1 //
   2 // Copyright (c) 2011, 2017, Oracle and/or its affiliates. All rights reserved.
   3 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4 //
   5 // This code is free software; you can redistribute it and/or modify it
   6 // under the terms of the GNU General Public License version 2 only, as
   7 // published by the Free Software Foundation.
   8 //
   9 // This code is distributed in the hope that it will be useful, but WITHOUT
  10 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11 // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12 // version 2 for more details (a copy is included in the LICENSE file that
  13 // accompanied this code).
  14 //
  15 // You should have received a copy of the GNU General Public License version
  16 // 2 along with this work; if not, write to the Free Software Foundation,
  17 // Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18 //
  19 // Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20 // or visit www.oracle.com if you need additional information or have any
  21 // questions.
  22 //
  23 //
  24 
  25 // X86 Common Architecture Description File
  26 
  27 //----------REGISTER DEFINITION BLOCK------------------------------------------
  28 // This information is used by the matcher and the register allocator to
  29 // describe individual registers and classes of registers within the target
  30 // archtecture.
  31 
  32 register %{
  33 //----------Architecture Description Register Definitions----------------------
  34 // General Registers
  35 // "reg_def"  name ( register save type, C convention save type,
  36 //                   ideal register type, encoding );
  37 // Register Save Types:
  38 //
  39 // NS  = No-Save:       The register allocator assumes that these registers
  40 //                      can be used without saving upon entry to the method, &
  41 //                      that they do not need to be saved at call sites.
  42 //
  43 // SOC = Save-On-Call:  The register allocator assumes that these registers
  44 //                      can be used without saving upon entry to the method,
  45 //                      but that they must be saved at call sites.
  46 //
  47 // SOE = Save-On-Entry: The register allocator assumes that these registers
  48 //                      must be saved before using them upon entry to the
  49 //                      method, but they do not need to be saved at call
  50 //                      sites.
  51 //
  52 // AS  = Always-Save:   The register allocator assumes that these registers
  53 //                      must be saved before using them upon entry to the
  54 //                      method, & that they must be saved at call sites.
  55 //
  56 // Ideal Register Type is used to determine how to save & restore a
  57 // register.  Op_RegI will get spilled with LoadI/StoreI, Op_RegP will get
  58 // spilled with LoadP/StoreP.  If the register supports both, use Op_RegI.
  59 //
  60 // The encoding number is the actual bit-pattern placed into the opcodes.
  61 
  62 // XMM registers.  512-bit registers or 8 words each, labeled (a)-p.
  63 // Word a in each register holds a Float, words ab hold a Double.
  64 // The whole registers are used in SSE4.2 version intrinsics,
  65 // array copy stubs and superword operations (see UseSSE42Intrinsics,
  66 // UseXMMForArrayCopy and UseSuperword flags).
  67 // For pre EVEX enabled architectures:
  68 //      XMM8-XMM15 must be encoded with REX (VEX for UseAVX)
  69 // For EVEX enabled architectures:
  70 //      XMM8-XMM31 must be encoded with REX (EVEX for UseAVX).
  71 //
  72 // Linux ABI:   No register preserved across function calls
  73 //              XMM0-XMM7 might hold parameters
  74 // Windows ABI: XMM6-XMM31 preserved across function calls
  75 //              XMM0-XMM3 might hold parameters
  76 
  77 reg_def XMM0 ( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg());
  78 reg_def XMM0b( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(1));
  79 reg_def XMM0c( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(2));
  80 reg_def XMM0d( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(3));
  81 reg_def XMM0e( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(4));
  82 reg_def XMM0f( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(5));
  83 reg_def XMM0g( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(6));
  84 reg_def XMM0h( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(7));
  85 reg_def XMM0i( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(8));
  86 reg_def XMM0j( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(9));
  87 reg_def XMM0k( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(10));
  88 reg_def XMM0l( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(11));
  89 reg_def XMM0m( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(12));
  90 reg_def XMM0n( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(13));
  91 reg_def XMM0o( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(14));
  92 reg_def XMM0p( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(15));
  93 
  94 reg_def XMM1 ( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg());
  95 reg_def XMM1b( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(1));
  96 reg_def XMM1c( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(2));
  97 reg_def XMM1d( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(3));
  98 reg_def XMM1e( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(4));
  99 reg_def XMM1f( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(5));
 100 reg_def XMM1g( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(6));
 101 reg_def XMM1h( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(7));
 102 reg_def XMM1i( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(8));
 103 reg_def XMM1j( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(9));
 104 reg_def XMM1k( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(10));
 105 reg_def XMM1l( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(11));
 106 reg_def XMM1m( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(12));
 107 reg_def XMM1n( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(13));
 108 reg_def XMM1o( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(14));
 109 reg_def XMM1p( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(15));
 110 
 111 reg_def XMM2 ( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg());
 112 reg_def XMM2b( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(1));
 113 reg_def XMM2c( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(2));
 114 reg_def XMM2d( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(3));
 115 reg_def XMM2e( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(4));
 116 reg_def XMM2f( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(5));
 117 reg_def XMM2g( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(6));
 118 reg_def XMM2h( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(7));
 119 reg_def XMM2i( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(8));
 120 reg_def XMM2j( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(9));
 121 reg_def XMM2k( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(10));
 122 reg_def XMM2l( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(11));
 123 reg_def XMM2m( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(12));
 124 reg_def XMM2n( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(13));
 125 reg_def XMM2o( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(14));
 126 reg_def XMM2p( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(15));
 127 
 128 reg_def XMM3 ( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg());
 129 reg_def XMM3b( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(1));
 130 reg_def XMM3c( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(2));
 131 reg_def XMM3d( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(3));
 132 reg_def XMM3e( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(4));
 133 reg_def XMM3f( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(5));
 134 reg_def XMM3g( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(6));
 135 reg_def XMM3h( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(7));
 136 reg_def XMM3i( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(8));
 137 reg_def XMM3j( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(9));
 138 reg_def XMM3k( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(10));
 139 reg_def XMM3l( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(11));
 140 reg_def XMM3m( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(12));
 141 reg_def XMM3n( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(13));
 142 reg_def XMM3o( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(14));
 143 reg_def XMM3p( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(15));
 144 
 145 reg_def XMM4 ( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg());
 146 reg_def XMM4b( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(1));
 147 reg_def XMM4c( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(2));
 148 reg_def XMM4d( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(3));
 149 reg_def XMM4e( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(4));
 150 reg_def XMM4f( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(5));
 151 reg_def XMM4g( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(6));
 152 reg_def XMM4h( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(7));
 153 reg_def XMM4i( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(8));
 154 reg_def XMM4j( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(9));
 155 reg_def XMM4k( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(10));
 156 reg_def XMM4l( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(11));
 157 reg_def XMM4m( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(12));
 158 reg_def XMM4n( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(13));
 159 reg_def XMM4o( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(14));
 160 reg_def XMM4p( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(15));
 161 
 162 reg_def XMM5 ( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg());
 163 reg_def XMM5b( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(1));
 164 reg_def XMM5c( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(2));
 165 reg_def XMM5d( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(3));
 166 reg_def XMM5e( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(4));
 167 reg_def XMM5f( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(5));
 168 reg_def XMM5g( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(6));
 169 reg_def XMM5h( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(7));
 170 reg_def XMM5i( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(8));
 171 reg_def XMM5j( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(9));
 172 reg_def XMM5k( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(10));
 173 reg_def XMM5l( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(11));
 174 reg_def XMM5m( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(12));
 175 reg_def XMM5n( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(13));
 176 reg_def XMM5o( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(14));
 177 reg_def XMM5p( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(15));
 178 
 179 reg_def XMM6 ( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg());
 180 reg_def XMM6b( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(1));
 181 reg_def XMM6c( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(2));
 182 reg_def XMM6d( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(3));
 183 reg_def XMM6e( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(4));
 184 reg_def XMM6f( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(5));
 185 reg_def XMM6g( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(6));
 186 reg_def XMM6h( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(7));
 187 reg_def XMM6i( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(8));
 188 reg_def XMM6j( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(9));
 189 reg_def XMM6k( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(10));
 190 reg_def XMM6l( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(11));
 191 reg_def XMM6m( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(12));
 192 reg_def XMM6n( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(13));
 193 reg_def XMM6o( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(14));
 194 reg_def XMM6p( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(15));
 195 
 196 reg_def XMM7 ( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg());
 197 reg_def XMM7b( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(1));
 198 reg_def XMM7c( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(2));
 199 reg_def XMM7d( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(3));
 200 reg_def XMM7e( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(4));
 201 reg_def XMM7f( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(5));
 202 reg_def XMM7g( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(6));
 203 reg_def XMM7h( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(7));
 204 reg_def XMM7i( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(8));
 205 reg_def XMM7j( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(9));
 206 reg_def XMM7k( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(10));
 207 reg_def XMM7l( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(11));
 208 reg_def XMM7m( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(12));
 209 reg_def XMM7n( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(13));
 210 reg_def XMM7o( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(14));
 211 reg_def XMM7p( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(15));
 212 
 213 #ifdef _LP64
 214 
 215 reg_def XMM8 ( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg());
 216 reg_def XMM8b( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(1));
 217 reg_def XMM8c( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(2));
 218 reg_def XMM8d( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(3));
 219 reg_def XMM8e( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(4));
 220 reg_def XMM8f( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(5));
 221 reg_def XMM8g( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(6));
 222 reg_def XMM8h( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(7));
 223 reg_def XMM8i( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(8));
 224 reg_def XMM8j( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(9));
 225 reg_def XMM8k( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(10));
 226 reg_def XMM8l( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(11));
 227 reg_def XMM8m( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(12));
 228 reg_def XMM8n( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(13));
 229 reg_def XMM8o( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(14));
 230 reg_def XMM8p( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(15));
 231 
 232 reg_def XMM9 ( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg());
 233 reg_def XMM9b( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(1));
 234 reg_def XMM9c( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(2));
 235 reg_def XMM9d( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(3));
 236 reg_def XMM9e( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(4));
 237 reg_def XMM9f( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(5));
 238 reg_def XMM9g( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(6));
 239 reg_def XMM9h( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(7));
 240 reg_def XMM9i( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(8));
 241 reg_def XMM9j( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(9));
 242 reg_def XMM9k( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(10));
 243 reg_def XMM9l( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(11));
 244 reg_def XMM9m( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(12));
 245 reg_def XMM9n( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(13));
 246 reg_def XMM9o( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(14));
 247 reg_def XMM9p( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(15));
 248 
 249 reg_def XMM10 ( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg());
 250 reg_def XMM10b( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(1));
 251 reg_def XMM10c( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(2));
 252 reg_def XMM10d( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(3));
 253 reg_def XMM10e( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(4));
 254 reg_def XMM10f( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(5));
 255 reg_def XMM10g( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(6));
 256 reg_def XMM10h( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(7));
 257 reg_def XMM10i( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(8));
 258 reg_def XMM10j( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(9));
 259 reg_def XMM10k( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(10));
 260 reg_def XMM10l( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(11));
 261 reg_def XMM10m( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(12));
 262 reg_def XMM10n( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(13));
 263 reg_def XMM10o( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(14));
 264 reg_def XMM10p( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(15));
 265 
 266 reg_def XMM11 ( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg());
 267 reg_def XMM11b( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(1));
 268 reg_def XMM11c( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(2));
 269 reg_def XMM11d( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(3));
 270 reg_def XMM11e( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(4));
 271 reg_def XMM11f( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(5));
 272 reg_def XMM11g( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(6));
 273 reg_def XMM11h( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(7));
 274 reg_def XMM11i( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(8));
 275 reg_def XMM11j( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(9));
 276 reg_def XMM11k( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(10));
 277 reg_def XMM11l( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(11));
 278 reg_def XMM11m( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(12));
 279 reg_def XMM11n( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(13));
 280 reg_def XMM11o( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(14));
 281 reg_def XMM11p( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(15));
 282 
 283 reg_def XMM12 ( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg());
 284 reg_def XMM12b( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(1));
 285 reg_def XMM12c( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(2));
 286 reg_def XMM12d( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(3));
 287 reg_def XMM12e( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(4));
 288 reg_def XMM12f( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(5));
 289 reg_def XMM12g( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(6));
 290 reg_def XMM12h( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(7));
 291 reg_def XMM12i( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(8));
 292 reg_def XMM12j( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(9));
 293 reg_def XMM12k( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(10));
 294 reg_def XMM12l( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(11));
 295 reg_def XMM12m( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(12));
 296 reg_def XMM12n( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(13));
 297 reg_def XMM12o( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(14));
 298 reg_def XMM12p( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(15));
 299 
 300 reg_def XMM13 ( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg());
 301 reg_def XMM13b( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(1));
 302 reg_def XMM13c( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(2));
 303 reg_def XMM13d( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(3));
 304 reg_def XMM13e( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(4));
 305 reg_def XMM13f( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(5));
 306 reg_def XMM13g( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(6));
 307 reg_def XMM13h( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(7));
 308 reg_def XMM13i( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(8));
 309 reg_def XMM13j( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(9));
 310 reg_def XMM13k( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(10));
 311 reg_def XMM13l( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(11));
 312 reg_def XMM13m( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(12));
 313 reg_def XMM13n( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(13));
 314 reg_def XMM13o( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(14));
 315 reg_def XMM13p( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(15));
 316 
 317 reg_def XMM14 ( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg());
 318 reg_def XMM14b( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(1));
 319 reg_def XMM14c( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(2));
 320 reg_def XMM14d( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(3));
 321 reg_def XMM14e( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(4));
 322 reg_def XMM14f( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(5));
 323 reg_def XMM14g( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(6));
 324 reg_def XMM14h( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(7));
 325 reg_def XMM14i( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(8));
 326 reg_def XMM14j( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(9));
 327 reg_def XMM14k( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(10));
 328 reg_def XMM14l( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(11));
 329 reg_def XMM14m( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(12));
 330 reg_def XMM14n( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(13));
 331 reg_def XMM14o( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(14));
 332 reg_def XMM14p( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(15));
 333 
 334 reg_def XMM15 ( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg());
 335 reg_def XMM15b( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(1));
 336 reg_def XMM15c( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(2));
 337 reg_def XMM15d( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(3));
 338 reg_def XMM15e( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(4));
 339 reg_def XMM15f( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(5));
 340 reg_def XMM15g( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(6));
 341 reg_def XMM15h( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(7));
 342 reg_def XMM15i( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(8));
 343 reg_def XMM15j( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(9));
 344 reg_def XMM15k( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(10));
 345 reg_def XMM15l( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(11));
 346 reg_def XMM15m( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(12));
 347 reg_def XMM15n( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(13));
 348 reg_def XMM15o( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(14));
 349 reg_def XMM15p( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(15));
 350 
 351 reg_def XMM16 ( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg());
 352 reg_def XMM16b( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(1));
 353 reg_def XMM16c( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(2));
 354 reg_def XMM16d( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(3));
 355 reg_def XMM16e( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(4));
 356 reg_def XMM16f( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(5));
 357 reg_def XMM16g( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(6));
 358 reg_def XMM16h( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(7));
 359 reg_def XMM16i( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(8));
 360 reg_def XMM16j( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(9));
 361 reg_def XMM16k( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(10));
 362 reg_def XMM16l( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(11));
 363 reg_def XMM16m( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(12));
 364 reg_def XMM16n( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(13));
 365 reg_def XMM16o( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(14));
 366 reg_def XMM16p( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(15));
 367 
 368 reg_def XMM17 ( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg());
 369 reg_def XMM17b( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(1));
 370 reg_def XMM17c( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(2));
 371 reg_def XMM17d( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(3));
 372 reg_def XMM17e( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(4));
 373 reg_def XMM17f( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(5));
 374 reg_def XMM17g( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(6));
 375 reg_def XMM17h( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(7));
 376 reg_def XMM17i( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(8));
 377 reg_def XMM17j( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(9));
 378 reg_def XMM17k( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(10));
 379 reg_def XMM17l( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(11));
 380 reg_def XMM17m( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(12));
 381 reg_def XMM17n( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(13));
 382 reg_def XMM17o( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(14));
 383 reg_def XMM17p( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(15));
 384 
 385 reg_def XMM18 ( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg());
 386 reg_def XMM18b( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(1));
 387 reg_def XMM18c( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(2));
 388 reg_def XMM18d( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(3));
 389 reg_def XMM18e( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(4));
 390 reg_def XMM18f( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(5));
 391 reg_def XMM18g( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(6));
 392 reg_def XMM18h( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(7));
 393 reg_def XMM18i( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(8));
 394 reg_def XMM18j( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(9));
 395 reg_def XMM18k( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(10));
 396 reg_def XMM18l( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(11));
 397 reg_def XMM18m( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(12));
 398 reg_def XMM18n( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(13));
 399 reg_def XMM18o( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(14));
 400 reg_def XMM18p( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(15));
 401 
 402 reg_def XMM19 ( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg());
 403 reg_def XMM19b( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(1));
 404 reg_def XMM19c( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(2));
 405 reg_def XMM19d( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(3));
 406 reg_def XMM19e( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(4));
 407 reg_def XMM19f( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(5));
 408 reg_def XMM19g( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(6));
 409 reg_def XMM19h( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(7));
 410 reg_def XMM19i( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(8));
 411 reg_def XMM19j( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(9));
 412 reg_def XMM19k( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(10));
 413 reg_def XMM19l( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(11));
 414 reg_def XMM19m( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(12));
 415 reg_def XMM19n( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(13));
 416 reg_def XMM19o( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(14));
 417 reg_def XMM19p( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(15));
 418 
 419 reg_def XMM20 ( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg());
 420 reg_def XMM20b( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(1));
 421 reg_def XMM20c( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(2));
 422 reg_def XMM20d( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(3));
 423 reg_def XMM20e( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(4));
 424 reg_def XMM20f( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(5));
 425 reg_def XMM20g( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(6));
 426 reg_def XMM20h( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(7));
 427 reg_def XMM20i( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(8));
 428 reg_def XMM20j( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(9));
 429 reg_def XMM20k( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(10));
 430 reg_def XMM20l( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(11));
 431 reg_def XMM20m( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(12));
 432 reg_def XMM20n( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(13));
 433 reg_def XMM20o( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(14));
 434 reg_def XMM20p( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(15));
 435 
 436 reg_def XMM21 ( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg());
 437 reg_def XMM21b( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(1));
 438 reg_def XMM21c( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(2));
 439 reg_def XMM21d( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(3));
 440 reg_def XMM21e( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(4));
 441 reg_def XMM21f( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(5));
 442 reg_def XMM21g( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(6));
 443 reg_def XMM21h( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(7));
 444 reg_def XMM21i( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(8));
 445 reg_def XMM21j( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(9));
 446 reg_def XMM21k( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(10));
 447 reg_def XMM21l( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(11));
 448 reg_def XMM21m( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(12));
 449 reg_def XMM21n( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(13));
 450 reg_def XMM21o( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(14));
 451 reg_def XMM21p( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(15));
 452 
 453 reg_def XMM22 ( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg());
 454 reg_def XMM22b( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(1));
 455 reg_def XMM22c( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(2));
 456 reg_def XMM22d( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(3));
 457 reg_def XMM22e( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(4));
 458 reg_def XMM22f( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(5));
 459 reg_def XMM22g( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(6));
 460 reg_def XMM22h( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(7));
 461 reg_def XMM22i( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(8));
 462 reg_def XMM22j( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(9));
 463 reg_def XMM22k( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(10));
 464 reg_def XMM22l( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(11));
 465 reg_def XMM22m( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(12));
 466 reg_def XMM22n( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(13));
 467 reg_def XMM22o( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(14));
 468 reg_def XMM22p( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(15));
 469 
 470 reg_def XMM23 ( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg());
 471 reg_def XMM23b( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(1));
 472 reg_def XMM23c( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(2));
 473 reg_def XMM23d( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(3));
 474 reg_def XMM23e( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(4));
 475 reg_def XMM23f( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(5));
 476 reg_def XMM23g( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(6));
 477 reg_def XMM23h( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(7));
 478 reg_def XMM23i( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(8));
 479 reg_def XMM23j( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(9));
 480 reg_def XMM23k( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(10));
 481 reg_def XMM23l( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(11));
 482 reg_def XMM23m( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(12));
 483 reg_def XMM23n( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(13));
 484 reg_def XMM23o( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(14));
 485 reg_def XMM23p( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(15));
 486 
 487 reg_def XMM24 ( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg());
 488 reg_def XMM24b( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(1));
 489 reg_def XMM24c( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(2));
 490 reg_def XMM24d( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(3));
 491 reg_def XMM24e( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(4));
 492 reg_def XMM24f( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(5));
 493 reg_def XMM24g( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(6));
 494 reg_def XMM24h( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(7));
 495 reg_def XMM24i( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(8));
 496 reg_def XMM24j( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(9));
 497 reg_def XMM24k( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(10));
 498 reg_def XMM24l( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(11));
 499 reg_def XMM24m( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(12));
 500 reg_def XMM24n( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(13));
 501 reg_def XMM24o( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(14));
 502 reg_def XMM24p( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(15));
 503 
 504 reg_def XMM25 ( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg());
 505 reg_def XMM25b( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(1));
 506 reg_def XMM25c( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(2));
 507 reg_def XMM25d( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(3));
 508 reg_def XMM25e( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(4));
 509 reg_def XMM25f( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(5));
 510 reg_def XMM25g( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(6));
 511 reg_def XMM25h( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(7));
 512 reg_def XMM25i( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(8));
 513 reg_def XMM25j( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(9));
 514 reg_def XMM25k( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(10));
 515 reg_def XMM25l( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(11));
 516 reg_def XMM25m( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(12));
 517 reg_def XMM25n( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(13));
 518 reg_def XMM25o( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(14));
 519 reg_def XMM25p( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(15));
 520 
 521 reg_def XMM26 ( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg());
 522 reg_def XMM26b( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(1));
 523 reg_def XMM26c( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(2));
 524 reg_def XMM26d( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(3));
 525 reg_def XMM26e( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(4));
 526 reg_def XMM26f( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(5));
 527 reg_def XMM26g( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(6));
 528 reg_def XMM26h( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(7));
 529 reg_def XMM26i( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(8));
 530 reg_def XMM26j( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(9));
 531 reg_def XMM26k( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(10));
 532 reg_def XMM26l( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(11));
 533 reg_def XMM26m( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(12));
 534 reg_def XMM26n( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(13));
 535 reg_def XMM26o( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(14));
 536 reg_def XMM26p( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(15));
 537 
 538 reg_def XMM27 ( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg());
 539 reg_def XMM27b( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(1));
 540 reg_def XMM27c( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(2));
 541 reg_def XMM27d( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(3));
 542 reg_def XMM27e( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(4));
 543 reg_def XMM27f( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(5));
 544 reg_def XMM27g( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(6));
 545 reg_def XMM27h( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(7));
 546 reg_def XMM27i( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(8));
 547 reg_def XMM27j( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(9));
 548 reg_def XMM27k( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(10));
 549 reg_def XMM27l( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(11));
 550 reg_def XMM27m( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(12));
 551 reg_def XMM27n( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(13));
 552 reg_def XMM27o( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(14));
 553 reg_def XMM27p( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(15));
 554 
 555 reg_def XMM28 ( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg());
 556 reg_def XMM28b( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(1));
 557 reg_def XMM28c( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(2));
 558 reg_def XMM28d( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(3));
 559 reg_def XMM28e( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(4));
 560 reg_def XMM28f( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(5));
 561 reg_def XMM28g( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(6));
 562 reg_def XMM28h( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(7));
 563 reg_def XMM28i( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(8));
 564 reg_def XMM28j( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(9));
 565 reg_def XMM28k( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(10));
 566 reg_def XMM28l( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(11));
 567 reg_def XMM28m( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(12));
 568 reg_def XMM28n( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(13));
 569 reg_def XMM28o( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(14));
 570 reg_def XMM28p( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(15));
 571 
 572 reg_def XMM29 ( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg());
 573 reg_def XMM29b( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(1));
 574 reg_def XMM29c( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(2));
 575 reg_def XMM29d( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(3));
 576 reg_def XMM29e( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(4));
 577 reg_def XMM29f( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(5));
 578 reg_def XMM29g( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(6));
 579 reg_def XMM29h( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(7));
 580 reg_def XMM29i( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(8));
 581 reg_def XMM29j( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(9));
 582 reg_def XMM29k( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(10));
 583 reg_def XMM29l( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(11));
 584 reg_def XMM29m( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(12));
 585 reg_def XMM29n( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(13));
 586 reg_def XMM29o( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(14));
 587 reg_def XMM29p( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(15));
 588 
 589 reg_def XMM30 ( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg());
 590 reg_def XMM30b( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(1));
 591 reg_def XMM30c( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(2));
 592 reg_def XMM30d( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(3));
 593 reg_def XMM30e( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(4));
 594 reg_def XMM30f( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(5));
 595 reg_def XMM30g( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(6));
 596 reg_def XMM30h( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(7));
 597 reg_def XMM30i( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(8));
 598 reg_def XMM30j( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(9));
 599 reg_def XMM30k( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(10));
 600 reg_def XMM30l( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(11));
 601 reg_def XMM30m( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(12));
 602 reg_def XMM30n( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(13));
 603 reg_def XMM30o( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(14));
 604 reg_def XMM30p( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(15));
 605 
 606 reg_def XMM31 ( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg());
 607 reg_def XMM31b( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(1));
 608 reg_def XMM31c( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(2));
 609 reg_def XMM31d( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(3));
 610 reg_def XMM31e( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(4));
 611 reg_def XMM31f( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(5));
 612 reg_def XMM31g( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(6));
 613 reg_def XMM31h( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(7));
 614 reg_def XMM31i( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(8));
 615 reg_def XMM31j( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(9));
 616 reg_def XMM31k( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(10));
 617 reg_def XMM31l( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(11));
 618 reg_def XMM31m( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(12));
 619 reg_def XMM31n( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(13));
 620 reg_def XMM31o( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(14));
 621 reg_def XMM31p( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(15));
 622 
 623 #endif // _LP64
 624 
 625 #ifdef _LP64
 626 reg_def RFLAGS(SOC, SOC, 0, 16, VMRegImpl::Bad());
 627 #else
 628 reg_def RFLAGS(SOC, SOC, 0, 8, VMRegImpl::Bad());
 629 #endif // _LP64
 630 
 631 alloc_class chunk1(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
 632                    XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
 633                    XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
 634                    XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
 635                    XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
 636                    XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
 637                    XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
 638                    XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
 639 #ifdef _LP64
 640                   ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
 641                    XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
 642                    XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
 643                    XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
 644                    XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
 645                    XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
 646                    XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
 647                    XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
 648                   ,XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
 649                    XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
 650                    XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
 651                    XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
 652                    XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
 653                    XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
 654                    XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
 655                    XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
 656                    XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
 657                    XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
 658                    XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
 659                    XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
 660                    XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
 661                    XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
 662                    XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
 663                    XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p
 664 #endif
 665                       );
 666 
 667 // flags allocation class should be last.
 668 alloc_class chunk2(RFLAGS);
 669 
 670 // Singleton class for condition codes
 671 reg_class int_flags(RFLAGS);
 672 
 673 // Class for pre evex float registers
 674 reg_class float_reg_legacy(XMM0,
 675                     XMM1,
 676                     XMM2,
 677                     XMM3,
 678                     XMM4,
 679                     XMM5,
 680                     XMM6,
 681                     XMM7
 682 #ifdef _LP64
 683                    ,XMM8,
 684                     XMM9,
 685                     XMM10,
 686                     XMM11,
 687                     XMM12,
 688                     XMM13,
 689                     XMM14,
 690                     XMM15
 691 #endif
 692                     );
 693 
 694 // Class for evex float registers
 695 reg_class float_reg_evex(XMM0,
 696                     XMM1,
 697                     XMM2,
 698                     XMM3,
 699                     XMM4,
 700                     XMM5,
 701                     XMM6,
 702                     XMM7
 703 #ifdef _LP64
 704                    ,XMM8,
 705                     XMM9,
 706                     XMM10,
 707                     XMM11,
 708                     XMM12,
 709                     XMM13,
 710                     XMM14,
 711                     XMM15,
 712                     XMM16,
 713                     XMM17,
 714                     XMM18,
 715                     XMM19,
 716                     XMM20,
 717                     XMM21,
 718                     XMM22,
 719                     XMM23,
 720                     XMM24,
 721                     XMM25,
 722                     XMM26,
 723                     XMM27,
 724                     XMM28,
 725                     XMM29,
 726                     XMM30,
 727                     XMM31
 728 #endif
 729                     );
 730 
 731 reg_class_dynamic float_reg(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() %} );
 732 
 733 // Class for pre evex double registers
 734 reg_class double_reg_legacy(XMM0,  XMM0b,
 735                      XMM1,  XMM1b,
 736                      XMM2,  XMM2b,
 737                      XMM3,  XMM3b,
 738                      XMM4,  XMM4b,
 739                      XMM5,  XMM5b,
 740                      XMM6,  XMM6b,
 741                      XMM7,  XMM7b
 742 #ifdef _LP64
 743                     ,XMM8,  XMM8b,
 744                      XMM9,  XMM9b,
 745                      XMM10, XMM10b,
 746                      XMM11, XMM11b,
 747                      XMM12, XMM12b,
 748                      XMM13, XMM13b,
 749                      XMM14, XMM14b,
 750                      XMM15, XMM15b
 751 #endif
 752                      );
 753 
 754 // Class for evex double registers
 755 reg_class double_reg_evex(XMM0,  XMM0b,
 756                      XMM1,  XMM1b,
 757                      XMM2,  XMM2b,
 758                      XMM3,  XMM3b,
 759                      XMM4,  XMM4b,
 760                      XMM5,  XMM5b,
 761                      XMM6,  XMM6b,
 762                      XMM7,  XMM7b
 763 #ifdef _LP64
 764                     ,XMM8,  XMM8b,
 765                      XMM9,  XMM9b,
 766                      XMM10, XMM10b,
 767                      XMM11, XMM11b,
 768                      XMM12, XMM12b,
 769                      XMM13, XMM13b,
 770                      XMM14, XMM14b,
 771                      XMM15, XMM15b,
 772                      XMM16, XMM16b,
 773                      XMM17, XMM17b,
 774                      XMM18, XMM18b,
 775                      XMM19, XMM19b,
 776                      XMM20, XMM20b,
 777                      XMM21, XMM21b,
 778                      XMM22, XMM22b,
 779                      XMM23, XMM23b,
 780                      XMM24, XMM24b,
 781                      XMM25, XMM25b,
 782                      XMM26, XMM26b,
 783                      XMM27, XMM27b,
 784                      XMM28, XMM28b,
 785                      XMM29, XMM29b,
 786                      XMM30, XMM30b,
 787                      XMM31, XMM31b
 788 #endif
 789                      );
 790 
 791 reg_class_dynamic double_reg(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() %} );
 792 
 793 // Class for pre evex 32bit vector registers
 794 reg_class vectors_reg_legacy(XMM0,
 795                       XMM1,
 796                       XMM2,
 797                       XMM3,
 798                       XMM4,
 799                       XMM5,
 800                       XMM6,
 801                       XMM7
 802 #ifdef _LP64
 803                      ,XMM8,
 804                       XMM9,
 805                       XMM10,
 806                       XMM11,
 807                       XMM12,
 808                       XMM13,
 809                       XMM14,
 810                       XMM15
 811 #endif
 812                       );
 813 
 814 // Class for evex 32bit vector registers
 815 reg_class vectors_reg_evex(XMM0,
 816                       XMM1,
 817                       XMM2,
 818                       XMM3,
 819                       XMM4,
 820                       XMM5,
 821                       XMM6,
 822                       XMM7
 823 #ifdef _LP64
 824                      ,XMM8,
 825                       XMM9,
 826                       XMM10,
 827                       XMM11,
 828                       XMM12,
 829                       XMM13,
 830                       XMM14,
 831                       XMM15,
 832                       XMM16,
 833                       XMM17,
 834                       XMM18,
 835                       XMM19,
 836                       XMM20,
 837                       XMM21,
 838                       XMM22,
 839                       XMM23,
 840                       XMM24,
 841                       XMM25,
 842                       XMM26,
 843                       XMM27,
 844                       XMM28,
 845                       XMM29,
 846                       XMM30,
 847                       XMM31
 848 #endif
 849                       );
 850 
 851 reg_class_dynamic vectors_reg(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_evex() %} );
 852 
 853 // Class for all 64bit vector registers
 854 reg_class vectord_reg_legacy(XMM0,  XMM0b,
 855                       XMM1,  XMM1b,
 856                       XMM2,  XMM2b,
 857                       XMM3,  XMM3b,
 858                       XMM4,  XMM4b,
 859                       XMM5,  XMM5b,
 860                       XMM6,  XMM6b,
 861                       XMM7,  XMM7b
 862 #ifdef _LP64
 863                      ,XMM8,  XMM8b,
 864                       XMM9,  XMM9b,
 865                       XMM10, XMM10b,
 866                       XMM11, XMM11b,
 867                       XMM12, XMM12b,
 868                       XMM13, XMM13b,
 869                       XMM14, XMM14b,
 870                       XMM15, XMM15b
 871 #endif
 872                       );
 873 
 874 // Class for all 64bit vector registers
 875 reg_class vectord_reg_evex(XMM0,  XMM0b,
 876                       XMM1,  XMM1b,
 877                       XMM2,  XMM2b,
 878                       XMM3,  XMM3b,
 879                       XMM4,  XMM4b,
 880                       XMM5,  XMM5b,
 881                       XMM6,  XMM6b,
 882                       XMM7,  XMM7b
 883 #ifdef _LP64
 884                      ,XMM8,  XMM8b,
 885                       XMM9,  XMM9b,
 886                       XMM10, XMM10b,
 887                       XMM11, XMM11b,
 888                       XMM12, XMM12b,
 889                       XMM13, XMM13b,
 890                       XMM14, XMM14b,
 891                       XMM15, XMM15b,
 892                       XMM16, XMM16b,
 893                       XMM17, XMM17b,
 894                       XMM18, XMM18b,
 895                       XMM19, XMM19b,
 896                       XMM20, XMM20b,
 897                       XMM21, XMM21b,
 898                       XMM22, XMM22b,
 899                       XMM23, XMM23b,
 900                       XMM24, XMM24b,
 901                       XMM25, XMM25b,
 902                       XMM26, XMM26b,
 903                       XMM27, XMM27b,
 904                       XMM28, XMM28b,
 905                       XMM29, XMM29b,
 906                       XMM30, XMM30b,
 907                       XMM31, XMM31b
 908 #endif
 909                       );
 910 
 911 reg_class_dynamic vectord_reg(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_evex() %} );
 912 
 913 // Class for all 128bit vector registers
 914 reg_class vectorx_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,
 915                       XMM1,  XMM1b,  XMM1c,  XMM1d,
 916                       XMM2,  XMM2b,  XMM2c,  XMM2d,
 917                       XMM3,  XMM3b,  XMM3c,  XMM3d,
 918                       XMM4,  XMM4b,  XMM4c,  XMM4d,
 919                       XMM5,  XMM5b,  XMM5c,  XMM5d,
 920                       XMM6,  XMM6b,  XMM6c,  XMM6d,
 921                       XMM7,  XMM7b,  XMM7c,  XMM7d
 922 #ifdef _LP64
 923                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,
 924                       XMM9,  XMM9b,  XMM9c,  XMM9d,
 925                       XMM10, XMM10b, XMM10c, XMM10d,
 926                       XMM11, XMM11b, XMM11c, XMM11d,
 927                       XMM12, XMM12b, XMM12c, XMM12d,
 928                       XMM13, XMM13b, XMM13c, XMM13d,
 929                       XMM14, XMM14b, XMM14c, XMM14d,
 930                       XMM15, XMM15b, XMM15c, XMM15d
 931 #endif
 932                       );
 933 
 934 // Class for all 128bit vector registers
 935 reg_class vectorx_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,
 936                       XMM1,  XMM1b,  XMM1c,  XMM1d,
 937                       XMM2,  XMM2b,  XMM2c,  XMM2d,
 938                       XMM3,  XMM3b,  XMM3c,  XMM3d,
 939                       XMM4,  XMM4b,  XMM4c,  XMM4d,
 940                       XMM5,  XMM5b,  XMM5c,  XMM5d,
 941                       XMM6,  XMM6b,  XMM6c,  XMM6d,
 942                       XMM7,  XMM7b,  XMM7c,  XMM7d
 943 #ifdef _LP64
 944                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,
 945                       XMM9,  XMM9b,  XMM9c,  XMM9d,
 946                       XMM10, XMM10b, XMM10c, XMM10d,
 947                       XMM11, XMM11b, XMM11c, XMM11d,
 948                       XMM12, XMM12b, XMM12c, XMM12d,
 949                       XMM13, XMM13b, XMM13c, XMM13d,
 950                       XMM14, XMM14b, XMM14c, XMM14d,
 951                       XMM15, XMM15b, XMM15c, XMM15d,
 952                       XMM16, XMM16b, XMM16c, XMM16d,
 953                       XMM17, XMM17b, XMM17c, XMM17d,
 954                       XMM18, XMM18b, XMM18c, XMM18d,
 955                       XMM19, XMM19b, XMM19c, XMM19d,
 956                       XMM20, XMM20b, XMM20c, XMM20d,
 957                       XMM21, XMM21b, XMM21c, XMM21d,
 958                       XMM22, XMM22b, XMM22c, XMM22d,
 959                       XMM23, XMM23b, XMM23c, XMM23d,
 960                       XMM24, XMM24b, XMM24c, XMM24d,
 961                       XMM25, XMM25b, XMM25c, XMM25d,
 962                       XMM26, XMM26b, XMM26c, XMM26d,
 963                       XMM27, XMM27b, XMM27c, XMM27d,
 964                       XMM28, XMM28b, XMM28c, XMM28d,
 965                       XMM29, XMM29b, XMM29c, XMM29d,
 966                       XMM30, XMM30b, XMM30c, XMM30d,
 967                       XMM31, XMM31b, XMM31c, XMM31d
 968 #endif
 969                       );
 970 
 971 reg_class_dynamic vectorx_reg(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_evex() %} );
 972 
 973 // Class for all 256bit vector registers
 974 reg_class vectory_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
 975                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
 976                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
 977                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
 978                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
 979                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
 980                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
 981                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
 982 #ifdef _LP64
 983                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
 984                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
 985                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
 986                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
 987                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
 988                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
 989                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
 990                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h
 991 #endif
 992                       );
 993 
 994 // Class for all 256bit vector registers
 995 reg_class vectory_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
 996                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
 997                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
 998                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
 999                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
1000                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
1001                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
1002                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
1003 #ifdef _LP64
1004                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
1005                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
1006                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
1007                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
1008                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
1009                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
1010                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
1011                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h,
1012                       XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h,
1013                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h,
1014                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h,
1015                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h,
1016                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h,
1017                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h,
1018                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h,
1019                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h,
1020                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h,
1021                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h,
1022                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h,
1023                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h,
1024                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h,
1025                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h,
1026                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h,
1027                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h
1028 #endif
1029                       );
1030 
1031 reg_class_dynamic vectory_reg(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_evex() %} );
1032 
1033 // Class for all 512bit vector registers
1034 reg_class vectorz_reg(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
1035                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
1036                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
1037                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
1038                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
1039                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
1040                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
1041                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
1042 #ifdef _LP64
1043                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
1044                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
1045                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
1046                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
1047                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
1048                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
1049                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
1050                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
1051                      ,XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
1052                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
1053                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
1054                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
1055                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
1056                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
1057                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
1058                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
1059                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
1060                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
1061                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
1062                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
1063                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
1064                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
1065                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
1066                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p
1067 #endif
1068                       );
1069 
1070 %}
1071 
1072 
1073 //----------SOURCE BLOCK-------------------------------------------------------
1074 // This is a block of C++ code which provides values, functions, and
1075 // definitions necessary in the rest of the architecture description
1076 
1077 source_hpp %{
1078 // Header information of the source block.
1079 // Method declarations/definitions which are used outside
1080 // the ad-scope can conveniently be defined here.
1081 //
1082 // To keep related declarations/definitions/uses close together,
1083 // we switch between source %{ }% and source_hpp %{ }% freely as needed.
1084 
1085 class NativeJump;
1086 
1087 class CallStubImpl {
1088 
1089   //--------------------------------------------------------------
1090   //---<  Used for optimization in Compile::shorten_branches  >---
1091   //--------------------------------------------------------------
1092 
1093  public:
1094   // Size of call trampoline stub.
1095   static uint size_call_trampoline() {
1096     return 0; // no call trampolines on this platform
1097   }
1098 
1099   // number of relocations needed by a call trampoline stub
1100   static uint reloc_call_trampoline() {
1101     return 0; // no call trampolines on this platform
1102   }
1103 };
1104 
1105 class HandlerImpl {
1106 
1107  public:
1108 
1109   static int emit_exception_handler(CodeBuffer &cbuf);
1110   static int emit_deopt_handler(CodeBuffer& cbuf);
1111 
1112   static uint size_exception_handler() {
1113     // NativeCall instruction size is the same as NativeJump.
1114     // exception handler starts out as jump and can be patched to
1115     // a call be deoptimization.  (4932387)
1116     // Note that this value is also credited (in output.cpp) to
1117     // the size of the code section.
1118     return NativeJump::instruction_size;
1119   }
1120 
1121 #ifdef _LP64
1122   static uint size_deopt_handler() {
1123     // three 5 byte instructions
1124     return 15;
1125   }
1126 #else
1127   static uint size_deopt_handler() {
1128     // NativeCall instruction size is the same as NativeJump.
1129     // exception handler starts out as jump and can be patched to
1130     // a call be deoptimization.  (4932387)
1131     // Note that this value is also credited (in output.cpp) to
1132     // the size of the code section.
1133     return 5 + NativeJump::instruction_size; // pushl(); jmp;
1134   }
1135 #endif
1136 };
1137 
1138 %} // end source_hpp
1139 
1140 source %{
1141 
1142 #include "opto/addnode.hpp"
1143 
1144 // Emit exception handler code.
1145 // Stuff framesize into a register and call a VM stub routine.
1146 int HandlerImpl::emit_exception_handler(CodeBuffer& cbuf) {
1147 
1148   // Note that the code buffer's insts_mark is always relative to insts.
1149   // That's why we must use the macroassembler to generate a handler.
1150   MacroAssembler _masm(&cbuf);
1151   address base = __ start_a_stub(size_exception_handler());
1152   if (base == NULL) {
1153     ciEnv::current()->record_failure("CodeCache is full");
1154     return 0;  // CodeBuffer::expand failed
1155   }
1156   int offset = __ offset();
1157   __ jump(RuntimeAddress(OptoRuntime::exception_blob()->entry_point()));
1158   assert(__ offset() - offset <= (int) size_exception_handler(), "overflow");
1159   __ end_a_stub();
1160   return offset;
1161 }
1162 
1163 // Emit deopt handler code.
1164 int HandlerImpl::emit_deopt_handler(CodeBuffer& cbuf) {
1165 
1166   // Note that the code buffer's insts_mark is always relative to insts.
1167   // That's why we must use the macroassembler to generate a handler.
1168   MacroAssembler _masm(&cbuf);
1169   address base = __ start_a_stub(size_deopt_handler());
1170   if (base == NULL) {
1171     ciEnv::current()->record_failure("CodeCache is full");
1172     return 0;  // CodeBuffer::expand failed
1173   }
1174   int offset = __ offset();
1175 
1176 #ifdef _LP64
1177   address the_pc = (address) __ pc();
1178   Label next;
1179   // push a "the_pc" on the stack without destroying any registers
1180   // as they all may be live.
1181 
1182   // push address of "next"
1183   __ call(next, relocInfo::none); // reloc none is fine since it is a disp32
1184   __ bind(next);
1185   // adjust it so it matches "the_pc"
1186   __ subptr(Address(rsp, 0), __ offset() - offset);
1187 #else
1188   InternalAddress here(__ pc());
1189   __ pushptr(here.addr());
1190 #endif
1191 
1192   __ jump(RuntimeAddress(SharedRuntime::deopt_blob()->unpack()));
1193   assert(__ offset() - offset <= (int) size_deopt_handler(), "overflow");
1194   __ end_a_stub();
1195   return offset;
1196 }
1197 
1198 
1199 //=============================================================================
1200 
1201   // Float masks come from different places depending on platform.
1202 #ifdef _LP64
1203   static address float_signmask()  { return StubRoutines::x86::float_sign_mask(); }
1204   static address float_signflip()  { return StubRoutines::x86::float_sign_flip(); }
1205   static address double_signmask() { return StubRoutines::x86::double_sign_mask(); }
1206   static address double_signflip() { return StubRoutines::x86::double_sign_flip(); }
1207 #else
1208   static address float_signmask()  { return (address)float_signmask_pool; }
1209   static address float_signflip()  { return (address)float_signflip_pool; }
1210   static address double_signmask() { return (address)double_signmask_pool; }
1211   static address double_signflip() { return (address)double_signflip_pool; }
1212 #endif
1213 
1214 
1215 const bool Matcher::match_rule_supported(int opcode) {
1216   if (!has_match_rule(opcode))
1217     return false;
1218 
1219   bool ret_value = true;
1220   switch (opcode) {
1221     case Op_PopCountI:
1222     case Op_PopCountL:
1223       if (!UsePopCountInstruction)
1224         ret_value = false;
1225       break;
1226     case Op_MulVI:
1227       if ((UseSSE < 4) && (UseAVX < 1)) // only with SSE4_1 or AVX
1228         ret_value = false;
1229       break;
1230     case Op_MulVL:
1231     case Op_MulReductionVL:
1232       if (VM_Version::supports_avx512dq() == false)
1233         ret_value = false;
1234       break;
1235     case Op_AddReductionVL:
1236       if (UseAVX < 3) // only EVEX : vector connectivity becomes an issue here
1237         ret_value = false;
1238       break;
1239     case Op_AddReductionVI:
1240       if (UseSSE < 3) // requires at least SSE3
1241         ret_value = false;
1242       break;
1243     case Op_MulReductionVI:
1244       if (UseSSE < 4) // requires at least SSE4
1245         ret_value = false;
1246       break;
1247     case Op_AddReductionVF:
1248     case Op_AddReductionVD:
1249     case Op_MulReductionVF:
1250     case Op_MulReductionVD:
1251       if (UseSSE < 1) // requires at least SSE
1252         ret_value = false;
1253       break;
1254     case Op_SqrtVD:
1255       if (UseAVX < 1) // enabled for AVX only
1256         ret_value = false;
1257       break;
1258     case Op_CompareAndSwapL:
1259 #ifdef _LP64
1260     case Op_CompareAndSwapP:
1261 #endif
1262       if (!VM_Version::supports_cx8())
1263         ret_value = false;
1264       break;
1265     case Op_CMoveVD:
1266       if (UseAVX < 1 || UseAVX > 2)
1267         ret_value = false;
1268       break;
1269     case Op_StrIndexOf:
1270       if (!UseSSE42Intrinsics)
1271         ret_value = false;
1272       break;
1273     case Op_StrIndexOfChar:
1274       if (!UseSSE42Intrinsics)
1275         ret_value = false;
1276       break;
1277     case Op_OnSpinWait:
1278       if (VM_Version::supports_on_spin_wait() == false)
1279         ret_value = false;
1280       break;
1281   }
1282 
1283   return ret_value;  // Per default match rules are supported.
1284 }
1285 
1286 const bool Matcher::match_rule_supported_vector(int opcode, int vlen) {
1287   // identify extra cases that we might want to provide match rules for
1288   // e.g. Op_ vector nodes and other intrinsics while guarding with vlen
1289   bool ret_value = match_rule_supported(opcode);
1290   if (ret_value) {
1291     switch (opcode) {
1292       case Op_AddVB:
1293       case Op_SubVB:
1294         if ((vlen == 64) && (VM_Version::supports_avx512bw() == false))
1295           ret_value = false;
1296         break;
1297       case Op_URShiftVS:
1298       case Op_RShiftVS:
1299       case Op_LShiftVS:
1300       case Op_MulVS:
1301       case Op_AddVS:
1302       case Op_SubVS:
1303         if ((vlen == 32) && (VM_Version::supports_avx512bw() == false))
1304           ret_value = false;
1305         break;
1306       case Op_CMoveVD:
1307         if (vlen != 4)
1308           ret_value  = false;
1309         break;
1310     }
1311   }
1312 
1313   return ret_value;  // Per default match rules are supported.
1314 }
1315 
1316 const bool Matcher::has_predicated_vectors(void) {
1317   bool ret_value = false;
1318   if (UseAVX > 2) {
1319     ret_value = VM_Version::supports_avx512vl();
1320   }
1321 
1322   return ret_value;
1323 }
1324 
1325 const int Matcher::float_pressure(int default_pressure_threshold) {
1326   int float_pressure_threshold = default_pressure_threshold;
1327 #ifdef _LP64
1328   if (UseAVX > 2) {
1329     // Increase pressure threshold on machines with AVX3 which have
1330     // 2x more XMM registers.
1331     float_pressure_threshold = default_pressure_threshold * 2;
1332   }
1333 #endif
1334   return float_pressure_threshold;
1335 }
1336 
1337 // Max vector size in bytes. 0 if not supported.
1338 const int Matcher::vector_width_in_bytes(BasicType bt) {
1339   assert(is_java_primitive(bt), "only primitive type vectors");
1340   if (UseSSE < 2) return 0;
1341   // SSE2 supports 128bit vectors for all types.
1342   // AVX2 supports 256bit vectors for all types.
1343   // AVX2/EVEX supports 512bit vectors for all types.
1344   int size = (UseAVX > 1) ? (1 << UseAVX) * 8 : 16;
1345   // AVX1 supports 256bit vectors only for FLOAT and DOUBLE.
1346   if (UseAVX > 0 && (bt == T_FLOAT || bt == T_DOUBLE))
1347     size = (UseAVX > 2) ? 64 : 32;
1348   // Use flag to limit vector size.
1349   size = MIN2(size,(int)MaxVectorSize);
1350   // Minimum 2 values in vector (or 4 for bytes).
1351   switch (bt) {
1352   case T_DOUBLE:
1353   case T_LONG:
1354     if (size < 16) return 0;
1355     break;
1356   case T_FLOAT:
1357   case T_INT:
1358     if (size < 8) return 0;
1359     break;
1360   case T_BOOLEAN:
1361     if (size < 4) return 0;
1362     break;
1363   case T_CHAR:
1364     if (size < 4) return 0;
1365     break;
1366   case T_BYTE:
1367     if (size < 4) return 0;
1368     break;
1369   case T_SHORT:
1370     if (size < 4) return 0;
1371     break;
1372   default:
1373     ShouldNotReachHere();
1374   }
1375   return size;
1376 }
1377 
1378 // Limits on vector size (number of elements) loaded into vector.
1379 const int Matcher::max_vector_size(const BasicType bt) {
1380   return vector_width_in_bytes(bt)/type2aelembytes(bt);
1381 }
1382 const int Matcher::min_vector_size(const BasicType bt) {
1383   int max_size = max_vector_size(bt);
1384   // Min size which can be loaded into vector is 4 bytes.
1385   int size = (type2aelembytes(bt) == 1) ? 4 : 2;
1386   return MIN2(size,max_size);
1387 }
1388 
1389 // Vector ideal reg corresponding to specidied size in bytes
1390 const uint Matcher::vector_ideal_reg(int size) {  
1391   switch(size) {
1392     case  4: return Op_VecS;
1393     case  8: return Op_VecD;
1394     case 16: return Op_VecX;
1395     case 32: return Op_VecY;
1396     case 64: return Op_VecZ;
1397   }
1398   ShouldNotReachHere();
1399   return 0;
1400 }
1401 
1402 // Only lowest bits of xmm reg are used for vector shift count.
1403 const uint Matcher::vector_shift_count_ideal_reg(int size) {
1404   return Op_VecS;
1405 }
1406 
1407 // x86 supports misaligned vectors store/load.
1408 const bool Matcher::misaligned_vectors_ok() {
1409   return !AlignVector; // can be changed by flag
1410 }
1411 
1412 // x86 AES instructions are compatible with SunJCE expanded
1413 // keys, hence we do not need to pass the original key to stubs
1414 const bool Matcher::pass_original_key_for_aes() {
1415   return false;
1416 }
1417 
1418 
1419 const bool Matcher::convi2l_type_required = true;
1420 
1421 // Check for shift by small constant as well
1422 static bool clone_shift(Node* shift, Matcher* matcher, Matcher::MStack& mstack, VectorSet& address_visited) {
1423   if (shift->Opcode() == Op_LShiftX && shift->in(2)->is_Con() &&
1424       shift->in(2)->get_int() <= 3 &&
1425       // Are there other uses besides address expressions?
1426       !matcher->is_visited(shift)) {
1427     address_visited.set(shift->_idx); // Flag as address_visited
1428     mstack.push(shift->in(2), Matcher::Visit);
1429     Node *conv = shift->in(1);
1430 #ifdef _LP64
1431     // Allow Matcher to match the rule which bypass
1432     // ConvI2L operation for an array index on LP64
1433     // if the index value is positive.
1434     if (conv->Opcode() == Op_ConvI2L &&
1435         conv->as_Type()->type()->is_long()->_lo >= 0 &&
1436         // Are there other uses besides address expressions?
1437         !matcher->is_visited(conv)) {
1438       address_visited.set(conv->_idx); // Flag as address_visited
1439       mstack.push(conv->in(1), Matcher::Pre_Visit);
1440     } else
1441 #endif
1442       mstack.push(conv, Matcher::Pre_Visit);
1443     return true;
1444   }
1445   return false;
1446 }
1447 
1448 // Should the Matcher clone shifts on addressing modes, expecting them
1449 // to be subsumed into complex addressing expressions or compute them
1450 // into registers?
1451 bool Matcher::clone_address_expressions(AddPNode* m, Matcher::MStack& mstack, VectorSet& address_visited) {
1452   Node *off = m->in(AddPNode::Offset);
1453   if (off->is_Con()) {
1454     address_visited.test_set(m->_idx); // Flag as address_visited
1455     Node *adr = m->in(AddPNode::Address);
1456 
1457     // Intel can handle 2 adds in addressing mode
1458     // AtomicAdd is not an addressing expression.
1459     // Cheap to find it by looking for screwy base.
1460     if (adr->is_AddP() &&
1461         !adr->in(AddPNode::Base)->is_top() &&
1462         // Are there other uses besides address expressions?
1463         !is_visited(adr)) {
1464       address_visited.set(adr->_idx); // Flag as address_visited
1465       Node *shift = adr->in(AddPNode::Offset);
1466       if (!clone_shift(shift, this, mstack, address_visited)) {
1467         mstack.push(shift, Pre_Visit);
1468       }
1469       mstack.push(adr->in(AddPNode::Address), Pre_Visit);
1470       mstack.push(adr->in(AddPNode::Base), Pre_Visit);
1471     } else {
1472       mstack.push(adr, Pre_Visit);
1473     }
1474 
1475     // Clone X+offset as it also folds into most addressing expressions
1476     mstack.push(off, Visit);
1477     mstack.push(m->in(AddPNode::Base), Pre_Visit);
1478     return true;
1479   } else if (clone_shift(off, this, mstack, address_visited)) {
1480     address_visited.test_set(m->_idx); // Flag as address_visited
1481     mstack.push(m->in(AddPNode::Address), Pre_Visit);
1482     mstack.push(m->in(AddPNode::Base), Pre_Visit);
1483     return true;
1484   }
1485   return false;
1486 }
1487 
1488 void Compile::reshape_address(AddPNode* addp) {
1489 }
1490 
1491 // Helper methods for MachSpillCopyNode::implementation().
1492 static int vec_mov_helper(CodeBuffer *cbuf, bool do_size, int src_lo, int dst_lo,
1493                           int src_hi, int dst_hi, uint ireg, outputStream* st) {
1494   // In 64-bit VM size calculation is very complex. Emitting instructions
1495   // into scratch buffer is used to get size in 64-bit VM.
1496   LP64_ONLY( assert(!do_size, "this method calculates size only for 32-bit VM"); )
1497   assert(ireg == Op_VecS || // 32bit vector
1498          (src_lo & 1) == 0 && (src_lo + 1) == src_hi &&
1499          (dst_lo & 1) == 0 && (dst_lo + 1) == dst_hi,
1500          "no non-adjacent vector moves" );
1501   if (cbuf) {
1502     MacroAssembler _masm(cbuf);
1503     int offset = __ offset();
1504     switch (ireg) {
1505     case Op_VecS: // copy whole register
1506     case Op_VecD:
1507     case Op_VecX:
1508       __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
1509       break;
1510     case Op_VecY:
1511       __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
1512       break;
1513     case Op_VecZ:
1514       __ evmovdquq(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 2);
1515       break;
1516     default:
1517       ShouldNotReachHere();
1518     }
1519     int size = __ offset() - offset;
1520 #ifdef ASSERT
1521     // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
1522     assert(!do_size || size == 4, "incorrect size calculattion");
1523 #endif
1524     return size;
1525 #ifndef PRODUCT
1526   } else if (!do_size) {
1527     switch (ireg) {
1528     case Op_VecS:
1529     case Op_VecD:
1530     case Op_VecX:
1531       st->print("movdqu  %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
1532       break;
1533     case Op_VecY:
1534     case Op_VecZ:
1535       st->print("vmovdqu %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
1536       break;
1537     default:
1538       ShouldNotReachHere();
1539     }
1540 #endif
1541   }
1542   // VEX_2bytes prefix is used if UseAVX > 0, and it takes the same 2 bytes as SIMD prefix.
1543   return (UseAVX > 2) ? 6 : 4;
1544 }
1545 
1546 static int vec_spill_helper(CodeBuffer *cbuf, bool do_size, bool is_load,
1547                             int stack_offset, int reg, uint ireg, outputStream* st) {
1548   // In 64-bit VM size calculation is very complex. Emitting instructions
1549   // into scratch buffer is used to get size in 64-bit VM.
1550   LP64_ONLY( assert(!do_size, "this method calculates size only for 32-bit VM"); )
1551   if (cbuf) {
1552     MacroAssembler _masm(cbuf);
1553     int offset = __ offset();
1554     if (is_load) {
1555       switch (ireg) {
1556       case Op_VecS:
1557         __ movdl(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
1558         break;
1559       case Op_VecD:
1560         __ movq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
1561         break;
1562       case Op_VecX:
1563         __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
1564         break;
1565       case Op_VecY:
1566         __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
1567         break;
1568       case Op_VecZ:
1569         __ evmovdquq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset), 2);
1570         break;
1571       default:
1572         ShouldNotReachHere();
1573       }
1574     } else { // store
1575       switch (ireg) {
1576       case Op_VecS:
1577         __ movdl(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
1578         break;
1579       case Op_VecD:
1580         __ movq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
1581         break;
1582       case Op_VecX:
1583         __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
1584         break;
1585       case Op_VecY:
1586         __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
1587         break;
1588       case Op_VecZ:
1589         __ evmovdquq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 2);
1590         break;
1591       default:
1592         ShouldNotReachHere();
1593       }
1594     }
1595     int size = __ offset() - offset;
1596 #ifdef ASSERT
1597     int offset_size = (stack_offset == 0) ? 0 : ((stack_offset < 0x80) ? 1 : (UseAVX > 2) ? 6 : 4);
1598     // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
1599     assert(!do_size || size == (5+offset_size), "incorrect size calculattion");
1600 #endif
1601     return size;
1602 #ifndef PRODUCT
1603   } else if (!do_size) {
1604     if (is_load) {
1605       switch (ireg) {
1606       case Op_VecS:
1607         st->print("movd    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
1608         break;
1609       case Op_VecD:
1610         st->print("movq    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
1611         break;
1612        case Op_VecX:
1613         st->print("movdqu  %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
1614         break;
1615       case Op_VecY:
1616       case Op_VecZ:
1617         st->print("vmovdqu %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
1618         break;
1619       default:
1620         ShouldNotReachHere();
1621       }
1622     } else { // store
1623       switch (ireg) {
1624       case Op_VecS:
1625         st->print("movd    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
1626         break;
1627       case Op_VecD:
1628         st->print("movq    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
1629         break;
1630        case Op_VecX:
1631         st->print("movdqu  [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
1632         break;
1633       case Op_VecY:
1634       case Op_VecZ:
1635         st->print("vmovdqu [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
1636         break;
1637       default:
1638         ShouldNotReachHere();
1639       }
1640     }
1641 #endif
1642   }
1643   bool is_single_byte = false;
1644   int vec_len = 0;
1645   if ((UseAVX > 2) && (stack_offset != 0)) {
1646     int tuple_type = Assembler::EVEX_FVM;
1647     int input_size = Assembler::EVEX_32bit;
1648     switch (ireg) {
1649     case Op_VecS:
1650       tuple_type = Assembler::EVEX_T1S;
1651       break;
1652     case Op_VecD:
1653       tuple_type = Assembler::EVEX_T1S;
1654       input_size = Assembler::EVEX_64bit;
1655       break;
1656     case Op_VecX:
1657       break;
1658     case Op_VecY:
1659       vec_len = 1;
1660       break;
1661     case Op_VecZ:
1662       vec_len = 2;
1663       break;
1664     }
1665     is_single_byte = Assembler::query_compressed_disp_byte(stack_offset, true, vec_len, tuple_type, input_size, 0);
1666   }
1667   int offset_size = 0;
1668   int size = 5;
1669   if (UseAVX > 2 ) {
1670     if (VM_Version::supports_avx512novl() && (vec_len == 2)) {
1671       offset_size = (stack_offset == 0) ? 0 : ((is_single_byte) ? 1 : 4);
1672       size += 2; // Need an additional two bytes for EVEX encoding
1673     } else if (VM_Version::supports_avx512novl() && (vec_len < 2)) {
1674       offset_size = (stack_offset == 0) ? 0 : ((stack_offset <= 127) ? 1 : 4);
1675     } else {
1676       offset_size = (stack_offset == 0) ? 0 : ((is_single_byte) ? 1 : 4);
1677       size += 2; // Need an additional two bytes for EVEX encodding
1678     }
1679   } else {
1680     offset_size = (stack_offset == 0) ? 0 : ((stack_offset <= 127) ? 1 : 4);
1681   }
1682   // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
1683   return size+offset_size;
1684 }
1685 
1686 static inline jint replicate4_imm(int con, int width) {
1687   // Load a constant of "width" (in bytes) and replicate it to fill 32bit.
1688   assert(width == 1 || width == 2, "only byte or short types here");
1689   int bit_width = width * 8;
1690   jint val = con;
1691   val &= (1 << bit_width) - 1;  // mask off sign bits
1692   while(bit_width < 32) {
1693     val |= (val << bit_width);
1694     bit_width <<= 1;
1695   }
1696   return val;
1697 }
1698 
1699 static inline jlong replicate8_imm(int con, int width) {
1700   // Load a constant of "width" (in bytes) and replicate it to fill 64bit.
1701   assert(width == 1 || width == 2 || width == 4, "only byte, short or int types here");
1702   int bit_width = width * 8;
1703   jlong val = con;
1704   val &= (((jlong) 1) << bit_width) - 1;  // mask off sign bits
1705   while(bit_width < 64) {
1706     val |= (val << bit_width);
1707     bit_width <<= 1;
1708   }
1709   return val;
1710 }
1711 
1712 #ifndef PRODUCT
1713   void MachNopNode::format(PhaseRegAlloc*, outputStream* st) const {
1714     st->print("nop \t# %d bytes pad for loops and calls", _count);
1715   }
1716 #endif
1717 
1718   void MachNopNode::emit(CodeBuffer &cbuf, PhaseRegAlloc*) const {
1719     MacroAssembler _masm(&cbuf);
1720     __ nop(_count);
1721   }
1722 
1723   uint MachNopNode::size(PhaseRegAlloc*) const {
1724     return _count;
1725   }
1726 
1727 #ifndef PRODUCT
1728   void MachBreakpointNode::format(PhaseRegAlloc*, outputStream* st) const {
1729     st->print("# breakpoint");
1730   }
1731 #endif
1732 
1733   void MachBreakpointNode::emit(CodeBuffer &cbuf, PhaseRegAlloc* ra_) const {
1734     MacroAssembler _masm(&cbuf);
1735     __ int3();
1736   }
1737 
1738   uint MachBreakpointNode::size(PhaseRegAlloc* ra_) const {
1739     return MachNode::size(ra_);
1740   }
1741 
1742 %}
1743 
1744 encode %{
1745 
1746   enc_class call_epilog %{
1747     if (VerifyStackAtCalls) {
1748       // Check that stack depth is unchanged: find majik cookie on stack
1749       int framesize = ra_->reg2offset_unchecked(OptoReg::add(ra_->_matcher._old_SP, -3*VMRegImpl::slots_per_word));
1750       MacroAssembler _masm(&cbuf);
1751       Label L;
1752       __ cmpptr(Address(rsp, framesize), (int32_t)0xbadb100d);
1753       __ jccb(Assembler::equal, L);
1754       // Die if stack mismatch
1755       __ int3();
1756       __ bind(L);
1757     }
1758   %}
1759 
1760 %}
1761 
1762 
1763 //----------OPERANDS-----------------------------------------------------------
1764 // Operand definitions must precede instruction definitions for correct parsing
1765 // in the ADLC because operands constitute user defined types which are used in
1766 // instruction definitions.
1767 
1768 // This one generically applies only for evex, so only one version
1769 operand vecZ() %{
1770   constraint(ALLOC_IN_RC(vectorz_reg));
1771   match(VecZ);
1772 
1773   format %{ %}
1774   interface(REG_INTER);
1775 %}
1776 
1777 // Comparison Code for FP conditional move
1778 operand cmpOp_vcmppd() %{
1779   match(Bool);
1780 
1781   predicate(n->as_Bool()->_test._test != BoolTest::overflow &&
1782             n->as_Bool()->_test._test != BoolTest::no_overflow);
1783   format %{ "" %}
1784   interface(COND_INTER) %{
1785     equal        (0x0, "eq");
1786     less         (0x1, "lt");
1787     less_equal   (0x2, "le");
1788     not_equal    (0xC, "ne");
1789     greater_equal(0xD, "ge");
1790     greater      (0xE, "gt");
1791     //TODO cannot compile (adlc breaks) without two next lines with error:
1792     // x86_64.ad(13987) Syntax Error: :In operand cmpOp_vcmppd: Do not support this encode constant: ' %{
1793     // equal' for overflow.
1794     overflow     (0x20, "o");  // not really supported by the instruction
1795     no_overflow  (0x21, "no"); // not really supported by the instruction
1796   %}
1797 %}
1798 
1799 
1800 // INSTRUCTIONS -- Platform independent definitions (same for 32- and 64-bit)
1801 
1802 // ============================================================================
1803 
1804 instruct ShouldNotReachHere() %{
1805   match(Halt);
1806   format %{ "ud2\t# ShouldNotReachHere" %}
1807   ins_encode %{
1808     __ ud2();
1809   %}
1810   ins_pipe(pipe_slow);
1811 %}
1812 
1813 // =================================EVEX special===============================
1814 
1815 instruct setMask(rRegI dst, rRegI src) %{
1816   predicate(Matcher::has_predicated_vectors());
1817   match(Set dst (SetVectMaskI  src));
1818   effect(TEMP dst);
1819   format %{ "setvectmask   $dst, $src" %}
1820   ins_encode %{
1821     __ setvectmask($dst$$Register, $src$$Register);
1822   %}
1823   ins_pipe(pipe_slow);
1824 %}
1825 
1826 // ============================================================================
1827 
1828 instruct addF_reg(regF dst, regF src) %{
1829   predicate((UseSSE>=1) && (UseAVX == 0));
1830   match(Set dst (AddF dst src));
1831 
1832   format %{ "addss   $dst, $src" %}
1833   ins_cost(150);
1834   ins_encode %{
1835     __ addss($dst$$XMMRegister, $src$$XMMRegister);
1836   %}
1837   ins_pipe(pipe_slow);
1838 %}
1839 
1840 instruct addF_mem(regF dst, memory src) %{
1841   predicate((UseSSE>=1) && (UseAVX == 0));
1842   match(Set dst (AddF dst (LoadF src)));
1843 
1844   format %{ "addss   $dst, $src" %}
1845   ins_cost(150);
1846   ins_encode %{
1847     __ addss($dst$$XMMRegister, $src$$Address);
1848   %}
1849   ins_pipe(pipe_slow);
1850 %}
1851 
1852 instruct addF_imm(regF dst, immF con) %{
1853   predicate((UseSSE>=1) && (UseAVX == 0));
1854   match(Set dst (AddF dst con));
1855   format %{ "addss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
1856   ins_cost(150);
1857   ins_encode %{
1858     __ addss($dst$$XMMRegister, $constantaddress($con));
1859   %}
1860   ins_pipe(pipe_slow);
1861 %}
1862 
1863 instruct addF_reg_reg(regF dst, regF src1, regF src2) %{
1864   predicate(UseAVX > 0);
1865   match(Set dst (AddF src1 src2));
1866 
1867   format %{ "vaddss  $dst, $src1, $src2" %}
1868   ins_cost(150);
1869   ins_encode %{
1870     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
1871   %}
1872   ins_pipe(pipe_slow);
1873 %}
1874 
1875 instruct addF_reg_mem(regF dst, regF src1, memory src2) %{
1876   predicate(UseAVX > 0);
1877   match(Set dst (AddF src1 (LoadF src2)));
1878 
1879   format %{ "vaddss  $dst, $src1, $src2" %}
1880   ins_cost(150);
1881   ins_encode %{
1882     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
1883   %}
1884   ins_pipe(pipe_slow);
1885 %}
1886 
1887 instruct addF_reg_imm(regF dst, regF src, immF con) %{
1888   predicate(UseAVX > 0);
1889   match(Set dst (AddF src con));
1890 
1891   format %{ "vaddss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
1892   ins_cost(150);
1893   ins_encode %{
1894     __ vaddss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
1895   %}
1896   ins_pipe(pipe_slow);
1897 %}
1898 
1899 instruct addD_reg(regD dst, regD src) %{
1900   predicate((UseSSE>=2) && (UseAVX == 0));
1901   match(Set dst (AddD dst src));
1902 
1903   format %{ "addsd   $dst, $src" %}
1904   ins_cost(150);
1905   ins_encode %{
1906     __ addsd($dst$$XMMRegister, $src$$XMMRegister);
1907   %}
1908   ins_pipe(pipe_slow);
1909 %}
1910 
1911 instruct addD_mem(regD dst, memory src) %{
1912   predicate((UseSSE>=2) && (UseAVX == 0));
1913   match(Set dst (AddD dst (LoadD src)));
1914 
1915   format %{ "addsd   $dst, $src" %}
1916   ins_cost(150);
1917   ins_encode %{
1918     __ addsd($dst$$XMMRegister, $src$$Address);
1919   %}
1920   ins_pipe(pipe_slow);
1921 %}
1922 
1923 instruct addD_imm(regD dst, immD con) %{
1924   predicate((UseSSE>=2) && (UseAVX == 0));
1925   match(Set dst (AddD dst con));
1926   format %{ "addsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
1927   ins_cost(150);
1928   ins_encode %{
1929     __ addsd($dst$$XMMRegister, $constantaddress($con));
1930   %}
1931   ins_pipe(pipe_slow);
1932 %}
1933 
1934 instruct addD_reg_reg(regD dst, regD src1, regD src2) %{
1935   predicate(UseAVX > 0);
1936   match(Set dst (AddD src1 src2));
1937 
1938   format %{ "vaddsd  $dst, $src1, $src2" %}
1939   ins_cost(150);
1940   ins_encode %{
1941     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
1942   %}
1943   ins_pipe(pipe_slow);
1944 %}
1945 
1946 instruct addD_reg_mem(regD dst, regD src1, memory src2) %{
1947   predicate(UseAVX > 0);
1948   match(Set dst (AddD src1 (LoadD src2)));
1949 
1950   format %{ "vaddsd  $dst, $src1, $src2" %}
1951   ins_cost(150);
1952   ins_encode %{
1953     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
1954   %}
1955   ins_pipe(pipe_slow);
1956 %}
1957 
1958 instruct addD_reg_imm(regD dst, regD src, immD con) %{
1959   predicate(UseAVX > 0);
1960   match(Set dst (AddD src con));
1961 
1962   format %{ "vaddsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
1963   ins_cost(150);
1964   ins_encode %{
1965     __ vaddsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
1966   %}
1967   ins_pipe(pipe_slow);
1968 %}
1969 
1970 instruct subF_reg(regF dst, regF src) %{
1971   predicate((UseSSE>=1) && (UseAVX == 0));
1972   match(Set dst (SubF dst src));
1973 
1974   format %{ "subss   $dst, $src" %}
1975   ins_cost(150);
1976   ins_encode %{
1977     __ subss($dst$$XMMRegister, $src$$XMMRegister);
1978   %}
1979   ins_pipe(pipe_slow);
1980 %}
1981 
1982 instruct subF_mem(regF dst, memory src) %{
1983   predicate((UseSSE>=1) && (UseAVX == 0));
1984   match(Set dst (SubF dst (LoadF src)));
1985 
1986   format %{ "subss   $dst, $src" %}
1987   ins_cost(150);
1988   ins_encode %{
1989     __ subss($dst$$XMMRegister, $src$$Address);
1990   %}
1991   ins_pipe(pipe_slow);
1992 %}
1993 
1994 instruct subF_imm(regF dst, immF con) %{
1995   predicate((UseSSE>=1) && (UseAVX == 0));
1996   match(Set dst (SubF dst con));
1997   format %{ "subss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
1998   ins_cost(150);
1999   ins_encode %{
2000     __ subss($dst$$XMMRegister, $constantaddress($con));
2001   %}
2002   ins_pipe(pipe_slow);
2003 %}
2004 
2005 instruct subF_reg_reg(regF dst, regF src1, regF src2) %{
2006   predicate(UseAVX > 0);
2007   match(Set dst (SubF src1 src2));
2008 
2009   format %{ "vsubss  $dst, $src1, $src2" %}
2010   ins_cost(150);
2011   ins_encode %{
2012     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2013   %}
2014   ins_pipe(pipe_slow);
2015 %}
2016 
2017 instruct subF_reg_mem(regF dst, regF src1, memory src2) %{
2018   predicate(UseAVX > 0);
2019   match(Set dst (SubF src1 (LoadF src2)));
2020 
2021   format %{ "vsubss  $dst, $src1, $src2" %}
2022   ins_cost(150);
2023   ins_encode %{
2024     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2025   %}
2026   ins_pipe(pipe_slow);
2027 %}
2028 
2029 instruct subF_reg_imm(regF dst, regF src, immF con) %{
2030   predicate(UseAVX > 0);
2031   match(Set dst (SubF src con));
2032 
2033   format %{ "vsubss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
2034   ins_cost(150);
2035   ins_encode %{
2036     __ vsubss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2037   %}
2038   ins_pipe(pipe_slow);
2039 %}
2040 
2041 instruct subD_reg(regD dst, regD src) %{
2042   predicate((UseSSE>=2) && (UseAVX == 0));
2043   match(Set dst (SubD dst src));
2044 
2045   format %{ "subsd   $dst, $src" %}
2046   ins_cost(150);
2047   ins_encode %{
2048     __ subsd($dst$$XMMRegister, $src$$XMMRegister);
2049   %}
2050   ins_pipe(pipe_slow);
2051 %}
2052 
2053 instruct subD_mem(regD dst, memory src) %{
2054   predicate((UseSSE>=2) && (UseAVX == 0));
2055   match(Set dst (SubD dst (LoadD src)));
2056 
2057   format %{ "subsd   $dst, $src" %}
2058   ins_cost(150);
2059   ins_encode %{
2060     __ subsd($dst$$XMMRegister, $src$$Address);
2061   %}
2062   ins_pipe(pipe_slow);
2063 %}
2064 
2065 instruct subD_imm(regD dst, immD con) %{
2066   predicate((UseSSE>=2) && (UseAVX == 0));
2067   match(Set dst (SubD dst con));
2068   format %{ "subsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2069   ins_cost(150);
2070   ins_encode %{
2071     __ subsd($dst$$XMMRegister, $constantaddress($con));
2072   %}
2073   ins_pipe(pipe_slow);
2074 %}
2075 
2076 instruct subD_reg_reg(regD dst, regD src1, regD src2) %{
2077   predicate(UseAVX > 0);
2078   match(Set dst (SubD src1 src2));
2079 
2080   format %{ "vsubsd  $dst, $src1, $src2" %}
2081   ins_cost(150);
2082   ins_encode %{
2083     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2084   %}
2085   ins_pipe(pipe_slow);
2086 %}
2087 
2088 instruct subD_reg_mem(regD dst, regD src1, memory src2) %{
2089   predicate(UseAVX > 0);
2090   match(Set dst (SubD src1 (LoadD src2)));
2091 
2092   format %{ "vsubsd  $dst, $src1, $src2" %}
2093   ins_cost(150);
2094   ins_encode %{
2095     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2096   %}
2097   ins_pipe(pipe_slow);
2098 %}
2099 
2100 instruct subD_reg_imm(regD dst, regD src, immD con) %{
2101   predicate(UseAVX > 0);
2102   match(Set dst (SubD src con));
2103 
2104   format %{ "vsubsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
2105   ins_cost(150);
2106   ins_encode %{
2107     __ vsubsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2108   %}
2109   ins_pipe(pipe_slow);
2110 %}
2111 
2112 instruct mulF_reg(regF dst, regF src) %{
2113   predicate((UseSSE>=1) && (UseAVX == 0));
2114   match(Set dst (MulF dst src));
2115 
2116   format %{ "mulss   $dst, $src" %}
2117   ins_cost(150);
2118   ins_encode %{
2119     __ mulss($dst$$XMMRegister, $src$$XMMRegister);
2120   %}
2121   ins_pipe(pipe_slow);
2122 %}
2123 
2124 instruct mulF_mem(regF dst, memory src) %{
2125   predicate((UseSSE>=1) && (UseAVX == 0));
2126   match(Set dst (MulF dst (LoadF src)));
2127 
2128   format %{ "mulss   $dst, $src" %}
2129   ins_cost(150);
2130   ins_encode %{
2131     __ mulss($dst$$XMMRegister, $src$$Address);
2132   %}
2133   ins_pipe(pipe_slow);
2134 %}
2135 
2136 instruct mulF_imm(regF dst, immF con) %{
2137   predicate((UseSSE>=1) && (UseAVX == 0));
2138   match(Set dst (MulF dst con));
2139   format %{ "mulss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2140   ins_cost(150);
2141   ins_encode %{
2142     __ mulss($dst$$XMMRegister, $constantaddress($con));
2143   %}
2144   ins_pipe(pipe_slow);
2145 %}
2146 
2147 instruct mulF_reg_reg(regF dst, regF src1, regF src2) %{
2148   predicate(UseAVX > 0);
2149   match(Set dst (MulF src1 src2));
2150 
2151   format %{ "vmulss  $dst, $src1, $src2" %}
2152   ins_cost(150);
2153   ins_encode %{
2154     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2155   %}
2156   ins_pipe(pipe_slow);
2157 %}
2158 
2159 instruct mulF_reg_mem(regF dst, regF src1, memory src2) %{
2160   predicate(UseAVX > 0);
2161   match(Set dst (MulF src1 (LoadF src2)));
2162 
2163   format %{ "vmulss  $dst, $src1, $src2" %}
2164   ins_cost(150);
2165   ins_encode %{
2166     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2167   %}
2168   ins_pipe(pipe_slow);
2169 %}
2170 
2171 instruct mulF_reg_imm(regF dst, regF src, immF con) %{
2172   predicate(UseAVX > 0);
2173   match(Set dst (MulF src con));
2174 
2175   format %{ "vmulss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
2176   ins_cost(150);
2177   ins_encode %{
2178     __ vmulss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2179   %}
2180   ins_pipe(pipe_slow);
2181 %}
2182 
2183 instruct mulD_reg(regD dst, regD src) %{
2184   predicate((UseSSE>=2) && (UseAVX == 0));
2185   match(Set dst (MulD dst src));
2186 
2187   format %{ "mulsd   $dst, $src" %}
2188   ins_cost(150);
2189   ins_encode %{
2190     __ mulsd($dst$$XMMRegister, $src$$XMMRegister);
2191   %}
2192   ins_pipe(pipe_slow);
2193 %}
2194 
2195 instruct mulD_mem(regD dst, memory src) %{
2196   predicate((UseSSE>=2) && (UseAVX == 0));
2197   match(Set dst (MulD dst (LoadD src)));
2198 
2199   format %{ "mulsd   $dst, $src" %}
2200   ins_cost(150);
2201   ins_encode %{
2202     __ mulsd($dst$$XMMRegister, $src$$Address);
2203   %}
2204   ins_pipe(pipe_slow);
2205 %}
2206 
2207 instruct mulD_imm(regD dst, immD con) %{
2208   predicate((UseSSE>=2) && (UseAVX == 0));
2209   match(Set dst (MulD dst con));
2210   format %{ "mulsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2211   ins_cost(150);
2212   ins_encode %{
2213     __ mulsd($dst$$XMMRegister, $constantaddress($con));
2214   %}
2215   ins_pipe(pipe_slow);
2216 %}
2217 
2218 instruct mulD_reg_reg(regD dst, regD src1, regD src2) %{
2219   predicate(UseAVX > 0);
2220   match(Set dst (MulD src1 src2));
2221 
2222   format %{ "vmulsd  $dst, $src1, $src2" %}
2223   ins_cost(150);
2224   ins_encode %{
2225     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2226   %}
2227   ins_pipe(pipe_slow);
2228 %}
2229 
2230 instruct mulD_reg_mem(regD dst, regD src1, memory src2) %{
2231   predicate(UseAVX > 0);
2232   match(Set dst (MulD src1 (LoadD src2)));
2233 
2234   format %{ "vmulsd  $dst, $src1, $src2" %}
2235   ins_cost(150);
2236   ins_encode %{
2237     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2238   %}
2239   ins_pipe(pipe_slow);
2240 %}
2241 
2242 instruct mulD_reg_imm(regD dst, regD src, immD con) %{
2243   predicate(UseAVX > 0);
2244   match(Set dst (MulD src con));
2245 
2246   format %{ "vmulsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
2247   ins_cost(150);
2248   ins_encode %{
2249     __ vmulsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2250   %}
2251   ins_pipe(pipe_slow);
2252 %}
2253 
2254 instruct divF_reg(regF dst, regF src) %{
2255   predicate((UseSSE>=1) && (UseAVX == 0));
2256   match(Set dst (DivF dst src));
2257 
2258   format %{ "divss   $dst, $src" %}
2259   ins_cost(150);
2260   ins_encode %{
2261     __ divss($dst$$XMMRegister, $src$$XMMRegister);
2262   %}
2263   ins_pipe(pipe_slow);
2264 %}
2265 
2266 instruct divF_mem(regF dst, memory src) %{
2267   predicate((UseSSE>=1) && (UseAVX == 0));
2268   match(Set dst (DivF dst (LoadF src)));
2269 
2270   format %{ "divss   $dst, $src" %}
2271   ins_cost(150);
2272   ins_encode %{
2273     __ divss($dst$$XMMRegister, $src$$Address);
2274   %}
2275   ins_pipe(pipe_slow);
2276 %}
2277 
2278 instruct divF_imm(regF dst, immF con) %{
2279   predicate((UseSSE>=1) && (UseAVX == 0));
2280   match(Set dst (DivF dst con));
2281   format %{ "divss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2282   ins_cost(150);
2283   ins_encode %{
2284     __ divss($dst$$XMMRegister, $constantaddress($con));
2285   %}
2286   ins_pipe(pipe_slow);
2287 %}
2288 
2289 instruct divF_reg_reg(regF dst, regF src1, regF src2) %{
2290   predicate(UseAVX > 0);
2291   match(Set dst (DivF src1 src2));
2292 
2293   format %{ "vdivss  $dst, $src1, $src2" %}
2294   ins_cost(150);
2295   ins_encode %{
2296     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2297   %}
2298   ins_pipe(pipe_slow);
2299 %}
2300 
2301 instruct divF_reg_mem(regF dst, regF src1, memory src2) %{
2302   predicate(UseAVX > 0);
2303   match(Set dst (DivF src1 (LoadF src2)));
2304 
2305   format %{ "vdivss  $dst, $src1, $src2" %}
2306   ins_cost(150);
2307   ins_encode %{
2308     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2309   %}
2310   ins_pipe(pipe_slow);
2311 %}
2312 
2313 instruct divF_reg_imm(regF dst, regF src, immF con) %{
2314   predicate(UseAVX > 0);
2315   match(Set dst (DivF src con));
2316 
2317   format %{ "vdivss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
2318   ins_cost(150);
2319   ins_encode %{
2320     __ vdivss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2321   %}
2322   ins_pipe(pipe_slow);
2323 %}
2324 
2325 instruct divD_reg(regD dst, regD src) %{
2326   predicate((UseSSE>=2) && (UseAVX == 0));
2327   match(Set dst (DivD dst src));
2328 
2329   format %{ "divsd   $dst, $src" %}
2330   ins_cost(150);
2331   ins_encode %{
2332     __ divsd($dst$$XMMRegister, $src$$XMMRegister);
2333   %}
2334   ins_pipe(pipe_slow);
2335 %}
2336 
2337 instruct divD_mem(regD dst, memory src) %{
2338   predicate((UseSSE>=2) && (UseAVX == 0));
2339   match(Set dst (DivD dst (LoadD src)));
2340 
2341   format %{ "divsd   $dst, $src" %}
2342   ins_cost(150);
2343   ins_encode %{
2344     __ divsd($dst$$XMMRegister, $src$$Address);
2345   %}
2346   ins_pipe(pipe_slow);
2347 %}
2348 
2349 instruct divD_imm(regD dst, immD con) %{
2350   predicate((UseSSE>=2) && (UseAVX == 0));
2351   match(Set dst (DivD dst con));
2352   format %{ "divsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2353   ins_cost(150);
2354   ins_encode %{
2355     __ divsd($dst$$XMMRegister, $constantaddress($con));
2356   %}
2357   ins_pipe(pipe_slow);
2358 %}
2359 
2360 instruct divD_reg_reg(regD dst, regD src1, regD src2) %{
2361   predicate(UseAVX > 0);
2362   match(Set dst (DivD src1 src2));
2363 
2364   format %{ "vdivsd  $dst, $src1, $src2" %}
2365   ins_cost(150);
2366   ins_encode %{
2367     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2368   %}
2369   ins_pipe(pipe_slow);
2370 %}
2371 
2372 instruct divD_reg_mem(regD dst, regD src1, memory src2) %{
2373   predicate(UseAVX > 0);
2374   match(Set dst (DivD src1 (LoadD src2)));
2375 
2376   format %{ "vdivsd  $dst, $src1, $src2" %}
2377   ins_cost(150);
2378   ins_encode %{
2379     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2380   %}
2381   ins_pipe(pipe_slow);
2382 %}
2383 
2384 instruct divD_reg_imm(regD dst, regD src, immD con) %{
2385   predicate(UseAVX > 0);
2386   match(Set dst (DivD src con));
2387 
2388   format %{ "vdivsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
2389   ins_cost(150);
2390   ins_encode %{
2391     __ vdivsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2392   %}
2393   ins_pipe(pipe_slow);
2394 %}
2395 
2396 instruct absF_reg(regF dst) %{
2397   predicate((UseSSE>=1) && (UseAVX == 0));
2398   match(Set dst (AbsF dst));
2399   ins_cost(150);
2400   format %{ "andps   $dst, [0x7fffffff]\t# abs float by sign masking" %}
2401   ins_encode %{
2402     __ andps($dst$$XMMRegister, ExternalAddress(float_signmask()));
2403   %}
2404   ins_pipe(pipe_slow);
2405 %}
2406 
2407 instruct absF_reg_reg(regF dst, regF src) %{
2408   predicate(VM_Version::supports_avxonly());
2409   match(Set dst (AbsF src));
2410   ins_cost(150);
2411   format %{ "vandps  $dst, $src, [0x7fffffff]\t# abs float by sign masking" %}
2412   ins_encode %{
2413     int vector_len = 0;
2414     __ vandps($dst$$XMMRegister, $src$$XMMRegister,
2415               ExternalAddress(float_signmask()), vector_len);
2416   %}
2417   ins_pipe(pipe_slow);
2418 %}
2419 
2420 #ifdef _LP64
2421 instruct absF_reg_reg_evex(regF dst, regF src) %{
2422   predicate(UseAVX > 2 && VM_Version::supports_avx512vl());
2423   match(Set dst (AbsF src));
2424   ins_cost(150);
2425   format %{ "vandps  $dst, $src, [0x7fffffff]\t# abs float by sign masking" %}
2426   ins_encode %{
2427     int vector_len = 0;
2428     __ vandps($dst$$XMMRegister, $src$$XMMRegister,
2429               ExternalAddress(float_signmask()), vector_len);
2430   %}
2431   ins_pipe(pipe_slow);
2432 %}
2433 
2434 instruct absF_reg_reg_evex_special(regF dst, regF src1, regF src2) %{
2435   predicate(VM_Version::supports_avx512novl());
2436   match(Set dst (AbsF src1));
2437   effect(TEMP src2);
2438   ins_cost(150);
2439   format %{ "vabsss  $dst, $src1, $src2, [0x7fffffff]\t# abs float by sign masking" %}
2440   ins_encode %{
2441     int vector_len = 0;
2442     __ vabsss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister,
2443               ExternalAddress(float_signmask()), vector_len);
2444   %}
2445   ins_pipe(pipe_slow);
2446 %}
2447 #else // _LP64
2448 instruct absF_reg_reg_evex(regF dst, regF src) %{
2449   predicate(UseAVX > 2);
2450   match(Set dst (AbsF src));
2451   ins_cost(150);
2452   format %{ "vandps  $dst, $src, [0x7fffffff]\t# abs float by sign masking" %}
2453   ins_encode %{
2454     int vector_len = 0;
2455     __ vandps($dst$$XMMRegister, $src$$XMMRegister,
2456               ExternalAddress(float_signmask()), vector_len);
2457   %}
2458   ins_pipe(pipe_slow);
2459 %}
2460 #endif
2461 
2462 instruct absD_reg(regD dst) %{
2463   predicate((UseSSE>=2) && (UseAVX == 0));
2464   match(Set dst (AbsD dst));
2465   ins_cost(150);
2466   format %{ "andpd   $dst, [0x7fffffffffffffff]\t"
2467             "# abs double by sign masking" %}
2468   ins_encode %{
2469     __ andpd($dst$$XMMRegister, ExternalAddress(double_signmask()));
2470   %}
2471   ins_pipe(pipe_slow);
2472 %}
2473 
2474 instruct absD_reg_reg(regD dst, regD src) %{
2475   predicate(VM_Version::supports_avxonly());
2476   match(Set dst (AbsD src));
2477   ins_cost(150);
2478   format %{ "vandpd  $dst, $src, [0x7fffffffffffffff]\t"
2479             "# abs double by sign masking" %}
2480   ins_encode %{
2481     int vector_len = 0;
2482     __ vandpd($dst$$XMMRegister, $src$$XMMRegister,
2483               ExternalAddress(double_signmask()), vector_len);
2484   %}
2485   ins_pipe(pipe_slow);
2486 %}
2487 
2488 #ifdef _LP64
2489 instruct absD_reg_reg_evex(regD dst, regD src) %{
2490   predicate(UseAVX > 2 && VM_Version::supports_avx512vl());
2491   match(Set dst (AbsD src));
2492   ins_cost(150);
2493   format %{ "vandpd  $dst, $src, [0x7fffffffffffffff]\t"
2494             "# abs double by sign masking" %}
2495   ins_encode %{
2496     int vector_len = 0;
2497     __ vandpd($dst$$XMMRegister, $src$$XMMRegister,
2498               ExternalAddress(double_signmask()), vector_len);
2499   %}
2500   ins_pipe(pipe_slow);
2501 %}
2502 
2503 instruct absD_reg_reg_evex_special(regD dst, regD src1, regD src2) %{
2504   predicate(VM_Version::supports_avx512novl());
2505   match(Set dst (AbsD src1));
2506   effect(TEMP src2);
2507   ins_cost(150);
2508   format %{ "vabssd  $dst, $src1, $src2, [0x7fffffffffffffff]\t# abs float by sign masking" %}
2509   ins_encode %{
2510     int vector_len = 0;
2511     __ vabssd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister,
2512               ExternalAddress(double_signmask()), vector_len);
2513   %}
2514   ins_pipe(pipe_slow);
2515 %}
2516 #else // _LP64
2517 instruct absD_reg_reg_evex(regD dst, regD src) %{
2518   predicate(UseAVX > 2);
2519   match(Set dst (AbsD src));
2520   ins_cost(150);
2521   format %{ "vandpd  $dst, $src, [0x7fffffffffffffff]\t"
2522             "# abs double by sign masking" %}
2523   ins_encode %{
2524     int vector_len = 0;
2525     __ vandpd($dst$$XMMRegister, $src$$XMMRegister,
2526               ExternalAddress(double_signmask()), vector_len);
2527   %}
2528   ins_pipe(pipe_slow);
2529 %}
2530 #endif
2531 
2532 instruct negF_reg(regF dst) %{
2533   predicate((UseSSE>=1) && (UseAVX == 0));
2534   match(Set dst (NegF dst));
2535   ins_cost(150);
2536   format %{ "xorps   $dst, [0x80000000]\t# neg float by sign flipping" %}
2537   ins_encode %{
2538     __ xorps($dst$$XMMRegister, ExternalAddress(float_signflip()));
2539   %}
2540   ins_pipe(pipe_slow);
2541 %}
2542 
2543 instruct negF_reg_reg(regF dst, regF src) %{
2544   predicate(UseAVX > 0);
2545   match(Set dst (NegF src));
2546   ins_cost(150);
2547   format %{ "vnegatess  $dst, $src, [0x80000000]\t# neg float by sign flipping" %}
2548   ins_encode %{
2549     __ vnegatess($dst$$XMMRegister, $src$$XMMRegister,
2550                  ExternalAddress(float_signflip()));
2551   %}
2552   ins_pipe(pipe_slow);
2553 %}
2554 
2555 instruct negD_reg(regD dst) %{
2556   predicate((UseSSE>=2) && (UseAVX == 0));
2557   match(Set dst (NegD dst));
2558   ins_cost(150);
2559   format %{ "xorpd   $dst, [0x8000000000000000]\t"
2560             "# neg double by sign flipping" %}
2561   ins_encode %{
2562     __ xorpd($dst$$XMMRegister, ExternalAddress(double_signflip()));
2563   %}
2564   ins_pipe(pipe_slow);
2565 %}
2566 
2567 instruct negD_reg_reg(regD dst, regD src) %{
2568   predicate(UseAVX > 0);
2569   match(Set dst (NegD src));
2570   ins_cost(150);
2571   format %{ "vnegatess  $dst, $src, [0x8000000000000000]\t"
2572             "# neg double by sign flipping" %}
2573   ins_encode %{
2574     __ vnegatesd($dst$$XMMRegister, $src$$XMMRegister,
2575                  ExternalAddress(double_signflip()));
2576   %}
2577   ins_pipe(pipe_slow);
2578 %}
2579 
2580 instruct sqrtF_reg(regF dst, regF src) %{
2581   predicate(UseSSE>=1);
2582   match(Set dst (ConvD2F (SqrtD (ConvF2D src))));
2583 
2584   format %{ "sqrtss  $dst, $src" %}
2585   ins_cost(150);
2586   ins_encode %{
2587     __ sqrtss($dst$$XMMRegister, $src$$XMMRegister);
2588   %}
2589   ins_pipe(pipe_slow);
2590 %}
2591 
2592 instruct sqrtF_mem(regF dst, memory src) %{
2593   predicate(UseSSE>=1);
2594   match(Set dst (ConvD2F (SqrtD (ConvF2D (LoadF src)))));
2595 
2596   format %{ "sqrtss  $dst, $src" %}
2597   ins_cost(150);
2598   ins_encode %{
2599     __ sqrtss($dst$$XMMRegister, $src$$Address);
2600   %}
2601   ins_pipe(pipe_slow);
2602 %}
2603 
2604 instruct sqrtF_imm(regF dst, immF con) %{
2605   predicate(UseSSE>=1);
2606   match(Set dst (ConvD2F (SqrtD (ConvF2D con))));
2607   format %{ "sqrtss  $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2608   ins_cost(150);
2609   ins_encode %{
2610     __ sqrtss($dst$$XMMRegister, $constantaddress($con));
2611   %}
2612   ins_pipe(pipe_slow);
2613 %}
2614 
2615 instruct sqrtD_reg(regD dst, regD src) %{
2616   predicate(UseSSE>=2);
2617   match(Set dst (SqrtD src));
2618 
2619   format %{ "sqrtsd  $dst, $src" %}
2620   ins_cost(150);
2621   ins_encode %{
2622     __ sqrtsd($dst$$XMMRegister, $src$$XMMRegister);
2623   %}
2624   ins_pipe(pipe_slow);
2625 %}
2626 
2627 instruct sqrtD_mem(regD dst, memory src) %{
2628   predicate(UseSSE>=2);
2629   match(Set dst (SqrtD (LoadD src)));
2630 
2631   format %{ "sqrtsd  $dst, $src" %}
2632   ins_cost(150);
2633   ins_encode %{
2634     __ sqrtsd($dst$$XMMRegister, $src$$Address);
2635   %}
2636   ins_pipe(pipe_slow);
2637 %}
2638 
2639 instruct sqrtD_imm(regD dst, immD con) %{
2640   predicate(UseSSE>=2);
2641   match(Set dst (SqrtD con));
2642   format %{ "sqrtsd  $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2643   ins_cost(150);
2644   ins_encode %{
2645     __ sqrtsd($dst$$XMMRegister, $constantaddress($con));
2646   %}
2647   ins_pipe(pipe_slow);
2648 %}
2649 
2650 instruct onspinwait() %{
2651   match(OnSpinWait);
2652   ins_cost(200);
2653 
2654   format %{
2655     $$template
2656     if (os::is_MP()) {
2657       $$emit$$"pause\t! membar_onspinwait"
2658     } else {
2659       $$emit$$"MEMBAR-onspinwait ! (empty encoding)"
2660     }
2661   %}
2662   ins_encode %{
2663     __ pause();
2664   %}
2665   ins_pipe(pipe_slow);
2666 %}
2667 
2668 // a * b + c
2669 instruct fmaD_reg(regD a, regD b, regD c) %{
2670   predicate(UseFMA);
2671   match(Set c (FmaD  c (Binary a b)));
2672   format %{ "fmasd $a,$b,$c\t# $c = $a * $b + $c" %}
2673   ins_cost(150);
2674   ins_encode %{
2675     __ fmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
2676   %}
2677   ins_pipe( pipe_slow );
2678 %}
2679 
2680 // a * b + c
2681 instruct fmaF_reg(regF a, regF b, regF c) %{
2682   predicate(UseFMA);
2683   match(Set c (FmaF  c (Binary a b)));
2684   format %{ "fmass $a,$b,$c\t# $c = $a * $b + $c" %}
2685   ins_cost(150);
2686   ins_encode %{
2687     __ fmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
2688   %}
2689   ins_pipe( pipe_slow );
2690 %}
2691 
2692 // ====================VECTOR INSTRUCTIONS=====================================
2693 
2694 // Load vectors (4 bytes long)
2695 instruct loadV4(vecS dst, memory mem) %{
2696   predicate(n->as_LoadVector()->memory_size() == 4);
2697   match(Set dst (LoadVector mem));
2698   ins_cost(125);
2699   format %{ "movd    $dst,$mem\t! load vector (4 bytes)" %}
2700   ins_encode %{
2701     __ movdl($dst$$XMMRegister, $mem$$Address);
2702   %}
2703   ins_pipe( pipe_slow );
2704 %}
2705 
2706 // Load vectors (8 bytes long)
2707 instruct loadV8(vecD dst, memory mem) %{
2708   predicate(n->as_LoadVector()->memory_size() == 8);
2709   match(Set dst (LoadVector mem));
2710   ins_cost(125);
2711   format %{ "movq    $dst,$mem\t! load vector (8 bytes)" %}
2712   ins_encode %{
2713     __ movq($dst$$XMMRegister, $mem$$Address);
2714   %}
2715   ins_pipe( pipe_slow );
2716 %}
2717 
2718 // Load vectors (16 bytes long)
2719 instruct loadV16(vecX dst, memory mem) %{
2720   predicate(n->as_LoadVector()->memory_size() == 16);
2721   match(Set dst (LoadVector mem));
2722   ins_cost(125);
2723   format %{ "movdqu  $dst,$mem\t! load vector (16 bytes)" %}
2724   ins_encode %{
2725     __ movdqu($dst$$XMMRegister, $mem$$Address);
2726   %}
2727   ins_pipe( pipe_slow );
2728 %}
2729 
2730 // Load vectors (32 bytes long)
2731 instruct loadV32(vecY dst, memory mem) %{
2732   predicate(n->as_LoadVector()->memory_size() == 32);
2733   match(Set dst (LoadVector mem));
2734   ins_cost(125);
2735   format %{ "vmovdqu $dst,$mem\t! load vector (32 bytes)" %}
2736   ins_encode %{
2737     __ vmovdqu($dst$$XMMRegister, $mem$$Address);
2738   %}
2739   ins_pipe( pipe_slow );
2740 %}
2741 
2742 // Load vectors (64 bytes long)
2743 instruct loadV64_dword(vecZ dst, memory mem) %{
2744   predicate(n->as_LoadVector()->memory_size() == 64 && n->as_LoadVector()->element_size() <= 4);
2745   match(Set dst (LoadVector mem));
2746   ins_cost(125);
2747   format %{ "vmovdqul $dst k0,$mem\t! load vector (64 bytes)" %}
2748   ins_encode %{
2749     int vector_len = 2;
2750     __ evmovdqul($dst$$XMMRegister, $mem$$Address, vector_len);
2751   %}
2752   ins_pipe( pipe_slow );
2753 %}
2754 
2755 // Load vectors (64 bytes long)
2756 instruct loadV64_qword(vecZ dst, memory mem) %{
2757   predicate(n->as_LoadVector()->memory_size() == 64 && n->as_LoadVector()->element_size() > 4);
2758   match(Set dst (LoadVector mem));
2759   ins_cost(125);
2760   format %{ "vmovdquq $dst k0,$mem\t! load vector (64 bytes)" %}
2761   ins_encode %{
2762     int vector_len = 2;
2763     __ evmovdquq($dst$$XMMRegister, $mem$$Address, vector_len);
2764   %}
2765   ins_pipe( pipe_slow );
2766 %}
2767 
2768 // Store vectors
2769 instruct storeV4(memory mem, vecS src) %{
2770   predicate(n->as_StoreVector()->memory_size() == 4);
2771   match(Set mem (StoreVector mem src));
2772   ins_cost(145);
2773   format %{ "movd    $mem,$src\t! store vector (4 bytes)" %}
2774   ins_encode %{
2775     __ movdl($mem$$Address, $src$$XMMRegister);
2776   %}
2777   ins_pipe( pipe_slow );
2778 %}
2779 
2780 instruct storeV8(memory mem, vecD src) %{
2781   predicate(n->as_StoreVector()->memory_size() == 8);
2782   match(Set mem (StoreVector mem src));
2783   ins_cost(145);
2784   format %{ "movq    $mem,$src\t! store vector (8 bytes)" %}
2785   ins_encode %{
2786     __ movq($mem$$Address, $src$$XMMRegister);
2787   %}
2788   ins_pipe( pipe_slow );
2789 %}
2790 
2791 instruct storeV16(memory mem, vecX src) %{
2792   predicate(n->as_StoreVector()->memory_size() == 16);
2793   match(Set mem (StoreVector mem src));
2794   ins_cost(145);
2795   format %{ "movdqu  $mem,$src\t! store vector (16 bytes)" %}
2796   ins_encode %{
2797     __ movdqu($mem$$Address, $src$$XMMRegister);
2798   %}
2799   ins_pipe( pipe_slow );
2800 %}
2801 
2802 instruct storeV32(memory mem, vecY src) %{
2803   predicate(n->as_StoreVector()->memory_size() == 32);
2804   match(Set mem (StoreVector mem src));
2805   ins_cost(145);
2806   format %{ "vmovdqu $mem,$src\t! store vector (32 bytes)" %}
2807   ins_encode %{
2808     __ vmovdqu($mem$$Address, $src$$XMMRegister);
2809   %}
2810   ins_pipe( pipe_slow );
2811 %}
2812 
2813 instruct storeV64_dword(memory mem, vecZ src) %{
2814   predicate(n->as_StoreVector()->memory_size() == 64 && n->as_StoreVector()->element_size() <= 4);
2815   match(Set mem (StoreVector mem src));
2816   ins_cost(145);
2817   format %{ "vmovdqul $mem k0,$src\t! store vector (64 bytes)" %}
2818   ins_encode %{
2819     int vector_len = 2;
2820     __ evmovdqul($mem$$Address, $src$$XMMRegister, vector_len);
2821   %}
2822   ins_pipe( pipe_slow );
2823 %}
2824 
2825 instruct storeV64_qword(memory mem, vecZ src) %{
2826   predicate(n->as_StoreVector()->memory_size() == 64 && n->as_StoreVector()->element_size() > 4);
2827   match(Set mem (StoreVector mem src));
2828   ins_cost(145);
2829   format %{ "vmovdquq $mem k0,$src\t! store vector (64 bytes)" %}
2830   ins_encode %{
2831     int vector_len = 2;
2832     __ evmovdquq($mem$$Address, $src$$XMMRegister, vector_len);
2833   %}
2834   ins_pipe( pipe_slow );
2835 %}
2836 
2837 // ====================LEGACY REPLICATE=======================================
2838 
2839 instruct Repl4B_mem(vecS dst, memory mem) %{
2840   predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
2841   match(Set dst (ReplicateB (LoadB mem)));
2842   format %{ "punpcklbw $dst,$mem\n\t"
2843             "pshuflw $dst,$dst,0x00\t! replicate4B" %}
2844   ins_encode %{
2845     __ punpcklbw($dst$$XMMRegister, $mem$$Address);
2846     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
2847   %}
2848   ins_pipe( pipe_slow );
2849 %}
2850 
2851 instruct Repl8B_mem(vecD dst, memory mem) %{
2852   predicate(n->as_Vector()->length() == 8 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
2853   match(Set dst (ReplicateB (LoadB mem)));
2854   format %{ "punpcklbw $dst,$mem\n\t"
2855             "pshuflw $dst,$dst,0x00\t! replicate8B" %}
2856   ins_encode %{
2857     __ punpcklbw($dst$$XMMRegister, $mem$$Address);
2858     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
2859   %}
2860   ins_pipe( pipe_slow );
2861 %}
2862 
2863 instruct Repl16B(vecX dst, rRegI src) %{
2864   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
2865   match(Set dst (ReplicateB src));
2866   format %{ "movd    $dst,$src\n\t"
2867             "punpcklbw $dst,$dst\n\t"
2868             "pshuflw $dst,$dst,0x00\n\t"
2869             "punpcklqdq $dst,$dst\t! replicate16B" %}
2870   ins_encode %{
2871     __ movdl($dst$$XMMRegister, $src$$Register);
2872     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
2873     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
2874     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2875   %}
2876   ins_pipe( pipe_slow );
2877 %}
2878 
2879 instruct Repl16B_mem(vecX dst, memory mem) %{
2880   predicate(n->as_Vector()->length() == 16 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
2881   match(Set dst (ReplicateB (LoadB mem)));
2882   format %{ "punpcklbw $dst,$mem\n\t"
2883             "pshuflw $dst,$dst,0x00\n\t"
2884             "punpcklqdq $dst,$dst\t! replicate16B" %}
2885   ins_encode %{
2886     __ punpcklbw($dst$$XMMRegister, $mem$$Address);
2887     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
2888     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2889   %}
2890   ins_pipe( pipe_slow );
2891 %}
2892 
2893 instruct Repl32B(vecY dst, rRegI src) %{
2894   predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
2895   match(Set dst (ReplicateB src));
2896   format %{ "movd    $dst,$src\n\t"
2897             "punpcklbw $dst,$dst\n\t"
2898             "pshuflw $dst,$dst,0x00\n\t"
2899             "punpcklqdq $dst,$dst\n\t"
2900             "vinserti128_high $dst,$dst\t! replicate32B" %}
2901   ins_encode %{
2902     __ movdl($dst$$XMMRegister, $src$$Register);
2903     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
2904     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
2905     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2906     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
2907   %}
2908   ins_pipe( pipe_slow );
2909 %}
2910 
2911 instruct Repl32B_mem(vecY dst, memory mem) %{
2912   predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
2913   match(Set dst (ReplicateB (LoadB mem)));
2914   format %{ "punpcklbw $dst,$mem\n\t"
2915             "pshuflw $dst,$dst,0x00\n\t"
2916             "punpcklqdq $dst,$dst\n\t"
2917             "vinserti128_high $dst,$dst\t! replicate32B" %}
2918   ins_encode %{
2919     __ punpcklbw($dst$$XMMRegister, $mem$$Address);
2920     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
2921     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2922     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
2923   %}
2924   ins_pipe( pipe_slow );
2925 %}
2926 
2927 instruct Repl16B_imm(vecX dst, immI con) %{
2928   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
2929   match(Set dst (ReplicateB con));
2930   format %{ "movq    $dst,[$constantaddress]\n\t"
2931             "punpcklqdq $dst,$dst\t! replicate16B($con)" %}
2932   ins_encode %{
2933     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
2934     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2935   %}
2936   ins_pipe( pipe_slow );
2937 %}
2938 
2939 instruct Repl32B_imm(vecY dst, immI con) %{
2940   predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
2941   match(Set dst (ReplicateB con));
2942   format %{ "movq    $dst,[$constantaddress]\n\t"
2943             "punpcklqdq $dst,$dst\n\t"
2944             "vinserti128_high $dst,$dst\t! lreplicate32B($con)" %}
2945   ins_encode %{
2946     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
2947     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2948     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
2949   %}
2950   ins_pipe( pipe_slow );
2951 %}
2952 
2953 instruct Repl4S(vecD dst, rRegI src) %{
2954   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vlbw());
2955   match(Set dst (ReplicateS src));
2956   format %{ "movd    $dst,$src\n\t"
2957             "pshuflw $dst,$dst,0x00\t! replicate4S" %}
2958   ins_encode %{
2959     __ movdl($dst$$XMMRegister, $src$$Register);
2960     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
2961   %}
2962   ins_pipe( pipe_slow );
2963 %}
2964 
2965 instruct Repl4S_mem(vecD dst, memory mem) %{
2966   predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
2967   match(Set dst (ReplicateS (LoadS mem)));
2968   format %{ "pshuflw $dst,$mem,0x00\t! replicate4S" %}
2969   ins_encode %{
2970     __ pshuflw($dst$$XMMRegister, $mem$$Address, 0x00);
2971   %}
2972   ins_pipe( pipe_slow );
2973 %}
2974 
2975 instruct Repl8S(vecX dst, rRegI src) %{
2976   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vlbw());
2977   match(Set dst (ReplicateS src));
2978   format %{ "movd    $dst,$src\n\t"
2979             "pshuflw $dst,$dst,0x00\n\t"
2980             "punpcklqdq $dst,$dst\t! replicate8S" %}
2981   ins_encode %{
2982     __ movdl($dst$$XMMRegister, $src$$Register);
2983     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
2984     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2985   %}
2986   ins_pipe( pipe_slow );
2987 %}
2988 
2989 instruct Repl8S_mem(vecX dst, memory mem) %{
2990   predicate(n->as_Vector()->length() == 8 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
2991   match(Set dst (ReplicateS (LoadS mem)));
2992   format %{ "pshuflw $dst,$mem,0x00\n\t"
2993             "punpcklqdq $dst,$dst\t! replicate8S" %}
2994   ins_encode %{
2995     __ pshuflw($dst$$XMMRegister, $mem$$Address, 0x00);
2996     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2997   %}
2998   ins_pipe( pipe_slow );
2999 %}
3000 
3001 instruct Repl8S_imm(vecX dst, immI con) %{
3002   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vlbw());
3003   match(Set dst (ReplicateS con));
3004   format %{ "movq    $dst,[$constantaddress]\n\t"
3005             "punpcklqdq $dst,$dst\t! replicate8S($con)" %}
3006   ins_encode %{
3007     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
3008     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3009   %}
3010   ins_pipe( pipe_slow );
3011 %}
3012 
3013 instruct Repl16S(vecY dst, rRegI src) %{
3014   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
3015   match(Set dst (ReplicateS src));
3016   format %{ "movd    $dst,$src\n\t"
3017             "pshuflw $dst,$dst,0x00\n\t"
3018             "punpcklqdq $dst,$dst\n\t"
3019             "vinserti128_high $dst,$dst\t! replicate16S" %}
3020   ins_encode %{
3021     __ movdl($dst$$XMMRegister, $src$$Register);
3022     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3023     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3024     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3025   %}
3026   ins_pipe( pipe_slow );
3027 %}
3028 
3029 instruct Repl16S_mem(vecY dst, memory mem) %{
3030   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
3031   match(Set dst (ReplicateS (LoadS mem)));
3032   format %{ "pshuflw $dst,$mem,0x00\n\t"
3033             "punpcklqdq $dst,$dst\n\t"
3034             "vinserti128_high $dst,$dst\t! replicate16S" %}
3035   ins_encode %{
3036     __ pshuflw($dst$$XMMRegister, $mem$$Address, 0x00);
3037     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3038     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3039   %}
3040   ins_pipe( pipe_slow );
3041 %}
3042 
3043 instruct Repl16S_imm(vecY dst, immI con) %{
3044   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
3045   match(Set dst (ReplicateS con));
3046   format %{ "movq    $dst,[$constantaddress]\n\t"
3047             "punpcklqdq $dst,$dst\n\t"
3048             "vinserti128_high $dst,$dst\t! replicate16S($con)" %}
3049   ins_encode %{
3050     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
3051     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3052     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3053   %}
3054   ins_pipe( pipe_slow );
3055 %}
3056 
3057 instruct Repl4I(vecX dst, rRegI src) %{
3058   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3059   match(Set dst (ReplicateI src));
3060   format %{ "movd    $dst,$src\n\t"
3061             "pshufd  $dst,$dst,0x00\t! replicate4I" %}
3062   ins_encode %{
3063     __ movdl($dst$$XMMRegister, $src$$Register);
3064     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3065   %}
3066   ins_pipe( pipe_slow );
3067 %}
3068 
3069 instruct Repl4I_mem(vecX dst, memory mem) %{
3070   predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3071   match(Set dst (ReplicateI (LoadI mem)));
3072   format %{ "pshufd  $dst,$mem,0x00\t! replicate4I" %}
3073   ins_encode %{
3074     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3075   %}
3076   ins_pipe( pipe_slow );
3077 %}
3078 
3079 instruct Repl8I(vecY dst, rRegI src) %{
3080   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3081   match(Set dst (ReplicateI src));
3082   format %{ "movd    $dst,$src\n\t"
3083             "pshufd  $dst,$dst,0x00\n\t"
3084             "vinserti128_high $dst,$dst\t! replicate8I" %}
3085   ins_encode %{
3086     __ movdl($dst$$XMMRegister, $src$$Register);
3087     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3088     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3089   %}
3090   ins_pipe( pipe_slow );
3091 %}
3092 
3093 instruct Repl8I_mem(vecY dst, memory mem) %{
3094   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3095   match(Set dst (ReplicateI (LoadI mem)));
3096   format %{ "pshufd  $dst,$mem,0x00\n\t"
3097             "vinserti128_high $dst,$dst\t! replicate8I" %}
3098   ins_encode %{
3099     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3100     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3101   %}
3102   ins_pipe( pipe_slow );
3103 %}
3104 
3105 instruct Repl4I_imm(vecX dst, immI con) %{
3106   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3107   match(Set dst (ReplicateI con));
3108   format %{ "movq    $dst,[$constantaddress]\t! replicate4I($con)\n\t"
3109             "punpcklqdq $dst,$dst" %}
3110   ins_encode %{
3111     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
3112     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3113   %}
3114   ins_pipe( pipe_slow );
3115 %}
3116 
3117 instruct Repl8I_imm(vecY dst, immI con) %{
3118   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3119   match(Set dst (ReplicateI con));
3120   format %{ "movq    $dst,[$constantaddress]\t! replicate8I($con)\n\t"
3121             "punpcklqdq $dst,$dst\n\t"
3122             "vinserti128_high $dst,$dst" %}
3123   ins_encode %{
3124     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
3125     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3126     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3127   %}
3128   ins_pipe( pipe_slow );
3129 %}
3130 
3131 // Long could be loaded into xmm register directly from memory.
3132 instruct Repl2L_mem(vecX dst, memory mem) %{
3133   predicate(n->as_Vector()->length() == 2 && !VM_Version::supports_avx512vlbw());
3134   match(Set dst (ReplicateL (LoadL mem)));
3135   format %{ "movq    $dst,$mem\n\t"
3136             "punpcklqdq $dst,$dst\t! replicate2L" %}
3137   ins_encode %{
3138     __ movq($dst$$XMMRegister, $mem$$Address);
3139     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3140   %}
3141   ins_pipe( pipe_slow );
3142 %}
3143 
3144 // Replicate long (8 byte) scalar to be vector
3145 #ifdef _LP64
3146 instruct Repl4L(vecY dst, rRegL src) %{
3147   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3148   match(Set dst (ReplicateL src));
3149   format %{ "movdq   $dst,$src\n\t"
3150             "punpcklqdq $dst,$dst\n\t"
3151             "vinserti128_high $dst,$dst\t! replicate4L" %}
3152   ins_encode %{
3153     __ movdq($dst$$XMMRegister, $src$$Register);
3154     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3155     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3156   %}
3157   ins_pipe( pipe_slow );
3158 %}
3159 #else // _LP64
3160 instruct Repl4L(vecY dst, eRegL src, regD tmp) %{
3161   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3162   match(Set dst (ReplicateL src));
3163   effect(TEMP dst, USE src, TEMP tmp);
3164   format %{ "movdl   $dst,$src.lo\n\t"
3165             "movdl   $tmp,$src.hi\n\t"
3166             "punpckldq $dst,$tmp\n\t"
3167             "punpcklqdq $dst,$dst\n\t"
3168             "vinserti128_high $dst,$dst\t! replicate4L" %}
3169   ins_encode %{
3170     __ movdl($dst$$XMMRegister, $src$$Register);
3171     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
3172     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
3173     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3174     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3175   %}
3176   ins_pipe( pipe_slow );
3177 %}
3178 #endif // _LP64
3179 
3180 instruct Repl4L_imm(vecY dst, immL con) %{
3181   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3182   match(Set dst (ReplicateL con));
3183   format %{ "movq    $dst,[$constantaddress]\n\t"
3184             "punpcklqdq $dst,$dst\n\t"
3185             "vinserti128_high $dst,$dst\t! replicate4L($con)" %}
3186   ins_encode %{
3187     __ movq($dst$$XMMRegister, $constantaddress($con));
3188     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3189     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3190   %}
3191   ins_pipe( pipe_slow );
3192 %}
3193 
3194 instruct Repl4L_mem(vecY dst, memory mem) %{
3195   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3196   match(Set dst (ReplicateL (LoadL mem)));
3197   format %{ "movq    $dst,$mem\n\t"
3198             "punpcklqdq $dst,$dst\n\t"
3199             "vinserti128_high $dst,$dst\t! replicate4L" %}
3200   ins_encode %{
3201     __ movq($dst$$XMMRegister, $mem$$Address);
3202     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3203     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3204   %}
3205   ins_pipe( pipe_slow );
3206 %}
3207 
3208 instruct Repl2F_mem(vecD dst, memory mem) %{
3209   predicate(n->as_Vector()->length() == 2 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3210   match(Set dst (ReplicateF (LoadF mem)));
3211   format %{ "pshufd  $dst,$mem,0x00\t! replicate2F" %}
3212   ins_encode %{
3213     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3214   %}
3215   ins_pipe( pipe_slow );
3216 %}
3217 
3218 instruct Repl4F_mem(vecX dst, memory mem) %{
3219   predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3220   match(Set dst (ReplicateF (LoadF mem)));
3221   format %{ "pshufd  $dst,$mem,0x00\t! replicate4F" %}
3222   ins_encode %{
3223     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3224   %}
3225   ins_pipe( pipe_slow );
3226 %}
3227 
3228 instruct Repl8F(vecY dst, regF src) %{
3229   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3230   match(Set dst (ReplicateF src));
3231   format %{ "pshufd  $dst,$src,0x00\n\t"
3232             "vinsertf128_high $dst,$dst\t! replicate8F" %}
3233   ins_encode %{
3234     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
3235     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3236   %}
3237   ins_pipe( pipe_slow );
3238 %}
3239 
3240 instruct Repl8F_mem(vecY dst, memory mem) %{
3241   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3242   match(Set dst (ReplicateF (LoadF mem)));
3243   format %{ "pshufd  $dst,$mem,0x00\n\t"
3244             "vinsertf128_high $dst,$dst\t! replicate8F" %}
3245   ins_encode %{
3246     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3247     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3248   %}
3249   ins_pipe( pipe_slow );
3250 %}
3251 
3252 instruct Repl2F_zero(vecD dst, immF0 zero) %{
3253   predicate(n->as_Vector()->length() == 2 && UseAVX < 3);
3254   match(Set dst (ReplicateF zero));
3255   format %{ "xorps   $dst,$dst\t! replicate2F zero" %}
3256   ins_encode %{
3257     __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
3258   %}
3259   ins_pipe( fpu_reg_reg );
3260 %}
3261 
3262 instruct Repl4F_zero(vecX dst, immF0 zero) %{
3263   predicate(n->as_Vector()->length() == 4 && UseAVX < 3);
3264   match(Set dst (ReplicateF zero));
3265   format %{ "xorps   $dst,$dst\t! replicate4F zero" %}
3266   ins_encode %{
3267     __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
3268   %}
3269   ins_pipe( fpu_reg_reg );
3270 %}
3271 
3272 instruct Repl8F_zero(vecY dst, immF0 zero) %{
3273   predicate(n->as_Vector()->length() == 8 && UseAVX < 3);
3274   match(Set dst (ReplicateF zero));
3275   format %{ "vxorps  $dst,$dst,$dst\t! replicate8F zero" %}
3276   ins_encode %{
3277     int vector_len = 1;
3278     __ vxorps($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3279   %}
3280   ins_pipe( fpu_reg_reg );
3281 %}
3282 
3283 instruct Repl2D_mem(vecX dst, memory mem) %{
3284   predicate(n->as_Vector()->length() == 2 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3285   match(Set dst (ReplicateD (LoadD mem)));
3286   format %{ "pshufd  $dst,$mem,0x44\t! replicate2D" %}
3287   ins_encode %{
3288     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x44);
3289   %}
3290   ins_pipe( pipe_slow );
3291 %}
3292 
3293 instruct Repl4D(vecY dst, regD src) %{
3294   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3295   match(Set dst (ReplicateD src));
3296   format %{ "pshufd  $dst,$src,0x44\n\t"
3297             "vinsertf128_high $dst,$dst\t! replicate4D" %}
3298   ins_encode %{
3299     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
3300     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3301   %}
3302   ins_pipe( pipe_slow );
3303 %}
3304 
3305 instruct Repl4D_mem(vecY dst, memory mem) %{
3306   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3307   match(Set dst (ReplicateD (LoadD mem)));
3308   format %{ "pshufd  $dst,$mem,0x44\n\t"
3309             "vinsertf128_high $dst,$dst\t! replicate4D" %}
3310   ins_encode %{
3311     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x44);
3312     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3313   %}
3314   ins_pipe( pipe_slow );
3315 %}
3316 
3317 // Replicate double (8 byte) scalar zero to be vector
3318 instruct Repl2D_zero(vecX dst, immD0 zero) %{
3319   predicate(n->as_Vector()->length() == 2 && UseAVX < 3);
3320   match(Set dst (ReplicateD zero));
3321   format %{ "xorpd   $dst,$dst\t! replicate2D zero" %}
3322   ins_encode %{
3323     __ xorpd($dst$$XMMRegister, $dst$$XMMRegister);
3324   %}
3325   ins_pipe( fpu_reg_reg );
3326 %}
3327 
3328 instruct Repl4D_zero(vecY dst, immD0 zero) %{
3329   predicate(n->as_Vector()->length() == 4 && UseAVX < 3);
3330   match(Set dst (ReplicateD zero));
3331   format %{ "vxorpd  $dst,$dst,$dst,vect256\t! replicate4D zero" %}
3332   ins_encode %{
3333     int vector_len = 1;
3334     __ vxorpd($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3335   %}
3336   ins_pipe( fpu_reg_reg );
3337 %}
3338 
3339 // ====================GENERIC REPLICATE==========================================
3340 
3341 // Replicate byte scalar to be vector
3342 instruct Repl4B(vecS dst, rRegI src) %{
3343   predicate(n->as_Vector()->length() == 4);
3344   match(Set dst (ReplicateB src));
3345   format %{ "movd    $dst,$src\n\t"
3346             "punpcklbw $dst,$dst\n\t"
3347             "pshuflw $dst,$dst,0x00\t! replicate4B" %}
3348   ins_encode %{
3349     __ movdl($dst$$XMMRegister, $src$$Register);
3350     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
3351     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3352   %}
3353   ins_pipe( pipe_slow );
3354 %}
3355 
3356 instruct Repl8B(vecD dst, rRegI src) %{
3357   predicate(n->as_Vector()->length() == 8);
3358   match(Set dst (ReplicateB src));
3359   format %{ "movd    $dst,$src\n\t"
3360             "punpcklbw $dst,$dst\n\t"
3361             "pshuflw $dst,$dst,0x00\t! replicate8B" %}
3362   ins_encode %{
3363     __ movdl($dst$$XMMRegister, $src$$Register);
3364     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
3365     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3366   %}
3367   ins_pipe( pipe_slow );
3368 %}
3369 
3370 // Replicate byte scalar immediate to be vector by loading from const table.
3371 instruct Repl4B_imm(vecS dst, immI con) %{
3372   predicate(n->as_Vector()->length() == 4);
3373   match(Set dst (ReplicateB con));
3374   format %{ "movdl   $dst,[$constantaddress]\t! replicate4B($con)" %}
3375   ins_encode %{
3376     __ movdl($dst$$XMMRegister, $constantaddress(replicate4_imm($con$$constant, 1)));
3377   %}
3378   ins_pipe( pipe_slow );
3379 %}
3380 
3381 instruct Repl8B_imm(vecD dst, immI con) %{
3382   predicate(n->as_Vector()->length() == 8);
3383   match(Set dst (ReplicateB con));
3384   format %{ "movq    $dst,[$constantaddress]\t! replicate8B($con)" %}
3385   ins_encode %{
3386     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
3387   %}
3388   ins_pipe( pipe_slow );
3389 %}
3390 
3391 // Replicate byte scalar zero to be vector
3392 instruct Repl4B_zero(vecS dst, immI0 zero) %{
3393   predicate(n->as_Vector()->length() == 4);
3394   match(Set dst (ReplicateB zero));
3395   format %{ "pxor    $dst,$dst\t! replicate4B zero" %}
3396   ins_encode %{
3397     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3398   %}
3399   ins_pipe( fpu_reg_reg );
3400 %}
3401 
3402 instruct Repl8B_zero(vecD dst, immI0 zero) %{
3403   predicate(n->as_Vector()->length() == 8);
3404   match(Set dst (ReplicateB zero));
3405   format %{ "pxor    $dst,$dst\t! replicate8B zero" %}
3406   ins_encode %{
3407     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3408   %}
3409   ins_pipe( fpu_reg_reg );
3410 %}
3411 
3412 instruct Repl16B_zero(vecX dst, immI0 zero) %{
3413   predicate(n->as_Vector()->length() == 16);
3414   match(Set dst (ReplicateB zero));
3415   format %{ "pxor    $dst,$dst\t! replicate16B zero" %}
3416   ins_encode %{
3417     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3418   %}
3419   ins_pipe( fpu_reg_reg );
3420 %}
3421 
3422 instruct Repl32B_zero(vecY dst, immI0 zero) %{
3423   predicate(n->as_Vector()->length() == 32);
3424   match(Set dst (ReplicateB zero));
3425   format %{ "vpxor   $dst,$dst,$dst\t! replicate32B zero" %}
3426   ins_encode %{
3427     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
3428     int vector_len = 1;
3429     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3430   %}
3431   ins_pipe( fpu_reg_reg );
3432 %}
3433 
3434 // Replicate char/short (2 byte) scalar to be vector
3435 instruct Repl2S(vecS dst, rRegI src) %{
3436   predicate(n->as_Vector()->length() == 2);
3437   match(Set dst (ReplicateS src));
3438   format %{ "movd    $dst,$src\n\t"
3439             "pshuflw $dst,$dst,0x00\t! replicate2S" %}
3440   ins_encode %{
3441     __ movdl($dst$$XMMRegister, $src$$Register);
3442     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3443   %}
3444   ins_pipe( fpu_reg_reg );
3445 %}
3446 
3447 // Replicate char/short (2 byte) scalar immediate to be vector by loading from const table.
3448 instruct Repl2S_imm(vecS dst, immI con) %{
3449   predicate(n->as_Vector()->length() == 2);
3450   match(Set dst (ReplicateS con));
3451   format %{ "movdl   $dst,[$constantaddress]\t! replicate2S($con)" %}
3452   ins_encode %{
3453     __ movdl($dst$$XMMRegister, $constantaddress(replicate4_imm($con$$constant, 2)));
3454   %}
3455   ins_pipe( fpu_reg_reg );
3456 %}
3457 
3458 instruct Repl4S_imm(vecD dst, immI con) %{
3459   predicate(n->as_Vector()->length() == 4);
3460   match(Set dst (ReplicateS con));
3461   format %{ "movq    $dst,[$constantaddress]\t! replicate4S($con)" %}
3462   ins_encode %{
3463     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
3464   %}
3465   ins_pipe( fpu_reg_reg );
3466 %}
3467 
3468 // Replicate char/short (2 byte) scalar zero to be vector
3469 instruct Repl2S_zero(vecS dst, immI0 zero) %{
3470   predicate(n->as_Vector()->length() == 2);
3471   match(Set dst (ReplicateS zero));
3472   format %{ "pxor    $dst,$dst\t! replicate2S zero" %}
3473   ins_encode %{
3474     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3475   %}
3476   ins_pipe( fpu_reg_reg );
3477 %}
3478 
3479 instruct Repl4S_zero(vecD dst, immI0 zero) %{
3480   predicate(n->as_Vector()->length() == 4);
3481   match(Set dst (ReplicateS zero));
3482   format %{ "pxor    $dst,$dst\t! replicate4S zero" %}
3483   ins_encode %{
3484     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3485   %}
3486   ins_pipe( fpu_reg_reg );
3487 %}
3488 
3489 instruct Repl8S_zero(vecX dst, immI0 zero) %{
3490   predicate(n->as_Vector()->length() == 8);
3491   match(Set dst (ReplicateS zero));
3492   format %{ "pxor    $dst,$dst\t! replicate8S zero" %}
3493   ins_encode %{
3494     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3495   %}
3496   ins_pipe( fpu_reg_reg );
3497 %}
3498 
3499 instruct Repl16S_zero(vecY dst, immI0 zero) %{
3500   predicate(n->as_Vector()->length() == 16);
3501   match(Set dst (ReplicateS zero));
3502   format %{ "vpxor   $dst,$dst,$dst\t! replicate16S zero" %}
3503   ins_encode %{
3504     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
3505     int vector_len = 1;
3506     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3507   %}
3508   ins_pipe( fpu_reg_reg );
3509 %}
3510 
3511 // Replicate integer (4 byte) scalar to be vector
3512 instruct Repl2I(vecD dst, rRegI src) %{
3513   predicate(n->as_Vector()->length() == 2);
3514   match(Set dst (ReplicateI src));
3515   format %{ "movd    $dst,$src\n\t"
3516             "pshufd  $dst,$dst,0x00\t! replicate2I" %}
3517   ins_encode %{
3518     __ movdl($dst$$XMMRegister, $src$$Register);
3519     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3520   %}
3521   ins_pipe( fpu_reg_reg );
3522 %}
3523 
3524 // Integer could be loaded into xmm register directly from memory.
3525 instruct Repl2I_mem(vecD dst, memory mem) %{
3526   predicate(n->as_Vector()->length() == 2);
3527   match(Set dst (ReplicateI (LoadI mem)));
3528   format %{ "movd    $dst,$mem\n\t"
3529             "pshufd  $dst,$dst,0x00\t! replicate2I" %}
3530   ins_encode %{
3531     __ movdl($dst$$XMMRegister, $mem$$Address);
3532     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3533   %}
3534   ins_pipe( fpu_reg_reg );
3535 %}
3536 
3537 // Replicate integer (4 byte) scalar immediate to be vector by loading from const table.
3538 instruct Repl2I_imm(vecD dst, immI con) %{
3539   predicate(n->as_Vector()->length() == 2);
3540   match(Set dst (ReplicateI con));
3541   format %{ "movq    $dst,[$constantaddress]\t! replicate2I($con)" %}
3542   ins_encode %{
3543     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
3544   %}
3545   ins_pipe( fpu_reg_reg );
3546 %}
3547 
3548 // Replicate integer (4 byte) scalar zero to be vector
3549 instruct Repl2I_zero(vecD dst, immI0 zero) %{
3550   predicate(n->as_Vector()->length() == 2);
3551   match(Set dst (ReplicateI zero));
3552   format %{ "pxor    $dst,$dst\t! replicate2I" %}
3553   ins_encode %{
3554     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3555   %}
3556   ins_pipe( fpu_reg_reg );
3557 %}
3558 
3559 instruct Repl4I_zero(vecX dst, immI0 zero) %{
3560   predicate(n->as_Vector()->length() == 4);
3561   match(Set dst (ReplicateI zero));
3562   format %{ "pxor    $dst,$dst\t! replicate4I zero)" %}
3563   ins_encode %{
3564     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3565   %}
3566   ins_pipe( fpu_reg_reg );
3567 %}
3568 
3569 instruct Repl8I_zero(vecY dst, immI0 zero) %{
3570   predicate(n->as_Vector()->length() == 8);
3571   match(Set dst (ReplicateI zero));
3572   format %{ "vpxor   $dst,$dst,$dst\t! replicate8I zero" %}
3573   ins_encode %{
3574     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
3575     int vector_len = 1;
3576     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3577   %}
3578   ins_pipe( fpu_reg_reg );
3579 %}
3580 
3581 // Replicate long (8 byte) scalar to be vector
3582 #ifdef _LP64
3583 instruct Repl2L(vecX dst, rRegL src) %{
3584   predicate(n->as_Vector()->length() == 2);
3585   match(Set dst (ReplicateL src));
3586   format %{ "movdq   $dst,$src\n\t"
3587             "punpcklqdq $dst,$dst\t! replicate2L" %}
3588   ins_encode %{
3589     __ movdq($dst$$XMMRegister, $src$$Register);
3590     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3591   %}
3592   ins_pipe( pipe_slow );
3593 %}
3594 #else // _LP64
3595 instruct Repl2L(vecX dst, eRegL src, regD tmp) %{
3596   predicate(n->as_Vector()->length() == 2);
3597   match(Set dst (ReplicateL src));
3598   effect(TEMP dst, USE src, TEMP tmp);
3599   format %{ "movdl   $dst,$src.lo\n\t"
3600             "movdl   $tmp,$src.hi\n\t"
3601             "punpckldq $dst,$tmp\n\t"
3602             "punpcklqdq $dst,$dst\t! replicate2L"%}
3603   ins_encode %{
3604     __ movdl($dst$$XMMRegister, $src$$Register);
3605     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
3606     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
3607     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3608   %}
3609   ins_pipe( pipe_slow );
3610 %}
3611 #endif // _LP64
3612 
3613 // Replicate long (8 byte) scalar immediate to be vector by loading from const table.
3614 instruct Repl2L_imm(vecX dst, immL con) %{
3615   predicate(n->as_Vector()->length() == 2);
3616   match(Set dst (ReplicateL con));
3617   format %{ "movq    $dst,[$constantaddress]\n\t"
3618             "punpcklqdq $dst,$dst\t! replicate2L($con)" %}
3619   ins_encode %{
3620     __ movq($dst$$XMMRegister, $constantaddress($con));
3621     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3622   %}
3623   ins_pipe( pipe_slow );
3624 %}
3625 
3626 // Replicate long (8 byte) scalar zero to be vector
3627 instruct Repl2L_zero(vecX dst, immL0 zero) %{
3628   predicate(n->as_Vector()->length() == 2);
3629   match(Set dst (ReplicateL zero));
3630   format %{ "pxor    $dst,$dst\t! replicate2L zero" %}
3631   ins_encode %{
3632     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3633   %}
3634   ins_pipe( fpu_reg_reg );
3635 %}
3636 
3637 instruct Repl4L_zero(vecY dst, immL0 zero) %{
3638   predicate(n->as_Vector()->length() == 4);
3639   match(Set dst (ReplicateL zero));
3640   format %{ "vpxor   $dst,$dst,$dst\t! replicate4L zero" %}
3641   ins_encode %{
3642     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
3643     int vector_len = 1;
3644     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3645   %}
3646   ins_pipe( fpu_reg_reg );
3647 %}
3648 
3649 // Replicate float (4 byte) scalar to be vector
3650 instruct Repl2F(vecD dst, regF src) %{
3651   predicate(n->as_Vector()->length() == 2);
3652   match(Set dst (ReplicateF src));
3653   format %{ "pshufd  $dst,$dst,0x00\t! replicate2F" %}
3654   ins_encode %{
3655     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
3656   %}
3657   ins_pipe( fpu_reg_reg );
3658 %}
3659 
3660 instruct Repl4F(vecX dst, regF src) %{
3661   predicate(n->as_Vector()->length() == 4);
3662   match(Set dst (ReplicateF src));
3663   format %{ "pshufd  $dst,$dst,0x00\t! replicate4F" %}
3664   ins_encode %{
3665     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
3666   %}
3667   ins_pipe( pipe_slow );
3668 %}
3669 
3670 // Replicate double (8 bytes) scalar to be vector
3671 instruct Repl2D(vecX dst, regD src) %{
3672   predicate(n->as_Vector()->length() == 2);
3673   match(Set dst (ReplicateD src));
3674   format %{ "pshufd  $dst,$src,0x44\t! replicate2D" %}
3675   ins_encode %{
3676     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
3677   %}
3678   ins_pipe( pipe_slow );
3679 %}
3680 
3681 // ====================EVEX REPLICATE=============================================
3682 
3683 instruct Repl4B_mem_evex(vecS dst, memory mem) %{
3684   predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vlbw());
3685   match(Set dst (ReplicateB (LoadB mem)));
3686   format %{ "vpbroadcastb  $dst,$mem\t! replicate4B" %}
3687   ins_encode %{
3688     int vector_len = 0;
3689     __ evpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
3690   %}
3691   ins_pipe( pipe_slow );
3692 %}
3693 
3694 instruct Repl8B_mem_evex(vecD dst, memory mem) %{
3695   predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vlbw());
3696   match(Set dst (ReplicateB (LoadB mem)));
3697   format %{ "vpbroadcastb  $dst,$mem\t! replicate8B" %}
3698   ins_encode %{
3699     int vector_len = 0;
3700     __ evpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
3701   %}
3702   ins_pipe( pipe_slow );
3703 %}
3704 
3705 instruct Repl16B_evex(vecX dst, rRegI src) %{
3706   predicate(n->as_Vector()->length() == 16 && VM_Version::supports_avx512vlbw());
3707   match(Set dst (ReplicateB src));
3708   format %{ "vpbroadcastb $dst,$src\t! replicate16B" %}
3709   ins_encode %{
3710    int vector_len = 0;
3711     __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vector_len);
3712   %}
3713   ins_pipe( pipe_slow );
3714 %}
3715 
3716 instruct Repl16B_mem_evex(vecX dst, memory mem) %{
3717   predicate(n->as_Vector()->length() == 16 && VM_Version::supports_avx512vlbw());
3718   match(Set dst (ReplicateB (LoadB mem)));
3719   format %{ "vpbroadcastb  $dst,$mem\t! replicate16B" %}
3720   ins_encode %{
3721     int vector_len = 0;
3722     __ evpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
3723   %}
3724   ins_pipe( pipe_slow );
3725 %}
3726 
3727 instruct Repl32B_evex(vecY dst, rRegI src) %{
3728   predicate(n->as_Vector()->length() == 32 && VM_Version::supports_avx512vlbw());
3729   match(Set dst (ReplicateB src));
3730   format %{ "vpbroadcastb $dst,$src\t! replicate32B" %}
3731   ins_encode %{
3732    int vector_len = 1;
3733     __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vector_len);
3734   %}
3735   ins_pipe( pipe_slow );
3736 %}
3737 
3738 instruct Repl32B_mem_evex(vecY dst, memory mem) %{
3739   predicate(n->as_Vector()->length() == 32 && VM_Version::supports_avx512vlbw());
3740   match(Set dst (ReplicateB (LoadB mem)));
3741   format %{ "vpbroadcastb  $dst,$mem\t! replicate32B" %}
3742   ins_encode %{
3743     int vector_len = 1;
3744     __ evpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
3745   %}
3746   ins_pipe( pipe_slow );
3747 %}
3748 
3749 instruct Repl64B_evex(vecZ dst, rRegI src) %{
3750   predicate(n->as_Vector()->length() == 64 && VM_Version::supports_avx512bw());
3751   match(Set dst (ReplicateB src));
3752   format %{ "vpbroadcastb $dst,$src\t! upper replicate64B" %}
3753   ins_encode %{
3754    int vector_len = 2;
3755     __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vector_len);
3756   %}
3757   ins_pipe( pipe_slow );
3758 %}
3759 
3760 instruct Repl64B_mem_evex(vecZ dst, memory mem) %{
3761   predicate(n->as_Vector()->length() == 64 && VM_Version::supports_avx512bw());
3762   match(Set dst (ReplicateB (LoadB mem)));
3763   format %{ "vpbroadcastb  $dst,$mem\t! replicate64B" %}
3764   ins_encode %{
3765     int vector_len = 2;
3766     __ evpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
3767   %}
3768   ins_pipe( pipe_slow );
3769 %}
3770 
3771 instruct Repl16B_imm_evex(vecX dst, immI con) %{
3772   predicate(n->as_Vector()->length() == 16 && VM_Version::supports_avx512vlbw());
3773   match(Set dst (ReplicateB con));
3774   format %{ "movq    $dst,[$constantaddress]\n\t"
3775             "vpbroadcastb $dst,$dst\t! replicate16B" %}
3776   ins_encode %{
3777    int vector_len = 0;
3778     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
3779     __ evpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3780   %}
3781   ins_pipe( pipe_slow );
3782 %}
3783 
3784 instruct Repl32B_imm_evex(vecY dst, immI con) %{
3785   predicate(n->as_Vector()->length() == 32 && VM_Version::supports_avx512vlbw());
3786   match(Set dst (ReplicateB con));
3787   format %{ "movq    $dst,[$constantaddress]\n\t"
3788             "vpbroadcastb $dst,$dst\t! replicate32B" %}
3789   ins_encode %{
3790    int vector_len = 1;
3791     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
3792     __ evpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3793   %}
3794   ins_pipe( pipe_slow );
3795 %}
3796 
3797 instruct Repl64B_imm_evex(vecZ dst, immI con) %{
3798   predicate(n->as_Vector()->length() == 64 && VM_Version::supports_avx512bw());
3799   match(Set dst (ReplicateB con));
3800   format %{ "movq    $dst,[$constantaddress]\n\t"
3801             "vpbroadcastb $dst,$dst\t! upper replicate64B" %}
3802   ins_encode %{
3803    int vector_len = 2;
3804     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
3805     __ evpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3806   %}
3807   ins_pipe( pipe_slow );
3808 %}
3809 
3810 instruct Repl64B_zero_evex(vecZ dst, immI0 zero) %{
3811   predicate(n->as_Vector()->length() == 64 && UseAVX > 2);
3812   match(Set dst (ReplicateB zero));
3813   format %{ "vpxor   $dst k0,$dst,$dst\t! replicate64B zero" %}
3814   ins_encode %{
3815     // Use vxorpd since AVX does not have vpxor for 512-bit (EVEX will have it).
3816     int vector_len = 2;
3817     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3818   %}
3819   ins_pipe( fpu_reg_reg );
3820 %}
3821 
3822 instruct Repl4S_evex(vecD dst, rRegI src) %{
3823   predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vlbw());
3824   match(Set dst (ReplicateS src));
3825   format %{ "vpbroadcastw $dst,$src\t! replicate4S" %}
3826   ins_encode %{
3827    int vector_len = 0;
3828     __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
3829   %}
3830   ins_pipe( pipe_slow );
3831 %}
3832 
3833 instruct Repl4S_mem_evex(vecD dst, memory mem) %{
3834   predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vlbw());
3835   match(Set dst (ReplicateS (LoadS mem)));
3836   format %{ "vpbroadcastw  $dst,$mem\t! replicate4S" %}
3837   ins_encode %{
3838     int vector_len = 0;
3839     __ evpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
3840   %}
3841   ins_pipe( pipe_slow );
3842 %}
3843 
3844 instruct Repl8S_evex(vecX dst, rRegI src) %{
3845   predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vlbw());
3846   match(Set dst (ReplicateS src));
3847   format %{ "vpbroadcastw $dst,$src\t! replicate8S" %}
3848   ins_encode %{
3849    int vector_len = 0;
3850     __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
3851   %}
3852   ins_pipe( pipe_slow );
3853 %}
3854 
3855 instruct Repl8S_mem_evex(vecX dst, memory mem) %{
3856   predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vlbw());
3857   match(Set dst (ReplicateS (LoadS mem)));
3858   format %{ "vpbroadcastw  $dst,$mem\t! replicate8S" %}
3859   ins_encode %{
3860     int vector_len = 0;
3861     __ evpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
3862   %}
3863   ins_pipe( pipe_slow );
3864 %}
3865 
3866 instruct Repl16S_evex(vecY dst, rRegI src) %{
3867   predicate(n->as_Vector()->length() == 16 && VM_Version::supports_avx512vlbw());
3868   match(Set dst (ReplicateS src));
3869   format %{ "vpbroadcastw $dst,$src\t! replicate16S" %}
3870   ins_encode %{
3871    int vector_len = 1;
3872     __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
3873   %}
3874   ins_pipe( pipe_slow );
3875 %}
3876 
3877 instruct Repl16S_mem_evex(vecY dst, memory mem) %{
3878   predicate(n->as_Vector()->length() == 16 && VM_Version::supports_avx512vlbw());
3879   match(Set dst (ReplicateS (LoadS mem)));
3880   format %{ "vpbroadcastw  $dst,$mem\t! replicate16S" %}
3881   ins_encode %{
3882     int vector_len = 1;
3883     __ evpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
3884   %}
3885   ins_pipe( pipe_slow );
3886 %}
3887 
3888 instruct Repl32S_evex(vecZ dst, rRegI src) %{
3889   predicate(n->as_Vector()->length() == 32 && VM_Version::supports_avx512bw());
3890   match(Set dst (ReplicateS src));
3891   format %{ "vpbroadcastw $dst,$src\t! replicate32S" %}
3892   ins_encode %{
3893    int vector_len = 2;
3894     __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
3895   %}
3896   ins_pipe( pipe_slow );
3897 %}
3898 
3899 instruct Repl32S_mem_evex(vecZ dst, memory mem) %{
3900   predicate(n->as_Vector()->length() == 32 && VM_Version::supports_avx512bw());
3901   match(Set dst (ReplicateS (LoadS mem)));
3902   format %{ "vpbroadcastw  $dst,$mem\t! replicate32S" %}
3903   ins_encode %{
3904     int vector_len = 2;
3905     __ evpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
3906   %}
3907   ins_pipe( pipe_slow );
3908 %}
3909 
3910 instruct Repl8S_imm_evex(vecX dst, immI con) %{
3911   predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vlbw());
3912   match(Set dst (ReplicateS con));
3913   format %{ "movq    $dst,[$constantaddress]\n\t"
3914             "vpbroadcastw $dst,$dst\t! replicate8S" %}
3915   ins_encode %{
3916    int vector_len = 0;
3917     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
3918     __ evpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3919   %}
3920   ins_pipe( pipe_slow );
3921 %}
3922 
3923 instruct Repl16S_imm_evex(vecY dst, immI con) %{
3924   predicate(n->as_Vector()->length() == 16 && VM_Version::supports_avx512vlbw());
3925   match(Set dst (ReplicateS con));
3926   format %{ "movq    $dst,[$constantaddress]\n\t"
3927             "vpbroadcastw $dst,$dst\t! replicate16S" %}
3928   ins_encode %{
3929    int vector_len = 1;
3930     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
3931     __ evpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3932   %}
3933   ins_pipe( pipe_slow );
3934 %}
3935 
3936 instruct Repl32S_imm_evex(vecZ dst, immI con) %{
3937   predicate(n->as_Vector()->length() == 32 && VM_Version::supports_avx512bw());
3938   match(Set dst (ReplicateS con));
3939   format %{ "movq    $dst,[$constantaddress]\n\t"
3940             "vpbroadcastw $dst,$dst\t! replicate32S" %}
3941   ins_encode %{
3942    int vector_len = 2;
3943     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
3944     __ evpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3945   %}
3946   ins_pipe( pipe_slow );
3947 %}
3948 
3949 instruct Repl32S_zero_evex(vecZ dst, immI0 zero) %{
3950   predicate(n->as_Vector()->length() == 32 && UseAVX > 2);
3951   match(Set dst (ReplicateS zero));
3952   format %{ "vpxor   $dst k0,$dst,$dst\t! replicate32S zero" %}
3953   ins_encode %{
3954     // Use vxorpd since AVX does not have vpxor for 512-bit (EVEX will have it).
3955     int vector_len = 2;
3956     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3957   %}
3958   ins_pipe( fpu_reg_reg );
3959 %}
3960 
3961 instruct Repl4I_evex(vecX dst, rRegI src) %{
3962   predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
3963   match(Set dst (ReplicateI src));
3964   format %{ "vpbroadcastd  $dst,$src\t! replicate4I" %}
3965   ins_encode %{
3966     int vector_len = 0;
3967     __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vector_len);
3968   %}
3969   ins_pipe( pipe_slow );
3970 %}
3971 
3972 instruct Repl4I_mem_evex(vecX dst, memory mem) %{
3973   predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
3974   match(Set dst (ReplicateI (LoadI mem)));
3975   format %{ "vpbroadcastd  $dst,$mem\t! replicate4I" %}
3976   ins_encode %{
3977     int vector_len = 0;
3978     __ evpbroadcastd($dst$$XMMRegister, $mem$$Address, vector_len);
3979   %}
3980   ins_pipe( pipe_slow );
3981 %}
3982 
3983 instruct Repl8I_evex(vecY dst, rRegI src) %{
3984   predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vl());
3985   match(Set dst (ReplicateI src));
3986   format %{ "vpbroadcastd  $dst,$src\t! replicate8I" %}
3987   ins_encode %{
3988     int vector_len = 1;
3989     __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vector_len);
3990   %}
3991   ins_pipe( pipe_slow );
3992 %}
3993 
3994 instruct Repl8I_mem_evex(vecY dst, memory mem) %{
3995   predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vl());
3996   match(Set dst (ReplicateI (LoadI mem)));
3997   format %{ "vpbroadcastd  $dst,$mem\t! replicate8I" %}
3998   ins_encode %{
3999     int vector_len = 1;
4000     __ evpbroadcastd($dst$$XMMRegister, $mem$$Address, vector_len);
4001   %}
4002   ins_pipe( pipe_slow );
4003 %}
4004 
4005 instruct Repl16I_evex(vecZ dst, rRegI src) %{
4006   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4007   match(Set dst (ReplicateI src));
4008   format %{ "vpbroadcastd  $dst,$src\t! replicate16I" %}
4009   ins_encode %{
4010     int vector_len = 2;
4011     __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vector_len);
4012   %}
4013   ins_pipe( pipe_slow );
4014 %}
4015 
4016 instruct Repl16I_mem_evex(vecZ dst, memory mem) %{
4017   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4018   match(Set dst (ReplicateI (LoadI mem)));
4019   format %{ "vpbroadcastd  $dst,$mem\t! replicate16I" %}
4020   ins_encode %{
4021     int vector_len = 2;
4022     __ evpbroadcastd($dst$$XMMRegister, $mem$$Address, vector_len);
4023   %}
4024   ins_pipe( pipe_slow );
4025 %}
4026 
4027 instruct Repl4I_imm_evex(vecX dst, immI con) %{
4028   predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
4029   match(Set dst (ReplicateI con));
4030   format %{ "movq    $dst,[$constantaddress]\t! replicate8I($con)\n\t"
4031             "vpbroadcastd  $dst,$dst\t! replicate4I" %}
4032   ins_encode %{
4033     int vector_len = 0;
4034     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
4035     __ evpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4036   %}
4037   ins_pipe( pipe_slow );
4038 %}
4039 
4040 instruct Repl8I_imm_evex(vecY dst, immI con) %{
4041   predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vl());
4042   match(Set dst (ReplicateI con));
4043   format %{ "movq    $dst,[$constantaddress]\t! replicate8I($con)\n\t"
4044             "vpbroadcastd  $dst,$dst\t! replicate8I" %}
4045   ins_encode %{
4046     int vector_len = 1;
4047     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
4048     __ evpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4049   %}
4050   ins_pipe( pipe_slow );
4051 %}
4052 
4053 instruct Repl16I_imm_evex(vecZ dst, immI con) %{
4054   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4055   match(Set dst (ReplicateI con));
4056   format %{ "movq    $dst,[$constantaddress]\t! replicate16I($con)\n\t"
4057             "vpbroadcastd  $dst,$dst\t! replicate16I" %}
4058   ins_encode %{
4059     int vector_len = 2;
4060     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
4061     __ evpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4062   %}
4063   ins_pipe( pipe_slow );
4064 %}
4065 
4066 instruct Repl16I_zero_evex(vecZ dst, immI0 zero) %{
4067   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4068   match(Set dst (ReplicateI zero));
4069   format %{ "vpxor   $dst k0,$dst,$dst\t! replicate16I zero" %}
4070   ins_encode %{
4071     // Use vxorpd since AVX does not have vpxor for 512-bit (AVX2 will have it).
4072     int vector_len = 2;
4073     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4074   %}
4075   ins_pipe( fpu_reg_reg );
4076 %}
4077 
4078 // Replicate long (8 byte) scalar to be vector
4079 #ifdef _LP64
4080 instruct Repl4L_evex(vecY dst, rRegL src) %{
4081   predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
4082   match(Set dst (ReplicateL src));
4083   format %{ "vpbroadcastq  $dst,$src\t! replicate4L" %}
4084   ins_encode %{
4085     int vector_len = 1;
4086     __ evpbroadcastq($dst$$XMMRegister, $src$$Register, vector_len);
4087   %}
4088   ins_pipe( pipe_slow );
4089 %}
4090 
4091 instruct Repl8L_evex(vecZ dst, rRegL src) %{
4092   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4093   match(Set dst (ReplicateL src));
4094   format %{ "vpbroadcastq  $dst,$src\t! replicate8L" %}
4095   ins_encode %{
4096     int vector_len = 2;
4097     __ evpbroadcastq($dst$$XMMRegister, $src$$Register, vector_len);
4098   %}
4099   ins_pipe( pipe_slow );
4100 %}
4101 #else // _LP64
4102 instruct Repl4L_evex(vecY dst, eRegL src, regD tmp) %{
4103   predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
4104   match(Set dst (ReplicateL src));
4105   effect(TEMP dst, USE src, TEMP tmp);
4106   format %{ "movdl   $dst,$src.lo\n\t"
4107             "movdl   $tmp,$src.hi\n\t"
4108             "punpckldq $dst,$tmp\n\t"
4109             "vpbroadcastq  $dst,$dst\t! replicate4L" %}
4110   ins_encode %{
4111     int vector_len = 1;
4112     __ movdl($dst$$XMMRegister, $src$$Register);
4113     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
4114     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
4115     __ evpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4116   %}
4117   ins_pipe( pipe_slow );
4118 %}
4119 
4120 instruct Repl8L_evex(vecZ dst, eRegL src, regD tmp) %{
4121   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4122   match(Set dst (ReplicateL src));
4123   effect(TEMP dst, USE src, TEMP tmp);
4124   format %{ "movdl   $dst,$src.lo\n\t"
4125             "movdl   $tmp,$src.hi\n\t"
4126             "punpckldq $dst,$tmp\n\t"
4127             "vpbroadcastq  $dst,$dst\t! replicate8L" %}
4128   ins_encode %{
4129     int vector_len = 2;
4130     __ movdl($dst$$XMMRegister, $src$$Register);
4131     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
4132     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
4133     __ evpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4134   %}
4135   ins_pipe( pipe_slow );
4136 %}
4137 #endif // _LP64
4138 
4139 instruct Repl4L_imm_evex(vecY dst, immL con) %{
4140   predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
4141   match(Set dst (ReplicateL con));
4142   format %{ "movq    $dst,[$constantaddress]\n\t"
4143             "vpbroadcastq  $dst,$dst\t! replicate4L" %}
4144   ins_encode %{
4145     int vector_len = 1;
4146     __ movq($dst$$XMMRegister, $constantaddress($con));
4147     __ evpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4148   %}
4149   ins_pipe( pipe_slow );
4150 %}
4151 
4152 instruct Repl8L_imm_evex(vecZ dst, immL con) %{
4153   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4154   match(Set dst (ReplicateL con));
4155   format %{ "movq    $dst,[$constantaddress]\n\t"
4156             "vpbroadcastq  $dst,$dst\t! replicate8L" %}
4157   ins_encode %{
4158     int vector_len = 2;
4159     __ movq($dst$$XMMRegister, $constantaddress($con));
4160     __ evpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4161   %}
4162   ins_pipe( pipe_slow );
4163 %}
4164 
4165 instruct Repl2L_mem_evex(vecX dst, memory mem) %{
4166   predicate(n->as_Vector()->length() == 2 && VM_Version::supports_avx512vl());
4167   match(Set dst (ReplicateL (LoadL mem)));
4168   format %{ "vpbroadcastd  $dst,$mem\t! replicate2L" %}
4169   ins_encode %{
4170     int vector_len = 0;
4171     __ evpbroadcastq($dst$$XMMRegister, $mem$$Address, vector_len);
4172   %}
4173   ins_pipe( pipe_slow );
4174 %}
4175 
4176 instruct Repl4L_mem_evex(vecY dst, memory mem) %{
4177   predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
4178   match(Set dst (ReplicateL (LoadL mem)));
4179   format %{ "vpbroadcastd  $dst,$mem\t! replicate4L" %}
4180   ins_encode %{
4181     int vector_len = 1;
4182     __ evpbroadcastq($dst$$XMMRegister, $mem$$Address, vector_len);
4183   %}
4184   ins_pipe( pipe_slow );
4185 %}
4186 
4187 instruct Repl8L_mem_evex(vecZ dst, memory mem) %{
4188   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4189   match(Set dst (ReplicateL (LoadL mem)));
4190   format %{ "vpbroadcastd  $dst,$mem\t! replicate8L" %}
4191   ins_encode %{
4192     int vector_len = 2;
4193     __ evpbroadcastq($dst$$XMMRegister, $mem$$Address, vector_len);
4194   %}
4195   ins_pipe( pipe_slow );
4196 %}
4197 
4198 instruct Repl8L_zero_evex(vecZ dst, immL0 zero) %{
4199   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4200   match(Set dst (ReplicateL zero));
4201   format %{ "vpxor   $dst k0,$dst,$dst\t! replicate8L zero" %}
4202   ins_encode %{
4203     // Use vxorpd since AVX does not have vpxor for 512-bit (EVEX will have it).
4204     int vector_len = 2;
4205     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4206   %}
4207   ins_pipe( fpu_reg_reg );
4208 %}
4209 
4210 instruct Repl8F_evex(vecY dst, regF src) %{
4211   predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vl());
4212   match(Set dst (ReplicateF src));
4213   format %{ "vbroadcastss $dst,$src\t! replicate8F" %}
4214   ins_encode %{
4215     int vector_len = 1;
4216     __ evpbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vector_len);
4217   %}
4218   ins_pipe( pipe_slow );
4219 %}
4220 
4221 instruct Repl8F_mem_evex(vecY dst, memory mem) %{
4222   predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vl());
4223   match(Set dst (ReplicateF (LoadF mem)));
4224   format %{ "vbroadcastss  $dst,$mem\t! replicate8F" %}
4225   ins_encode %{
4226     int vector_len = 1;
4227     __ evpbroadcastss($dst$$XMMRegister, $mem$$Address, vector_len);
4228   %}
4229   ins_pipe( pipe_slow );
4230 %}
4231 
4232 instruct Repl16F_evex(vecZ dst, regF src) %{
4233   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4234   match(Set dst (ReplicateF src));
4235   format %{ "vbroadcastss $dst,$src\t! replicate16F" %}
4236   ins_encode %{
4237     int vector_len = 2;
4238     __ evpbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vector_len);
4239   %}
4240   ins_pipe( pipe_slow );
4241 %}
4242 
4243 instruct Repl16F_mem_evex(vecZ dst, memory mem) %{
4244   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4245   match(Set dst (ReplicateF (LoadF mem)));
4246   format %{ "vbroadcastss  $dst,$mem\t! replicate16F" %}
4247   ins_encode %{
4248     int vector_len = 2;
4249     __ evpbroadcastss($dst$$XMMRegister, $mem$$Address, vector_len);
4250   %}
4251   ins_pipe( pipe_slow );
4252 %}
4253 
4254 instruct Repl2F_zero_evex(vecD dst, immF0 zero) %{
4255   predicate(n->as_Vector()->length() == 2 && UseAVX > 2);
4256   match(Set dst (ReplicateF zero));
4257   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate2F zero" %}
4258   ins_encode %{
4259     // Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
4260     int vector_len = 2;
4261     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4262   %}
4263   ins_pipe( fpu_reg_reg );
4264 %}
4265 
4266 instruct Repl4F_zero_evex(vecX dst, immF0 zero) %{
4267   predicate(n->as_Vector()->length() == 4 && UseAVX > 2);
4268   match(Set dst (ReplicateF zero));
4269   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate4F zero" %}
4270   ins_encode %{
4271     // Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
4272     int vector_len = 2;
4273     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4274   %}
4275   ins_pipe( fpu_reg_reg );
4276 %}
4277 
4278 instruct Repl8F_zero_evex(vecY dst, immF0 zero) %{
4279   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4280   match(Set dst (ReplicateF zero));
4281   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate8F zero" %}
4282   ins_encode %{
4283     // Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
4284     int vector_len = 2;
4285     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4286   %}
4287   ins_pipe( fpu_reg_reg );
4288 %}
4289 
4290 instruct Repl16F_zero_evex(vecZ dst, immF0 zero) %{
4291   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4292   match(Set dst (ReplicateF zero));
4293   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate16F zero" %}
4294   ins_encode %{
4295     // Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
4296     int vector_len = 2;
4297     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4298   %}
4299   ins_pipe( fpu_reg_reg );
4300 %}
4301 
4302 instruct Repl4D_evex(vecY dst, regD src) %{
4303   predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
4304   match(Set dst (ReplicateD src));
4305   format %{ "vbroadcastsd $dst,$src\t! replicate4D" %}
4306   ins_encode %{
4307     int vector_len = 1;
4308     __ evpbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
4309   %}
4310   ins_pipe( pipe_slow );
4311 %}
4312 
4313 instruct Repl4D_mem_evex(vecY dst, memory mem) %{
4314   predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
4315   match(Set dst (ReplicateD (LoadD mem)));
4316   format %{ "vbroadcastsd  $dst,$mem\t! replicate4D" %}
4317   ins_encode %{
4318     int vector_len = 1;
4319     __ evpbroadcastsd($dst$$XMMRegister, $mem$$Address, vector_len);
4320   %}
4321   ins_pipe( pipe_slow );
4322 %}
4323 
4324 instruct Repl8D_evex(vecZ dst, regD src) %{
4325   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4326   match(Set dst (ReplicateD src));
4327   format %{ "vbroadcastsd $dst,$src\t! replicate8D" %}
4328   ins_encode %{
4329     int vector_len = 2;
4330     __ evpbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
4331   %}
4332   ins_pipe( pipe_slow );
4333 %}
4334 
4335 instruct Repl8D_mem_evex(vecZ dst, memory mem) %{
4336   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4337   match(Set dst (ReplicateD (LoadD mem)));
4338   format %{ "vbroadcastsd  $dst,$mem\t! replicate8D" %}
4339   ins_encode %{
4340     int vector_len = 2;
4341     __ evpbroadcastsd($dst$$XMMRegister, $mem$$Address, vector_len);
4342   %}
4343   ins_pipe( pipe_slow );
4344 %}
4345 
4346 instruct Repl2D_zero_evex(vecX dst, immD0 zero) %{
4347   predicate(n->as_Vector()->length() == 2 && UseAVX > 2);
4348   match(Set dst (ReplicateD zero));
4349   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate2D zero" %}
4350   ins_encode %{
4351     // Use vpxor in place of vxorpd since EVEX has a constriant on dq for vxorpd: this is a 512-bit operation
4352     int vector_len = 2;
4353     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4354   %}
4355   ins_pipe( fpu_reg_reg );
4356 %}
4357 
4358 instruct Repl4D_zero_evex(vecY dst, immD0 zero) %{
4359   predicate(n->as_Vector()->length() == 4 && UseAVX > 2);
4360   match(Set dst (ReplicateD zero));
4361   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate4D zero" %}
4362   ins_encode %{
4363     // Use vpxor in place of vxorpd since EVEX has a constriant on dq for vxorpd: this is a 512-bit operation
4364     int vector_len = 2;
4365     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4366   %}
4367   ins_pipe( fpu_reg_reg );
4368 %}
4369 
4370 instruct Repl8D_zero_evex(vecZ dst, immD0 zero) %{
4371   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4372   match(Set dst (ReplicateD zero));
4373   format %{ "vpxor  $dst k0,$dst,$dst,vect512\t! replicate8D zero" %}
4374   ins_encode %{
4375     // Use vpxor in place of vxorpd since EVEX has a constriant on dq for vxorpd: this is a 512-bit operation
4376     int vector_len = 2;
4377     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4378   %}
4379   ins_pipe( fpu_reg_reg );
4380 %}
4381 
4382 // ====================REDUCTION ARITHMETIC=======================================
4383 
4384 instruct rsadd2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
4385   predicate(UseSSE > 2 && UseAVX == 0);
4386   match(Set dst (AddReductionVI src1 src2));
4387   effect(TEMP tmp2, TEMP tmp);
4388   format %{ "movdqu  $tmp2,$src2\n\t"
4389             "phaddd  $tmp2,$tmp2\n\t"
4390             "movd    $tmp,$src1\n\t"
4391             "paddd   $tmp,$tmp2\n\t"
4392             "movd    $dst,$tmp\t! add reduction2I" %}
4393   ins_encode %{
4394     __ movdqu($tmp2$$XMMRegister, $src2$$XMMRegister);
4395     __ phaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister);
4396     __ movdl($tmp$$XMMRegister, $src1$$Register);
4397     __ paddd($tmp$$XMMRegister, $tmp2$$XMMRegister);
4398     __ movdl($dst$$Register, $tmp$$XMMRegister);
4399   %}
4400   ins_pipe( pipe_slow );
4401 %}
4402 
4403 instruct rvadd2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
4404   predicate(VM_Version::supports_avxonly());
4405   match(Set dst (AddReductionVI src1 src2));
4406   effect(TEMP tmp, TEMP tmp2);
4407   format %{ "vphaddd  $tmp,$src2,$src2\n\t"
4408             "movd     $tmp2,$src1\n\t"
4409             "vpaddd   $tmp2,$tmp2,$tmp\n\t"
4410             "movd     $dst,$tmp2\t! add reduction2I" %}
4411   ins_encode %{
4412     int vector_len = 0;
4413     __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len);
4414     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4415     __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
4416     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4417   %}
4418   ins_pipe( pipe_slow );
4419 %}
4420 
4421 instruct rvadd2I_reduction_reg_evex(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
4422   predicate(UseAVX > 2);
4423   match(Set dst (AddReductionVI src1 src2));
4424   effect(TEMP tmp, TEMP tmp2);
4425   format %{ "pshufd  $tmp2,$src2,0x1\n\t"
4426             "vpaddd  $tmp,$src2,$tmp2\n\t"
4427             "movd    $tmp2,$src1\n\t"
4428             "vpaddd  $tmp2,$tmp,$tmp2\n\t"
4429             "movd    $dst,$tmp2\t! add reduction2I" %}
4430   ins_encode %{
4431     int vector_len = 0;
4432     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
4433     __ vpaddd($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4434     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4435     __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4436     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4437   %}
4438   ins_pipe( pipe_slow );
4439 %}
4440 
4441 instruct rsadd4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
4442   predicate(UseSSE > 2 && UseAVX == 0);
4443   match(Set dst (AddReductionVI src1 src2));
4444   effect(TEMP tmp, TEMP tmp2);
4445   format %{ "movdqu  $tmp,$src2\n\t"
4446             "phaddd  $tmp,$tmp\n\t"
4447             "phaddd  $tmp,$tmp\n\t"
4448             "movd    $tmp2,$src1\n\t"
4449             "paddd   $tmp2,$tmp\n\t"
4450             "movd    $dst,$tmp2\t! add reduction4I" %}
4451   ins_encode %{
4452     __ movdqu($tmp$$XMMRegister, $src2$$XMMRegister);
4453     __ phaddd($tmp$$XMMRegister, $tmp$$XMMRegister);
4454     __ phaddd($tmp$$XMMRegister, $tmp$$XMMRegister);
4455     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4456     __ paddd($tmp2$$XMMRegister, $tmp$$XMMRegister);
4457     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4458   %}
4459   ins_pipe( pipe_slow );
4460 %}
4461 
4462 instruct rvadd4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
4463   predicate(VM_Version::supports_avxonly());
4464   match(Set dst (AddReductionVI src1 src2));
4465   effect(TEMP tmp, TEMP tmp2);
4466   format %{ "vphaddd  $tmp,$src2,$src2\n\t"
4467             "vphaddd  $tmp,$tmp,$tmp\n\t"
4468             "movd     $tmp2,$src1\n\t"
4469             "vpaddd   $tmp2,$tmp2,$tmp\n\t"
4470             "movd     $dst,$tmp2\t! add reduction4I" %}
4471   ins_encode %{
4472     int vector_len = 0;
4473     __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len);
4474     __ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp$$XMMRegister, vector_len);
4475     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4476     __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
4477     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4478   %}
4479   ins_pipe( pipe_slow );
4480 %}
4481 
4482 instruct rvadd4I_reduction_reg_evex(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
4483   predicate(UseAVX > 2);
4484   match(Set dst (AddReductionVI src1 src2));
4485   effect(TEMP tmp, TEMP tmp2);
4486   format %{ "pshufd  $tmp2,$src2,0xE\n\t"
4487             "vpaddd  $tmp,$src2,$tmp2\n\t"
4488             "pshufd  $tmp2,$tmp,0x1\n\t"
4489             "vpaddd  $tmp,$tmp,$tmp2\n\t"
4490             "movd    $tmp2,$src1\n\t"
4491             "vpaddd  $tmp2,$tmp,$tmp2\n\t"
4492             "movd    $dst,$tmp2\t! add reduction4I" %}
4493   ins_encode %{
4494     int vector_len = 0;
4495     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
4496     __ vpaddd($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4497     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
4498     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4499     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4500     __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4501     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4502   %}
4503   ins_pipe( pipe_slow );
4504 %}
4505 
4506 instruct rvadd8I_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2) %{
4507   predicate(VM_Version::supports_avxonly());
4508   match(Set dst (AddReductionVI src1 src2));
4509   effect(TEMP tmp, TEMP tmp2);
4510   format %{ "vphaddd  $tmp,$src2,$src2\n\t"
4511             "vphaddd  $tmp,$tmp,$tmp2\n\t"
4512             "vextracti128_high  $tmp2,$tmp\n\t"
4513             "vpaddd   $tmp,$tmp,$tmp2\n\t"
4514             "movd     $tmp2,$src1\n\t"
4515             "vpaddd   $tmp2,$tmp2,$tmp\n\t"
4516             "movd     $dst,$tmp2\t! add reduction8I" %}
4517   ins_encode %{
4518     int vector_len = 1;
4519     __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len);
4520     __ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4521     __ vextracti128_high($tmp2$$XMMRegister, $tmp$$XMMRegister);
4522     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
4523     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4524     __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
4525     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4526   %}
4527   ins_pipe( pipe_slow );
4528 %}
4529 
4530 instruct rvadd8I_reduction_reg_evex(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2) %{
4531   predicate(UseAVX > 2);
4532   match(Set dst (AddReductionVI src1 src2));
4533   effect(TEMP tmp, TEMP tmp2);
4534   format %{ "vextracti128_high  $tmp,$src2\n\t"
4535             "vpaddd  $tmp,$tmp,$src2\n\t"
4536             "pshufd  $tmp2,$tmp,0xE\n\t"
4537             "vpaddd  $tmp,$tmp,$tmp2\n\t"
4538             "pshufd  $tmp2,$tmp,0x1\n\t"
4539             "vpaddd  $tmp,$tmp,$tmp2\n\t"
4540             "movd    $tmp2,$src1\n\t"
4541             "vpaddd  $tmp2,$tmp,$tmp2\n\t"
4542             "movd    $dst,$tmp2\t! add reduction8I" %}
4543   ins_encode %{
4544     int vector_len = 0;
4545     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
4546     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
4547     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
4548     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4549     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
4550     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4551     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4552     __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4553     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4554   %}
4555   ins_pipe( pipe_slow );
4556 %}
4557 
4558 instruct rvadd16I_reduction_reg_evex(rRegI dst, rRegI src1, vecZ src2, regF tmp, regF tmp2, regF tmp3) %{
4559   predicate(UseAVX > 2);
4560   match(Set dst (AddReductionVI src1 src2));
4561   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
4562   format %{ "vextracti64x4_high  $tmp3,$src2\n\t"
4563             "vpaddd  $tmp3,$tmp3,$src2\n\t"
4564             "vextracti128_high  $tmp,$tmp3\n\t"
4565             "vpaddd  $tmp,$tmp,$tmp3\n\t"
4566             "pshufd  $tmp2,$tmp,0xE\n\t"
4567             "vpaddd  $tmp,$tmp,$tmp2\n\t"
4568             "pshufd  $tmp2,$tmp,0x1\n\t"
4569             "vpaddd  $tmp,$tmp,$tmp2\n\t"
4570             "movd    $tmp2,$src1\n\t"
4571             "vpaddd  $tmp2,$tmp,$tmp2\n\t"
4572             "movd    $dst,$tmp2\t! mul reduction16I" %}
4573   ins_encode %{
4574     __ vextracti64x4_high($tmp3$$XMMRegister, $src2$$XMMRegister);
4575     __ vpaddd($tmp3$$XMMRegister, $tmp3$$XMMRegister, $src2$$XMMRegister, 1);
4576     __ vextracti128_high($tmp$$XMMRegister, $tmp3$$XMMRegister);
4577     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp3$$XMMRegister, 0);
4578     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
4579     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
4580     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
4581     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
4582     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4583     __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
4584     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4585   %}
4586   ins_pipe( pipe_slow );
4587 %}
4588 
4589 #ifdef _LP64
4590 instruct rvadd2L_reduction_reg(rRegL dst, rRegL src1, vecX src2, regF tmp, regF tmp2) %{
4591   predicate(UseAVX > 2);
4592   match(Set dst (AddReductionVL src1 src2));
4593   effect(TEMP tmp, TEMP tmp2);
4594   format %{ "pshufd  $tmp2,$src2,0xE\n\t"
4595             "vpaddq  $tmp,$src2,$tmp2\n\t"
4596             "movdq   $tmp2,$src1\n\t"
4597             "vpaddq  $tmp2,$tmp,$tmp2\n\t"
4598             "movdq   $dst,$tmp2\t! add reduction2L" %}
4599   ins_encode %{
4600     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
4601     __ vpaddq($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, 0);
4602     __ movdq($tmp2$$XMMRegister, $src1$$Register);
4603     __ vpaddq($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
4604     __ movdq($dst$$Register, $tmp2$$XMMRegister);
4605   %}
4606   ins_pipe( pipe_slow );
4607 %}
4608 
4609 instruct rvadd4L_reduction_reg(rRegL dst, rRegL src1, vecY src2, regF tmp, regF tmp2) %{
4610   predicate(UseAVX > 2);
4611   match(Set dst (AddReductionVL src1 src2));
4612   effect(TEMP tmp, TEMP tmp2);
4613   format %{ "vextracti128_high  $tmp,$src2\n\t"
4614             "vpaddq  $tmp2,$tmp,$src2\n\t"
4615             "pshufd  $tmp,$tmp2,0xE\n\t"
4616             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
4617             "movdq   $tmp,$src1\n\t"
4618             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
4619             "movdq   $dst,$tmp2\t! add reduction4L" %}
4620   ins_encode %{
4621     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
4622     __ vpaddq($tmp2$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, 0);
4623     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
4624     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
4625     __ movdq($tmp$$XMMRegister, $src1$$Register);
4626     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
4627     __ movdq($dst$$Register, $tmp2$$XMMRegister);
4628   %}
4629   ins_pipe( pipe_slow );
4630 %}
4631 
4632 instruct rvadd8L_reduction_reg(rRegL dst, rRegL src1, vecZ src2, regF tmp, regF tmp2) %{
4633   predicate(UseAVX > 2);
4634   match(Set dst (AddReductionVL src1 src2));
4635   effect(TEMP tmp, TEMP tmp2);
4636   format %{ "vextracti64x4_high  $tmp2,$src2\n\t"
4637             "vpaddq  $tmp2,$tmp2,$src2\n\t"
4638             "vextracti128_high  $tmp,$tmp2\n\t"
4639             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
4640             "pshufd  $tmp,$tmp2,0xE\n\t"
4641             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
4642             "movdq   $tmp,$src1\n\t"
4643             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
4644             "movdq   $dst,$tmp2\t! add reduction8L" %}
4645   ins_encode %{
4646     __ vextracti64x4_high($tmp2$$XMMRegister, $src2$$XMMRegister);
4647     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
4648     __ vextracti128_high($tmp$$XMMRegister, $tmp2$$XMMRegister);
4649     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
4650     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
4651     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
4652     __ movdq($tmp$$XMMRegister, $src1$$Register);
4653     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
4654     __ movdq($dst$$Register, $tmp2$$XMMRegister);
4655   %}
4656   ins_pipe( pipe_slow );
4657 %}
4658 #endif
4659 
4660 instruct rsadd2F_reduction_reg(regF dst, vecD src2, regF tmp) %{
4661   predicate(UseSSE >= 1 && UseAVX == 0);
4662   match(Set dst (AddReductionVF dst src2));
4663   effect(TEMP dst, TEMP tmp);
4664   format %{ "addss   $dst,$src2\n\t"
4665             "pshufd  $tmp,$src2,0x01\n\t"
4666             "addss   $dst,$tmp\t! add reduction2F" %}
4667   ins_encode %{
4668     __ addss($dst$$XMMRegister, $src2$$XMMRegister);
4669     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
4670     __ addss($dst$$XMMRegister, $tmp$$XMMRegister);
4671   %}
4672   ins_pipe( pipe_slow );
4673 %}
4674 
4675 instruct rvadd2F_reduction_reg(regF dst, vecD src2, regF tmp) %{
4676   predicate(UseAVX > 0);
4677   match(Set dst (AddReductionVF dst src2));
4678   effect(TEMP dst, TEMP tmp);
4679   format %{ "vaddss  $dst,$dst,$src2\n\t"
4680             "pshufd  $tmp,$src2,0x01\n\t"
4681             "vaddss  $dst,$dst,$tmp\t! add reduction2F" %}
4682   ins_encode %{
4683     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
4684     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
4685     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4686   %}
4687   ins_pipe( pipe_slow );
4688 %}
4689 
4690 instruct rsadd4F_reduction_reg(regF dst, vecX src2, regF tmp) %{
4691   predicate(UseSSE >= 1 && UseAVX == 0);
4692   match(Set dst (AddReductionVF dst src2));
4693   effect(TEMP dst, TEMP tmp);
4694   format %{ "addss   $dst,$src2\n\t"
4695             "pshufd  $tmp,$src2,0x01\n\t"
4696             "addss   $dst,$tmp\n\t"
4697             "pshufd  $tmp,$src2,0x02\n\t"
4698             "addss   $dst,$tmp\n\t"
4699             "pshufd  $tmp,$src2,0x03\n\t"
4700             "addss   $dst,$tmp\t! add reduction4F" %}
4701   ins_encode %{
4702     __ addss($dst$$XMMRegister, $src2$$XMMRegister);
4703     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
4704     __ addss($dst$$XMMRegister, $tmp$$XMMRegister);
4705     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
4706     __ addss($dst$$XMMRegister, $tmp$$XMMRegister);
4707     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
4708     __ addss($dst$$XMMRegister, $tmp$$XMMRegister);
4709   %}
4710   ins_pipe( pipe_slow );
4711 %}
4712 
4713 instruct rvadd4F_reduction_reg(regF dst, vecX src2, regF tmp) %{
4714   predicate(UseAVX > 0);
4715   match(Set dst (AddReductionVF dst src2));
4716   effect(TEMP tmp, TEMP dst);
4717   format %{ "vaddss  $dst,dst,$src2\n\t"
4718             "pshufd  $tmp,$src2,0x01\n\t"
4719             "vaddss  $dst,$dst,$tmp\n\t"
4720             "pshufd  $tmp,$src2,0x02\n\t"
4721             "vaddss  $dst,$dst,$tmp\n\t"
4722             "pshufd  $tmp,$src2,0x03\n\t"
4723             "vaddss  $dst,$dst,$tmp\t! add reduction4F" %}
4724   ins_encode %{
4725     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
4726     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
4727     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4728     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
4729     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4730     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
4731     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4732   %}
4733   ins_pipe( pipe_slow );
4734 %}
4735 
4736 instruct radd8F_reduction_reg(regF dst, vecY src2, regF tmp, regF tmp2) %{
4737   predicate(UseAVX > 0);
4738   match(Set dst (AddReductionVF dst src2));
4739   effect(TEMP tmp, TEMP dst, TEMP tmp2);
4740   format %{ "vaddss  $dst,$dst,$src2\n\t"
4741             "pshufd  $tmp,$src2,0x01\n\t"
4742             "vaddss  $dst,$dst,$tmp\n\t"
4743             "pshufd  $tmp,$src2,0x02\n\t"
4744             "vaddss  $dst,$dst,$tmp\n\t"
4745             "pshufd  $tmp,$src2,0x03\n\t"
4746             "vaddss  $dst,$dst,$tmp\n\t"
4747             "vextractf128_high  $tmp2,$src2\n\t"
4748             "vaddss  $dst,$dst,$tmp2\n\t"
4749             "pshufd  $tmp,$tmp2,0x01\n\t"
4750             "vaddss  $dst,$dst,$tmp\n\t"
4751             "pshufd  $tmp,$tmp2,0x02\n\t"
4752             "vaddss  $dst,$dst,$tmp\n\t"
4753             "pshufd  $tmp,$tmp2,0x03\n\t"
4754             "vaddss  $dst,$dst,$tmp\t! add reduction8F" %}
4755   ins_encode %{
4756     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
4757     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
4758     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4759     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
4760     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4761     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
4762     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4763     __ vextractf128_high($tmp2$$XMMRegister, $src2$$XMMRegister);
4764     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
4765     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
4766     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4767     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
4768     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4769     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
4770     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4771   %}
4772   ins_pipe( pipe_slow );
4773 %}
4774 
4775 instruct radd16F_reduction_reg(regF dst, vecZ src2, regF tmp, regF tmp2) %{
4776   predicate(UseAVX > 2);
4777   match(Set dst (AddReductionVF dst src2));
4778   effect(TEMP tmp, TEMP dst, TEMP tmp2);
4779   format %{ "vaddss  $dst,$dst,$src2\n\t"
4780             "pshufd  $tmp,$src2,0x01\n\t"
4781             "vaddss  $dst,$dst,$tmp\n\t"
4782             "pshufd  $tmp,$src2,0x02\n\t"
4783             "vaddss  $dst,$dst,$tmp\n\t"
4784             "pshufd  $tmp,$src2,0x03\n\t"
4785             "vaddss  $dst,$dst,$tmp\n\t"
4786             "vextractf32x4  $tmp2,$src2,0x1\n\t"
4787             "vaddss  $dst,$dst,$tmp2\n\t"
4788             "pshufd  $tmp,$tmp2,0x01\n\t"
4789             "vaddss  $dst,$dst,$tmp\n\t"
4790             "pshufd  $tmp,$tmp2,0x02\n\t"
4791             "vaddss  $dst,$dst,$tmp\n\t"
4792             "pshufd  $tmp,$tmp2,0x03\n\t"
4793             "vaddss  $dst,$dst,$tmp\n\t"
4794             "vextractf32x4  $tmp2,$src2,0x2\n\t"
4795             "vaddss  $dst,$dst,$tmp2\n\t"
4796             "pshufd  $tmp,$tmp2,0x01\n\t"
4797             "vaddss  $dst,$dst,$tmp\n\t"
4798             "pshufd  $tmp,$tmp2,0x02\n\t"
4799             "vaddss  $dst,$dst,$tmp\n\t"
4800             "pshufd  $tmp,$tmp2,0x03\n\t"
4801             "vaddss  $dst,$dst,$tmp\n\t"
4802             "vextractf32x4  $tmp2,$src2,0x3\n\t"
4803             "vaddss  $dst,$dst,$tmp2\n\t"
4804             "pshufd  $tmp,$tmp2,0x01\n\t"
4805             "vaddss  $dst,$dst,$tmp\n\t"
4806             "pshufd  $tmp,$tmp2,0x02\n\t"
4807             "vaddss  $dst,$dst,$tmp\n\t"
4808             "pshufd  $tmp,$tmp2,0x03\n\t"
4809             "vaddss  $dst,$dst,$tmp\t! add reduction16F" %}
4810   ins_encode %{
4811     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
4812     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
4813     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4814     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
4815     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4816     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
4817     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4818     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
4819     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
4820     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
4821     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4822     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
4823     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4824     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
4825     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4826     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
4827     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
4828     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
4829     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4830     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
4831     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4832     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
4833     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4834     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
4835     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
4836     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
4837     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4838     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
4839     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4840     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
4841     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4842   %}
4843   ins_pipe( pipe_slow );
4844 %}
4845 
4846 instruct rsadd2D_reduction_reg(regD dst, vecX src2, regD tmp) %{
4847   predicate(UseSSE >= 1 && UseAVX == 0);
4848   match(Set dst (AddReductionVD dst src2));
4849   effect(TEMP tmp, TEMP dst);
4850   format %{ "addsd   $dst,$src2\n\t"
4851             "pshufd  $tmp,$src2,0xE\n\t"
4852             "addsd   $dst,$tmp\t! add reduction2D" %}
4853   ins_encode %{
4854     __ addsd($dst$$XMMRegister, $src2$$XMMRegister);
4855     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
4856     __ addsd($dst$$XMMRegister, $tmp$$XMMRegister);
4857   %}
4858   ins_pipe( pipe_slow );
4859 %}
4860 
4861 instruct rvadd2D_reduction_reg(regD dst, vecX src2, regD tmp) %{
4862   predicate(UseAVX > 0);
4863   match(Set dst (AddReductionVD dst src2));
4864   effect(TEMP tmp, TEMP dst);
4865   format %{ "vaddsd  $dst,$dst,$src2\n\t"
4866             "pshufd  $tmp,$src2,0xE\n\t"
4867             "vaddsd  $dst,$dst,$tmp\t! add reduction2D" %}
4868   ins_encode %{
4869     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
4870     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
4871     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4872   %}
4873   ins_pipe( pipe_slow );
4874 %}
4875 
4876 instruct rvadd4D_reduction_reg(regD dst, vecY src2, regD tmp, regD tmp2) %{
4877   predicate(UseAVX > 0);
4878   match(Set dst (AddReductionVD dst src2));
4879   effect(TEMP tmp, TEMP dst, TEMP tmp2);
4880   format %{ "vaddsd  $dst,$dst,$src2\n\t"
4881             "pshufd  $tmp,$src2,0xE\n\t"
4882             "vaddsd  $dst,$dst,$tmp\n\t"
4883             "vextractf32x4  $tmp2,$src2,0x1\n\t"
4884             "vaddsd  $dst,$dst,$tmp2\n\t"
4885             "pshufd  $tmp,$tmp2,0xE\n\t"
4886             "vaddsd  $dst,$dst,$tmp\t! add reduction4D" %}
4887   ins_encode %{
4888     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
4889     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
4890     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4891     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
4892     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
4893     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
4894     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4895   %}
4896   ins_pipe( pipe_slow );
4897 %}
4898 
4899 instruct rvadd8D_reduction_reg(regD dst, vecZ src2, regD tmp, regD tmp2) %{
4900   predicate(UseAVX > 2);
4901   match(Set dst (AddReductionVD dst src2));
4902   effect(TEMP tmp, TEMP dst, TEMP tmp2);
4903   format %{ "vaddsd  $dst,$dst,$src2\n\t"
4904             "pshufd  $tmp,$src2,0xE\n\t"
4905             "vaddsd  $dst,$dst,$tmp\n\t"
4906             "vextractf32x4  $tmp2,$src2,0x1\n\t"
4907             "vaddsd  $dst,$dst,$tmp2\n\t"
4908             "pshufd  $tmp,$tmp2,0xE\n\t"
4909             "vaddsd  $dst,$dst,$tmp\n\t"
4910             "vextractf32x4  $tmp2,$src2,0x2\n\t"
4911             "vaddsd  $dst,$dst,$tmp2\n\t"
4912             "pshufd  $tmp,$tmp2,0xE\n\t"
4913             "vaddsd  $dst,$dst,$tmp\n\t"
4914             "vextractf32x4  $tmp2,$src2,0x3\n\t"
4915             "vaddsd  $dst,$dst,$tmp2\n\t"
4916             "pshufd  $tmp,$tmp2,0xE\n\t"
4917             "vaddsd  $dst,$dst,$tmp\t! add reduction8D" %}
4918   ins_encode %{
4919     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
4920     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
4921     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4922     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
4923     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
4924     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
4925     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4926     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
4927     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
4928     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
4929     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4930     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
4931     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
4932     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
4933     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4934   %}
4935   ins_pipe( pipe_slow );
4936 %}
4937 
4938 instruct rsmul2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
4939   predicate(UseSSE > 3 && UseAVX == 0);
4940   match(Set dst (MulReductionVI src1 src2));
4941   effect(TEMP tmp, TEMP tmp2);
4942   format %{ "pshufd  $tmp2,$src2,0x1\n\t"
4943             "pmulld  $tmp2,$src2\n\t"
4944             "movd    $tmp,$src1\n\t"
4945             "pmulld  $tmp2,$tmp\n\t"
4946             "movd    $dst,$tmp2\t! mul reduction2I" %}
4947   ins_encode %{
4948     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
4949     __ pmulld($tmp2$$XMMRegister, $src2$$XMMRegister);
4950     __ movdl($tmp$$XMMRegister, $src1$$Register);
4951     __ pmulld($tmp2$$XMMRegister, $tmp$$XMMRegister);
4952     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4953   %}
4954   ins_pipe( pipe_slow );
4955 %}
4956 
4957 instruct rvmul2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
4958   predicate(UseAVX > 0);
4959   match(Set dst (MulReductionVI src1 src2));
4960   effect(TEMP tmp, TEMP tmp2);
4961   format %{ "pshufd   $tmp2,$src2,0x1\n\t"
4962             "vpmulld  $tmp,$src2,$tmp2\n\t"
4963             "movd     $tmp2,$src1\n\t"
4964             "vpmulld  $tmp2,$tmp,$tmp2\n\t"
4965             "movd     $dst,$tmp2\t! mul reduction2I" %}
4966   ins_encode %{
4967     int vector_len = 0;
4968     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
4969     __ vpmulld($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4970     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4971     __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4972     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4973   %}
4974   ins_pipe( pipe_slow );
4975 %}
4976 
4977 instruct rsmul4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
4978   predicate(UseSSE > 3 && UseAVX == 0);
4979   match(Set dst (MulReductionVI src1 src2));
4980   effect(TEMP tmp, TEMP tmp2);
4981   format %{ "pshufd  $tmp2,$src2,0xE\n\t"
4982             "pmulld  $tmp2,$src2\n\t"
4983             "pshufd  $tmp,$tmp2,0x1\n\t"
4984             "pmulld  $tmp2,$tmp\n\t"
4985             "movd    $tmp,$src1\n\t"
4986             "pmulld  $tmp2,$tmp\n\t"
4987             "movd    $dst,$tmp2\t! mul reduction4I" %}
4988   ins_encode %{
4989     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
4990     __ pmulld($tmp2$$XMMRegister, $src2$$XMMRegister);
4991     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x1);
4992     __ pmulld($tmp2$$XMMRegister, $tmp$$XMMRegister);
4993     __ movdl($tmp$$XMMRegister, $src1$$Register);
4994     __ pmulld($tmp2$$XMMRegister, $tmp$$XMMRegister);
4995     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4996   %}
4997   ins_pipe( pipe_slow );
4998 %}
4999 
5000 instruct rvmul4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
5001   predicate(UseAVX > 0);
5002   match(Set dst (MulReductionVI src1 src2));
5003   effect(TEMP tmp, TEMP tmp2);
5004   format %{ "pshufd   $tmp2,$src2,0xE\n\t"
5005             "vpmulld  $tmp,$src2,$tmp2\n\t"
5006             "pshufd   $tmp2,$tmp,0x1\n\t"
5007             "vpmulld  $tmp,$tmp,$tmp2\n\t"
5008             "movd     $tmp2,$src1\n\t"
5009             "vpmulld  $tmp2,$tmp,$tmp2\n\t"
5010             "movd     $dst,$tmp2\t! mul reduction4I" %}
5011   ins_encode %{
5012     int vector_len = 0;
5013     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
5014     __ vpmulld($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5015     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
5016     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5017     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5018     __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5019     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5020   %}
5021   ins_pipe( pipe_slow );
5022 %}
5023 
5024 instruct rvmul8I_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2) %{
5025   predicate(UseAVX > 0);
5026   match(Set dst (MulReductionVI src1 src2));
5027   effect(TEMP tmp, TEMP tmp2);
5028   format %{ "vextracti128_high  $tmp,$src2\n\t"
5029             "vpmulld  $tmp,$tmp,$src2\n\t"
5030             "pshufd   $tmp2,$tmp,0xE\n\t"
5031             "vpmulld  $tmp,$tmp,$tmp2\n\t"
5032             "pshufd   $tmp2,$tmp,0x1\n\t"
5033             "vpmulld  $tmp,$tmp,$tmp2\n\t"
5034             "movd     $tmp2,$src1\n\t"
5035             "vpmulld  $tmp2,$tmp,$tmp2\n\t"
5036             "movd     $dst,$tmp2\t! mul reduction8I" %}
5037   ins_encode %{
5038     int vector_len = 0;
5039     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
5040     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
5041     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
5042     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5043     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
5044     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5045     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5046     __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5047     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5048   %}
5049   ins_pipe( pipe_slow );
5050 %}
5051 
5052 instruct rvmul16I_reduction_reg(rRegI dst, rRegI src1, vecZ src2, regF tmp, regF tmp2, regF tmp3) %{
5053   predicate(UseAVX > 2);
5054   match(Set dst (MulReductionVI src1 src2));
5055   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
5056   format %{ "vextracti64x4_high  $tmp3,$src2\n\t"
5057             "vpmulld  $tmp3,$tmp3,$src2\n\t"
5058             "vextracti128_high  $tmp,$tmp3\n\t"
5059             "vpmulld  $tmp,$tmp,$src2\n\t"
5060             "pshufd   $tmp2,$tmp,0xE\n\t"
5061             "vpmulld  $tmp,$tmp,$tmp2\n\t"
5062             "pshufd   $tmp2,$tmp,0x1\n\t"
5063             "vpmulld  $tmp,$tmp,$tmp2\n\t"
5064             "movd     $tmp2,$src1\n\t"
5065             "vpmulld  $tmp2,$tmp,$tmp2\n\t"
5066             "movd     $dst,$tmp2\t! mul reduction16I" %}
5067   ins_encode %{
5068     __ vextracti64x4_high($tmp3$$XMMRegister, $src2$$XMMRegister);
5069     __ vpmulld($tmp3$$XMMRegister, $tmp3$$XMMRegister, $src2$$XMMRegister, 1);
5070     __ vextracti128_high($tmp$$XMMRegister, $tmp3$$XMMRegister);
5071     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp3$$XMMRegister, 0);
5072     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
5073     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5074     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
5075     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5076     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5077     __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5078     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5079   %}
5080   ins_pipe( pipe_slow );
5081 %}
5082 
5083 #ifdef _LP64
5084 instruct rvmul2L_reduction_reg(rRegL dst, rRegL src1, vecX src2, regF tmp, regF tmp2) %{
5085   predicate(UseAVX > 2 && VM_Version::supports_avx512dq());
5086   match(Set dst (MulReductionVL src1 src2));
5087   effect(TEMP tmp, TEMP tmp2);
5088   format %{ "pshufd   $tmp2,$src2,0xE\n\t"
5089             "vpmullq  $tmp,$src2,$tmp2\n\t"
5090             "movdq    $tmp2,$src1\n\t"
5091             "vpmullq  $tmp2,$tmp,$tmp2\n\t"
5092             "movdq    $dst,$tmp2\t! mul reduction2L" %}
5093   ins_encode %{
5094     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
5095     __ vpmullq($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, 0);
5096     __ movdq($tmp2$$XMMRegister, $src1$$Register);
5097     __ vpmullq($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5098     __ movdq($dst$$Register, $tmp2$$XMMRegister);
5099   %}
5100   ins_pipe( pipe_slow );
5101 %}
5102 
5103 instruct rvmul4L_reduction_reg(rRegL dst, rRegL src1, vecY src2, regF tmp, regF tmp2) %{
5104   predicate(UseAVX > 2 && VM_Version::supports_avx512dq());
5105   match(Set dst (MulReductionVL src1 src2));
5106   effect(TEMP tmp, TEMP tmp2);
5107   format %{ "vextracti128_high  $tmp,$src2\n\t"
5108             "vpmullq  $tmp2,$tmp,$src2\n\t"
5109             "pshufd   $tmp,$tmp2,0xE\n\t"
5110             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5111             "movdq    $tmp,$src1\n\t"
5112             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5113             "movdq    $dst,$tmp2\t! mul reduction4L" %}
5114   ins_encode %{
5115     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
5116     __ vpmullq($tmp2$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, 0);
5117     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5118     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5119     __ movdq($tmp$$XMMRegister, $src1$$Register);
5120     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5121     __ movdq($dst$$Register, $tmp2$$XMMRegister);
5122   %}
5123   ins_pipe( pipe_slow );
5124 %}
5125 
5126 instruct rvmul8L_reduction_reg(rRegL dst, rRegL src1, vecZ src2, regF tmp, regF tmp2) %{
5127   predicate(UseAVX > 2 && VM_Version::supports_avx512dq());
5128   match(Set dst (MulReductionVL src1 src2));
5129   effect(TEMP tmp, TEMP tmp2);
5130   format %{ "vextracti64x4_high  $tmp2,$src2\n\t"
5131             "vpmullq  $tmp2,$tmp2,$src2\n\t"
5132             "vextracti128_high  $tmp,$tmp2\n\t"
5133             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5134             "pshufd   $tmp,$tmp2,0xE\n\t"
5135             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5136             "movdq    $tmp,$src1\n\t"
5137             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5138             "movdq    $dst,$tmp2\t! mul reduction8L" %}
5139   ins_encode %{
5140     __ vextracti64x4_high($tmp2$$XMMRegister, $src2$$XMMRegister);
5141     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
5142     __ vextracti128_high($tmp$$XMMRegister, $tmp2$$XMMRegister);
5143     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5144     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5145     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5146     __ movdq($tmp$$XMMRegister, $src1$$Register);
5147     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5148     __ movdq($dst$$Register, $tmp2$$XMMRegister);
5149   %}
5150   ins_pipe( pipe_slow );
5151 %}
5152 #endif
5153 
5154 instruct rsmul2F_reduction(regF dst, vecD src2, regF tmp) %{
5155   predicate(UseSSE >= 1 && UseAVX == 0);
5156   match(Set dst (MulReductionVF dst src2));
5157   effect(TEMP dst, TEMP tmp);
5158   format %{ "mulss   $dst,$src2\n\t"
5159             "pshufd  $tmp,$src2,0x01\n\t"
5160             "mulss   $dst,$tmp\t! mul reduction2F" %}
5161   ins_encode %{
5162     __ mulss($dst$$XMMRegister, $src2$$XMMRegister);
5163     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5164     __ mulss($dst$$XMMRegister, $tmp$$XMMRegister);
5165   %}
5166   ins_pipe( pipe_slow );
5167 %}
5168 
5169 instruct rvmul2F_reduction_reg(regF dst, vecD src2, regF tmp) %{
5170   predicate(UseAVX > 0);
5171   match(Set dst (MulReductionVF dst src2));
5172   effect(TEMP tmp, TEMP dst);
5173   format %{ "vmulss  $dst,$dst,$src2\n\t"
5174             "pshufd  $tmp,$src2,0x01\n\t"
5175             "vmulss  $dst,$dst,$tmp\t! mul reduction2F" %}
5176   ins_encode %{
5177     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5178     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5179     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5180   %}
5181   ins_pipe( pipe_slow );
5182 %}
5183 
5184 instruct rsmul4F_reduction_reg(regF dst, vecX src2, regF tmp) %{
5185   predicate(UseSSE >= 1 && UseAVX == 0);
5186   match(Set dst (MulReductionVF dst src2));
5187   effect(TEMP dst, TEMP tmp);
5188   format %{ "mulss   $dst,$src2\n\t"
5189             "pshufd  $tmp,$src2,0x01\n\t"
5190             "mulss   $dst,$tmp\n\t"
5191             "pshufd  $tmp,$src2,0x02\n\t"
5192             "mulss   $dst,$tmp\n\t"
5193             "pshufd  $tmp,$src2,0x03\n\t"
5194             "mulss   $dst,$tmp\t! mul reduction4F" %}
5195   ins_encode %{
5196     __ mulss($dst$$XMMRegister, $src2$$XMMRegister);
5197     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5198     __ mulss($dst$$XMMRegister, $tmp$$XMMRegister);
5199     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5200     __ mulss($dst$$XMMRegister, $tmp$$XMMRegister);
5201     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5202     __ mulss($dst$$XMMRegister, $tmp$$XMMRegister);
5203   %}
5204   ins_pipe( pipe_slow );
5205 %}
5206 
5207 instruct rvmul4F_reduction_reg(regF dst, vecX src2, regF tmp) %{
5208   predicate(UseAVX > 0);
5209   match(Set dst (MulReductionVF dst src2));
5210   effect(TEMP tmp, TEMP dst);
5211   format %{ "vmulss  $dst,$dst,$src2\n\t"
5212             "pshufd  $tmp,$src2,0x01\n\t"
5213             "vmulss  $dst,$dst,$tmp\n\t"
5214             "pshufd  $tmp,$src2,0x02\n\t"
5215             "vmulss  $dst,$dst,$tmp\n\t"
5216             "pshufd  $tmp,$src2,0x03\n\t"
5217             "vmulss  $dst,$dst,$tmp\t! mul reduction4F" %}
5218   ins_encode %{
5219     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5220     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5221     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5222     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5223     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5224     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5225     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5226   %}
5227   ins_pipe( pipe_slow );
5228 %}
5229 
5230 instruct rvmul8F_reduction_reg(regF dst, vecY src2, regF tmp, regF tmp2) %{
5231   predicate(UseAVX > 0);
5232   match(Set dst (MulReductionVF dst src2));
5233   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5234   format %{ "vmulss  $dst,$dst,$src2\n\t"
5235             "pshufd  $tmp,$src2,0x01\n\t"
5236             "vmulss  $dst,$dst,$tmp\n\t"
5237             "pshufd  $tmp,$src2,0x02\n\t"
5238             "vmulss  $dst,$dst,$tmp\n\t"
5239             "pshufd  $tmp,$src2,0x03\n\t"
5240             "vmulss  $dst,$dst,$tmp\n\t"
5241             "vextractf128_high  $tmp2,$src2\n\t"
5242             "vmulss  $dst,$dst,$tmp2\n\t"
5243             "pshufd  $tmp,$tmp2,0x01\n\t"
5244             "vmulss  $dst,$dst,$tmp\n\t"
5245             "pshufd  $tmp,$tmp2,0x02\n\t"
5246             "vmulss  $dst,$dst,$tmp\n\t"
5247             "pshufd  $tmp,$tmp2,0x03\n\t"
5248             "vmulss  $dst,$dst,$tmp\t! mul reduction8F" %}
5249   ins_encode %{
5250     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5251     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5252     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5253     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5254     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5255     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5256     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5257     __ vextractf128_high($tmp2$$XMMRegister, $src2$$XMMRegister);
5258     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5259     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5260     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5261     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5262     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5263     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5264     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5265   %}
5266   ins_pipe( pipe_slow );
5267 %}
5268 
5269 instruct rvmul16F_reduction_reg(regF dst, vecZ src2, regF tmp, regF tmp2) %{
5270   predicate(UseAVX > 2);
5271   match(Set dst (MulReductionVF dst src2));
5272   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5273   format %{ "vmulss  $dst,$dst,$src2\n\t"
5274             "pshufd  $tmp,$src2,0x01\n\t"
5275             "vmulss  $dst,$dst,$tmp\n\t"
5276             "pshufd  $tmp,$src2,0x02\n\t"
5277             "vmulss  $dst,$dst,$tmp\n\t"
5278             "pshufd  $tmp,$src2,0x03\n\t"
5279             "vmulss  $dst,$dst,$tmp\n\t"
5280             "vextractf32x4  $tmp2,$src2,0x1\n\t"
5281             "vmulss  $dst,$dst,$tmp2\n\t"
5282             "pshufd  $tmp,$tmp2,0x01\n\t"
5283             "vmulss  $dst,$dst,$tmp\n\t"
5284             "pshufd  $tmp,$tmp2,0x02\n\t"
5285             "vmulss  $dst,$dst,$tmp\n\t"
5286             "pshufd  $tmp,$tmp2,0x03\n\t"
5287             "vmulss  $dst,$dst,$tmp\n\t"
5288             "vextractf32x4  $tmp2,$src2,0x2\n\t"
5289             "vmulss  $dst,$dst,$tmp2\n\t"
5290             "pshufd  $tmp,$tmp2,0x01\n\t"
5291             "vmulss  $dst,$dst,$tmp\n\t"
5292             "pshufd  $tmp,$tmp2,0x02\n\t"
5293             "vmulss  $dst,$dst,$tmp\n\t"
5294             "pshufd  $tmp,$tmp2,0x03\n\t"
5295             "vmulss  $dst,$dst,$tmp\n\t"
5296             "vextractf32x4  $tmp2,$src2,0x3\n\t"
5297             "vmulss  $dst,$dst,$tmp2\n\t"
5298             "pshufd  $tmp,$tmp2,0x01\n\t"
5299             "vmulss  $dst,$dst,$tmp\n\t"
5300             "pshufd  $tmp,$tmp2,0x02\n\t"
5301             "vmulss  $dst,$dst,$tmp\n\t"
5302             "pshufd  $tmp,$tmp2,0x03\n\t"
5303             "vmulss  $dst,$dst,$tmp\t! mul reduction16F" %}
5304   ins_encode %{
5305     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5306     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5307     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5308     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5309     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5310     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5311     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5312     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5313     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5314     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5315     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5316     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5317     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5318     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5319     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5320     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
5321     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5322     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5323     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5324     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5325     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5326     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5327     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5328     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
5329     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5330     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5331     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5332     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5333     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5334     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5335     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5336   %}
5337   ins_pipe( pipe_slow );
5338 %}
5339 
5340 instruct rsmul2D_reduction_reg(regD dst, vecX src2, regD tmp) %{
5341   predicate(UseSSE >= 1 && UseAVX == 0);
5342   match(Set dst (MulReductionVD dst src2));
5343   effect(TEMP dst, TEMP tmp);
5344   format %{ "mulsd   $dst,$src2\n\t"
5345             "pshufd  $tmp,$src2,0xE\n\t"
5346             "mulsd   $dst,$tmp\t! mul reduction2D" %}
5347   ins_encode %{
5348     __ mulsd($dst$$XMMRegister, $src2$$XMMRegister);
5349     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5350     __ mulsd($dst$$XMMRegister, $tmp$$XMMRegister);
5351   %}
5352   ins_pipe( pipe_slow );
5353 %}
5354 
5355 instruct rvmul2D_reduction_reg(regD dst, vecX src2, regD tmp) %{
5356   predicate(UseAVX > 0);
5357   match(Set dst (MulReductionVD dst src2));
5358   effect(TEMP tmp, TEMP dst);
5359   format %{ "vmulsd  $dst,$dst,$src2\n\t"
5360             "pshufd  $tmp,$src2,0xE\n\t"
5361             "vmulsd  $dst,$dst,$tmp\t! mul reduction2D" %}
5362   ins_encode %{
5363     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5364     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5365     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5366   %}
5367   ins_pipe( pipe_slow );
5368 %}
5369 
5370 instruct rvmul4D_reduction_reg(regD dst, vecY src2, regD tmp, regD tmp2) %{
5371   predicate(UseAVX > 0);
5372   match(Set dst (MulReductionVD dst src2));
5373   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5374   format %{ "vmulsd  $dst,$dst,$src2\n\t"
5375             "pshufd  $tmp,$src2,0xE\n\t"
5376             "vmulsd  $dst,$dst,$tmp\n\t"
5377             "vextractf128_high  $tmp2,$src2\n\t"
5378             "vmulsd  $dst,$dst,$tmp2\n\t"
5379             "pshufd  $tmp,$tmp2,0xE\n\t"
5380             "vmulsd  $dst,$dst,$tmp\t! mul reduction4D" %}
5381   ins_encode %{
5382     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5383     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5384     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5385     __ vextractf128_high($tmp2$$XMMRegister, $src2$$XMMRegister);
5386     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5387     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5388     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5389   %}
5390   ins_pipe( pipe_slow );
5391 %}
5392 
5393 instruct rvmul8D_reduction_reg(regD dst, vecZ src2, regD tmp, regD tmp2) %{
5394   predicate(UseAVX > 2);
5395   match(Set dst (MulReductionVD dst src2));
5396   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5397   format %{ "vmulsd  $dst,$dst,$src2\n\t"
5398             "pshufd  $tmp,$src2,0xE\n\t"
5399             "vmulsd  $dst,$dst,$tmp\n\t"
5400             "vextractf32x4  $tmp2,$src2,0x1\n\t"
5401             "vmulsd  $dst,$dst,$tmp2\n\t"
5402             "pshufd  $tmp,$src2,0xE\n\t"
5403             "vmulsd  $dst,$dst,$tmp\n\t"
5404             "vextractf32x4  $tmp2,$src2,0x2\n\t"
5405             "vmulsd  $dst,$dst,$tmp2\n\t"
5406             "pshufd  $tmp,$tmp2,0xE\n\t"
5407             "vmulsd  $dst,$dst,$tmp\n\t"
5408             "vextractf32x4  $tmp2,$src2,0x3\n\t"
5409             "vmulsd  $dst,$dst,$tmp2\n\t"
5410             "pshufd  $tmp,$tmp2,0xE\n\t"
5411             "vmulsd  $dst,$dst,$tmp\t! mul reduction8D" %}
5412   ins_encode %{
5413     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5414     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5415     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5416     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5417     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5418     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5419     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5420     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
5421     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5422     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5423     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5424     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
5425     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5426     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5427     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5428   %}
5429   ins_pipe( pipe_slow );
5430 %}
5431 
5432 // ====================VECTOR ARITHMETIC=======================================
5433 
5434 // --------------------------------- ADD --------------------------------------
5435 
5436 // Bytes vector add
5437 instruct vadd4B(vecS dst, vecS src) %{
5438   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
5439   match(Set dst (AddVB dst src));
5440   format %{ "paddb   $dst,$src\t! add packed4B" %}
5441   ins_encode %{
5442     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
5443   %}
5444   ins_pipe( pipe_slow );
5445 %}
5446 
5447 instruct vadd4B_reg_avx(vecS dst, vecS src1, vecS src2) %{
5448   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
5449   match(Set dst (AddVB src1 src2));
5450   format %{ "vpaddb  $dst,$src1,$src2\t! add packed4B" %}
5451   ins_encode %{
5452     int vector_len = 0;
5453     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5454   %}
5455   ins_pipe( pipe_slow );
5456 %}
5457 
5458 instruct vadd4B_reg_evex(vecS dst, vecS src1, vecS src2) %{
5459   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
5460   match(Set dst (AddVB src1 src2));
5461   format %{ "vpaddb  $dst,$src1,$src2\t! add packed4B" %}
5462   ins_encode %{
5463     int vector_len = 0;
5464     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5465   %}
5466   ins_pipe( pipe_slow );
5467 %}
5468 
5469 instruct vadd4B_reg_evex_special(vecS dst, vecS src1, vecS src2) %{
5470   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
5471   match(Set dst (AddVB dst src2));
5472   effect(TEMP src1);
5473   format %{ "vpaddb  $dst,$dst,$src2\t! add packed4B" %}
5474   ins_encode %{
5475     int vector_len = 0;
5476     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5477   %}
5478   ins_pipe( pipe_slow );
5479 %}
5480 
5481 instruct vadd4B_mem_avx(vecS dst, vecS src, memory mem) %{
5482   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
5483   match(Set dst (AddVB src (LoadVector mem)));
5484   format %{ "vpaddb  $dst,$src,$mem\t! add packed4B" %}
5485   ins_encode %{
5486     int vector_len = 0;
5487     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5488   %}
5489   ins_pipe( pipe_slow );
5490 %}
5491 
5492 instruct vadd4B_mem_evex(vecS dst, vecS src, memory mem) %{
5493   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
5494   match(Set dst (AddVB src (LoadVector mem)));
5495   format %{ "vpaddb  $dst,$src,$mem\t! add packed4B" %}
5496   ins_encode %{
5497     int vector_len = 0;
5498     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5499   %}
5500   ins_pipe( pipe_slow );
5501 %}
5502 
5503 instruct vadd4B_mem_evex_special(vecS dst, vecS src, memory mem) %{
5504   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
5505   match(Set dst (AddVB dst (LoadVector mem)));
5506   effect(TEMP src);
5507   format %{ "vpaddb  $dst,$src,$mem\t! add packed4B" %}
5508   ins_encode %{
5509     int vector_len = 0;
5510     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5511   %}
5512   ins_pipe( pipe_slow );
5513 %}
5514 
5515 instruct vadd8B(vecD dst, vecD src) %{
5516   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
5517   match(Set dst (AddVB dst src));
5518   format %{ "paddb   $dst,$src\t! add packed8B" %}
5519   ins_encode %{
5520     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
5521   %}
5522   ins_pipe( pipe_slow );
5523 %}
5524 
5525 instruct vadd8B_reg_avx(vecD dst, vecD src1, vecD src2) %{
5526   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
5527   match(Set dst (AddVB src1 src2));
5528   format %{ "vpaddb  $dst,$src1,$src2\t! add packed8B" %}
5529   ins_encode %{
5530     int vector_len = 0;
5531     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5532   %}
5533   ins_pipe( pipe_slow );
5534 %}
5535 
5536 instruct vadd8B_reg_evex(vecD dst, vecD src1, vecD src2) %{
5537   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
5538   match(Set dst (AddVB src1 src2));
5539   format %{ "vpaddb  $dst,$src1,$src2\t! add packed8B" %}
5540   ins_encode %{
5541     int vector_len = 0;
5542     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5543   %}
5544   ins_pipe( pipe_slow );
5545 %}
5546 
5547 instruct vadd8B_reg_evex_special(vecD dst, vecD src1, vecD src2) %{
5548   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
5549   match(Set dst (AddVB dst src2));
5550   effect(TEMP src1);
5551   format %{ "vpaddb  $dst,$dst,$src2\t! add packed8B" %}
5552   ins_encode %{
5553     int vector_len = 0;
5554     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5555   %}
5556   ins_pipe( pipe_slow );
5557 %}
5558 
5559 instruct vadd8B_mem_avx(vecD dst, vecD src, memory mem) %{
5560   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
5561   match(Set dst (AddVB src (LoadVector mem)));
5562   format %{ "vpaddb  $dst,$src,$mem\t! add packed8B" %}
5563   ins_encode %{
5564     int vector_len = 0;
5565     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5566   %}
5567   ins_pipe( pipe_slow );
5568 %}
5569 
5570 instruct vadd8B_mem_evex(vecD dst, vecD src, memory mem) %{
5571   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
5572   match(Set dst (AddVB src (LoadVector mem)));
5573   format %{ "vpaddb  $dst,$src,$mem\t! add packed8B" %}
5574   ins_encode %{
5575     int vector_len = 0;
5576     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5577   %}
5578   ins_pipe( pipe_slow );
5579 %}
5580 
5581 instruct vadd8B_mem_evex_special(vecD dst, vecD src, memory mem) %{
5582   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
5583   match(Set dst (AddVB dst (LoadVector mem)));
5584   effect(TEMP src);
5585   format %{ "vpaddb  $dst,$src,$mem\t! add packed8B" %}
5586   ins_encode %{
5587     int vector_len = 0;
5588     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5589   %}
5590   ins_pipe( pipe_slow );
5591 %}
5592 
5593 instruct vadd16B(vecX dst, vecX src) %{
5594   predicate(UseAVX == 0 && n->as_Vector()->length() == 16);
5595   match(Set dst (AddVB dst src));
5596   format %{ "paddb   $dst,$src\t! add packed16B" %}
5597   ins_encode %{
5598     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
5599   %}
5600   ins_pipe( pipe_slow );
5601 %}
5602 
5603 instruct vadd16B_reg_avx(vecX dst, vecX src1, vecX src2) %{
5604   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 16);
5605   match(Set dst (AddVB src1 src2));
5606   format %{ "vpaddb  $dst,$src1,$src2\t! add packed16B" %}
5607   ins_encode %{
5608     int vector_len = 0;
5609     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5610   %}
5611   ins_pipe( pipe_slow );
5612 %}
5613 
5614 instruct vadd16B_reg_evex(vecX dst, vecX src1, vecX src2) %{
5615   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
5616   match(Set dst (AddVB src1 src2));
5617   format %{ "vpaddb  $dst,$src1,$src2\t! add packed16B" %}
5618   ins_encode %{
5619     int vector_len = 0;
5620     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5621   %}
5622   ins_pipe( pipe_slow );
5623 %}
5624 
5625 instruct vadd16B_reg_evex_special(vecX dst, vecX src1, vecX src2) %{
5626   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
5627   match(Set dst (AddVB dst src2));
5628   effect(TEMP src1);
5629   format %{ "vpaddb  $dst,$dst,$src2\t! add packed16B" %}
5630   ins_encode %{
5631     int vector_len = 0;
5632     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5633   %}
5634   ins_pipe( pipe_slow );
5635 %}
5636 
5637 instruct vadd16B_mem_avx(vecX dst, vecX src, memory mem) %{
5638   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 16);
5639   match(Set dst (AddVB src (LoadVector mem)));
5640   format %{ "vpaddb  $dst,$src,$mem\t! add packed16B" %}
5641   ins_encode %{
5642     int vector_len = 0;
5643     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5644   %}
5645   ins_pipe( pipe_slow );
5646 %}
5647 
5648 instruct vadd16B_mem_evex(vecX dst, vecX src, memory mem) %{
5649   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
5650   match(Set dst (AddVB src (LoadVector mem)));
5651   format %{ "vpaddb  $dst,$src,$mem\t! add packed16B" %}
5652   ins_encode %{
5653     int vector_len = 0;
5654     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5655   %}
5656   ins_pipe( pipe_slow );
5657 %}
5658 
5659 instruct vadd16B_mem_evex_special(vecX dst, vecX src, memory mem) %{
5660   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
5661   match(Set dst (AddVB dst (LoadVector mem)));
5662   effect(TEMP src);
5663   format %{ "vpaddb  $dst,$src,$mem\t! add packed16B" %}
5664   ins_encode %{
5665     int vector_len = 0;
5666     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5667   %}
5668   ins_pipe( pipe_slow );
5669 %}
5670 
5671 instruct vadd32B_reg_avx(vecY dst, vecY src1, vecY src2) %{
5672   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 32);
5673   match(Set dst (AddVB src1 src2));
5674   format %{ "vpaddb  $dst,$src1,$src2\t! add packed32B" %}
5675   ins_encode %{
5676     int vector_len = 1;
5677     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5678   %}
5679   ins_pipe( pipe_slow );
5680 %}
5681 
5682 instruct vadd32B_reg_evex(vecY dst, vecY src1, vecY src2) %{
5683   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
5684   match(Set dst (AddVB src1 src2));
5685   format %{ "vpaddb  $dst,$src1,$src2\t! add packed32B" %}
5686   ins_encode %{
5687     int vector_len = 1;
5688     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5689   %}
5690   ins_pipe( pipe_slow );
5691 %}
5692 
5693 instruct vadd32B_reg_evex_special(vecY dst, vecY src1, vecY src2) %{
5694   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 32);
5695   match(Set dst (AddVB dst src2));
5696   effect(TEMP src1);
5697   format %{ "vpaddb  $dst,$dst,$src2\t! add packed32B" %}
5698   ins_encode %{
5699     int vector_len = 1;
5700     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5701   %}
5702   ins_pipe( pipe_slow );
5703 %}
5704 
5705 instruct vadd32B_mem_avx(vecY dst, vecY src, memory mem) %{
5706   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 32);
5707   match(Set dst (AddVB src (LoadVector mem)));
5708   format %{ "vpaddb  $dst,$src,$mem\t! add packed32B" %}
5709   ins_encode %{
5710     int vector_len = 1;
5711     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5712   %}
5713   ins_pipe( pipe_slow );
5714 %}
5715 
5716 instruct vadd32B_mem_evex(vecY dst, vecY src, memory mem) %{
5717   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
5718   match(Set dst (AddVB src (LoadVector mem)));
5719   format %{ "vpaddb  $dst,$src,$mem\t! add packed32B" %}
5720   ins_encode %{
5721     int vector_len = 1;
5722     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5723   %}
5724   ins_pipe( pipe_slow );
5725 %}
5726 
5727 instruct vadd32B_mem_evex_special(vecY dst, vecY src, memory mem) %{
5728   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
5729   match(Set dst (AddVB dst (LoadVector mem)));
5730   effect(TEMP src);
5731   format %{ "vpaddb  $dst,$src,$mem\t! add packed32B" %}
5732   ins_encode %{
5733     int vector_len = 1;
5734     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5735   %}
5736   ins_pipe( pipe_slow );
5737 %}
5738 
5739 instruct vadd64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
5740   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64);
5741   match(Set dst (AddVB src1 src2));
5742   format %{ "vpaddb  $dst,$src1,$src2\t! add packed64B" %}
5743   ins_encode %{
5744     int vector_len = 2;
5745     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5746   %}
5747   ins_pipe( pipe_slow );
5748 %}
5749 
5750 instruct vadd64B_mem(vecZ dst, vecZ src, memory mem) %{
5751   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64);
5752   match(Set dst (AddVB src (LoadVector mem)));
5753   format %{ "vpaddb  $dst,$src,$mem\t! add packed64B" %}
5754   ins_encode %{
5755     int vector_len = 2;
5756     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5757   %}
5758   ins_pipe( pipe_slow );
5759 %}
5760 
5761 // Shorts/Chars vector add
5762 instruct vadd2S(vecS dst, vecS src) %{
5763   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
5764   match(Set dst (AddVS dst src));
5765   format %{ "paddw   $dst,$src\t! add packed2S" %}
5766   ins_encode %{
5767     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
5768   %}
5769   ins_pipe( pipe_slow );
5770 %}
5771 
5772 instruct vadd2S_reg_avx(vecS dst, vecS src1, vecS src2) %{
5773   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
5774   match(Set dst (AddVS src1 src2));
5775   format %{ "vpaddw  $dst,$src1,$src2\t! add packed2S" %}
5776   ins_encode %{
5777     int vector_len = 0;
5778     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5779   %}
5780   ins_pipe( pipe_slow );
5781 %}
5782 
5783 instruct vadd2S_reg_evex(vecS dst, vecS src1, vecS src2) %{
5784   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
5785   match(Set dst (AddVS src1 src2));
5786   format %{ "vpaddw  $dst,$src1,$src2\t! add packed2S" %}
5787   ins_encode %{
5788     int vector_len = 0;
5789     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5790   %}
5791   ins_pipe( pipe_slow );
5792 %}
5793 
5794 instruct vadd2S_reg_evex_special(vecS dst, vecS src1, vecS src2) %{
5795   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
5796   match(Set dst (AddVS dst src2));
5797   effect(TEMP src1);
5798   format %{ "vpaddw  $dst,$dst,$src2\t! add packed2S" %}
5799   ins_encode %{
5800     int vector_len = 0;
5801     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5802   %}
5803   ins_pipe( pipe_slow );
5804 %}
5805 
5806 instruct vadd2S_mem_avx(vecS dst, vecS src, memory mem) %{
5807   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
5808   match(Set dst (AddVS src (LoadVector mem)));
5809   format %{ "vpaddw  $dst,$src,$mem\t! add packed2S" %}
5810   ins_encode %{
5811     int vector_len = 0;
5812     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5813   %}
5814   ins_pipe( pipe_slow );
5815 %}
5816 
5817 instruct vadd2S_mem_evex(vecS dst, vecS src, memory mem) %{
5818   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
5819   match(Set dst (AddVS src (LoadVector mem)));
5820   format %{ "vpaddw  $dst,$src,$mem\t! add packed2S" %}
5821   ins_encode %{
5822     int vector_len = 0;
5823     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5824   %}
5825   ins_pipe( pipe_slow );
5826 %}
5827 
5828 instruct vadd2S_mem_evex_special(vecS dst, vecS src, memory mem) %{
5829   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
5830   match(Set dst (AddVS dst (LoadVector mem)));
5831   effect(TEMP src);
5832   format %{ "vpaddw  $dst,$src,$mem\t! add packed2S" %}
5833   ins_encode %{
5834     int vector_len = 0;
5835     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5836   %}
5837   ins_pipe( pipe_slow );
5838 %}
5839 
5840 instruct vadd4S(vecD dst, vecD src) %{
5841   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
5842   match(Set dst (AddVS dst src));
5843   format %{ "paddw   $dst,$src\t! add packed4S" %}
5844   ins_encode %{
5845     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
5846   %}
5847   ins_pipe( pipe_slow );
5848 %}
5849 
5850 instruct vadd4S_reg_avx(vecD dst, vecD src1, vecD src2) %{
5851   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
5852   match(Set dst (AddVS src1 src2));
5853   format %{ "vpaddw  $dst,$src1,$src2\t! add packed4S" %}
5854   ins_encode %{
5855     int vector_len = 0;
5856     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5857   %}
5858   ins_pipe( pipe_slow );
5859 %}
5860 
5861 instruct vadd4S_reg_evex(vecD dst, vecD src1, vecD src2) %{
5862   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
5863   match(Set dst (AddVS src1 src2));
5864   format %{ "vpaddw  $dst,$src1,$src2\t! add packed4S" %}
5865   ins_encode %{
5866     int vector_len = 0;
5867     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5868   %}
5869   ins_pipe( pipe_slow );
5870 %}
5871 
5872 instruct vadd4S_reg_evex_special(vecD dst, vecD src1, vecD src2) %{
5873   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
5874   match(Set dst (AddVS dst src2));
5875   effect(TEMP src1);
5876   format %{ "vpaddw  $dst,$dst,$src2\t! add packed4S" %}
5877   ins_encode %{
5878     int vector_len = 0;
5879     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5880   %}
5881   ins_pipe( pipe_slow );
5882 %}
5883 
5884 instruct vadd4S_mem_avx(vecD dst, vecD src, memory mem) %{
5885   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
5886   match(Set dst (AddVS src (LoadVector mem)));
5887   format %{ "vpaddw  $dst,$src,$mem\t! add packed4S" %}
5888   ins_encode %{
5889     int vector_len = 0;
5890     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5891   %}
5892   ins_pipe( pipe_slow );
5893 %}
5894 
5895 instruct vadd4S_mem_evex(vecD dst, vecD src, memory mem) %{
5896   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
5897   match(Set dst (AddVS src (LoadVector mem)));
5898   format %{ "vpaddw  $dst,$src,$mem\t! add packed4S" %}
5899   ins_encode %{
5900     int vector_len = 0;
5901     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5902   %}
5903   ins_pipe( pipe_slow );
5904 %}
5905 
5906 instruct vadd4S_mem_evex_special(vecD dst, vecD src, memory mem) %{
5907   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
5908   match(Set dst (AddVS dst (LoadVector mem)));
5909   effect(TEMP src);
5910   format %{ "vpaddw  $dst,$src,$mem\t! add packed4S" %}
5911   ins_encode %{
5912     int vector_len = 0;
5913     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5914   %}
5915   ins_pipe( pipe_slow );
5916 %}
5917 
5918 instruct vadd8S(vecX dst, vecX src) %{
5919   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
5920   match(Set dst (AddVS dst src));
5921   format %{ "paddw   $dst,$src\t! add packed8S" %}
5922   ins_encode %{
5923     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
5924   %}
5925   ins_pipe( pipe_slow );
5926 %}
5927 
5928 instruct vadd8S_reg_avx(vecX dst, vecX src1, vecX src2) %{
5929   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
5930   match(Set dst (AddVS src1 src2));
5931   format %{ "vpaddw  $dst,$src1,$src2\t! add packed8S" %}
5932   ins_encode %{
5933     int vector_len = 0;
5934     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5935   %}
5936   ins_pipe( pipe_slow );
5937 %}
5938 
5939 instruct vadd8S_reg_evex(vecX dst, vecX src1, vecX src2) %{
5940   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
5941   match(Set dst (AddVS src1 src2));
5942   format %{ "vpaddw  $dst,$src1,$src2\t! add packed8S" %}
5943   ins_encode %{
5944     int vector_len = 0;
5945     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5946   %}
5947   ins_pipe( pipe_slow );
5948 %}
5949 
5950 instruct vadd8S_reg_evex_special(vecX dst, vecX src1, vecX src2) %{
5951   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
5952   match(Set dst (AddVS dst src2));
5953   effect(TEMP src1);
5954   format %{ "vpaddw  $dst,$dst,$src2\t! add packed8S" %}
5955   ins_encode %{
5956     int vector_len = 0;
5957     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5958   %}
5959   ins_pipe( pipe_slow );
5960 %}
5961 
5962 instruct vadd8S_mem_avx(vecX dst, vecX src, memory mem) %{
5963   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
5964   match(Set dst (AddVS src (LoadVector mem)));
5965   format %{ "vpaddw  $dst,$src,$mem\t! add packed8S" %}
5966   ins_encode %{
5967     int vector_len = 0;
5968     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5969   %}
5970   ins_pipe( pipe_slow );
5971 %}
5972 
5973 instruct vadd8S_mem_evex(vecX dst, vecX src, memory mem) %{
5974   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
5975   match(Set dst (AddVS src (LoadVector mem)));
5976   format %{ "vpaddw  $dst,$src,$mem\t! add packed8S" %}
5977   ins_encode %{
5978     int vector_len = 0;
5979     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5980   %}
5981   ins_pipe( pipe_slow );
5982 %}
5983 
5984 instruct vadd8S_mem_evex_special(vecX dst, vecX src, memory mem) %{
5985   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
5986   match(Set dst (AddVS dst (LoadVector mem)));
5987   effect(TEMP src);
5988   format %{ "vpaddw  $dst,$src,$mem\t! add packed8S" %}
5989   ins_encode %{
5990     int vector_len = 0;
5991     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5992   %}
5993   ins_pipe( pipe_slow );
5994 %}
5995 
5996 instruct vadd16S_reg_avx(vecY dst, vecY src1, vecY src2) %{
5997   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
5998   match(Set dst (AddVS src1 src2));
5999   format %{ "vpaddw  $dst,$src1,$src2\t! add packed16S" %}
6000   ins_encode %{
6001     int vector_len = 1;
6002     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6003   %}
6004   ins_pipe( pipe_slow );
6005 %}
6006 
6007 instruct vadd16S_reg_evex(vecY dst, vecY src1, vecY src2) %{
6008   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
6009   match(Set dst (AddVS src1 src2));
6010   format %{ "vpaddw  $dst,$src1,$src2\t! add packed16S" %}
6011   ins_encode %{
6012     int vector_len = 1;
6013     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6014   %}
6015   ins_pipe( pipe_slow );
6016 %}
6017 
6018 instruct vadd16S_reg_evex_special(vecY dst, vecY src1, vecY src2) %{
6019   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
6020   match(Set dst (AddVS dst src2));
6021   effect(TEMP src1);
6022   format %{ "vpaddw  $dst,$dst,$src2\t! add packed16S" %}
6023   ins_encode %{
6024     int vector_len = 1;
6025     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6026   %}
6027   ins_pipe( pipe_slow );
6028 %}
6029 
6030 instruct vadd16S_mem_avx(vecY dst, vecY src, memory mem) %{
6031   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
6032   match(Set dst (AddVS src (LoadVector mem)));
6033   format %{ "vpaddw  $dst,$src,$mem\t! add packed16S" %}
6034   ins_encode %{
6035     int vector_len = 1;
6036     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6037   %}
6038   ins_pipe( pipe_slow );
6039 %}
6040 
6041 instruct vadd16S_mem_evex(vecY dst, vecY src, memory mem) %{
6042   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
6043   match(Set dst (AddVS src (LoadVector mem)));
6044   format %{ "vpaddw  $dst,$src,$mem\t! add packed16S" %}
6045   ins_encode %{
6046     int vector_len = 1;
6047     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6048   %}
6049   ins_pipe( pipe_slow );
6050 %}
6051 
6052 instruct vadd16S_mem_evex_special(vecY dst, vecY src, memory mem) %{
6053   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
6054   match(Set dst (AddVS dst (LoadVector mem)));
6055   effect(TEMP src);
6056   format %{ "vpaddw  $dst,$src,$mem\t! add packed16S" %}
6057   ins_encode %{
6058     int vector_len = 1;
6059     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6060   %}
6061   ins_pipe( pipe_slow );
6062 %}
6063 
6064 instruct vadd32S_reg(vecZ dst, vecZ src1, vecZ src2) %{
6065   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
6066   match(Set dst (AddVS src1 src2));
6067   format %{ "vpaddw  $dst,$src1,$src2\t! add packed32S" %}
6068   ins_encode %{
6069     int vector_len = 2;
6070     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6071   %}
6072   ins_pipe( pipe_slow );
6073 %}
6074 
6075 instruct vadd32S_mem(vecZ dst, vecZ src, memory mem) %{
6076   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
6077   match(Set dst (AddVS src (LoadVector mem)));
6078   format %{ "vpaddw  $dst,$src,$mem\t! add packed32S" %}
6079   ins_encode %{
6080     int vector_len = 2;
6081     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6082   %}
6083   ins_pipe( pipe_slow );
6084 %}
6085 
6086 // Integers vector add
6087 instruct vadd2I(vecD dst, vecD src) %{
6088   predicate(n->as_Vector()->length() == 2);
6089   match(Set dst (AddVI dst src));
6090   format %{ "paddd   $dst,$src\t! add packed2I" %}
6091   ins_encode %{
6092     __ paddd($dst$$XMMRegister, $src$$XMMRegister);
6093   %}
6094   ins_pipe( pipe_slow );
6095 %}
6096 
6097 instruct vadd2I_reg(vecD dst, vecD src1, vecD src2) %{
6098   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6099   match(Set dst (AddVI src1 src2));
6100   format %{ "vpaddd  $dst,$src1,$src2\t! add packed2I" %}
6101   ins_encode %{
6102     int vector_len = 0;
6103     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6104   %}
6105   ins_pipe( pipe_slow );
6106 %}
6107 
6108 instruct vadd2I_mem(vecD dst, vecD src, memory mem) %{
6109   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6110   match(Set dst (AddVI src (LoadVector mem)));
6111   format %{ "vpaddd  $dst,$src,$mem\t! add packed2I" %}
6112   ins_encode %{
6113     int vector_len = 0;
6114     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6115   %}
6116   ins_pipe( pipe_slow );
6117 %}
6118 
6119 instruct vadd4I(vecX dst, vecX src) %{
6120   predicate(n->as_Vector()->length() == 4);
6121   match(Set dst (AddVI dst src));
6122   format %{ "paddd   $dst,$src\t! add packed4I" %}
6123   ins_encode %{
6124     __ paddd($dst$$XMMRegister, $src$$XMMRegister);
6125   %}
6126   ins_pipe( pipe_slow );
6127 %}
6128 
6129 instruct vadd4I_reg(vecX dst, vecX src1, vecX src2) %{
6130   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6131   match(Set dst (AddVI src1 src2));
6132   format %{ "vpaddd  $dst,$src1,$src2\t! add packed4I" %}
6133   ins_encode %{
6134     int vector_len = 0;
6135     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6136   %}
6137   ins_pipe( pipe_slow );
6138 %}
6139 
6140 instruct vadd4I_mem(vecX dst, vecX src, memory mem) %{
6141   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6142   match(Set dst (AddVI src (LoadVector mem)));
6143   format %{ "vpaddd  $dst,$src,$mem\t! add packed4I" %}
6144   ins_encode %{
6145     int vector_len = 0;
6146     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6147   %}
6148   ins_pipe( pipe_slow );
6149 %}
6150 
6151 instruct vadd8I_reg(vecY dst, vecY src1, vecY src2) %{
6152   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
6153   match(Set dst (AddVI src1 src2));
6154   format %{ "vpaddd  $dst,$src1,$src2\t! add packed8I" %}
6155   ins_encode %{
6156     int vector_len = 1;
6157     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6158   %}
6159   ins_pipe( pipe_slow );
6160 %}
6161 
6162 instruct vadd8I_mem(vecY dst, vecY src, memory mem) %{
6163   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
6164   match(Set dst (AddVI src (LoadVector mem)));
6165   format %{ "vpaddd  $dst,$src,$mem\t! add packed8I" %}
6166   ins_encode %{
6167     int vector_len = 1;
6168     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6169   %}
6170   ins_pipe( pipe_slow );
6171 %}
6172 
6173 instruct vadd16I_reg(vecZ dst, vecZ src1, vecZ src2) %{
6174   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
6175   match(Set dst (AddVI src1 src2));
6176   format %{ "vpaddd  $dst,$src1,$src2\t! add packed16I" %}
6177   ins_encode %{
6178     int vector_len = 2;
6179     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6180   %}
6181   ins_pipe( pipe_slow );
6182 %}
6183 
6184 instruct vadd16I_mem(vecZ dst, vecZ src, memory mem) %{
6185   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
6186   match(Set dst (AddVI src (LoadVector mem)));
6187   format %{ "vpaddd  $dst,$src,$mem\t! add packed16I" %}
6188   ins_encode %{
6189     int vector_len = 2;
6190     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6191   %}
6192   ins_pipe( pipe_slow );
6193 %}
6194 
6195 // Longs vector add
6196 instruct vadd2L(vecX dst, vecX src) %{
6197   predicate(n->as_Vector()->length() == 2);
6198   match(Set dst (AddVL dst src));
6199   format %{ "paddq   $dst,$src\t! add packed2L" %}
6200   ins_encode %{
6201     __ paddq($dst$$XMMRegister, $src$$XMMRegister);
6202   %}
6203   ins_pipe( pipe_slow );
6204 %}
6205 
6206 instruct vadd2L_reg(vecX dst, vecX src1, vecX src2) %{
6207   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6208   match(Set dst (AddVL src1 src2));
6209   format %{ "vpaddq  $dst,$src1,$src2\t! add packed2L" %}
6210   ins_encode %{
6211     int vector_len = 0;
6212     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6213   %}
6214   ins_pipe( pipe_slow );
6215 %}
6216 
6217 instruct vadd2L_mem(vecX dst, vecX src, memory mem) %{
6218   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6219   match(Set dst (AddVL src (LoadVector mem)));
6220   format %{ "vpaddq  $dst,$src,$mem\t! add packed2L" %}
6221   ins_encode %{
6222     int vector_len = 0;
6223     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6224   %}
6225   ins_pipe( pipe_slow );
6226 %}
6227 
6228 instruct vadd4L_reg(vecY dst, vecY src1, vecY src2) %{
6229   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
6230   match(Set dst (AddVL src1 src2));
6231   format %{ "vpaddq  $dst,$src1,$src2\t! add packed4L" %}
6232   ins_encode %{
6233     int vector_len = 1;
6234     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6235   %}
6236   ins_pipe( pipe_slow );
6237 %}
6238 
6239 instruct vadd4L_mem(vecY dst, vecY src, memory mem) %{
6240   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
6241   match(Set dst (AddVL src (LoadVector mem)));
6242   format %{ "vpaddq  $dst,$src,$mem\t! add packed4L" %}
6243   ins_encode %{
6244     int vector_len = 1;
6245     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6246   %}
6247   ins_pipe( pipe_slow );
6248 %}
6249 
6250 instruct vadd8L_reg(vecZ dst, vecZ src1, vecZ src2) %{
6251   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
6252   match(Set dst (AddVL src1 src2));
6253   format %{ "vpaddq  $dst,$src1,$src2\t! add packed8L" %}
6254   ins_encode %{
6255     int vector_len = 2;
6256     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6257   %}
6258   ins_pipe( pipe_slow );
6259 %}
6260 
6261 instruct vadd8L_mem(vecZ dst, vecZ src, memory mem) %{
6262   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
6263   match(Set dst (AddVL src (LoadVector mem)));
6264   format %{ "vpaddq  $dst,$src,$mem\t! add packed8L" %}
6265   ins_encode %{
6266     int vector_len = 2;
6267     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6268   %}
6269   ins_pipe( pipe_slow );
6270 %}
6271 
6272 // Floats vector add
6273 instruct vadd2F(vecD dst, vecD src) %{
6274   predicate(n->as_Vector()->length() == 2);
6275   match(Set dst (AddVF dst src));
6276   format %{ "addps   $dst,$src\t! add packed2F" %}
6277   ins_encode %{
6278     __ addps($dst$$XMMRegister, $src$$XMMRegister);
6279   %}
6280   ins_pipe( pipe_slow );
6281 %}
6282 
6283 instruct vadd2F_reg(vecD dst, vecD src1, vecD src2) %{
6284   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6285   match(Set dst (AddVF src1 src2));
6286   format %{ "vaddps  $dst,$src1,$src2\t! add packed2F" %}
6287   ins_encode %{
6288     int vector_len = 0;
6289     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6290   %}
6291   ins_pipe( pipe_slow );
6292 %}
6293 
6294 instruct vadd2F_mem(vecD dst, vecD src, memory mem) %{
6295   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6296   match(Set dst (AddVF src (LoadVector mem)));
6297   format %{ "vaddps  $dst,$src,$mem\t! add packed2F" %}
6298   ins_encode %{
6299     int vector_len = 0;
6300     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6301   %}
6302   ins_pipe( pipe_slow );
6303 %}
6304 
6305 instruct vadd4F(vecX dst, vecX src) %{
6306   predicate(n->as_Vector()->length() == 4);
6307   match(Set dst (AddVF dst src));
6308   format %{ "addps   $dst,$src\t! add packed4F" %}
6309   ins_encode %{
6310     __ addps($dst$$XMMRegister, $src$$XMMRegister);
6311   %}
6312   ins_pipe( pipe_slow );
6313 %}
6314 
6315 instruct vadd4F_reg(vecX dst, vecX src1, vecX src2) %{
6316   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6317   match(Set dst (AddVF src1 src2));
6318   format %{ "vaddps  $dst,$src1,$src2\t! add packed4F" %}
6319   ins_encode %{
6320     int vector_len = 0;
6321     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6322   %}
6323   ins_pipe( pipe_slow );
6324 %}
6325 
6326 instruct vadd4F_mem(vecX dst, vecX src, memory mem) %{
6327   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6328   match(Set dst (AddVF src (LoadVector mem)));
6329   format %{ "vaddps  $dst,$src,$mem\t! add packed4F" %}
6330   ins_encode %{
6331     int vector_len = 0;
6332     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6333   %}
6334   ins_pipe( pipe_slow );
6335 %}
6336 
6337 instruct vadd8F_reg(vecY dst, vecY src1, vecY src2) %{
6338   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6339   match(Set dst (AddVF src1 src2));
6340   format %{ "vaddps  $dst,$src1,$src2\t! add packed8F" %}
6341   ins_encode %{
6342     int vector_len = 1;
6343     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6344   %}
6345   ins_pipe( pipe_slow );
6346 %}
6347 
6348 instruct vadd8F_mem(vecY dst, vecY src, memory mem) %{
6349   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6350   match(Set dst (AddVF src (LoadVector mem)));
6351   format %{ "vaddps  $dst,$src,$mem\t! add packed8F" %}
6352   ins_encode %{
6353     int vector_len = 1;
6354     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6355   %}
6356   ins_pipe( pipe_slow );
6357 %}
6358 
6359 instruct vadd16F_reg(vecZ dst, vecZ src1, vecZ src2) %{
6360   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
6361   match(Set dst (AddVF src1 src2));
6362   format %{ "vaddps  $dst,$src1,$src2\t! add packed16F" %}
6363   ins_encode %{
6364     int vector_len = 2;
6365     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6366   %}
6367   ins_pipe( pipe_slow );
6368 %}
6369 
6370 instruct vadd16F_mem(vecZ dst, vecZ src, memory mem) %{
6371   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
6372   match(Set dst (AddVF src (LoadVector mem)));
6373   format %{ "vaddps  $dst,$src,$mem\t! add packed16F" %}
6374   ins_encode %{
6375     int vector_len = 2;
6376     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6377   %}
6378   ins_pipe( pipe_slow );
6379 %}
6380 
6381 // Doubles vector add
6382 instruct vadd2D(vecX dst, vecX src) %{
6383   predicate(n->as_Vector()->length() == 2);
6384   match(Set dst (AddVD dst src));
6385   format %{ "addpd   $dst,$src\t! add packed2D" %}
6386   ins_encode %{
6387     __ addpd($dst$$XMMRegister, $src$$XMMRegister);
6388   %}
6389   ins_pipe( pipe_slow );
6390 %}
6391 
6392 instruct vadd2D_reg(vecX dst, vecX src1, vecX src2) %{
6393   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6394   match(Set dst (AddVD src1 src2));
6395   format %{ "vaddpd  $dst,$src1,$src2\t! add packed2D" %}
6396   ins_encode %{
6397     int vector_len = 0;
6398     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6399   %}
6400   ins_pipe( pipe_slow );
6401 %}
6402 
6403 instruct vadd2D_mem(vecX dst, vecX src, memory mem) %{
6404   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6405   match(Set dst (AddVD src (LoadVector mem)));
6406   format %{ "vaddpd  $dst,$src,$mem\t! add packed2D" %}
6407   ins_encode %{
6408     int vector_len = 0;
6409     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6410   %}
6411   ins_pipe( pipe_slow );
6412 %}
6413 
6414 instruct vadd4D_reg(vecY dst, vecY src1, vecY src2) %{
6415   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6416   match(Set dst (AddVD src1 src2));
6417   format %{ "vaddpd  $dst,$src1,$src2\t! add packed4D" %}
6418   ins_encode %{
6419     int vector_len = 1;
6420     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6421   %}
6422   ins_pipe( pipe_slow );
6423 %}
6424 
6425 instruct vadd4D_mem(vecY dst, vecY src, memory mem) %{
6426   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6427   match(Set dst (AddVD src (LoadVector mem)));
6428   format %{ "vaddpd  $dst,$src,$mem\t! add packed4D" %}
6429   ins_encode %{
6430     int vector_len = 1;
6431     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6432   %}
6433   ins_pipe( pipe_slow );
6434 %}
6435 
6436 instruct vadd8D_reg(vecZ dst, vecZ src1, vecZ src2) %{
6437   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
6438   match(Set dst (AddVD src1 src2));
6439   format %{ "vaddpd  $dst,$src1,$src2\t! add packed8D" %}
6440   ins_encode %{
6441     int vector_len = 2;
6442     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6443   %}
6444   ins_pipe( pipe_slow );
6445 %}
6446 
6447 instruct vadd8D_mem(vecZ dst, vecZ src, memory mem) %{
6448   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
6449   match(Set dst (AddVD src (LoadVector mem)));
6450   format %{ "vaddpd  $dst,$src,$mem\t! add packed8D" %}
6451   ins_encode %{
6452     int vector_len = 2;
6453     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6454   %}
6455   ins_pipe( pipe_slow );
6456 %}
6457 
6458 // --------------------------------- SUB --------------------------------------
6459 
6460 // Bytes vector sub
6461 instruct vsub4B(vecS dst, vecS src) %{
6462   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
6463   match(Set dst (SubVB dst src));
6464   format %{ "psubb   $dst,$src\t! sub packed4B" %}
6465   ins_encode %{
6466     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
6467   %}
6468   ins_pipe( pipe_slow );
6469 %}
6470 
6471 instruct vsub4B_reg_avx(vecS dst, vecS src1, vecS src2) %{
6472   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
6473   match(Set dst (SubVB src1 src2));
6474   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed4B" %}
6475   ins_encode %{
6476     int vector_len = 0;
6477     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6478   %}
6479   ins_pipe( pipe_slow );
6480 %}
6481 
6482 instruct vsub4B_reg_evex(vecS dst, vecS src1, vecS src2) %{
6483   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
6484   match(Set dst (SubVB src1 src2));
6485   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed4B" %}
6486   ins_encode %{
6487     int vector_len = 0;
6488     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6489   %}
6490   ins_pipe( pipe_slow );
6491 %}
6492 
6493 instruct vsub4B_reg_exex_special(vecS dst, vecS src1, vecS src2) %{
6494   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
6495   match(Set dst (SubVB dst src2));
6496   effect(TEMP src1);
6497   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed4B" %}
6498   ins_encode %{
6499     int vector_len = 0;
6500     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6501   %}
6502   ins_pipe( pipe_slow );
6503 %}
6504 
6505 instruct vsub4B_mem_avx(vecS dst, vecS src, memory mem) %{
6506   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
6507   match(Set dst (SubVB src (LoadVector mem)));
6508   format %{ "vpsubb  $dst,$src,$mem\t! sub packed4B" %}
6509   ins_encode %{
6510     int vector_len = 0;
6511     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6512   %}
6513   ins_pipe( pipe_slow );
6514 %}
6515 
6516 instruct vsub4B_mem_evex(vecS dst, vecS src, memory mem) %{
6517   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
6518   match(Set dst (SubVB src (LoadVector mem)));
6519   format %{ "vpsubb  $dst,$src,$mem\t! sub packed4B" %}
6520   ins_encode %{
6521     int vector_len = 0;
6522     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6523   %}
6524   ins_pipe( pipe_slow );
6525 %}
6526 
6527 instruct vsub4B_mem_evex_special(vecS dst, vecS src, memory mem) %{
6528   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
6529   match(Set dst (SubVB dst (LoadVector mem)));
6530   effect(TEMP src);
6531   format %{ "vpsubb  $dst,$src,$mem\t! sub packed4B" %}
6532   ins_encode %{
6533     int vector_len = 0;
6534     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6535   %}
6536   ins_pipe( pipe_slow );
6537 %}
6538 
6539 instruct vsub8B(vecD dst, vecD src) %{
6540   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
6541   match(Set dst (SubVB dst src));
6542   format %{ "psubb   $dst,$src\t! sub packed8B" %}
6543   ins_encode %{
6544     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
6545   %}
6546   ins_pipe( pipe_slow );
6547 %}
6548 
6549 instruct vsub8B_reg_avx(vecD dst, vecD src1, vecD src2) %{
6550   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
6551   match(Set dst (SubVB src1 src2));
6552   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed8B" %}
6553   ins_encode %{
6554     int vector_len = 0;
6555     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6556   %}
6557   ins_pipe( pipe_slow );
6558 %}
6559 
6560 instruct vsub8B_reg_evex(vecD dst, vecD src1, vecD src2) %{
6561   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
6562   match(Set dst (SubVB src1 src2));
6563   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed8B" %}
6564   ins_encode %{
6565     int vector_len = 0;
6566     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6567   %}
6568   ins_pipe( pipe_slow );
6569 %}
6570 
6571 instruct vsub8B_reg_evex_special(vecD dst, vecD src1, vecD src2) %{
6572   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
6573   match(Set dst (SubVB dst src2));
6574   effect(TEMP src1);
6575   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed8B" %}
6576   ins_encode %{
6577     int vector_len = 0;
6578     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6579   %}
6580   ins_pipe( pipe_slow );
6581 %}
6582 
6583 instruct vsub8B_mem_avx(vecD dst, vecD src, memory mem) %{
6584   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
6585   match(Set dst (SubVB src (LoadVector mem)));
6586   format %{ "vpsubb  $dst,$src,$mem\t! sub packed8B" %}
6587   ins_encode %{
6588     int vector_len = 0;
6589     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6590   %}
6591   ins_pipe( pipe_slow );
6592 %}
6593 
6594 instruct vsub8B_mem_evex(vecD dst, vecD src, memory mem) %{
6595   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
6596   match(Set dst (SubVB src (LoadVector mem)));
6597   format %{ "vpsubb  $dst,$src,$mem\t! sub packed8B" %}
6598   ins_encode %{
6599     int vector_len = 0;
6600     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6601   %}
6602   ins_pipe( pipe_slow );
6603 %}
6604 
6605 instruct vsub8B_mem_evex_special(vecD dst, vecD src, memory mem) %{
6606   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
6607   match(Set dst (SubVB dst (LoadVector mem)));
6608   effect(TEMP src);
6609   format %{ "vpsubb  $dst,$src,$mem\t! sub packed8B" %}
6610   ins_encode %{
6611     int vector_len = 0;
6612     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6613   %}
6614   ins_pipe( pipe_slow );
6615 %}
6616 
6617 instruct vsub16B(vecX dst, vecX src) %{
6618   predicate(UseAVX == 0 && n->as_Vector()->length() == 16);
6619   match(Set dst (SubVB dst src));
6620   format %{ "psubb   $dst,$src\t! sub packed16B" %}
6621   ins_encode %{
6622     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
6623   %}
6624   ins_pipe( pipe_slow );
6625 %}
6626 
6627 instruct vsub16B_reg_avx(vecX dst, vecX src1, vecX src2) %{
6628   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 16);
6629   match(Set dst (SubVB src1 src2));
6630   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed16B" %}
6631   ins_encode %{
6632     int vector_len = 0;
6633     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6634   %}
6635   ins_pipe( pipe_slow );
6636 %}
6637 
6638 instruct vsub16B_reg_evex(vecX dst, vecX src1, vecX src2) %{
6639   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
6640   match(Set dst (SubVB src1 src2));
6641   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed16B" %}
6642   ins_encode %{
6643     int vector_len = 0;
6644     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6645   %}
6646   ins_pipe( pipe_slow );
6647 %}
6648 
6649 instruct vsub16B_reg_evex_special(vecX dst, vecX src1, vecX src2) %{
6650   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
6651   match(Set dst (SubVB dst src2));
6652   effect(TEMP src1);
6653   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed16B" %}
6654   ins_encode %{
6655     int vector_len = 0;
6656     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6657   %}
6658   ins_pipe( pipe_slow );
6659 %}
6660 
6661 instruct vsub16B_mem_avx(vecX dst, vecX src, memory mem) %{
6662   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 16);
6663   match(Set dst (SubVB src (LoadVector mem)));
6664   format %{ "vpsubb  $dst,$src,$mem\t! sub packed16B" %}
6665   ins_encode %{
6666     int vector_len = 0;
6667     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6668   %}
6669   ins_pipe( pipe_slow );
6670 %}
6671 
6672 instruct vsub16B_mem_evex(vecX dst, vecX src, memory mem) %{
6673   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
6674   match(Set dst (SubVB src (LoadVector mem)));
6675   format %{ "vpsubb  $dst,$src,$mem\t! sub packed16B" %}
6676   ins_encode %{
6677     int vector_len = 0;
6678     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6679   %}
6680   ins_pipe( pipe_slow );
6681 %}
6682 
6683 instruct vsub16B_mem_evex_special(vecX dst, vecX src, memory mem) %{
6684   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
6685   match(Set dst (SubVB dst (LoadVector mem)));
6686   effect(TEMP src);
6687   format %{ "vpsubb  $dst,$src,$mem\t! sub packed16B" %}
6688   ins_encode %{
6689     int vector_len = 0;
6690     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6691   %}
6692   ins_pipe( pipe_slow );
6693 %}
6694 
6695 instruct vsub32B_reg_avx(vecY dst, vecY src1, vecY src2) %{
6696   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 32);
6697   match(Set dst (SubVB src1 src2));
6698   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed32B" %}
6699   ins_encode %{
6700     int vector_len = 1;
6701     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6702   %}
6703   ins_pipe( pipe_slow );
6704 %}
6705 
6706 instruct vsub32B_reg_evex(vecY dst, vecY src1, vecY src2) %{
6707   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
6708   match(Set dst (SubVB src1 src2));
6709   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed32B" %}
6710   ins_encode %{
6711     int vector_len = 1;
6712     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6713   %}
6714   ins_pipe( pipe_slow );
6715 %}
6716 
6717 instruct vsub32B_reg_evex_special(vecY dst, vecY src1, vecY src2) %{
6718   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 32);
6719   match(Set dst (SubVB dst src2));
6720   effect(TEMP src1);
6721   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed32B" %}
6722   ins_encode %{
6723     int vector_len = 1;
6724     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6725   %}
6726   ins_pipe( pipe_slow );
6727 %}
6728 
6729 instruct vsub32B_mem_avx(vecY dst, vecY src, memory mem) %{
6730   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 32);
6731   match(Set dst (SubVB src (LoadVector mem)));
6732   format %{ "vpsubb  $dst,$src,$mem\t! sub packed32B" %}
6733   ins_encode %{
6734     int vector_len = 1;
6735     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6736   %}
6737   ins_pipe( pipe_slow );
6738 %}
6739 
6740 instruct vsub32B_mem_evex(vecY dst, vecY src, memory mem) %{
6741   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
6742   match(Set dst (SubVB src (LoadVector mem)));
6743   format %{ "vpsubb  $dst,$src,$mem\t! sub packed32B" %}
6744   ins_encode %{
6745     int vector_len = 1;
6746     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6747   %}
6748   ins_pipe( pipe_slow );
6749 %}
6750 
6751 instruct vsub32B_mem_evex_special(vecY dst, vecY src, memory mem) %{
6752   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 32);
6753   match(Set dst (SubVB dst (LoadVector mem)));
6754   effect(TEMP src);
6755   format %{ "vpsubb  $dst,$src,$mem\t! sub packed32B" %}
6756   ins_encode %{
6757     int vector_len = 1;
6758     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6759   %}
6760   ins_pipe( pipe_slow );
6761 %}
6762 
6763 instruct vsub64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
6764   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64);
6765   match(Set dst (SubVB src1 src2));
6766   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed64B" %}
6767   ins_encode %{
6768     int vector_len = 2;
6769     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6770   %}
6771   ins_pipe( pipe_slow );
6772 %}
6773 
6774 instruct vsub64B_mem(vecZ dst, vecZ src, memory mem) %{
6775   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64);
6776   match(Set dst (SubVB src (LoadVector mem)));
6777   format %{ "vpsubb  $dst,$src,$mem\t! sub packed64B" %}
6778   ins_encode %{
6779     int vector_len = 2;
6780     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6781   %}
6782   ins_pipe( pipe_slow );
6783 %}
6784 
6785 // Shorts/Chars vector sub
6786 instruct vsub2S(vecS dst, vecS src) %{
6787   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
6788   match(Set dst (SubVS dst src));
6789   format %{ "psubw   $dst,$src\t! sub packed2S" %}
6790   ins_encode %{
6791     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
6792   %}
6793   ins_pipe( pipe_slow );
6794 %}
6795 
6796 instruct vsub2S_reg_avx(vecS dst, vecS src1, vecS src2) %{
6797   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
6798   match(Set dst (SubVS src1 src2));
6799   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed2S" %}
6800   ins_encode %{
6801     int vector_len = 0;
6802     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6803   %}
6804   ins_pipe( pipe_slow );
6805 %}
6806 
6807 instruct vsub2S_reg_evex(vecS dst, vecS src1, vecS src2) %{
6808   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
6809   match(Set dst (SubVS src1 src2));
6810   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed2S" %}
6811   ins_encode %{
6812     int vector_len = 0;
6813     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6814   %}
6815   ins_pipe( pipe_slow );
6816 %}
6817 
6818 instruct vsub2S_reg_evex_special(vecS dst, vecS src1, vecS src2) %{
6819   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
6820   match(Set dst (SubVS dst src2));
6821   effect(TEMP src1);
6822   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed2S" %}
6823   ins_encode %{
6824     int vector_len = 0;
6825     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6826   %}
6827   ins_pipe( pipe_slow );
6828 %}
6829 
6830 instruct vsub2S_mem_avx(vecS dst, vecS src, memory mem) %{
6831   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
6832   match(Set dst (SubVS src (LoadVector mem)));
6833   format %{ "vpsubw  $dst,$src,$mem\t! sub packed2S" %}
6834   ins_encode %{
6835     int vector_len = 0;
6836     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6837   %}
6838   ins_pipe( pipe_slow );
6839 %}
6840 
6841 instruct vsub2S_mem_evex(vecS dst, vecS src, memory mem) %{
6842   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
6843   match(Set dst (SubVS src (LoadVector mem)));
6844   format %{ "vpsubw  $dst,$src,$mem\t! sub packed2S" %}
6845   ins_encode %{
6846     int vector_len = 0;
6847     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6848   %}
6849   ins_pipe( pipe_slow );
6850 %}
6851 
6852 instruct vsub2S_mem_evex_special(vecS dst, vecS src, memory mem) %{
6853   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
6854   match(Set dst (SubVS dst (LoadVector mem)));
6855   effect(TEMP src);
6856   format %{ "vpsubw  $dst,$src,$mem\t! sub packed2S" %}
6857   ins_encode %{
6858     int vector_len = 0;
6859     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6860   %}
6861   ins_pipe( pipe_slow );
6862 %}
6863 
6864 instruct vsub4S(vecD dst, vecD src) %{
6865   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
6866   match(Set dst (SubVS dst src));
6867   format %{ "psubw   $dst,$src\t! sub packed4S" %}
6868   ins_encode %{
6869     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
6870   %}
6871   ins_pipe( pipe_slow );
6872 %}
6873 
6874 instruct vsub4S_reg_avx(vecD dst, vecD src1, vecD src2) %{
6875   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
6876   match(Set dst (SubVS src1 src2));
6877   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed4S" %}
6878   ins_encode %{
6879     int vector_len = 0;
6880     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6881   %}
6882   ins_pipe( pipe_slow );
6883 %}
6884 
6885 instruct vsub4S_reg_evex(vecD dst, vecD src1, vecD src2) %{
6886   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
6887   match(Set dst (SubVS src1 src2));
6888   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed4S" %}
6889   ins_encode %{
6890     int vector_len = 0;
6891     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6892   %}
6893   ins_pipe( pipe_slow );
6894 %}
6895 
6896 instruct vsub4S_reg_evex_special(vecD dst, vecD src1, vecD src2) %{
6897   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
6898   match(Set dst (SubVS dst src2));
6899   effect(TEMP src1);
6900   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed4S" %}
6901   ins_encode %{
6902     int vector_len = 0;
6903     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6904   %}
6905   ins_pipe( pipe_slow );
6906 %}
6907 
6908 instruct vsub4S_mem_avx(vecD dst, vecD src, memory mem) %{
6909   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
6910   match(Set dst (SubVS src (LoadVector mem)));
6911   format %{ "vpsubw  $dst,$src,$mem\t! sub packed4S" %}
6912   ins_encode %{
6913     int vector_len = 0;
6914     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6915   %}
6916   ins_pipe( pipe_slow );
6917 %}
6918 
6919 instruct vsub4S_mem_evex(vecD dst, vecD src, memory mem) %{
6920   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
6921   match(Set dst (SubVS src (LoadVector mem)));
6922   format %{ "vpsubw  $dst,$src,$mem\t! sub packed4S" %}
6923   ins_encode %{
6924     int vector_len = 0;
6925     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6926   %}
6927   ins_pipe( pipe_slow );
6928 %}
6929 
6930 instruct vsub4S_mem_evex_special(vecD dst, vecD src, memory mem) %{
6931   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
6932   match(Set dst (SubVS dst (LoadVector mem)));
6933   effect(TEMP src);
6934   format %{ "vpsubw  $dst,$src,$mem\t! sub packed4S" %}
6935   ins_encode %{
6936     int vector_len = 0;
6937     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6938   %}
6939   ins_pipe( pipe_slow );
6940 %}
6941 
6942 instruct vsub8S(vecX dst, vecX src) %{
6943   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
6944   match(Set dst (SubVS dst src));
6945   format %{ "psubw   $dst,$src\t! sub packed8S" %}
6946   ins_encode %{
6947     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
6948   %}
6949   ins_pipe( pipe_slow );
6950 %}
6951 
6952 instruct vsub8S_reg_avx(vecX dst, vecX src1, vecX src2) %{
6953   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
6954   match(Set dst (SubVS src1 src2));
6955   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed8S" %}
6956   ins_encode %{
6957     int vector_len = 0;
6958     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6959   %}
6960   ins_pipe( pipe_slow );
6961 %}
6962 
6963 instruct vsub8S_reg_evex(vecX dst, vecX src1, vecX src2) %{
6964   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
6965   match(Set dst (SubVS src1 src2));
6966   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed8S" %}
6967   ins_encode %{
6968     int vector_len = 0;
6969     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6970   %}
6971   ins_pipe( pipe_slow );
6972 %}
6973 
6974 instruct vsub8S_reg_evex_special(vecX dst, vecX src1, vecX src2) %{
6975   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
6976   match(Set dst (SubVS dst src2));
6977   effect(TEMP src1);
6978   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed8S" %}
6979   ins_encode %{
6980     int vector_len = 0;
6981     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6982   %}
6983   ins_pipe( pipe_slow );
6984 %}
6985 
6986 instruct vsub8S_mem_avx(vecX dst, vecX src, memory mem) %{
6987   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
6988   match(Set dst (SubVS src (LoadVector mem)));
6989   format %{ "vpsubw  $dst,$src,$mem\t! sub packed8S" %}
6990   ins_encode %{
6991     int vector_len = 0;
6992     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6993   %}
6994   ins_pipe( pipe_slow );
6995 %}
6996 
6997 instruct vsub8S_mem_evex(vecX dst, vecX src, memory mem) %{
6998   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
6999   match(Set dst (SubVS src (LoadVector mem)));
7000   format %{ "vpsubw  $dst,$src,$mem\t! sub packed8S" %}
7001   ins_encode %{
7002     int vector_len = 0;
7003     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7004   %}
7005   ins_pipe( pipe_slow );
7006 %}
7007 
7008 instruct vsub8S_mem_evex_special(vecX dst, vecX src, memory mem) %{
7009   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
7010   match(Set dst (SubVS dst (LoadVector mem)));
7011   effect(TEMP src);
7012   format %{ "vpsubw  $dst,$src,$mem\t! sub packed8S" %}
7013   ins_encode %{
7014     int vector_len = 0;
7015     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7016   %}
7017   ins_pipe( pipe_slow );
7018 %}
7019 
7020 instruct vsub16S_reg_avx(vecY dst, vecY src1, vecY src2) %{
7021   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
7022   match(Set dst (SubVS src1 src2));
7023   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed16S" %}
7024   ins_encode %{
7025     int vector_len = 1;
7026     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7027   %}
7028   ins_pipe( pipe_slow );
7029 %}
7030 
7031 instruct vsub16S_reg_evex(vecY dst, vecY src1, vecY src2) %{
7032   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
7033   match(Set dst (SubVS src1 src2));
7034   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed16S" %}
7035   ins_encode %{
7036     int vector_len = 1;
7037     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7038   %}
7039   ins_pipe( pipe_slow );
7040 %}
7041 
7042 instruct vsub16S_reg_evex_special(vecY dst, vecY src1, vecY src2) %{
7043   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
7044   match(Set dst (SubVS dst src2));
7045   effect(TEMP src1);
7046   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed16S" %}
7047   ins_encode %{
7048     int vector_len = 1;
7049     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7050   %}
7051   ins_pipe( pipe_slow );
7052 %}
7053 
7054 instruct vsub16S_mem_avx(vecY dst, vecY src, memory mem) %{
7055   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
7056   match(Set dst (SubVS src (LoadVector mem)));
7057   format %{ "vpsubw  $dst,$src,$mem\t! sub packed16S" %}
7058   ins_encode %{
7059     int vector_len = 1;
7060     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7061   %}
7062   ins_pipe( pipe_slow );
7063 %}
7064 
7065 instruct vsub16S_mem_evex(vecY dst, vecY src, memory mem) %{
7066   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
7067   match(Set dst (SubVS src (LoadVector mem)));
7068   format %{ "vpsubw  $dst,$src,$mem\t! sub packed16S" %}
7069   ins_encode %{
7070     int vector_len = 1;
7071     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7072   %}
7073   ins_pipe( pipe_slow );
7074 %}
7075 
7076 instruct vsub16S_mem_evex_special(vecY dst, vecY src, memory mem) %{
7077   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
7078   match(Set dst (SubVS dst (LoadVector mem)));
7079    effect(TEMP src);
7080   format %{ "vpsubw  $dst,$src,$mem\t! sub packed16S" %}
7081   ins_encode %{
7082     int vector_len = 1;
7083     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7084   %}
7085   ins_pipe( pipe_slow );
7086 %}
7087 
7088 instruct vsub32S_reg(vecZ dst, vecZ src1, vecZ src2) %{
7089   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
7090   match(Set dst (SubVS src1 src2));
7091   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed32S" %}
7092   ins_encode %{
7093     int vector_len = 2;
7094     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7095   %}
7096   ins_pipe( pipe_slow );
7097 %}
7098 
7099 instruct vsub32S_mem(vecZ dst, vecZ src, memory mem) %{
7100   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
7101   match(Set dst (SubVS src (LoadVector mem)));
7102   format %{ "vpsubw  $dst,$src,$mem\t! sub packed32S" %}
7103   ins_encode %{
7104     int vector_len = 2;
7105     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7106   %}
7107   ins_pipe( pipe_slow );
7108 %}
7109 
7110 // Integers vector sub
7111 instruct vsub2I(vecD dst, vecD src) %{
7112   predicate(n->as_Vector()->length() == 2);
7113   match(Set dst (SubVI dst src));
7114   format %{ "psubd   $dst,$src\t! sub packed2I" %}
7115   ins_encode %{
7116     __ psubd($dst$$XMMRegister, $src$$XMMRegister);
7117   %}
7118   ins_pipe( pipe_slow );
7119 %}
7120 
7121 instruct vsub2I_reg(vecD dst, vecD src1, vecD src2) %{
7122   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7123   match(Set dst (SubVI src1 src2));
7124   format %{ "vpsubd  $dst,$src1,$src2\t! sub packed2I" %}
7125   ins_encode %{
7126     int vector_len = 0;
7127     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7128   %}
7129   ins_pipe( pipe_slow );
7130 %}
7131 
7132 instruct vsub2I_mem(vecD dst, vecD src, memory mem) %{
7133   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7134   match(Set dst (SubVI src (LoadVector mem)));
7135   format %{ "vpsubd  $dst,$src,$mem\t! sub packed2I" %}
7136   ins_encode %{
7137     int vector_len = 0;
7138     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7139   %}
7140   ins_pipe( pipe_slow );
7141 %}
7142 
7143 instruct vsub4I(vecX dst, vecX src) %{
7144   predicate(n->as_Vector()->length() == 4);
7145   match(Set dst (SubVI dst src));
7146   format %{ "psubd   $dst,$src\t! sub packed4I" %}
7147   ins_encode %{
7148     __ psubd($dst$$XMMRegister, $src$$XMMRegister);
7149   %}
7150   ins_pipe( pipe_slow );
7151 %}
7152 
7153 instruct vsub4I_reg(vecX dst, vecX src1, vecX src2) %{
7154   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7155   match(Set dst (SubVI src1 src2));
7156   format %{ "vpsubd  $dst,$src1,$src2\t! sub packed4I" %}
7157   ins_encode %{
7158     int vector_len = 0;
7159     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7160   %}
7161   ins_pipe( pipe_slow );
7162 %}
7163 
7164 instruct vsub4I_mem(vecX dst, vecX src, memory mem) %{
7165   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7166   match(Set dst (SubVI src (LoadVector mem)));
7167   format %{ "vpsubd  $dst,$src,$mem\t! sub packed4I" %}
7168   ins_encode %{
7169     int vector_len = 0;
7170     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7171   %}
7172   ins_pipe( pipe_slow );
7173 %}
7174 
7175 instruct vsub8I_reg(vecY dst, vecY src1, vecY src2) %{
7176   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
7177   match(Set dst (SubVI src1 src2));
7178   format %{ "vpsubd  $dst,$src1,$src2\t! sub packed8I" %}
7179   ins_encode %{
7180     int vector_len = 1;
7181     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7182   %}
7183   ins_pipe( pipe_slow );
7184 %}
7185 
7186 instruct vsub8I_mem(vecY dst, vecY src, memory mem) %{
7187   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
7188   match(Set dst (SubVI src (LoadVector mem)));
7189   format %{ "vpsubd  $dst,$src,$mem\t! sub packed8I" %}
7190   ins_encode %{
7191     int vector_len = 1;
7192     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7193   %}
7194   ins_pipe( pipe_slow );
7195 %}
7196 
7197 instruct vsub16I_reg(vecZ dst, vecZ src1, vecZ src2) %{
7198   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
7199   match(Set dst (SubVI src1 src2));
7200   format %{ "vpsubd  $dst,$src1,$src2\t! sub packed16I" %}
7201   ins_encode %{
7202     int vector_len = 2;
7203     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7204   %}
7205   ins_pipe( pipe_slow );
7206 %}
7207 
7208 instruct vsub16I_mem(vecZ dst, vecZ src, memory mem) %{
7209   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
7210   match(Set dst (SubVI src (LoadVector mem)));
7211   format %{ "vpsubd  $dst,$src,$mem\t! sub packed16I" %}
7212   ins_encode %{
7213     int vector_len = 2;
7214     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7215   %}
7216   ins_pipe( pipe_slow );
7217 %}
7218 
7219 // Longs vector sub
7220 instruct vsub2L(vecX dst, vecX src) %{
7221   predicate(n->as_Vector()->length() == 2);
7222   match(Set dst (SubVL dst src));
7223   format %{ "psubq   $dst,$src\t! sub packed2L" %}
7224   ins_encode %{
7225     __ psubq($dst$$XMMRegister, $src$$XMMRegister);
7226   %}
7227   ins_pipe( pipe_slow );
7228 %}
7229 
7230 instruct vsub2L_reg(vecX dst, vecX src1, vecX src2) %{
7231   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7232   match(Set dst (SubVL src1 src2));
7233   format %{ "vpsubq  $dst,$src1,$src2\t! sub packed2L" %}
7234   ins_encode %{
7235     int vector_len = 0;
7236     __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7237   %}
7238   ins_pipe( pipe_slow );
7239 %}
7240 
7241 instruct vsub2L_mem(vecX dst, vecX src, memory mem) %{
7242   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7243   match(Set dst (SubVL src (LoadVector mem)));
7244   format %{ "vpsubq  $dst,$src,$mem\t! sub packed2L" %}
7245   ins_encode %{
7246     int vector_len = 0;
7247     __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7248   %}
7249   ins_pipe( pipe_slow );
7250 %}
7251 
7252 instruct vsub4L_reg(vecY dst, vecY src1, vecY src2) %{
7253   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
7254   match(Set dst (SubVL src1 src2));
7255   format %{ "vpsubq  $dst,$src1,$src2\t! sub packed4L" %}
7256   ins_encode %{
7257     int vector_len = 1;
7258     __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7259   %}
7260   ins_pipe( pipe_slow );
7261 %}
7262 
7263 instruct vsub4L_mem(vecY dst, vecY src, memory mem) %{
7264   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
7265   match(Set dst (SubVL src (LoadVector mem)));
7266   format %{ "vpsubq  $dst,$src,$mem\t! sub packed4L" %}
7267   ins_encode %{
7268     int vector_len = 1;
7269     __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7270   %}
7271   ins_pipe( pipe_slow );
7272 %}
7273 
7274 instruct vsub8L_reg(vecZ dst, vecZ src1, vecZ src2) %{
7275   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
7276   match(Set dst (SubVL src1 src2));
7277   format %{ "vpsubq  $dst,$src1,$src2\t! sub packed8L" %}
7278   ins_encode %{
7279     int vector_len = 2;
7280     __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7281   %}
7282   ins_pipe( pipe_slow );
7283 %}
7284 
7285 instruct vsub8L_mem(vecZ dst, vecZ src, memory mem) %{
7286   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
7287   match(Set dst (SubVL src (LoadVector mem)));
7288   format %{ "vpsubq  $dst,$src,$mem\t! sub packed8L" %}
7289   ins_encode %{
7290     int vector_len = 2;
7291     __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7292   %}
7293   ins_pipe( pipe_slow );
7294 %}
7295 
7296 // Floats vector sub
7297 instruct vsub2F(vecD dst, vecD src) %{
7298   predicate(n->as_Vector()->length() == 2);
7299   match(Set dst (SubVF dst src));
7300   format %{ "subps   $dst,$src\t! sub packed2F" %}
7301   ins_encode %{
7302     __ subps($dst$$XMMRegister, $src$$XMMRegister);
7303   %}
7304   ins_pipe( pipe_slow );
7305 %}
7306 
7307 instruct vsub2F_reg(vecD dst, vecD src1, vecD src2) %{
7308   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7309   match(Set dst (SubVF src1 src2));
7310   format %{ "vsubps  $dst,$src1,$src2\t! sub packed2F" %}
7311   ins_encode %{
7312     int vector_len = 0;
7313     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7314   %}
7315   ins_pipe( pipe_slow );
7316 %}
7317 
7318 instruct vsub2F_mem(vecD dst, vecD src, memory mem) %{
7319   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7320   match(Set dst (SubVF src (LoadVector mem)));
7321   format %{ "vsubps  $dst,$src,$mem\t! sub packed2F" %}
7322   ins_encode %{
7323     int vector_len = 0;
7324     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7325   %}
7326   ins_pipe( pipe_slow );
7327 %}
7328 
7329 instruct vsub4F(vecX dst, vecX src) %{
7330   predicate(n->as_Vector()->length() == 4);
7331   match(Set dst (SubVF dst src));
7332   format %{ "subps   $dst,$src\t! sub packed4F" %}
7333   ins_encode %{
7334     __ subps($dst$$XMMRegister, $src$$XMMRegister);
7335   %}
7336   ins_pipe( pipe_slow );
7337 %}
7338 
7339 instruct vsub4F_reg(vecX dst, vecX src1, vecX src2) %{
7340   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7341   match(Set dst (SubVF src1 src2));
7342   format %{ "vsubps  $dst,$src1,$src2\t! sub packed4F" %}
7343   ins_encode %{
7344     int vector_len = 0;
7345     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7346   %}
7347   ins_pipe( pipe_slow );
7348 %}
7349 
7350 instruct vsub4F_mem(vecX dst, vecX src, memory mem) %{
7351   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7352   match(Set dst (SubVF src (LoadVector mem)));
7353   format %{ "vsubps  $dst,$src,$mem\t! sub packed4F" %}
7354   ins_encode %{
7355     int vector_len = 0;
7356     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7357   %}
7358   ins_pipe( pipe_slow );
7359 %}
7360 
7361 instruct vsub8F_reg(vecY dst, vecY src1, vecY src2) %{
7362   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
7363   match(Set dst (SubVF src1 src2));
7364   format %{ "vsubps  $dst,$src1,$src2\t! sub packed8F" %}
7365   ins_encode %{
7366     int vector_len = 1;
7367     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7368   %}
7369   ins_pipe( pipe_slow );
7370 %}
7371 
7372 instruct vsub8F_mem(vecY dst, vecY src, memory mem) %{
7373   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
7374   match(Set dst (SubVF src (LoadVector mem)));
7375   format %{ "vsubps  $dst,$src,$mem\t! sub packed8F" %}
7376   ins_encode %{
7377     int vector_len = 1;
7378     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7379   %}
7380   ins_pipe( pipe_slow );
7381 %}
7382 
7383 instruct vsub16F_reg(vecZ dst, vecZ src1, vecZ src2) %{
7384   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
7385   match(Set dst (SubVF src1 src2));
7386   format %{ "vsubps  $dst,$src1,$src2\t! sub packed16F" %}
7387   ins_encode %{
7388     int vector_len = 2;
7389     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7390   %}
7391   ins_pipe( pipe_slow );
7392 %}
7393 
7394 instruct vsub16F_mem(vecZ dst, vecZ src, memory mem) %{
7395   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
7396   match(Set dst (SubVF src (LoadVector mem)));
7397   format %{ "vsubps  $dst,$src,$mem\t! sub packed16F" %}
7398   ins_encode %{
7399     int vector_len = 2;
7400     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7401   %}
7402   ins_pipe( pipe_slow );
7403 %}
7404 
7405 // Doubles vector sub
7406 instruct vsub2D(vecX dst, vecX src) %{
7407   predicate(n->as_Vector()->length() == 2);
7408   match(Set dst (SubVD dst src));
7409   format %{ "subpd   $dst,$src\t! sub packed2D" %}
7410   ins_encode %{
7411     __ subpd($dst$$XMMRegister, $src$$XMMRegister);
7412   %}
7413   ins_pipe( pipe_slow );
7414 %}
7415 
7416 instruct vsub2D_reg(vecX dst, vecX src1, vecX src2) %{
7417   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7418   match(Set dst (SubVD src1 src2));
7419   format %{ "vsubpd  $dst,$src1,$src2\t! sub packed2D" %}
7420   ins_encode %{
7421     int vector_len = 0;
7422     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7423   %}
7424   ins_pipe( pipe_slow );
7425 %}
7426 
7427 instruct vsub2D_mem(vecX dst, vecX src, memory mem) %{
7428   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7429   match(Set dst (SubVD src (LoadVector mem)));
7430   format %{ "vsubpd  $dst,$src,$mem\t! sub packed2D" %}
7431   ins_encode %{
7432     int vector_len = 0;
7433     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7434   %}
7435   ins_pipe( pipe_slow );
7436 %}
7437 
7438 instruct vsub4D_reg(vecY dst, vecY src1, vecY src2) %{
7439   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7440   match(Set dst (SubVD src1 src2));
7441   format %{ "vsubpd  $dst,$src1,$src2\t! sub packed4D" %}
7442   ins_encode %{
7443     int vector_len = 1;
7444     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7445   %}
7446   ins_pipe( pipe_slow );
7447 %}
7448 
7449 instruct vsub4D_mem(vecY dst, vecY src, memory mem) %{
7450   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7451   match(Set dst (SubVD src (LoadVector mem)));
7452   format %{ "vsubpd  $dst,$src,$mem\t! sub packed4D" %}
7453   ins_encode %{
7454     int vector_len = 1;
7455     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7456   %}
7457   ins_pipe( pipe_slow );
7458 %}
7459 
7460 instruct vsub8D_reg(vecZ dst, vecZ src1, vecZ src2) %{
7461   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
7462   match(Set dst (SubVD src1 src2));
7463   format %{ "vsubpd  $dst,$src1,$src2\t! sub packed8D" %}
7464   ins_encode %{
7465     int vector_len = 2;
7466     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7467   %}
7468   ins_pipe( pipe_slow );
7469 %}
7470 
7471 instruct vsub8D_mem(vecZ dst, vecZ src, memory mem) %{
7472   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
7473   match(Set dst (SubVD src (LoadVector mem)));
7474   format %{ "vsubpd  $dst,$src,$mem\t! sub packed8D" %}
7475   ins_encode %{
7476     int vector_len = 2;
7477     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7478   %}
7479   ins_pipe( pipe_slow );
7480 %}
7481 
7482 // --------------------------------- MUL --------------------------------------
7483 
7484 // Shorts/Chars vector mul
7485 instruct vmul2S(vecS dst, vecS src) %{
7486   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
7487   match(Set dst (MulVS dst src));
7488   format %{ "pmullw $dst,$src\t! mul packed2S" %}
7489   ins_encode %{
7490     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
7491   %}
7492   ins_pipe( pipe_slow );
7493 %}
7494 
7495 instruct vmul2S_reg_avx(vecS dst, vecS src1, vecS src2) %{
7496   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
7497   match(Set dst (MulVS src1 src2));
7498   format %{ "vpmullw $dst,$src1,$src2\t! mul packed2S" %}
7499   ins_encode %{
7500     int vector_len = 0;
7501     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7502   %}
7503   ins_pipe( pipe_slow );
7504 %}
7505 
7506 instruct vmul2S_reg_evex(vecS dst, vecS src1, vecS src2) %{
7507   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
7508   match(Set dst (MulVS src1 src2));
7509   format %{ "vpmullw $dst,$src1,$src2\t! mul packed2S" %}
7510   ins_encode %{
7511     int vector_len = 0;
7512     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7513   %}
7514   ins_pipe( pipe_slow );
7515 %}
7516 
7517 instruct vmul2S_evex_special(vecS dst, vecS src1, vecS src2) %{
7518   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
7519   match(Set dst (MulVS dst src2));
7520   effect(TEMP src1);
7521   format %{ "vpmullw $dst,$src1,$src2\t! mul packed2S" %}
7522   ins_encode %{
7523     int vector_len = 0;
7524     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7525   %}
7526   ins_pipe( pipe_slow );
7527 %}
7528 
7529 instruct vmul2S_mem_avx(vecS dst, vecS src, memory mem) %{
7530   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
7531   match(Set dst (MulVS src (LoadVector mem)));
7532   format %{ "vpmullw $dst,$src,$mem\t! mul packed2S" %}
7533   ins_encode %{
7534     int vector_len = 0;
7535     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7536   %}
7537   ins_pipe( pipe_slow );
7538 %}
7539 
7540 instruct vmul2S_mem_evex(vecS dst, vecS src, memory mem) %{
7541   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
7542   match(Set dst (MulVS src (LoadVector mem)));
7543   format %{ "vpmullw $dst,$src,$mem\t! mul packed2S" %}
7544   ins_encode %{
7545     int vector_len = 0;
7546     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7547   %}
7548   ins_pipe( pipe_slow );
7549 %}
7550 
7551 instruct vmul2S_mem_evex_special(vecS dst, vecS src, memory mem) %{
7552   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
7553   match(Set dst (MulVS dst (LoadVector mem)));
7554   effect(TEMP src);
7555   format %{ "vpmullw $dst,$src,$mem\t! mul packed2S" %}
7556   ins_encode %{
7557     int vector_len = 0;
7558     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7559   %}
7560   ins_pipe( pipe_slow );
7561 %}
7562 
7563 instruct vmul4S(vecD dst, vecD src) %{
7564   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
7565   match(Set dst (MulVS dst src));
7566   format %{ "pmullw  $dst,$src\t! mul packed4S" %}
7567   ins_encode %{
7568     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
7569   %}
7570   ins_pipe( pipe_slow );
7571 %}
7572 
7573 instruct vmul4S_reg_avx(vecD dst, vecD src1, vecD src2) %{
7574   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
7575   match(Set dst (MulVS src1 src2));
7576   format %{ "vpmullw $dst,$src1,$src2\t! mul packed4S" %}
7577   ins_encode %{
7578     int vector_len = 0;
7579     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7580   %}
7581   ins_pipe( pipe_slow );
7582 %}
7583 
7584 instruct vmul4S_reg_evex(vecD dst, vecD src1, vecD src2) %{
7585   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
7586   match(Set dst (MulVS src1 src2));
7587   format %{ "vpmullw $dst,$src1,$src2\t! mul packed4S" %}
7588   ins_encode %{
7589     int vector_len = 0;
7590     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7591   %}
7592   ins_pipe( pipe_slow );
7593 %}
7594 
7595 instruct vmul4S_reg_evex_special(vecD dst, vecD src1, vecD src2) %{
7596   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
7597   match(Set dst (MulVS dst src2));
7598   effect(TEMP src1);
7599   format %{ "vpmullw $dst,$src1,$src2\t! mul packed4S" %}
7600   ins_encode %{
7601     int vector_len = 0;
7602     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7603   %}
7604   ins_pipe( pipe_slow );
7605 %}
7606 
7607 instruct vmul4S_mem_avx(vecD dst, vecD src, memory mem) %{
7608   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
7609   match(Set dst (MulVS src (LoadVector mem)));
7610   format %{ "vpmullw $dst,$src,$mem\t! mul packed4S" %}
7611   ins_encode %{
7612     int vector_len = 0;
7613     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7614   %}
7615   ins_pipe( pipe_slow );
7616 %}
7617 
7618 instruct vmul4S_mem_evex(vecD dst, vecD src, memory mem) %{
7619   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
7620   match(Set dst (MulVS src (LoadVector mem)));
7621   format %{ "vpmullw $dst,$src,$mem\t! mul packed4S" %}
7622   ins_encode %{
7623     int vector_len = 0;
7624     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7625   %}
7626   ins_pipe( pipe_slow );
7627 %}
7628 
7629 instruct vmul4S_mem_evex_special(vecD dst, vecD src, memory mem) %{
7630   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
7631   match(Set dst (MulVS dst (LoadVector mem)));
7632   effect(TEMP src);
7633   format %{ "vpmullw $dst,$src,$mem\t! mul packed4S" %}
7634   ins_encode %{
7635     int vector_len = 0;
7636     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7637   %}
7638   ins_pipe( pipe_slow );
7639 %}
7640 
7641 instruct vmul8S(vecX dst, vecX src) %{
7642   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
7643   match(Set dst (MulVS dst src));
7644   format %{ "pmullw  $dst,$src\t! mul packed8S" %}
7645   ins_encode %{
7646     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
7647   %}
7648   ins_pipe( pipe_slow );
7649 %}
7650 
7651 instruct vmul8S_reg_avx(vecX dst, vecX src1, vecX src2) %{
7652   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
7653   match(Set dst (MulVS src1 src2));
7654   format %{ "vpmullw $dst,$src1,$src2\t! mul packed8S" %}
7655   ins_encode %{
7656     int vector_len = 0;
7657     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7658   %}
7659   ins_pipe( pipe_slow );
7660 %}
7661 
7662 instruct vmul8S_reg_evex(vecX dst, vecX src1, vecX src2) %{
7663   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
7664   match(Set dst (MulVS src1 src2));
7665   format %{ "vpmullw $dst,$src1,$src2\t! mul packed8S" %}
7666   ins_encode %{
7667     int vector_len = 0;
7668     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7669   %}
7670   ins_pipe( pipe_slow );
7671 %}
7672 
7673 instruct vmul8S_reg_evex_special(vecX dst, vecX src1, vecX src2) %{
7674   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
7675   match(Set dst (MulVS dst src2));
7676   effect(TEMP src1);
7677   format %{ "vpmullw $dst,$src1,$src2\t! mul packed8S" %}
7678   ins_encode %{
7679     int vector_len = 0;
7680     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7681   %}
7682   ins_pipe( pipe_slow );
7683 %}
7684 
7685 instruct vmul8S_mem_avx(vecX dst, vecX src, memory mem) %{
7686   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
7687   match(Set dst (MulVS src (LoadVector mem)));
7688   format %{ "vpmullw $dst,$src,$mem\t! mul packed8S" %}
7689   ins_encode %{
7690     int vector_len = 0;
7691     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7692   %}
7693   ins_pipe( pipe_slow );
7694 %}
7695 
7696 instruct vmul8S_mem_evex(vecX dst, vecX src, memory mem) %{
7697   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
7698   match(Set dst (MulVS src (LoadVector mem)));
7699   format %{ "vpmullw $dst,$src,$mem\t! mul packed8S" %}
7700   ins_encode %{
7701     int vector_len = 0;
7702     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7703   %}
7704   ins_pipe( pipe_slow );
7705 %}
7706 
7707 instruct vmul8S_mem_evex_special(vecX dst, vecX src, memory mem) %{
7708   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
7709   match(Set dst (MulVS dst (LoadVector mem)));
7710   effect(TEMP src);
7711   format %{ "vpmullw $dst,$src,$mem\t! mul packed8S" %}
7712   ins_encode %{
7713     int vector_len = 0;
7714     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7715   %}
7716   ins_pipe( pipe_slow );
7717 %}
7718 
7719 instruct vmul16S_reg_avx(vecY dst, vecY src1, vecY src2) %{
7720   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
7721   match(Set dst (MulVS src1 src2));
7722   format %{ "vpmullw $dst,$src1,$src2\t! mul packed16S" %}
7723   ins_encode %{
7724     int vector_len = 1;
7725     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7726   %}
7727   ins_pipe( pipe_slow );
7728 %}
7729 
7730 instruct vmul16S_reg_evex(vecY dst, vecY src1, vecY src2) %{
7731   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
7732   match(Set dst (MulVS src1 src2));
7733   format %{ "vpmullw $dst,$src1,$src2\t! mul packed16S" %}
7734   ins_encode %{
7735     int vector_len = 1;
7736     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7737   %}
7738   ins_pipe( pipe_slow );
7739 %}
7740 
7741 instruct vmul16S_reg_evex_special(vecY dst, vecY src1, vecY src2) %{
7742   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
7743   match(Set dst (MulVS dst src2));
7744   effect(TEMP src1);
7745   format %{ "vpmullw $dst,$src1,$src2\t! mul packed16S" %}
7746   ins_encode %{
7747     int vector_len = 1;
7748     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7749   %}
7750   ins_pipe( pipe_slow );
7751 %}
7752 
7753 instruct vmul16S_mem_avx(vecY dst, vecY src, memory mem) %{
7754   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
7755   match(Set dst (MulVS src (LoadVector mem)));
7756   format %{ "vpmullw $dst,$src,$mem\t! mul packed16S" %}
7757   ins_encode %{
7758     int vector_len = 1;
7759     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7760   %}
7761   ins_pipe( pipe_slow );
7762 %}
7763 
7764 instruct vmul16S_mem_evex(vecY dst, vecY src, memory mem) %{
7765   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
7766   match(Set dst (MulVS src (LoadVector mem)));
7767   format %{ "vpmullw $dst,$src,$mem\t! mul packed16S" %}
7768   ins_encode %{
7769     int vector_len = 1;
7770     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7771   %}
7772   ins_pipe( pipe_slow );
7773 %}
7774 
7775 instruct vmul16S_mem_evex_special(vecY dst, vecY src, memory mem) %{
7776   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
7777   match(Set dst (MulVS dst (LoadVector mem)));
7778   effect(TEMP src);
7779   format %{ "vpmullw $dst,$src,$mem\t! mul packed16S" %}
7780   ins_encode %{
7781     int vector_len = 1;
7782     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7783   %}
7784   ins_pipe( pipe_slow );
7785 %}
7786 
7787 instruct vmul32S_reg(vecZ dst, vecZ src1, vecZ src2) %{
7788   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
7789   match(Set dst (MulVS src1 src2));
7790   format %{ "vpmullw $dst,$src1,$src2\t! mul packed32S" %}
7791   ins_encode %{
7792     int vector_len = 2;
7793     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7794   %}
7795   ins_pipe( pipe_slow );
7796 %}
7797 
7798 instruct vmul32S_mem(vecZ dst, vecZ src, memory mem) %{
7799   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
7800   match(Set dst (MulVS src (LoadVector mem)));
7801   format %{ "vpmullw $dst,$src,$mem\t! mul packed32S" %}
7802   ins_encode %{
7803     int vector_len = 2;
7804     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7805   %}
7806   ins_pipe( pipe_slow );
7807 %}
7808 
7809 // Integers vector mul (sse4_1)
7810 instruct vmul2I(vecD dst, vecD src) %{
7811   predicate(UseSSE > 3 && n->as_Vector()->length() == 2);
7812   match(Set dst (MulVI dst src));
7813   format %{ "pmulld  $dst,$src\t! mul packed2I" %}
7814   ins_encode %{
7815     __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
7816   %}
7817   ins_pipe( pipe_slow );
7818 %}
7819 
7820 instruct vmul2I_reg(vecD dst, vecD src1, vecD src2) %{
7821   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7822   match(Set dst (MulVI src1 src2));
7823   format %{ "vpmulld $dst,$src1,$src2\t! mul packed2I" %}
7824   ins_encode %{
7825     int vector_len = 0;
7826     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7827   %}
7828   ins_pipe( pipe_slow );
7829 %}
7830 
7831 instruct vmul2I_mem(vecD dst, vecD src, memory mem) %{
7832   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7833   match(Set dst (MulVI src (LoadVector mem)));
7834   format %{ "vpmulld $dst,$src,$mem\t! mul packed2I" %}
7835   ins_encode %{
7836     int vector_len = 0;
7837     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7838   %}
7839   ins_pipe( pipe_slow );
7840 %}
7841 
7842 instruct vmul4I(vecX dst, vecX src) %{
7843   predicate(UseSSE > 3 && n->as_Vector()->length() == 4);
7844   match(Set dst (MulVI dst src));
7845   format %{ "pmulld  $dst,$src\t! mul packed4I" %}
7846   ins_encode %{
7847     __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
7848   %}
7849   ins_pipe( pipe_slow );
7850 %}
7851 
7852 instruct vmul4I_reg(vecX dst, vecX src1, vecX src2) %{
7853   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7854   match(Set dst (MulVI src1 src2));
7855   format %{ "vpmulld $dst,$src1,$src2\t! mul packed4I" %}
7856   ins_encode %{
7857     int vector_len = 0;
7858     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7859   %}
7860   ins_pipe( pipe_slow );
7861 %}
7862 
7863 instruct vmul4I_mem(vecX dst, vecX src, memory mem) %{
7864   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7865   match(Set dst (MulVI src (LoadVector mem)));
7866   format %{ "vpmulld $dst,$src,$mem\t! mul packed4I" %}
7867   ins_encode %{
7868     int vector_len = 0;
7869     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7870   %}
7871   ins_pipe( pipe_slow );
7872 %}
7873 
7874 instruct vmul2L_reg(vecX dst, vecX src1, vecX src2) %{
7875   predicate(UseAVX > 2 && n->as_Vector()->length() == 2 && VM_Version::supports_avx512dq());
7876   match(Set dst (MulVL src1 src2));
7877   format %{ "vpmullq $dst,$src1,$src2\t! mul packed2L" %}
7878   ins_encode %{
7879     int vector_len = 0;
7880     __ vpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7881   %}
7882   ins_pipe( pipe_slow );
7883 %}
7884 
7885 instruct vmul2L_mem(vecX dst, vecX src, memory mem) %{
7886   predicate(UseAVX > 2 && n->as_Vector()->length() == 2 && VM_Version::supports_avx512dq());
7887   match(Set dst (MulVL src (LoadVector mem)));
7888   format %{ "vpmullq $dst,$src,$mem\t! mul packed2L" %}
7889   ins_encode %{
7890     int vector_len = 0;
7891     __ vpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7892   %}
7893   ins_pipe( pipe_slow );
7894 %}
7895 
7896 instruct vmul4L_reg(vecY dst, vecY src1, vecY src2) %{
7897   predicate(UseAVX > 2 && n->as_Vector()->length() == 4 && VM_Version::supports_avx512dq());
7898   match(Set dst (MulVL src1 src2));
7899   format %{ "vpmullq $dst,$src1,$src2\t! mul packed4L" %}
7900   ins_encode %{
7901     int vector_len = 1;
7902     __ vpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7903   %}
7904   ins_pipe( pipe_slow );
7905 %}
7906 
7907 instruct vmul4L_mem(vecY dst, vecY src, memory mem) %{
7908   predicate(UseAVX > 2 && n->as_Vector()->length() == 4 && VM_Version::supports_avx512dq());
7909   match(Set dst (MulVL src (LoadVector mem)));
7910   format %{ "vpmullq $dst,$src,$mem\t! mul packed4L" %}
7911   ins_encode %{
7912     int vector_len = 1;
7913     __ vpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7914   %}
7915   ins_pipe( pipe_slow );
7916 %}
7917 
7918 instruct vmul8L_reg(vecZ dst, vecZ src1, vecZ src2) %{
7919   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && VM_Version::supports_avx512dq());
7920   match(Set dst (MulVL src1 src2));
7921   format %{ "vpmullq $dst,$src1,$src2\t! mul packed8L" %}
7922   ins_encode %{
7923     int vector_len = 2;
7924     __ vpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7925   %}
7926   ins_pipe( pipe_slow );
7927 %}
7928 
7929 instruct vmul8L_mem(vecZ dst, vecZ src, memory mem) %{
7930   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && VM_Version::supports_avx512dq());
7931   match(Set dst (MulVL src (LoadVector mem)));
7932   format %{ "vpmullq $dst,$src,$mem\t! mul packed8L" %}
7933   ins_encode %{
7934     int vector_len = 2;
7935     __ vpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7936   %}
7937   ins_pipe( pipe_slow );
7938 %}
7939 
7940 instruct vmul8I_reg(vecY dst, vecY src1, vecY src2) %{
7941   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
7942   match(Set dst (MulVI src1 src2));
7943   format %{ "vpmulld $dst,$src1,$src2\t! mul packed8I" %}
7944   ins_encode %{
7945     int vector_len = 1;
7946     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7947   %}
7948   ins_pipe( pipe_slow );
7949 %}
7950 
7951 instruct vmul8I_mem(vecY dst, vecY src, memory mem) %{
7952   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
7953   match(Set dst (MulVI src (LoadVector mem)));
7954   format %{ "vpmulld $dst,$src,$mem\t! mul packed8I" %}
7955   ins_encode %{
7956     int vector_len = 1;
7957     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7958   %}
7959   ins_pipe( pipe_slow );
7960 %}
7961 
7962 instruct vmul16I_reg(vecZ dst, vecZ src1, vecZ src2) %{
7963   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
7964   match(Set dst (MulVI src1 src2));
7965   format %{ "vpmulld $dst,$src1,$src2\t! mul packed16I" %}
7966   ins_encode %{
7967     int vector_len = 2;
7968     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7969   %}
7970   ins_pipe( pipe_slow );
7971 %}
7972 
7973 instruct vmul16I_mem(vecZ dst, vecZ src, memory mem) %{
7974   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
7975   match(Set dst (MulVI src (LoadVector mem)));
7976   format %{ "vpmulld $dst,$src,$mem\t! mul packed16I" %}
7977   ins_encode %{
7978     int vector_len = 2;
7979     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7980   %}
7981   ins_pipe( pipe_slow );
7982 %}
7983 
7984 // Floats vector mul
7985 instruct vmul2F(vecD dst, vecD src) %{
7986   predicate(n->as_Vector()->length() == 2);
7987   match(Set dst (MulVF dst src));
7988   format %{ "mulps   $dst,$src\t! mul packed2F" %}
7989   ins_encode %{
7990     __ mulps($dst$$XMMRegister, $src$$XMMRegister);
7991   %}
7992   ins_pipe( pipe_slow );
7993 %}
7994 
7995 instruct vmul2F_reg(vecD dst, vecD src1, vecD src2) %{
7996   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7997   match(Set dst (MulVF src1 src2));
7998   format %{ "vmulps  $dst,$src1,$src2\t! mul packed2F" %}
7999   ins_encode %{
8000     int vector_len = 0;
8001     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8002   %}
8003   ins_pipe( pipe_slow );
8004 %}
8005 
8006 instruct vmul2F_mem(vecD dst, vecD src, memory mem) %{
8007   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8008   match(Set dst (MulVF src (LoadVector mem)));
8009   format %{ "vmulps  $dst,$src,$mem\t! mul packed2F" %}
8010   ins_encode %{
8011     int vector_len = 0;
8012     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8013   %}
8014   ins_pipe( pipe_slow );
8015 %}
8016 
8017 instruct vmul4F(vecX dst, vecX src) %{
8018   predicate(n->as_Vector()->length() == 4);
8019   match(Set dst (MulVF dst src));
8020   format %{ "mulps   $dst,$src\t! mul packed4F" %}
8021   ins_encode %{
8022     __ mulps($dst$$XMMRegister, $src$$XMMRegister);
8023   %}
8024   ins_pipe( pipe_slow );
8025 %}
8026 
8027 instruct vmul4F_reg(vecX dst, vecX src1, vecX src2) %{
8028   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8029   match(Set dst (MulVF src1 src2));
8030   format %{ "vmulps  $dst,$src1,$src2\t! mul packed4F" %}
8031   ins_encode %{
8032     int vector_len = 0;
8033     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8034   %}
8035   ins_pipe( pipe_slow );
8036 %}
8037 
8038 instruct vmul4F_mem(vecX dst, vecX src, memory mem) %{
8039   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8040   match(Set dst (MulVF src (LoadVector mem)));
8041   format %{ "vmulps  $dst,$src,$mem\t! mul packed4F" %}
8042   ins_encode %{
8043     int vector_len = 0;
8044     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8045   %}
8046   ins_pipe( pipe_slow );
8047 %}
8048 
8049 instruct vmul8F_reg(vecY dst, vecY src1, vecY src2) %{
8050   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
8051   match(Set dst (MulVF src1 src2));
8052   format %{ "vmulps  $dst,$src1,$src2\t! mul packed8F" %}
8053   ins_encode %{
8054     int vector_len = 1;
8055     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8056   %}
8057   ins_pipe( pipe_slow );
8058 %}
8059 
8060 instruct vmul8F_mem(vecY dst, vecY src, memory mem) %{
8061   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
8062   match(Set dst (MulVF src (LoadVector mem)));
8063   format %{ "vmulps  $dst,$src,$mem\t! mul packed8F" %}
8064   ins_encode %{
8065     int vector_len = 1;
8066     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8067   %}
8068   ins_pipe( pipe_slow );
8069 %}
8070 
8071 instruct vmul16F_reg(vecZ dst, vecZ src1, vecZ src2) %{
8072   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
8073   match(Set dst (MulVF src1 src2));
8074   format %{ "vmulps  $dst,$src1,$src2\t! mul packed16F" %}
8075   ins_encode %{
8076     int vector_len = 2;
8077     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8078   %}
8079   ins_pipe( pipe_slow );
8080 %}
8081 
8082 instruct vmul16F_mem(vecZ dst, vecZ src, memory mem) %{
8083   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
8084   match(Set dst (MulVF src (LoadVector mem)));
8085   format %{ "vmulps  $dst,$src,$mem\t! mul packed16F" %}
8086   ins_encode %{
8087     int vector_len = 2;
8088     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8089   %}
8090   ins_pipe( pipe_slow );
8091 %}
8092 
8093 // Doubles vector mul
8094 instruct vmul2D(vecX dst, vecX src) %{
8095   predicate(n->as_Vector()->length() == 2);
8096   match(Set dst (MulVD dst src));
8097   format %{ "mulpd   $dst,$src\t! mul packed2D" %}
8098   ins_encode %{
8099     __ mulpd($dst$$XMMRegister, $src$$XMMRegister);
8100   %}
8101   ins_pipe( pipe_slow );
8102 %}
8103 
8104 instruct vmul2D_reg(vecX dst, vecX src1, vecX src2) %{
8105   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8106   match(Set dst (MulVD src1 src2));
8107   format %{ "vmulpd  $dst,$src1,$src2\t! mul packed2D" %}
8108   ins_encode %{
8109     int vector_len = 0;
8110     __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8111   %}
8112   ins_pipe( pipe_slow );
8113 %}
8114 
8115 instruct vmul2D_mem(vecX dst, vecX src, memory mem) %{
8116   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8117   match(Set dst (MulVD src (LoadVector mem)));
8118   format %{ "vmulpd  $dst,$src,$mem\t! mul packed2D" %}
8119   ins_encode %{
8120     int vector_len = 0;
8121     __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8122   %}
8123   ins_pipe( pipe_slow );
8124 %}
8125 
8126 instruct vmul4D_reg(vecY dst, vecY src1, vecY src2) %{
8127   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8128   match(Set dst (MulVD src1 src2));
8129   format %{ "vmulpd  $dst,$src1,$src2\t! mul packed4D" %}
8130   ins_encode %{
8131     int vector_len = 1;
8132     __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8133   %}
8134   ins_pipe( pipe_slow );
8135 %}
8136 
8137 instruct vmul4D_mem(vecY dst, vecY src, memory mem) %{
8138   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8139   match(Set dst (MulVD src (LoadVector mem)));
8140   format %{ "vmulpd  $dst,$src,$mem\t! mul packed4D" %}
8141   ins_encode %{
8142     int vector_len = 1;
8143     __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8144   %}
8145   ins_pipe( pipe_slow );
8146 %}
8147 
8148 instruct vmul8D_reg(vecZ dst, vecZ src1, vecZ src2) %{
8149   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8150   match(Set dst (MulVD src1 src2));
8151   format %{ "vmulpd  $dst k0,$src1,$src2\t! mul packed8D" %}
8152   ins_encode %{
8153     int vector_len = 2;
8154     __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8155   %}
8156   ins_pipe( pipe_slow );
8157 %}
8158 
8159 instruct vmul8D_mem(vecZ dst, vecZ src, memory mem) %{
8160   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8161   match(Set dst (MulVD src (LoadVector mem)));
8162   format %{ "vmulpd  $dst k0,$src,$mem\t! mul packed8D" %}
8163   ins_encode %{
8164     int vector_len = 2;
8165     __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8166   %}
8167   ins_pipe( pipe_slow );
8168 %}
8169 
8170 instruct vcmov4D_reg(vecY dst, vecY src1, vecY src2, immI8 cop, cmpOp_vcmppd copnd) %{
8171   predicate(UseAVX > 0 && UseAVX < 3 && n->as_Vector()->length() == 4);
8172   match(Set dst (CMoveVD (Binary copnd cop) (Binary src1 src2)));
8173   effect(TEMP dst, USE src1, USE src2);
8174   format %{ "cmppd.$copnd  $dst, $src1, $src2  ! vcmovevd, cond=$cop\n\t"
8175             "blendvpd $dst,$src1,$src2,$dst ! vcmovevd\n\t"
8176          %}
8177   ins_encode %{
8178     int vector_len = 1;
8179     int cond = (Assembler::Condition)($copnd$$cmpcode);
8180     __ cmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cond, vector_len);
8181     __ blendvpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $dst$$XMMRegister, vector_len);
8182   %}
8183   ins_pipe( pipe_slow );
8184 %}
8185 
8186 // --------------------------------- DIV --------------------------------------
8187 
8188 // Floats vector div
8189 instruct vdiv2F(vecD dst, vecD src) %{
8190   predicate(n->as_Vector()->length() == 2);
8191   match(Set dst (DivVF dst src));
8192   format %{ "divps   $dst,$src\t! div packed2F" %}
8193   ins_encode %{
8194     __ divps($dst$$XMMRegister, $src$$XMMRegister);
8195   %}
8196   ins_pipe( pipe_slow );
8197 %}
8198 
8199 instruct vdiv2F_reg(vecD dst, vecD src1, vecD src2) %{
8200   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8201   match(Set dst (DivVF src1 src2));
8202   format %{ "vdivps  $dst,$src1,$src2\t! div packed2F" %}
8203   ins_encode %{
8204     int vector_len = 0;
8205     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8206   %}
8207   ins_pipe( pipe_slow );
8208 %}
8209 
8210 instruct vdiv2F_mem(vecD dst, vecD src, memory mem) %{
8211   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8212   match(Set dst (DivVF src (LoadVector mem)));
8213   format %{ "vdivps  $dst,$src,$mem\t! div packed2F" %}
8214   ins_encode %{
8215     int vector_len = 0;
8216     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8217   %}
8218   ins_pipe( pipe_slow );
8219 %}
8220 
8221 instruct vdiv4F(vecX dst, vecX src) %{
8222   predicate(n->as_Vector()->length() == 4);
8223   match(Set dst (DivVF dst src));
8224   format %{ "divps   $dst,$src\t! div packed4F" %}
8225   ins_encode %{
8226     __ divps($dst$$XMMRegister, $src$$XMMRegister);
8227   %}
8228   ins_pipe( pipe_slow );
8229 %}
8230 
8231 instruct vdiv4F_reg(vecX dst, vecX src1, vecX src2) %{
8232   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8233   match(Set dst (DivVF src1 src2));
8234   format %{ "vdivps  $dst,$src1,$src2\t! div packed4F" %}
8235   ins_encode %{
8236     int vector_len = 0;
8237     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8238   %}
8239   ins_pipe( pipe_slow );
8240 %}
8241 
8242 instruct vdiv4F_mem(vecX dst, vecX src, memory mem) %{
8243   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8244   match(Set dst (DivVF src (LoadVector mem)));
8245   format %{ "vdivps  $dst,$src,$mem\t! div packed4F" %}
8246   ins_encode %{
8247     int vector_len = 0;
8248     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8249   %}
8250   ins_pipe( pipe_slow );
8251 %}
8252 
8253 instruct vdiv8F_reg(vecY dst, vecY src1, vecY src2) %{
8254   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
8255   match(Set dst (DivVF src1 src2));
8256   format %{ "vdivps  $dst,$src1,$src2\t! div packed8F" %}
8257   ins_encode %{
8258     int vector_len = 1;
8259     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8260   %}
8261   ins_pipe( pipe_slow );
8262 %}
8263 
8264 instruct vdiv8F_mem(vecY dst, vecY src, memory mem) %{
8265   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
8266   match(Set dst (DivVF src (LoadVector mem)));
8267   format %{ "vdivps  $dst,$src,$mem\t! div packed8F" %}
8268   ins_encode %{
8269     int vector_len = 1;
8270     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8271   %}
8272   ins_pipe( pipe_slow );
8273 %}
8274 
8275 instruct vdiv16F_reg(vecZ dst, vecZ src1, vecZ src2) %{
8276   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
8277   match(Set dst (DivVF src1 src2));
8278   format %{ "vdivps  $dst,$src1,$src2\t! div packed16F" %}
8279   ins_encode %{
8280     int vector_len = 2;
8281     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8282   %}
8283   ins_pipe( pipe_slow );
8284 %}
8285 
8286 instruct vdiv16F_mem(vecZ dst, vecZ src, memory mem) %{
8287   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
8288   match(Set dst (DivVF src (LoadVector mem)));
8289   format %{ "vdivps  $dst,$src,$mem\t! div packed16F" %}
8290   ins_encode %{
8291     int vector_len = 2;
8292     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8293   %}
8294   ins_pipe( pipe_slow );
8295 %}
8296 
8297 // Doubles vector div
8298 instruct vdiv2D(vecX dst, vecX src) %{
8299   predicate(n->as_Vector()->length() == 2);
8300   match(Set dst (DivVD dst src));
8301   format %{ "divpd   $dst,$src\t! div packed2D" %}
8302   ins_encode %{
8303     __ divpd($dst$$XMMRegister, $src$$XMMRegister);
8304   %}
8305   ins_pipe( pipe_slow );
8306 %}
8307 
8308 instruct vdiv2D_reg(vecX dst, vecX src1, vecX src2) %{
8309   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8310   match(Set dst (DivVD src1 src2));
8311   format %{ "vdivpd  $dst,$src1,$src2\t! div packed2D" %}
8312   ins_encode %{
8313     int vector_len = 0;
8314     __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8315   %}
8316   ins_pipe( pipe_slow );
8317 %}
8318 
8319 instruct vdiv2D_mem(vecX dst, vecX src, memory mem) %{
8320   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8321   match(Set dst (DivVD src (LoadVector mem)));
8322   format %{ "vdivpd  $dst,$src,$mem\t! div packed2D" %}
8323   ins_encode %{
8324     int vector_len = 0;
8325     __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8326   %}
8327   ins_pipe( pipe_slow );
8328 %}
8329 
8330 instruct vdiv4D_reg(vecY dst, vecY src1, vecY src2) %{
8331   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8332   match(Set dst (DivVD src1 src2));
8333   format %{ "vdivpd  $dst,$src1,$src2\t! div packed4D" %}
8334   ins_encode %{
8335     int vector_len = 1;
8336     __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8337   %}
8338   ins_pipe( pipe_slow );
8339 %}
8340 
8341 instruct vdiv4D_mem(vecY dst, vecY src, memory mem) %{
8342   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8343   match(Set dst (DivVD src (LoadVector mem)));
8344   format %{ "vdivpd  $dst,$src,$mem\t! div packed4D" %}
8345   ins_encode %{
8346     int vector_len = 1;
8347     __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8348   %}
8349   ins_pipe( pipe_slow );
8350 %}
8351 
8352 instruct vdiv8D_reg(vecZ dst, vecZ src1, vecZ src2) %{
8353   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8354   match(Set dst (DivVD src1 src2));
8355   format %{ "vdivpd  $dst,$src1,$src2\t! div packed8D" %}
8356   ins_encode %{
8357     int vector_len = 2;
8358     __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8359   %}
8360   ins_pipe( pipe_slow );
8361 %}
8362 
8363 instruct vdiv8D_mem(vecZ dst, vecZ src, memory mem) %{
8364   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8365   match(Set dst (DivVD src (LoadVector mem)));
8366   format %{ "vdivpd  $dst,$src,$mem\t! div packed8D" %}
8367   ins_encode %{
8368     int vector_len = 2;
8369     __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8370   %}
8371   ins_pipe( pipe_slow );
8372 %}
8373 
8374 // ------------------------------ Shift ---------------------------------------
8375 
8376 // Left and right shift count vectors are the same on x86
8377 // (only lowest bits of xmm reg are used for count).
8378 instruct vshiftcnt(vecS dst, rRegI cnt) %{
8379   match(Set dst (LShiftCntV cnt));
8380   match(Set dst (RShiftCntV cnt));
8381   format %{ "movd    $dst,$cnt\t! load shift count" %}
8382   ins_encode %{
8383     __ movdl($dst$$XMMRegister, $cnt$$Register);
8384   %}
8385   ins_pipe( pipe_slow );
8386 %}
8387 
8388 // --------------------------------- Sqrt --------------------------------------
8389 
8390 // Floating point vector sqrt - double precision only
8391 instruct vsqrt2D_reg(vecX dst, vecX src) %{
8392   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8393   match(Set dst (SqrtVD src));
8394   format %{ "vsqrtpd  $dst,$src\t! sqrt packed2D" %}
8395   ins_encode %{
8396     int vector_len = 0;
8397     __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
8398   %}
8399   ins_pipe( pipe_slow );
8400 %}
8401 
8402 instruct vsqrt2D_mem(vecX dst, memory mem) %{
8403   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8404   match(Set dst (SqrtVD (LoadVector mem)));
8405   format %{ "vsqrtpd  $dst,$mem\t! sqrt packed2D" %}
8406   ins_encode %{
8407     int vector_len = 0;
8408     __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vector_len);
8409   %}
8410   ins_pipe( pipe_slow );
8411 %}
8412 
8413 instruct vsqrt4D_reg(vecY dst, vecY src) %{
8414   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8415   match(Set dst (SqrtVD src));
8416   format %{ "vsqrtpd  $dst,$src\t! sqrt packed4D" %}
8417   ins_encode %{
8418     int vector_len = 1;
8419     __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
8420   %}
8421   ins_pipe( pipe_slow );
8422 %}
8423 
8424 instruct vsqrt4D_mem(vecY dst, memory mem) %{
8425   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8426   match(Set dst (SqrtVD (LoadVector mem)));
8427   format %{ "vsqrtpd  $dst,$mem\t! sqrt packed4D" %}
8428   ins_encode %{
8429     int vector_len = 1;
8430     __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vector_len);
8431   %}
8432   ins_pipe( pipe_slow );
8433 %}
8434 
8435 instruct vsqrt8D_reg(vecZ dst, vecZ src) %{
8436   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8437   match(Set dst (SqrtVD src));
8438   format %{ "vsqrtpd  $dst,$src\t! sqrt packed8D" %}
8439   ins_encode %{
8440     int vector_len = 2;
8441     __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
8442   %}
8443   ins_pipe( pipe_slow );
8444 %}
8445 
8446 instruct vsqrt8D_mem(vecZ dst, memory mem) %{
8447   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8448   match(Set dst (SqrtVD (LoadVector mem)));
8449   format %{ "vsqrtpd  $dst,$mem\t! sqrt packed8D" %}
8450   ins_encode %{
8451     int vector_len = 2;
8452     __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vector_len);
8453   %}
8454   ins_pipe( pipe_slow );
8455 %}
8456 
8457 // ------------------------------ LeftShift -----------------------------------
8458 
8459 // Shorts/Chars vector left shift
8460 instruct vsll2S(vecS dst, vecS shift) %{
8461   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
8462   match(Set dst (LShiftVS dst shift));
8463   format %{ "psllw   $dst,$shift\t! left shift packed2S" %}
8464   ins_encode %{
8465     __ psllw($dst$$XMMRegister, $shift$$XMMRegister);
8466   %}
8467   ins_pipe( pipe_slow );
8468 %}
8469 
8470 instruct vsll2S_imm(vecS dst, immI8 shift) %{
8471   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
8472   match(Set dst (LShiftVS dst shift));
8473   format %{ "psllw   $dst,$shift\t! left shift packed2S" %}
8474   ins_encode %{
8475     __ psllw($dst$$XMMRegister, (int)$shift$$constant);
8476   %}
8477   ins_pipe( pipe_slow );
8478 %}
8479 
8480 instruct vsll2S_reg_avx(vecS dst, vecS src, vecS shift) %{
8481   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
8482   match(Set dst (LShiftVS src shift));
8483   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
8484   ins_encode %{
8485     int vector_len = 0;
8486     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8487   %}
8488   ins_pipe( pipe_slow );
8489 %}
8490 
8491 instruct vsll2S_reg_evex(vecS dst, vecS src, vecS shift) %{
8492   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
8493   match(Set dst (LShiftVS src shift));
8494   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
8495   ins_encode %{
8496     int vector_len = 0;
8497     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8498   %}
8499   ins_pipe( pipe_slow );
8500 %}
8501 
8502 instruct vsll2S_reg_evex_special(vecS dst, vecS src, vecS shift) %{
8503   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
8504   match(Set dst (LShiftVS dst shift));
8505   effect(TEMP src);
8506   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
8507   ins_encode %{
8508     int vector_len = 0;
8509     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8510   %}
8511   ins_pipe( pipe_slow );
8512 %}
8513 
8514 instruct vsll2S_reg_imm_avx(vecS dst, vecS src, immI8 shift) %{
8515   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
8516   match(Set dst (LShiftVS src shift));
8517   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
8518   ins_encode %{
8519     int vector_len = 0;
8520     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8521   %}
8522   ins_pipe( pipe_slow );
8523 %}
8524 
8525 instruct vsll2S_reg_imm_evex(vecS dst, vecS src, immI8 shift) %{
8526   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
8527   match(Set dst (LShiftVS src shift));
8528   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
8529   ins_encode %{
8530     int vector_len = 0;
8531     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8532   %}
8533   ins_pipe( pipe_slow );
8534 %}
8535 
8536 instruct vsll2S_reg_imm_evex_special(vecS dst, vecS src, immI8 shift) %{
8537   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
8538   match(Set dst (LShiftVS dst shift));
8539   effect(TEMP src);
8540   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
8541   ins_encode %{
8542     int vector_len = 0;
8543     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8544   %}
8545   ins_pipe( pipe_slow );
8546 %}
8547 
8548 instruct vsll4S(vecD dst, vecS shift) %{
8549   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
8550   match(Set dst (LShiftVS dst shift));
8551   format %{ "psllw   $dst,$shift\t! left shift packed4S" %}
8552   ins_encode %{
8553     __ psllw($dst$$XMMRegister, $shift$$XMMRegister);
8554   %}
8555   ins_pipe( pipe_slow );
8556 %}
8557 
8558 instruct vsll4S_imm(vecD dst, immI8 shift) %{
8559   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
8560   match(Set dst (LShiftVS dst shift));
8561   format %{ "psllw   $dst,$shift\t! left shift packed4S" %}
8562   ins_encode %{
8563     __ psllw($dst$$XMMRegister, (int)$shift$$constant);
8564   %}
8565   ins_pipe( pipe_slow );
8566 %}
8567 
8568 instruct vsll4S_reg_avx(vecD dst, vecD src, vecS shift) %{
8569   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
8570   match(Set dst (LShiftVS src shift));
8571   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
8572   ins_encode %{
8573     int vector_len = 0;
8574     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8575   %}
8576   ins_pipe( pipe_slow );
8577 %}
8578 
8579 instruct vsll4S_reg_evex(vecD dst, vecD src, vecS shift) %{
8580   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
8581   match(Set dst (LShiftVS src shift));
8582   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
8583   ins_encode %{
8584     int vector_len = 0;
8585     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8586   %}
8587   ins_pipe( pipe_slow );
8588 %}
8589 
8590 instruct vsll4S_reg_evex_special(vecD dst, vecD src, vecS shift) %{
8591   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
8592   match(Set dst (LShiftVS dst shift));
8593   effect(TEMP src);
8594   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
8595   ins_encode %{
8596     int vector_len = 0;
8597     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8598   %}
8599   ins_pipe( pipe_slow );
8600 %}
8601 
8602 instruct vsll4S_reg_imm_avx(vecD dst, vecD src, immI8 shift) %{
8603   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
8604   match(Set dst (LShiftVS src shift));
8605   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
8606   ins_encode %{
8607     int vector_len = 0;
8608     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8609   %}
8610   ins_pipe( pipe_slow );
8611 %}
8612 
8613 instruct vsll4S_reg_imm_evex(vecD dst, vecD src, immI8 shift) %{
8614   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
8615   match(Set dst (LShiftVS src shift));
8616   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
8617   ins_encode %{
8618     int vector_len = 0;
8619     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8620   %}
8621   ins_pipe( pipe_slow );
8622 %}
8623 
8624 instruct vsll4S_reg_imm_evex_special(vecD dst, vecD src, immI8 shift) %{
8625   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
8626   match(Set dst (LShiftVS dst shift));
8627   effect(TEMP src);
8628   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
8629   ins_encode %{
8630     int vector_len = 0;
8631     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8632   %}
8633   ins_pipe( pipe_slow );
8634 %}
8635 
8636 instruct vsll8S(vecX dst, vecS shift) %{
8637   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
8638   match(Set dst (LShiftVS dst shift));
8639   format %{ "psllw   $dst,$shift\t! left shift packed8S" %}
8640   ins_encode %{
8641     __ psllw($dst$$XMMRegister, $shift$$XMMRegister);
8642   %}
8643   ins_pipe( pipe_slow );
8644 %}
8645 
8646 instruct vsll8S_imm(vecX dst, immI8 shift) %{
8647   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
8648   match(Set dst (LShiftVS dst shift));
8649   format %{ "psllw   $dst,$shift\t! left shift packed8S" %}
8650   ins_encode %{
8651     __ psllw($dst$$XMMRegister, (int)$shift$$constant);
8652   %}
8653   ins_pipe( pipe_slow );
8654 %}
8655 
8656 instruct vsll8S_reg_avx(vecX dst, vecX src, vecS shift) %{
8657   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
8658   match(Set dst (LShiftVS src shift));
8659   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
8660   ins_encode %{
8661     int vector_len = 0;
8662     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8663   %}
8664   ins_pipe( pipe_slow );
8665 %}
8666 
8667 instruct vsll8S_reg_evex(vecX dst, vecX src, vecS shift) %{
8668   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
8669   match(Set dst (LShiftVS src shift));
8670   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
8671   ins_encode %{
8672     int vector_len = 0;
8673     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8674   %}
8675   ins_pipe( pipe_slow );
8676 %}
8677 
8678 instruct vsll8S_reg_evex_special(vecX dst, vecX src, vecS shift) %{
8679   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
8680   match(Set dst (LShiftVS dst shift));
8681   effect(TEMP src);
8682   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
8683   ins_encode %{
8684     int vector_len = 0;
8685     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8686   %}
8687   ins_pipe( pipe_slow );
8688 %}
8689 
8690 instruct vsll8S_reg_imm_avx(vecX dst, vecX src, immI8 shift) %{
8691   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
8692   match(Set dst (LShiftVS src shift));
8693   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
8694   ins_encode %{
8695     int vector_len = 0;
8696     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8697   %}
8698   ins_pipe( pipe_slow );
8699 %}
8700 
8701 instruct vsll8S_reg_imm_evex(vecX dst, vecX src, immI8 shift) %{
8702   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
8703   match(Set dst (LShiftVS src shift));
8704   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
8705   ins_encode %{
8706     int vector_len = 0;
8707     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8708   %}
8709   ins_pipe( pipe_slow );
8710 %}
8711 
8712 instruct vsll8S_reg_imm_evex_special(vecX dst, vecX src, immI8 shift) %{
8713   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
8714   match(Set dst (LShiftVS dst shift));
8715   effect(TEMP src);
8716   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
8717   ins_encode %{
8718     int vector_len = 0;
8719     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8720   %}
8721   ins_pipe( pipe_slow );
8722 %}
8723 
8724 instruct vsll16S_reg_avx(vecY dst, vecY src, vecS shift) %{
8725   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
8726   match(Set dst (LShiftVS src shift));
8727   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
8728   ins_encode %{
8729     int vector_len = 1;
8730     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8731   %}
8732   ins_pipe( pipe_slow );
8733 %}
8734 
8735 instruct vsll16S_reg_evex(vecY dst, vecY src, vecS shift) %{
8736   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
8737   match(Set dst (LShiftVS src shift));
8738   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
8739   ins_encode %{
8740     int vector_len = 1;
8741     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8742   %}
8743   ins_pipe( pipe_slow );
8744 %}
8745 
8746 instruct vsll16S_reg_evex_special(vecY dst, vecY src, vecS shift) %{
8747   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
8748   match(Set dst (LShiftVS dst shift));
8749   effect(TEMP src);
8750   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
8751   ins_encode %{
8752     int vector_len = 1;
8753     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8754   %}
8755   ins_pipe( pipe_slow );
8756 %}
8757 
8758 instruct vsll16S_reg_imm_avx(vecY dst, vecY src, immI8 shift) %{
8759   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
8760   match(Set dst (LShiftVS src shift));
8761   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
8762   ins_encode %{
8763     int vector_len = 1;
8764     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8765   %}
8766   ins_pipe( pipe_slow );
8767 %}
8768 
8769 instruct vsll16S_reg_imm_evex(vecY dst, vecY src, immI8 shift) %{
8770   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
8771   match(Set dst (LShiftVS src shift));
8772   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
8773   ins_encode %{
8774     int vector_len = 1;
8775     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8776   %}
8777   ins_pipe( pipe_slow );
8778 %}
8779 
8780 instruct vsll16S_reg_imm_evex_special(vecY dst, vecY src, immI8 shift) %{
8781   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
8782   match(Set dst (LShiftVS dst shift));
8783   effect(TEMP src);
8784   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
8785   ins_encode %{
8786     int vector_len = 1;
8787     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8788   %}
8789   ins_pipe( pipe_slow );
8790 %}
8791 
8792 instruct vsll32S_reg(vecZ dst, vecZ src, vecS shift) %{
8793   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
8794   match(Set dst (LShiftVS src shift));
8795   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed32S" %}
8796   ins_encode %{
8797     int vector_len = 2;
8798     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8799   %}
8800   ins_pipe( pipe_slow );
8801 %}
8802 
8803 instruct vsll32S_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
8804   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
8805   match(Set dst (LShiftVS src shift));
8806   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed32S" %}
8807   ins_encode %{
8808     int vector_len = 2;
8809     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8810   %}
8811   ins_pipe( pipe_slow );
8812 %}
8813 
8814 // Integers vector left shift
8815 instruct vsll2I(vecD dst, vecS shift) %{
8816   predicate(n->as_Vector()->length() == 2);
8817   match(Set dst (LShiftVI dst shift));
8818   format %{ "pslld   $dst,$shift\t! left shift packed2I" %}
8819   ins_encode %{
8820     __ pslld($dst$$XMMRegister, $shift$$XMMRegister);
8821   %}
8822   ins_pipe( pipe_slow );
8823 %}
8824 
8825 instruct vsll2I_imm(vecD dst, immI8 shift) %{
8826   predicate(n->as_Vector()->length() == 2);
8827   match(Set dst (LShiftVI dst shift));
8828   format %{ "pslld   $dst,$shift\t! left shift packed2I" %}
8829   ins_encode %{
8830     __ pslld($dst$$XMMRegister, (int)$shift$$constant);
8831   %}
8832   ins_pipe( pipe_slow );
8833 %}
8834 
8835 instruct vsll2I_reg(vecD dst, vecD src, vecS shift) %{
8836   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8837   match(Set dst (LShiftVI src shift));
8838   format %{ "vpslld  $dst,$src,$shift\t! left shift packed2I" %}
8839   ins_encode %{
8840     int vector_len = 0;
8841     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8842   %}
8843   ins_pipe( pipe_slow );
8844 %}
8845 
8846 instruct vsll2I_reg_imm(vecD dst, vecD src, immI8 shift) %{
8847   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8848   match(Set dst (LShiftVI src shift));
8849   format %{ "vpslld  $dst,$src,$shift\t! left shift packed2I" %}
8850   ins_encode %{
8851     int vector_len = 0;
8852     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8853   %}
8854   ins_pipe( pipe_slow );
8855 %}
8856 
8857 instruct vsll4I(vecX dst, vecS shift) %{
8858   predicate(n->as_Vector()->length() == 4);
8859   match(Set dst (LShiftVI dst shift));
8860   format %{ "pslld   $dst,$shift\t! left shift packed4I" %}
8861   ins_encode %{
8862     __ pslld($dst$$XMMRegister, $shift$$XMMRegister);
8863   %}
8864   ins_pipe( pipe_slow );
8865 %}
8866 
8867 instruct vsll4I_imm(vecX dst, immI8 shift) %{
8868   predicate(n->as_Vector()->length() == 4);
8869   match(Set dst (LShiftVI dst shift));
8870   format %{ "pslld   $dst,$shift\t! left shift packed4I" %}
8871   ins_encode %{
8872     __ pslld($dst$$XMMRegister, (int)$shift$$constant);
8873   %}
8874   ins_pipe( pipe_slow );
8875 %}
8876 
8877 instruct vsll4I_reg(vecX dst, vecX src, vecS shift) %{
8878   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8879   match(Set dst (LShiftVI src shift));
8880   format %{ "vpslld  $dst,$src,$shift\t! left shift packed4I" %}
8881   ins_encode %{
8882     int vector_len = 0;
8883     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8884   %}
8885   ins_pipe( pipe_slow );
8886 %}
8887 
8888 instruct vsll4I_reg_imm(vecX dst, vecX src, immI8 shift) %{
8889   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8890   match(Set dst (LShiftVI src shift));
8891   format %{ "vpslld  $dst,$src,$shift\t! left shift packed4I" %}
8892   ins_encode %{
8893     int vector_len = 0;
8894     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8895   %}
8896   ins_pipe( pipe_slow );
8897 %}
8898 
8899 instruct vsll8I_reg(vecY dst, vecY src, vecS shift) %{
8900   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
8901   match(Set dst (LShiftVI src shift));
8902   format %{ "vpslld  $dst,$src,$shift\t! left shift packed8I" %}
8903   ins_encode %{
8904     int vector_len = 1;
8905     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8906   %}
8907   ins_pipe( pipe_slow );
8908 %}
8909 
8910 instruct vsll8I_reg_imm(vecY dst, vecY src, immI8 shift) %{
8911   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
8912   match(Set dst (LShiftVI src shift));
8913   format %{ "vpslld  $dst,$src,$shift\t! left shift packed8I" %}
8914   ins_encode %{
8915     int vector_len = 1;
8916     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8917   %}
8918   ins_pipe( pipe_slow );
8919 %}
8920 
8921 instruct vsll16I_reg(vecZ dst, vecZ src, vecS shift) %{
8922   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
8923   match(Set dst (LShiftVI src shift));
8924   format %{ "vpslld  $dst,$src,$shift\t! left shift packed16I" %}
8925   ins_encode %{
8926     int vector_len = 2;
8927     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8928   %}
8929   ins_pipe( pipe_slow );
8930 %}
8931 
8932 instruct vsll16I_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
8933   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
8934   match(Set dst (LShiftVI src shift));
8935   format %{ "vpslld  $dst,$src,$shift\t! left shift packed16I" %}
8936   ins_encode %{
8937     int vector_len = 2;
8938     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8939   %}
8940   ins_pipe( pipe_slow );
8941 %}
8942 
8943 // Longs vector left shift
8944 instruct vsll2L(vecX dst, vecS shift) %{
8945   predicate(n->as_Vector()->length() == 2);
8946   match(Set dst (LShiftVL dst shift));
8947   format %{ "psllq   $dst,$shift\t! left shift packed2L" %}
8948   ins_encode %{
8949     __ psllq($dst$$XMMRegister, $shift$$XMMRegister);
8950   %}
8951   ins_pipe( pipe_slow );
8952 %}
8953 
8954 instruct vsll2L_imm(vecX dst, immI8 shift) %{
8955   predicate(n->as_Vector()->length() == 2);
8956   match(Set dst (LShiftVL dst shift));
8957   format %{ "psllq   $dst,$shift\t! left shift packed2L" %}
8958   ins_encode %{
8959     __ psllq($dst$$XMMRegister, (int)$shift$$constant);
8960   %}
8961   ins_pipe( pipe_slow );
8962 %}
8963 
8964 instruct vsll2L_reg(vecX dst, vecX src, vecS shift) %{
8965   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8966   match(Set dst (LShiftVL src shift));
8967   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed2L" %}
8968   ins_encode %{
8969     int vector_len = 0;
8970     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8971   %}
8972   ins_pipe( pipe_slow );
8973 %}
8974 
8975 instruct vsll2L_reg_imm(vecX dst, vecX src, immI8 shift) %{
8976   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8977   match(Set dst (LShiftVL src shift));
8978   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed2L" %}
8979   ins_encode %{
8980     int vector_len = 0;
8981     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8982   %}
8983   ins_pipe( pipe_slow );
8984 %}
8985 
8986 instruct vsll4L_reg(vecY dst, vecY src, vecS shift) %{
8987   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
8988   match(Set dst (LShiftVL src shift));
8989   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed4L" %}
8990   ins_encode %{
8991     int vector_len = 1;
8992     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8993   %}
8994   ins_pipe( pipe_slow );
8995 %}
8996 
8997 instruct vsll4L_reg_imm(vecY dst, vecY src, immI8 shift) %{
8998   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
8999   match(Set dst (LShiftVL src shift));
9000   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed4L" %}
9001   ins_encode %{
9002     int vector_len = 1;
9003     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9004   %}
9005   ins_pipe( pipe_slow );
9006 %}
9007 
9008 instruct vsll8L_reg(vecZ dst, vecZ src, vecS shift) %{
9009   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
9010   match(Set dst (LShiftVL src shift));
9011   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed8L" %}
9012   ins_encode %{
9013     int vector_len = 2;
9014     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9015   %}
9016   ins_pipe( pipe_slow );
9017 %}
9018 
9019 instruct vsll8L_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
9020   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
9021   match(Set dst (LShiftVL src shift));
9022   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed8L" %}
9023   ins_encode %{
9024     int vector_len = 2;
9025     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9026   %}
9027   ins_pipe( pipe_slow );
9028 %}
9029 
9030 // ----------------------- LogicalRightShift -----------------------------------
9031 
9032 // Shorts vector logical right shift produces incorrect Java result
9033 // for negative data because java code convert short value into int with
9034 // sign extension before a shift. But char vectors are fine since chars are
9035 // unsigned values.
9036 
9037 instruct vsrl2S(vecS dst, vecS shift) %{
9038   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
9039   match(Set dst (URShiftVS dst shift));
9040   format %{ "psrlw   $dst,$shift\t! logical right shift packed2S" %}
9041   ins_encode %{
9042     __ psrlw($dst$$XMMRegister, $shift$$XMMRegister);
9043   %}
9044   ins_pipe( pipe_slow );
9045 %}
9046 
9047 instruct vsrl2S_imm(vecS dst, immI8 shift) %{
9048   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
9049   match(Set dst (URShiftVS dst shift));
9050   format %{ "psrlw   $dst,$shift\t! logical right shift packed2S" %}
9051   ins_encode %{
9052     __ psrlw($dst$$XMMRegister, (int)$shift$$constant);
9053   %}
9054   ins_pipe( pipe_slow );
9055 %}
9056 
9057 instruct vsrl2S_reg_avx(vecS dst, vecS src, vecS shift) %{
9058   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
9059   match(Set dst (URShiftVS src shift));
9060   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed2S" %}
9061   ins_encode %{
9062     int vector_len = 0;
9063     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9064   %}
9065   ins_pipe( pipe_slow );
9066 %}
9067 
9068 instruct vsrl2S_reg_evex(vecS dst, vecS src, vecS shift) %{
9069   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
9070   match(Set dst (URShiftVS src shift));
9071   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed2S" %}
9072   ins_encode %{
9073     int vector_len = 0;
9074     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9075   %}
9076   ins_pipe( pipe_slow );
9077 %}
9078 
9079 instruct vsrl2S_reg_evex_special(vecS dst, vecS src, vecS shift) %{
9080   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
9081   match(Set dst (URShiftVS dst shift));
9082   effect(TEMP src);
9083   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed2S" %}
9084   ins_encode %{
9085     int vector_len = 0;
9086     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9087   %}
9088   ins_pipe( pipe_slow );
9089 %}
9090 
9091 instruct vsrl2S_reg_imm_avx(vecS dst, vecS src, immI8 shift) %{
9092   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
9093   match(Set dst (URShiftVS src shift));
9094   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed2S" %}
9095   ins_encode %{
9096     int vector_len = 0;
9097     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9098   %}
9099   ins_pipe( pipe_slow );
9100 %}
9101 
9102 instruct vsrl2S_reg_imm_evex(vecS dst, vecS src, immI8 shift) %{
9103   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
9104   match(Set dst (URShiftVS src shift));
9105   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed2S" %}
9106   ins_encode %{
9107     int vector_len = 0;
9108     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9109   %}
9110   ins_pipe( pipe_slow );
9111 %}
9112 
9113 instruct vsrl2S_reg_imm_evex_special(vecS dst, vecS src, immI8 shift) %{
9114   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
9115   match(Set dst (URShiftVS dst shift));
9116   effect(TEMP src);
9117   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed2S" %}
9118   ins_encode %{
9119     int vector_len = 0;
9120     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9121   %}
9122   ins_pipe( pipe_slow );
9123 %}
9124 
9125 instruct vsrl4S(vecD dst, vecS shift) %{
9126   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
9127   match(Set dst (URShiftVS dst shift));
9128   format %{ "psrlw   $dst,$shift\t! logical right shift packed4S" %}
9129   ins_encode %{
9130     __ psrlw($dst$$XMMRegister, $shift$$XMMRegister);
9131   %}
9132   ins_pipe( pipe_slow );
9133 %}
9134 
9135 instruct vsrl4S_imm(vecD dst, immI8 shift) %{
9136   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
9137   match(Set dst (URShiftVS dst shift));
9138   format %{ "psrlw   $dst,$shift\t! logical right shift packed4S" %}
9139   ins_encode %{
9140     __ psrlw($dst$$XMMRegister, (int)$shift$$constant);
9141   %}
9142   ins_pipe( pipe_slow );
9143 %}
9144 
9145 instruct vsrl4S_reg_avx(vecD dst, vecD src, vecS shift) %{
9146   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
9147   match(Set dst (URShiftVS src shift));
9148   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed4S" %}
9149   ins_encode %{
9150     int vector_len = 0;
9151     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9152   %}
9153   ins_pipe( pipe_slow );
9154 %}
9155 
9156 instruct vsrl4S_reg_evex(vecD dst, vecD src, vecS shift) %{
9157   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
9158   match(Set dst (URShiftVS src shift));
9159   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed4S" %}
9160   ins_encode %{
9161     int vector_len = 0;
9162     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9163   %}
9164   ins_pipe( pipe_slow );
9165 %}
9166 
9167 instruct vsrl4S_reg_evex_special(vecD dst, vecD src, vecS shift) %{
9168   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
9169   match(Set dst (URShiftVS dst shift));
9170   effect(TEMP src);
9171   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed4S" %}
9172   ins_encode %{
9173     int vector_len = 0;
9174     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9175   %}
9176   ins_pipe( pipe_slow );
9177 %}
9178 
9179 instruct vsrl4S_reg_imm_avx(vecD dst, vecD src, immI8 shift) %{
9180   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
9181   match(Set dst (URShiftVS src shift));
9182   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed4S" %}
9183   ins_encode %{
9184     int vector_len = 0;
9185     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9186   %}
9187   ins_pipe( pipe_slow );
9188 %}
9189 
9190 instruct vsrl4S_reg_imm_evex(vecD dst, vecD src, immI8 shift) %{
9191   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
9192   match(Set dst (URShiftVS src shift));
9193   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed4S" %}
9194   ins_encode %{
9195     int vector_len = 0;
9196     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9197   %}
9198   ins_pipe( pipe_slow );
9199 %}
9200 
9201 instruct vsrl4S_reg_imm_evex_special(vecD dst, vecD src, immI8 shift) %{
9202   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
9203   match(Set dst (URShiftVS dst shift));
9204   effect(TEMP src);
9205   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed4S" %}
9206   ins_encode %{
9207     int vector_len = 0;
9208     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9209   %}
9210   ins_pipe( pipe_slow );
9211 %}
9212 
9213 instruct vsrl8S(vecX dst, vecS shift) %{
9214   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
9215   match(Set dst (URShiftVS dst shift));
9216   format %{ "psrlw   $dst,$shift\t! logical right shift packed8S" %}
9217   ins_encode %{
9218     __ psrlw($dst$$XMMRegister, $shift$$XMMRegister);
9219   %}
9220   ins_pipe( pipe_slow );
9221 %}
9222 
9223 instruct vsrl8S_imm(vecX dst, immI8 shift) %{
9224   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
9225   match(Set dst (URShiftVS dst shift));
9226   format %{ "psrlw   $dst,$shift\t! logical right shift packed8S" %}
9227   ins_encode %{
9228     __ psrlw($dst$$XMMRegister, (int)$shift$$constant);
9229   %}
9230   ins_pipe( pipe_slow );
9231 %}
9232 
9233 instruct vsrl8S_reg_avx(vecX dst, vecX src, vecS shift) %{
9234   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
9235   match(Set dst (URShiftVS src shift));
9236   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed8S" %}
9237   ins_encode %{
9238     int vector_len = 0;
9239     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9240   %}
9241   ins_pipe( pipe_slow );
9242 %}
9243 
9244 instruct vsrl8S_reg_evex(vecX dst, vecX src, vecS shift) %{
9245   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
9246   match(Set dst (URShiftVS src shift));
9247   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed8S" %}
9248   ins_encode %{
9249     int vector_len = 0;
9250     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9251   %}
9252   ins_pipe( pipe_slow );
9253 %}
9254 
9255 instruct vsrl8S_reg_evex_special(vecX dst, vecX src, vecS shift) %{
9256   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
9257   match(Set dst (URShiftVS dst shift));
9258   effect(TEMP src);
9259   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed8S" %}
9260   ins_encode %{
9261     int vector_len = 0;
9262     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9263   %}
9264   ins_pipe( pipe_slow );
9265 %}
9266 
9267 instruct vsrl8S_reg_imm_avx(vecX dst, vecX src, immI8 shift) %{
9268   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
9269   match(Set dst (URShiftVS src shift));
9270   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed8S" %}
9271   ins_encode %{
9272     int vector_len = 0;
9273     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9274   %}
9275   ins_pipe( pipe_slow );
9276 %}
9277 
9278 instruct vsrl8S_reg_imm_evex(vecX dst, vecX src, immI8 shift) %{
9279   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
9280   match(Set dst (URShiftVS src shift));
9281   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed8S" %}
9282   ins_encode %{
9283     int vector_len = 0;
9284     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9285   %}
9286   ins_pipe( pipe_slow );
9287 %}
9288 
9289 instruct vsrl8S_reg_imm_evex_special(vecX dst, vecX src, immI8 shift) %{
9290   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
9291   match(Set dst (URShiftVS dst shift));
9292   effect(TEMP src);
9293   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed8S" %}
9294   ins_encode %{
9295     int vector_len = 0;
9296     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9297   %}
9298   ins_pipe( pipe_slow );
9299 %}
9300 
9301 instruct vsrl16S_reg_avx(vecY dst, vecY src, vecS shift) %{
9302   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
9303   match(Set dst (URShiftVS src shift));
9304   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed16S" %}
9305   ins_encode %{
9306     int vector_len = 1;
9307     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9308   %}
9309   ins_pipe( pipe_slow );
9310 %}
9311 
9312 instruct vsrl16S_reg_evex(vecY dst, vecY src, vecS shift) %{
9313   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
9314   match(Set dst (URShiftVS src shift));
9315   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed16S" %}
9316   ins_encode %{
9317     int vector_len = 1;
9318     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9319   %}
9320   ins_pipe( pipe_slow );
9321 %}
9322 
9323 instruct vsrl16S_reg_evex_special(vecY dst, vecY src, vecS shift) %{
9324   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
9325   match(Set dst (URShiftVS dst shift));
9326   effect(TEMP src);
9327   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed16S" %}
9328   ins_encode %{
9329     int vector_len = 1;
9330     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9331   %}
9332   ins_pipe( pipe_slow );
9333 %}
9334 
9335 instruct vsrl16S_reg_imm_avx(vecY dst, vecY src, immI8 shift) %{
9336   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
9337   match(Set dst (URShiftVS src shift));
9338   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed16S" %}
9339   ins_encode %{
9340     int vector_len = 1;
9341     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9342   %}
9343   ins_pipe( pipe_slow );
9344 %}
9345 
9346 instruct vsrl16S_reg_imm_evex(vecY dst, vecY src, immI8 shift) %{
9347   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
9348   match(Set dst (URShiftVS src shift));
9349   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed16S" %}
9350   ins_encode %{
9351     int vector_len = 1;
9352     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9353   %}
9354   ins_pipe( pipe_slow );
9355 %}
9356 
9357 instruct vsrl16S_reg_imm_evex_special(vecY dst, vecY src, immI8 shift) %{
9358   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
9359   match(Set dst (URShiftVS dst shift));
9360   effect(TEMP src);
9361   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed16S" %}
9362   ins_encode %{
9363     int vector_len = 1;
9364     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9365   %}
9366   ins_pipe( pipe_slow );
9367 %}
9368 
9369 instruct vsrl32S_reg(vecZ dst, vecZ src, vecS shift) %{
9370   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
9371   match(Set dst (URShiftVS src shift));
9372   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed32S" %}
9373   ins_encode %{
9374     int vector_len = 2;
9375     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9376   %}
9377   ins_pipe( pipe_slow );
9378 %}
9379 
9380 instruct vsrl32S_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
9381   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
9382   match(Set dst (URShiftVS src shift));
9383   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed32S" %}
9384   ins_encode %{
9385     int vector_len = 2;
9386     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9387   %}
9388   ins_pipe( pipe_slow );
9389 %}
9390 
9391 // Integers vector logical right shift
9392 instruct vsrl2I(vecD dst, vecS shift) %{
9393   predicate(n->as_Vector()->length() == 2);
9394   match(Set dst (URShiftVI dst shift));
9395   format %{ "psrld   $dst,$shift\t! logical right shift packed2I" %}
9396   ins_encode %{
9397     __ psrld($dst$$XMMRegister, $shift$$XMMRegister);
9398   %}
9399   ins_pipe( pipe_slow );
9400 %}
9401 
9402 instruct vsrl2I_imm(vecD dst, immI8 shift) %{
9403   predicate(n->as_Vector()->length() == 2);
9404   match(Set dst (URShiftVI dst shift));
9405   format %{ "psrld   $dst,$shift\t! logical right shift packed2I" %}
9406   ins_encode %{
9407     __ psrld($dst$$XMMRegister, (int)$shift$$constant);
9408   %}
9409   ins_pipe( pipe_slow );
9410 %}
9411 
9412 instruct vsrl2I_reg(vecD dst, vecD src, vecS shift) %{
9413   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
9414   match(Set dst (URShiftVI src shift));
9415   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed2I" %}
9416   ins_encode %{
9417     int vector_len = 0;
9418     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9419   %}
9420   ins_pipe( pipe_slow );
9421 %}
9422 
9423 instruct vsrl2I_reg_imm(vecD dst, vecD src, immI8 shift) %{
9424   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
9425   match(Set dst (URShiftVI src shift));
9426   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed2I" %}
9427   ins_encode %{
9428     int vector_len = 0;
9429     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9430   %}
9431   ins_pipe( pipe_slow );
9432 %}
9433 
9434 instruct vsrl4I(vecX dst, vecS shift) %{
9435   predicate(n->as_Vector()->length() == 4);
9436   match(Set dst (URShiftVI dst shift));
9437   format %{ "psrld   $dst,$shift\t! logical right shift packed4I" %}
9438   ins_encode %{
9439     __ psrld($dst$$XMMRegister, $shift$$XMMRegister);
9440   %}
9441   ins_pipe( pipe_slow );
9442 %}
9443 
9444 instruct vsrl4I_imm(vecX dst, immI8 shift) %{
9445   predicate(n->as_Vector()->length() == 4);
9446   match(Set dst (URShiftVI dst shift));
9447   format %{ "psrld   $dst,$shift\t! logical right shift packed4I" %}
9448   ins_encode %{
9449     __ psrld($dst$$XMMRegister, (int)$shift$$constant);
9450   %}
9451   ins_pipe( pipe_slow );
9452 %}
9453 
9454 instruct vsrl4I_reg(vecX dst, vecX src, vecS shift) %{
9455   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
9456   match(Set dst (URShiftVI src shift));
9457   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed4I" %}
9458   ins_encode %{
9459     int vector_len = 0;
9460     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9461   %}
9462   ins_pipe( pipe_slow );
9463 %}
9464 
9465 instruct vsrl4I_reg_imm(vecX dst, vecX src, immI8 shift) %{
9466   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
9467   match(Set dst (URShiftVI src shift));
9468   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed4I" %}
9469   ins_encode %{
9470     int vector_len = 0;
9471     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9472   %}
9473   ins_pipe( pipe_slow );
9474 %}
9475 
9476 instruct vsrl8I_reg(vecY dst, vecY src, vecS shift) %{
9477   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
9478   match(Set dst (URShiftVI src shift));
9479   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed8I" %}
9480   ins_encode %{
9481     int vector_len = 1;
9482     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9483   %}
9484   ins_pipe( pipe_slow );
9485 %}
9486 
9487 instruct vsrl8I_reg_imm(vecY dst, vecY src, immI8 shift) %{
9488   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
9489   match(Set dst (URShiftVI src shift));
9490   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed8I" %}
9491   ins_encode %{
9492     int vector_len = 1;
9493     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9494   %}
9495   ins_pipe( pipe_slow );
9496 %}
9497 
9498 instruct vsrl16I_reg(vecZ dst, vecZ src, vecS shift) %{
9499   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
9500   match(Set dst (URShiftVI src shift));
9501   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed16I" %}
9502   ins_encode %{
9503     int vector_len = 2;
9504     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9505   %}
9506   ins_pipe( pipe_slow );
9507 %}
9508 
9509 instruct vsrl16I_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
9510   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
9511   match(Set dst (URShiftVI src shift));
9512   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed16I" %}
9513   ins_encode %{
9514     int vector_len = 2;
9515     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9516   %}
9517   ins_pipe( pipe_slow );
9518 %}
9519 
9520 // Longs vector logical right shift
9521 instruct vsrl2L(vecX dst, vecS shift) %{
9522   predicate(n->as_Vector()->length() == 2);
9523   match(Set dst (URShiftVL dst shift));
9524   format %{ "psrlq   $dst,$shift\t! logical right shift packed2L" %}
9525   ins_encode %{
9526     __ psrlq($dst$$XMMRegister, $shift$$XMMRegister);
9527   %}
9528   ins_pipe( pipe_slow );
9529 %}
9530 
9531 instruct vsrl2L_imm(vecX dst, immI8 shift) %{
9532   predicate(n->as_Vector()->length() == 2);
9533   match(Set dst (URShiftVL dst shift));
9534   format %{ "psrlq   $dst,$shift\t! logical right shift packed2L" %}
9535   ins_encode %{
9536     __ psrlq($dst$$XMMRegister, (int)$shift$$constant);
9537   %}
9538   ins_pipe( pipe_slow );
9539 %}
9540 
9541 instruct vsrl2L_reg(vecX dst, vecX src, vecS shift) %{
9542   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
9543   match(Set dst (URShiftVL src shift));
9544   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed2L" %}
9545   ins_encode %{
9546     int vector_len = 0;
9547     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9548   %}
9549   ins_pipe( pipe_slow );
9550 %}
9551 
9552 instruct vsrl2L_reg_imm(vecX dst, vecX src, immI8 shift) %{
9553   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
9554   match(Set dst (URShiftVL src shift));
9555   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed2L" %}
9556   ins_encode %{
9557     int vector_len = 0;
9558     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9559   %}
9560   ins_pipe( pipe_slow );
9561 %}
9562 
9563 instruct vsrl4L_reg(vecY dst, vecY src, vecS shift) %{
9564   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
9565   match(Set dst (URShiftVL src shift));
9566   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed4L" %}
9567   ins_encode %{
9568     int vector_len = 1;
9569     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9570   %}
9571   ins_pipe( pipe_slow );
9572 %}
9573 
9574 instruct vsrl4L_reg_imm(vecY dst, vecY src, immI8 shift) %{
9575   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
9576   match(Set dst (URShiftVL src shift));
9577   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed4L" %}
9578   ins_encode %{
9579     int vector_len = 1;
9580     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9581   %}
9582   ins_pipe( pipe_slow );
9583 %}
9584 
9585 instruct vsrl8L_reg(vecZ dst, vecZ src, vecS shift) %{
9586   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
9587   match(Set dst (URShiftVL src shift));
9588   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed8L" %}
9589   ins_encode %{
9590     int vector_len = 2;
9591     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9592   %}
9593   ins_pipe( pipe_slow );
9594 %}
9595 
9596 instruct vsrl8L_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
9597   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
9598   match(Set dst (URShiftVL src shift));
9599   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed8L" %}
9600   ins_encode %{
9601     int vector_len = 2;
9602     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9603   %}
9604   ins_pipe( pipe_slow );
9605 %}
9606 
9607 // ------------------- ArithmeticRightShift -----------------------------------
9608 
9609 // Shorts/Chars vector arithmetic right shift
9610 instruct vsra2S(vecS dst, vecS shift) %{
9611   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
9612   match(Set dst (RShiftVS dst shift));
9613   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed2S" %}
9614   ins_encode %{
9615     __ psraw($dst$$XMMRegister, $shift$$XMMRegister);
9616   %}
9617   ins_pipe( pipe_slow );
9618 %}
9619 
9620 instruct vsra2S_imm(vecS dst, immI8 shift) %{
9621   predicate(n->as_Vector()->length() == 2);
9622   match(Set dst (RShiftVS dst shift));
9623   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed2S" %}
9624   ins_encode %{
9625     __ psraw($dst$$XMMRegister, (int)$shift$$constant);
9626   %}
9627   ins_pipe( pipe_slow );
9628 %}
9629 
9630 instruct vsra2S_reg_avx(vecS dst, vecS src, vecS shift) %{
9631   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
9632   match(Set dst (RShiftVS src shift));
9633   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
9634   ins_encode %{
9635     int vector_len = 0;
9636     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9637   %}
9638   ins_pipe( pipe_slow );
9639 %}
9640 
9641 instruct vsra2S_reg_evex(vecS dst, vecS src, vecS shift) %{
9642   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
9643   match(Set dst (RShiftVS src shift));
9644   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
9645   ins_encode %{
9646     int vector_len = 0;
9647     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9648   %}
9649   ins_pipe( pipe_slow );
9650 %}
9651 
9652 instruct vsra2S_reg_evex_special(vecS dst, vecS src, vecS shift) %{
9653   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
9654   match(Set dst (RShiftVS dst shift));
9655   effect(TEMP src);
9656   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
9657   ins_encode %{
9658     int vector_len = 0;
9659     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9660   %}
9661   ins_pipe( pipe_slow );
9662 %}
9663 
9664 instruct vsra2S_reg_imm_avx(vecS dst, vecS src, immI8 shift) %{
9665   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
9666   match(Set dst (RShiftVS src shift));
9667   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
9668   ins_encode %{
9669     int vector_len = 0;
9670     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9671   %}
9672   ins_pipe( pipe_slow );
9673 %}
9674 
9675 instruct vsra2S_reg_imm_evex(vecS dst, vecS src, immI8 shift) %{
9676   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
9677   match(Set dst (RShiftVS src shift));
9678   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
9679   ins_encode %{
9680     int vector_len = 0;
9681     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9682   %}
9683   ins_pipe( pipe_slow );
9684 %}
9685 
9686 instruct vsra2S_reg_imm_evex_special(vecS dst, vecS src, immI8 shift) %{
9687   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
9688   match(Set dst (RShiftVS dst shift));
9689   effect(TEMP src);
9690   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
9691   ins_encode %{
9692     int vector_len = 0;
9693     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9694   %}
9695   ins_pipe( pipe_slow );
9696 %}
9697 
9698 instruct vsra4S(vecD dst, vecS shift) %{
9699   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
9700   match(Set dst (RShiftVS dst shift));
9701   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed4S" %}
9702   ins_encode %{
9703     __ psraw($dst$$XMMRegister, $shift$$XMMRegister);
9704   %}
9705   ins_pipe( pipe_slow );
9706 %}
9707 
9708 instruct vsra4S_imm(vecD dst, immI8 shift) %{
9709   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
9710   match(Set dst (RShiftVS dst shift));
9711   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed4S" %}
9712   ins_encode %{
9713     __ psraw($dst$$XMMRegister, (int)$shift$$constant);
9714   %}
9715   ins_pipe( pipe_slow );
9716 %}
9717 
9718 instruct vsra4S_reg_avx(vecD dst, vecD src, vecS shift) %{
9719   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
9720   match(Set dst (RShiftVS src shift));
9721   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
9722   ins_encode %{
9723     int vector_len = 0;
9724     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9725   %}
9726   ins_pipe( pipe_slow );
9727 %}
9728 
9729 instruct vsra4S_reg_evex(vecD dst, vecD src, vecS shift) %{
9730   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
9731   match(Set dst (RShiftVS src shift));
9732   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
9733   ins_encode %{
9734     int vector_len = 0;
9735     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9736   %}
9737   ins_pipe( pipe_slow );
9738 %}
9739 
9740 instruct vsra4S_reg_evex_special(vecD dst, vecD src, vecS shift) %{
9741   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
9742   match(Set dst (RShiftVS dst shift));
9743   effect(TEMP src);
9744   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
9745   ins_encode %{
9746     int vector_len = 0;
9747     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9748   %}
9749   ins_pipe( pipe_slow );
9750 %}
9751 
9752 instruct vsra4S_reg_imm_avx(vecD dst, vecD src, immI8 shift) %{
9753   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
9754   match(Set dst (RShiftVS src shift));
9755   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
9756   ins_encode %{
9757     int vector_len = 0;
9758     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9759   %}
9760   ins_pipe( pipe_slow );
9761 %}
9762 
9763 instruct vsra4S_reg_imm_evex(vecD dst, vecD src, immI8 shift) %{
9764   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
9765   match(Set dst (RShiftVS src shift));
9766   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
9767   ins_encode %{
9768     int vector_len = 0;
9769     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9770   %}
9771   ins_pipe( pipe_slow );
9772 %}
9773 
9774 instruct vsra4S_reg_imm_evex_special(vecD dst, vecD src, immI8 shift) %{
9775   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
9776   match(Set dst (RShiftVS dst shift));
9777   effect(TEMP src);
9778   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
9779   ins_encode %{
9780     int vector_len = 0;
9781     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9782   %}
9783   ins_pipe( pipe_slow );
9784 %}
9785 
9786 instruct vsra8S(vecX dst, vecS shift) %{
9787   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
9788   match(Set dst (RShiftVS dst shift));
9789   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed8S" %}
9790   ins_encode %{
9791     __ psraw($dst$$XMMRegister, $shift$$XMMRegister);
9792   %}
9793   ins_pipe( pipe_slow );
9794 %}
9795 
9796 instruct vsra8S_imm(vecX dst, immI8 shift) %{
9797   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
9798   match(Set dst (RShiftVS dst shift));
9799   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed8S" %}
9800   ins_encode %{
9801     __ psraw($dst$$XMMRegister, (int)$shift$$constant);
9802   %}
9803   ins_pipe( pipe_slow );
9804 %}
9805 
9806 instruct vsra8S_reg_avx(vecX dst, vecX src, vecS shift) %{
9807   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
9808   match(Set dst (RShiftVS src shift));
9809   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
9810   ins_encode %{
9811     int vector_len = 0;
9812     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9813   %}
9814   ins_pipe( pipe_slow );
9815 %}
9816 
9817 instruct vsra8S_reg_evex(vecX dst, vecX src, vecS shift) %{
9818   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
9819   match(Set dst (RShiftVS src shift));
9820   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
9821   ins_encode %{
9822     int vector_len = 0;
9823     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9824   %}
9825   ins_pipe( pipe_slow );
9826 %}
9827 
9828 instruct vsra8S_reg_evex_special(vecX dst, vecX src, vecS shift) %{
9829   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
9830   match(Set dst (RShiftVS dst shift));
9831   effect(TEMP src);
9832   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
9833   ins_encode %{
9834     int vector_len = 0;
9835     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9836   %}
9837   ins_pipe( pipe_slow );
9838 %}
9839 
9840 instruct vsra8S_reg_imm_avx(vecX dst, vecX src, immI8 shift) %{
9841   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
9842   match(Set dst (RShiftVS src shift));
9843   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
9844   ins_encode %{
9845     int vector_len = 0;
9846     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9847   %}
9848   ins_pipe( pipe_slow );
9849 %}
9850 
9851 instruct vsra8S_reg_imm_evex(vecX dst, vecX src, immI8 shift) %{
9852   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
9853   match(Set dst (RShiftVS src shift));
9854   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
9855   ins_encode %{
9856     int vector_len = 0;
9857     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9858   %}
9859   ins_pipe( pipe_slow );
9860 %}
9861 
9862 instruct vsra8S_reg_imm_evex_special(vecX dst, vecX src, immI8 shift) %{
9863   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
9864   match(Set dst (RShiftVS dst shift));
9865   effect(TEMP src);
9866   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
9867   ins_encode %{
9868     int vector_len = 0;
9869     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9870   %}
9871   ins_pipe( pipe_slow );
9872 %}
9873 
9874 instruct vsra16S_reg_avx(vecY dst, vecY src, vecS shift) %{
9875   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
9876   match(Set dst (RShiftVS src shift));
9877   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
9878   ins_encode %{
9879     int vector_len = 1;
9880     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9881   %}
9882   ins_pipe( pipe_slow );
9883 %}
9884 
9885 instruct vsra16S_reg_evex(vecY dst, vecY src, vecS shift) %{
9886   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
9887   match(Set dst (RShiftVS src shift));
9888   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
9889   ins_encode %{
9890     int vector_len = 1;
9891     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9892   %}
9893   ins_pipe( pipe_slow );
9894 %}
9895 
9896 instruct vsra16S_reg_evex_special(vecY dst, vecY src, vecS shift) %{
9897   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
9898   match(Set dst (RShiftVS dst shift));
9899   effect(TEMP src);
9900   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
9901   ins_encode %{
9902     int vector_len = 1;
9903     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9904   %}
9905   ins_pipe( pipe_slow );
9906 %}
9907 
9908 instruct vsra16S_reg_imm_avx(vecY dst, vecY src, immI8 shift) %{
9909   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 16);
9910   match(Set dst (RShiftVS src shift));
9911   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
9912   ins_encode %{
9913     int vector_len = 1;
9914     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9915   %}
9916   ins_pipe( pipe_slow );
9917 %}
9918 
9919 instruct vsra16S_reg_imm_evex(vecY dst, vecY src, immI8 shift) %{
9920   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
9921   match(Set dst (RShiftVS src shift));
9922   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
9923   ins_encode %{
9924     int vector_len = 1;
9925     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9926   %}
9927   ins_pipe( pipe_slow );
9928 %}
9929 
9930 instruct vsra16S_reg_imm_evex_special(vecY dst, vecY src, immI8 shift) %{
9931   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
9932   match(Set dst (RShiftVS dst shift));
9933   effect(TEMP src);
9934   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
9935   ins_encode %{
9936     int vector_len = 1;
9937     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9938   %}
9939   ins_pipe( pipe_slow );
9940 %}
9941 
9942 instruct vsra32S_reg(vecZ dst, vecZ src, vecS shift) %{
9943   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
9944   match(Set dst (RShiftVS src shift));
9945   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed32S" %}
9946   ins_encode %{
9947     int vector_len = 2;
9948     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9949   %}
9950   ins_pipe( pipe_slow );
9951 %}
9952 
9953 instruct vsra32S_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
9954   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
9955   match(Set dst (RShiftVS src shift));
9956   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed32S" %}
9957   ins_encode %{
9958     int vector_len = 2;
9959     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9960   %}
9961   ins_pipe( pipe_slow );
9962 %}
9963 
9964 // Integers vector arithmetic right shift
9965 instruct vsra2I(vecD dst, vecS shift) %{
9966   predicate(n->as_Vector()->length() == 2);
9967   match(Set dst (RShiftVI dst shift));
9968   format %{ "psrad   $dst,$shift\t! arithmetic right shift packed2I" %}
9969   ins_encode %{
9970     __ psrad($dst$$XMMRegister, $shift$$XMMRegister);
9971   %}
9972   ins_pipe( pipe_slow );
9973 %}
9974 
9975 instruct vsra2I_imm(vecD dst, immI8 shift) %{
9976   predicate(n->as_Vector()->length() == 2);
9977   match(Set dst (RShiftVI dst shift));
9978   format %{ "psrad   $dst,$shift\t! arithmetic right shift packed2I" %}
9979   ins_encode %{
9980     __ psrad($dst$$XMMRegister, (int)$shift$$constant);
9981   %}
9982   ins_pipe( pipe_slow );
9983 %}
9984 
9985 instruct vsra2I_reg(vecD dst, vecD src, vecS shift) %{
9986   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
9987   match(Set dst (RShiftVI src shift));
9988   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed2I" %}
9989   ins_encode %{
9990     int vector_len = 0;
9991     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9992   %}
9993   ins_pipe( pipe_slow );
9994 %}
9995 
9996 instruct vsra2I_reg_imm(vecD dst, vecD src, immI8 shift) %{
9997   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
9998   match(Set dst (RShiftVI src shift));
9999   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed2I" %}
10000   ins_encode %{
10001     int vector_len = 0;
10002     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10003   %}
10004   ins_pipe( pipe_slow );
10005 %}
10006 
10007 instruct vsra4I(vecX dst, vecS shift) %{
10008   predicate(n->as_Vector()->length() == 4);
10009   match(Set dst (RShiftVI dst shift));
10010   format %{ "psrad   $dst,$shift\t! arithmetic right shift packed4I" %}
10011   ins_encode %{
10012     __ psrad($dst$$XMMRegister, $shift$$XMMRegister);
10013   %}
10014   ins_pipe( pipe_slow );
10015 %}
10016 
10017 instruct vsra4I_imm(vecX dst, immI8 shift) %{
10018   predicate(n->as_Vector()->length() == 4);
10019   match(Set dst (RShiftVI dst shift));
10020   format %{ "psrad   $dst,$shift\t! arithmetic right shift packed4I" %}
10021   ins_encode %{
10022     __ psrad($dst$$XMMRegister, (int)$shift$$constant);
10023   %}
10024   ins_pipe( pipe_slow );
10025 %}
10026 
10027 instruct vsra4I_reg(vecX dst, vecX src, vecS shift) %{
10028   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
10029   match(Set dst (RShiftVI src shift));
10030   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed4I" %}
10031   ins_encode %{
10032     int vector_len = 0;
10033     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
10034   %}
10035   ins_pipe( pipe_slow );
10036 %}
10037 
10038 instruct vsra4I_reg_imm(vecX dst, vecX src, immI8 shift) %{
10039   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
10040   match(Set dst (RShiftVI src shift));
10041   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed4I" %}
10042   ins_encode %{
10043     int vector_len = 0;
10044     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10045   %}
10046   ins_pipe( pipe_slow );
10047 %}
10048 
10049 instruct vsra8I_reg(vecY dst, vecY src, vecS shift) %{
10050   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
10051   match(Set dst (RShiftVI src shift));
10052   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed8I" %}
10053   ins_encode %{
10054     int vector_len = 1;
10055     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
10056   %}
10057   ins_pipe( pipe_slow );
10058 %}
10059 
10060 instruct vsra8I_reg_imm(vecY dst, vecY src, immI8 shift) %{
10061   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
10062   match(Set dst (RShiftVI src shift));
10063   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed8I" %}
10064   ins_encode %{
10065     int vector_len = 1;
10066     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10067   %}
10068   ins_pipe( pipe_slow );
10069 %}
10070 
10071 instruct vsra16I_reg(vecZ dst, vecZ src, vecS shift) %{
10072   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
10073   match(Set dst (RShiftVI src shift));
10074   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed16I" %}
10075   ins_encode %{
10076     int vector_len = 2;
10077     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
10078   %}
10079   ins_pipe( pipe_slow );
10080 %}
10081 
10082 instruct vsra16I_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
10083   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
10084   match(Set dst (RShiftVI src shift));
10085   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed16I" %}
10086   ins_encode %{
10087     int vector_len = 2;
10088     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10089   %}
10090   ins_pipe( pipe_slow );
10091 %}
10092 
10093 // There are no longs vector arithmetic right shift instructions.
10094 
10095 
10096 // --------------------------------- AND --------------------------------------
10097 
10098 instruct vand4B(vecS dst, vecS src) %{
10099   predicate(n->as_Vector()->length_in_bytes() == 4);
10100   match(Set dst (AndV dst src));
10101   format %{ "pand    $dst,$src\t! and vectors (4 bytes)" %}
10102   ins_encode %{
10103     __ pand($dst$$XMMRegister, $src$$XMMRegister);
10104   %}
10105   ins_pipe( pipe_slow );
10106 %}
10107 
10108 instruct vand4B_reg(vecS dst, vecS src1, vecS src2) %{
10109   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
10110   match(Set dst (AndV src1 src2));
10111   format %{ "vpand   $dst,$src1,$src2\t! and vectors (4 bytes)" %}
10112   ins_encode %{
10113     int vector_len = 0;
10114     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
10115   %}
10116   ins_pipe( pipe_slow );
10117 %}
10118 
10119 instruct vand4B_mem(vecS dst, vecS src, memory mem) %{
10120   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
10121   match(Set dst (AndV src (LoadVector mem)));
10122   format %{ "vpand   $dst,$src,$mem\t! and vectors (4 bytes)" %}
10123   ins_encode %{
10124     int vector_len = 0;
10125     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
10126   %}
10127   ins_pipe( pipe_slow );
10128 %}
10129 
10130 instruct vand8B(vecD dst, vecD src) %{
10131   predicate(n->as_Vector()->length_in_bytes() == 8);
10132   match(Set dst (AndV dst src));
10133   format %{ "pand    $dst,$src\t! and vectors (8 bytes)" %}
10134   ins_encode %{
10135     __ pand($dst$$XMMRegister, $src$$XMMRegister);
10136   %}
10137   ins_pipe( pipe_slow );
10138 %}
10139 
10140 instruct vand8B_reg(vecD dst, vecD src1, vecD src2) %{
10141   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
10142   match(Set dst (AndV src1 src2));
10143   format %{ "vpand   $dst,$src1,$src2\t! and vectors (8 bytes)" %}
10144   ins_encode %{
10145     int vector_len = 0;
10146     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
10147   %}
10148   ins_pipe( pipe_slow );
10149 %}
10150 
10151 instruct vand8B_mem(vecD dst, vecD src, memory mem) %{
10152   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
10153   match(Set dst (AndV src (LoadVector mem)));
10154   format %{ "vpand   $dst,$src,$mem\t! and vectors (8 bytes)" %}
10155   ins_encode %{
10156     int vector_len = 0;
10157     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
10158   %}
10159   ins_pipe( pipe_slow );
10160 %}
10161 
10162 instruct vand16B(vecX dst, vecX src) %{
10163   predicate(n->as_Vector()->length_in_bytes() == 16);
10164   match(Set dst (AndV dst src));
10165   format %{ "pand    $dst,$src\t! and vectors (16 bytes)" %}
10166   ins_encode %{
10167     __ pand($dst$$XMMRegister, $src$$XMMRegister);
10168   %}
10169   ins_pipe( pipe_slow );
10170 %}
10171 
10172 instruct vand16B_reg(vecX dst, vecX src1, vecX src2) %{
10173   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
10174   match(Set dst (AndV src1 src2));
10175   format %{ "vpand   $dst,$src1,$src2\t! and vectors (16 bytes)" %}
10176   ins_encode %{
10177     int vector_len = 0;
10178     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
10179   %}
10180   ins_pipe( pipe_slow );
10181 %}
10182 
10183 instruct vand16B_mem(vecX dst, vecX src, memory mem) %{
10184   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
10185   match(Set dst (AndV src (LoadVector mem)));
10186   format %{ "vpand   $dst,$src,$mem\t! and vectors (16 bytes)" %}
10187   ins_encode %{
10188     int vector_len = 0;
10189     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
10190   %}
10191   ins_pipe( pipe_slow );
10192 %}
10193 
10194 instruct vand32B_reg(vecY dst, vecY src1, vecY src2) %{
10195   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
10196   match(Set dst (AndV src1 src2));
10197   format %{ "vpand   $dst,$src1,$src2\t! and vectors (32 bytes)" %}
10198   ins_encode %{
10199     int vector_len = 1;
10200     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
10201   %}
10202   ins_pipe( pipe_slow );
10203 %}
10204 
10205 instruct vand32B_mem(vecY dst, vecY src, memory mem) %{
10206   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
10207   match(Set dst (AndV src (LoadVector mem)));
10208   format %{ "vpand   $dst,$src,$mem\t! and vectors (32 bytes)" %}
10209   ins_encode %{
10210     int vector_len = 1;
10211     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
10212   %}
10213   ins_pipe( pipe_slow );
10214 %}
10215 
10216 instruct vand64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
10217   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
10218   match(Set dst (AndV src1 src2));
10219   format %{ "vpand   $dst,$src1,$src2\t! and vectors (64 bytes)" %}
10220   ins_encode %{
10221     int vector_len = 2;
10222     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
10223   %}
10224   ins_pipe( pipe_slow );
10225 %}
10226 
10227 instruct vand64B_mem(vecZ dst, vecZ src, memory mem) %{
10228   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
10229   match(Set dst (AndV src (LoadVector mem)));
10230   format %{ "vpand   $dst,$src,$mem\t! and vectors (64 bytes)" %}
10231   ins_encode %{
10232     int vector_len = 2;
10233     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
10234   %}
10235   ins_pipe( pipe_slow );
10236 %}
10237 
10238 // --------------------------------- OR ---------------------------------------
10239 
10240 instruct vor4B(vecS dst, vecS src) %{
10241   predicate(n->as_Vector()->length_in_bytes() == 4);
10242   match(Set dst (OrV dst src));
10243   format %{ "por     $dst,$src\t! or vectors (4 bytes)" %}
10244   ins_encode %{
10245     __ por($dst$$XMMRegister, $src$$XMMRegister);
10246   %}
10247   ins_pipe( pipe_slow );
10248 %}
10249 
10250 instruct vor4B_reg(vecS dst, vecS src1, vecS src2) %{
10251   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
10252   match(Set dst (OrV src1 src2));
10253   format %{ "vpor    $dst,$src1,$src2\t! or vectors (4 bytes)" %}
10254   ins_encode %{
10255     int vector_len = 0;
10256     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
10257   %}
10258   ins_pipe( pipe_slow );
10259 %}
10260 
10261 instruct vor4B_mem(vecS dst, vecS src, memory mem) %{
10262   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
10263   match(Set dst (OrV src (LoadVector mem)));
10264   format %{ "vpor    $dst,$src,$mem\t! or vectors (4 bytes)" %}
10265   ins_encode %{
10266     int vector_len = 0;
10267     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
10268   %}
10269   ins_pipe( pipe_slow );
10270 %}
10271 
10272 instruct vor8B(vecD dst, vecD src) %{
10273   predicate(n->as_Vector()->length_in_bytes() == 8);
10274   match(Set dst (OrV dst src));
10275   format %{ "por     $dst,$src\t! or vectors (8 bytes)" %}
10276   ins_encode %{
10277     __ por($dst$$XMMRegister, $src$$XMMRegister);
10278   %}
10279   ins_pipe( pipe_slow );
10280 %}
10281 
10282 instruct vor8B_reg(vecD dst, vecD src1, vecD src2) %{
10283   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
10284   match(Set dst (OrV src1 src2));
10285   format %{ "vpor    $dst,$src1,$src2\t! or vectors (8 bytes)" %}
10286   ins_encode %{
10287     int vector_len = 0;
10288     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
10289   %}
10290   ins_pipe( pipe_slow );
10291 %}
10292 
10293 instruct vor8B_mem(vecD dst, vecD src, memory mem) %{
10294   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
10295   match(Set dst (OrV src (LoadVector mem)));
10296   format %{ "vpor    $dst,$src,$mem\t! or vectors (8 bytes)" %}
10297   ins_encode %{
10298     int vector_len = 0;
10299     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
10300   %}
10301   ins_pipe( pipe_slow );
10302 %}
10303 
10304 instruct vor16B(vecX dst, vecX src) %{
10305   predicate(n->as_Vector()->length_in_bytes() == 16);
10306   match(Set dst (OrV dst src));
10307   format %{ "por     $dst,$src\t! or vectors (16 bytes)" %}
10308   ins_encode %{
10309     __ por($dst$$XMMRegister, $src$$XMMRegister);
10310   %}
10311   ins_pipe( pipe_slow );
10312 %}
10313 
10314 instruct vor16B_reg(vecX dst, vecX src1, vecX src2) %{
10315   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
10316   match(Set dst (OrV src1 src2));
10317   format %{ "vpor    $dst,$src1,$src2\t! or vectors (16 bytes)" %}
10318   ins_encode %{
10319     int vector_len = 0;
10320     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
10321   %}
10322   ins_pipe( pipe_slow );
10323 %}
10324 
10325 instruct vor16B_mem(vecX dst, vecX src, memory mem) %{
10326   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
10327   match(Set dst (OrV src (LoadVector mem)));
10328   format %{ "vpor    $dst,$src,$mem\t! or vectors (16 bytes)" %}
10329   ins_encode %{
10330     int vector_len = 0;
10331     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
10332   %}
10333   ins_pipe( pipe_slow );
10334 %}
10335 
10336 instruct vor32B_reg(vecY dst, vecY src1, vecY src2) %{
10337   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
10338   match(Set dst (OrV src1 src2));
10339   format %{ "vpor    $dst,$src1,$src2\t! or vectors (32 bytes)" %}
10340   ins_encode %{
10341     int vector_len = 1;
10342     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
10343   %}
10344   ins_pipe( pipe_slow );
10345 %}
10346 
10347 instruct vor32B_mem(vecY dst, vecY src, memory mem) %{
10348   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
10349   match(Set dst (OrV src (LoadVector mem)));
10350   format %{ "vpor    $dst,$src,$mem\t! or vectors (32 bytes)" %}
10351   ins_encode %{
10352     int vector_len = 1;
10353     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
10354   %}
10355   ins_pipe( pipe_slow );
10356 %}
10357 
10358 instruct vor64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
10359   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
10360   match(Set dst (OrV src1 src2));
10361   format %{ "vpor    $dst,$src1,$src2\t! or vectors (64 bytes)" %}
10362   ins_encode %{
10363     int vector_len = 2;
10364     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
10365   %}
10366   ins_pipe( pipe_slow );
10367 %}
10368 
10369 instruct vor64B_mem(vecZ dst, vecZ src, memory mem) %{
10370   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
10371   match(Set dst (OrV src (LoadVector mem)));
10372   format %{ "vpor    $dst,$src,$mem\t! or vectors (64 bytes)" %}
10373   ins_encode %{
10374     int vector_len = 2;
10375     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
10376   %}
10377   ins_pipe( pipe_slow );
10378 %}
10379 
10380 // --------------------------------- XOR --------------------------------------
10381 
10382 instruct vxor4B(vecS dst, vecS src) %{
10383   predicate(n->as_Vector()->length_in_bytes() == 4);
10384   match(Set dst (XorV dst src));
10385   format %{ "pxor    $dst,$src\t! xor vectors (4 bytes)" %}
10386   ins_encode %{
10387     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
10388   %}
10389   ins_pipe( pipe_slow );
10390 %}
10391 
10392 instruct vxor4B_reg(vecS dst, vecS src1, vecS src2) %{
10393   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
10394   match(Set dst (XorV src1 src2));
10395   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (4 bytes)" %}
10396   ins_encode %{
10397     int vector_len = 0;
10398     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
10399   %}
10400   ins_pipe( pipe_slow );
10401 %}
10402 
10403 instruct vxor4B_mem(vecS dst, vecS src, memory mem) %{
10404   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
10405   match(Set dst (XorV src (LoadVector mem)));
10406   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (4 bytes)" %}
10407   ins_encode %{
10408     int vector_len = 0;
10409     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
10410   %}
10411   ins_pipe( pipe_slow );
10412 %}
10413 
10414 instruct vxor8B(vecD dst, vecD src) %{
10415   predicate(n->as_Vector()->length_in_bytes() == 8);
10416   match(Set dst (XorV dst src));
10417   format %{ "pxor    $dst,$src\t! xor vectors (8 bytes)" %}
10418   ins_encode %{
10419     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
10420   %}
10421   ins_pipe( pipe_slow );
10422 %}
10423 
10424 instruct vxor8B_reg(vecD dst, vecD src1, vecD src2) %{
10425   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
10426   match(Set dst (XorV src1 src2));
10427   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (8 bytes)" %}
10428   ins_encode %{
10429     int vector_len = 0;
10430     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
10431   %}
10432   ins_pipe( pipe_slow );
10433 %}
10434 
10435 instruct vxor8B_mem(vecD dst, vecD src, memory mem) %{
10436   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
10437   match(Set dst (XorV src (LoadVector mem)));
10438   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (8 bytes)" %}
10439   ins_encode %{
10440     int vector_len = 0;
10441     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
10442   %}
10443   ins_pipe( pipe_slow );
10444 %}
10445 
10446 instruct vxor16B(vecX dst, vecX src) %{
10447   predicate(n->as_Vector()->length_in_bytes() == 16);
10448   match(Set dst (XorV dst src));
10449   format %{ "pxor    $dst,$src\t! xor vectors (16 bytes)" %}
10450   ins_encode %{
10451     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
10452   %}
10453   ins_pipe( pipe_slow );
10454 %}
10455 
10456 instruct vxor16B_reg(vecX dst, vecX src1, vecX src2) %{
10457   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
10458   match(Set dst (XorV src1 src2));
10459   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (16 bytes)" %}
10460   ins_encode %{
10461     int vector_len = 0;
10462     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
10463   %}
10464   ins_pipe( pipe_slow );
10465 %}
10466 
10467 instruct vxor16B_mem(vecX dst, vecX src, memory mem) %{
10468   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
10469   match(Set dst (XorV src (LoadVector mem)));
10470   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (16 bytes)" %}
10471   ins_encode %{
10472     int vector_len = 0;
10473     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
10474   %}
10475   ins_pipe( pipe_slow );
10476 %}
10477 
10478 instruct vxor32B_reg(vecY dst, vecY src1, vecY src2) %{
10479   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
10480   match(Set dst (XorV src1 src2));
10481   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (32 bytes)" %}
10482   ins_encode %{
10483     int vector_len = 1;
10484     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
10485   %}
10486   ins_pipe( pipe_slow );
10487 %}
10488 
10489 instruct vxor32B_mem(vecY dst, vecY src, memory mem) %{
10490   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
10491   match(Set dst (XorV src (LoadVector mem)));
10492   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (32 bytes)" %}
10493   ins_encode %{
10494     int vector_len = 1;
10495     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
10496   %}
10497   ins_pipe( pipe_slow );
10498 %}
10499 
10500 instruct vxor64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
10501   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
10502   match(Set dst (XorV src1 src2));
10503   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (64 bytes)" %}
10504   ins_encode %{
10505     int vector_len = 2;
10506     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
10507   %}
10508   ins_pipe( pipe_slow );
10509 %}
10510 
10511 instruct vxor64B_mem(vecZ dst, vecZ src, memory mem) %{
10512   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
10513   match(Set dst (XorV src (LoadVector mem)));
10514   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (64 bytes)" %}
10515   ins_encode %{
10516     int vector_len = 2;
10517     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
10518   %}
10519   ins_pipe( pipe_slow );
10520 %}
10521 
10522 // --------------------------------- FMA --------------------------------------
10523 
10524 // a * b + c
10525 instruct vfma2D_reg(vecX a, vecX b, vecX c) %{
10526   predicate(UseFMA && n->as_Vector()->length() == 2);
10527   match(Set c (FmaVD  c (Binary a b)));
10528   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed2D" %}
10529   ins_cost(150);
10530   ins_encode %{
10531     int vector_len = 0;
10532     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
10533   %}
10534   ins_pipe( pipe_slow );
10535 %}
10536 
10537 // a * b + c
10538 instruct vfma2D_mem(vecX a, memory b, vecX c) %{
10539   predicate(UseFMA && n->as_Vector()->length() == 2);
10540   match(Set c (FmaVD  c (Binary a (LoadVector b))));
10541   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed2D" %}
10542   ins_cost(150);
10543   ins_encode %{
10544     int vector_len = 0;
10545     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
10546   %}
10547   ins_pipe( pipe_slow );
10548 %}
10549 
10550 
10551 // a * b + c
10552 instruct vfma4D_reg(vecY a, vecY b, vecY c) %{
10553   predicate(UseFMA && n->as_Vector()->length() == 4);
10554   match(Set c (FmaVD  c (Binary a b)));
10555   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed4D" %}
10556   ins_cost(150);
10557   ins_encode %{
10558     int vector_len = 1;
10559     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
10560   %}
10561   ins_pipe( pipe_slow );
10562 %}
10563 
10564 // a * b + c
10565 instruct vfma4D_mem(vecY a, memory b, vecY c) %{
10566   predicate(UseFMA && n->as_Vector()->length() == 4);
10567   match(Set c (FmaVD  c (Binary a (LoadVector b))));
10568   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed4D" %}
10569   ins_cost(150);
10570   ins_encode %{
10571     int vector_len = 1;
10572     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
10573   %}
10574   ins_pipe( pipe_slow );
10575 %}
10576 
10577 // a * b + c
10578 instruct vfma8D_reg(vecZ a, vecZ b, vecZ c) %{
10579   predicate(UseFMA && n->as_Vector()->length() == 8);
10580   match(Set c (FmaVD  c (Binary a b)));
10581   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed8D" %}
10582   ins_cost(150);
10583   ins_encode %{
10584     int vector_len = 2;
10585     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
10586   %}
10587   ins_pipe( pipe_slow );
10588 %}
10589 
10590 // a * b + c
10591 instruct vfma8D_mem(vecZ a, memory b, vecZ c) %{
10592   predicate(UseFMA && n->as_Vector()->length() == 8);
10593   match(Set c (FmaVD  c (Binary a (LoadVector b))));
10594   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed8D" %}
10595   ins_cost(150);
10596   ins_encode %{
10597     int vector_len = 2;
10598     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
10599   %}
10600   ins_pipe( pipe_slow );
10601 %}
10602 
10603 // a * b + c
10604 instruct vfma4F_reg(vecX a, vecX b, vecX c) %{
10605   predicate(UseFMA && n->as_Vector()->length() == 4);
10606   match(Set c (FmaVF  c (Binary a b)));
10607   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed4F" %}
10608   ins_cost(150);
10609   ins_encode %{
10610     int vector_len = 0;
10611     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
10612   %}
10613   ins_pipe( pipe_slow );
10614 %}
10615 
10616 // a * b + c
10617 instruct vfma4F_mem(vecX a, memory b, vecX c) %{
10618   predicate(UseFMA && n->as_Vector()->length() == 4);
10619   match(Set c (FmaVF  c (Binary a (LoadVector b))));
10620   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed4F" %}
10621   ins_cost(150);
10622   ins_encode %{
10623     int vector_len = 0;
10624     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
10625   %}
10626   ins_pipe( pipe_slow );
10627 %}
10628 
10629 // a * b + c
10630 instruct vfma8F_reg(vecY a, vecY b, vecY c) %{
10631   predicate(UseFMA && n->as_Vector()->length() == 8);
10632   match(Set c (FmaVF  c (Binary a b)));
10633   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed8F" %}
10634   ins_cost(150);
10635   ins_encode %{
10636     int vector_len = 1;
10637     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
10638   %}
10639   ins_pipe( pipe_slow );
10640 %}
10641 
10642 // a * b + c
10643 instruct vfma8F_mem(vecY a, memory b, vecY c) %{
10644   predicate(UseFMA && n->as_Vector()->length() == 8);
10645   match(Set c (FmaVF  c (Binary a (LoadVector b))));
10646   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed8F" %}
10647   ins_cost(150);
10648   ins_encode %{
10649     int vector_len = 1;
10650     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
10651   %}
10652   ins_pipe( pipe_slow );
10653 %}
10654 
10655 // a * b + c
10656 instruct vfma16F_reg(vecZ a, vecZ b, vecZ c) %{
10657   predicate(UseFMA && n->as_Vector()->length() == 16);
10658   match(Set c (FmaVF  c (Binary a b)));
10659   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed16F" %}
10660   ins_cost(150);
10661   ins_encode %{
10662     int vector_len = 2;
10663     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
10664   %}
10665   ins_pipe( pipe_slow );
10666 %}
10667 
10668 // a * b + c
10669 instruct vfma16F_mem(vecZ a, memory b, vecZ c) %{
10670   predicate(UseFMA && n->as_Vector()->length() == 16);
10671   match(Set c (FmaVF  c (Binary a (LoadVector b))));
10672   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed16F" %}
10673   ins_cost(150);
10674   ins_encode %{
10675     int vector_len = 2;
10676     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
10677   %}
10678   ins_pipe( pipe_slow );
10679 %}