1 //
   2 // Copyright (c) 2011, 2016, Oracle and/or its affiliates. All rights reserved.
   3 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4 //
   5 // This code is free software; you can redistribute it and/or modify it
   6 // under the terms of the GNU General Public License version 2 only, as
   7 // published by the Free Software Foundation.
   8 //
   9 // This code is distributed in the hope that it will be useful, but WITHOUT
  10 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11 // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12 // version 2 for more details (a copy is included in the LICENSE file that
  13 // accompanied this code).
  14 //
  15 // You should have received a copy of the GNU General Public License version
  16 // 2 along with this work; if not, write to the Free Software Foundation,
  17 // Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18 //
  19 // Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20 // or visit www.oracle.com if you need additional information or have any
  21 // questions.
  22 //
  23 //
  24 
  25 // X86 Common Architecture Description File
  26 
  27 //----------REGISTER DEFINITION BLOCK------------------------------------------
  28 // This information is used by the matcher and the register allocator to
  29 // describe individual registers and classes of registers within the target
  30 // archtecture.
  31 
  32 register %{
  33 //----------Architecture Description Register Definitions----------------------
  34 // General Registers
  35 // "reg_def"  name ( register save type, C convention save type,
  36 //                   ideal register type, encoding );
  37 // Register Save Types:
  38 //
  39 // NS  = No-Save:       The register allocator assumes that these registers
  40 //                      can be used without saving upon entry to the method, &
  41 //                      that they do not need to be saved at call sites.
  42 //
  43 // SOC = Save-On-Call:  The register allocator assumes that these registers
  44 //                      can be used without saving upon entry to the method,
  45 //                      but that they must be saved at call sites.
  46 //
  47 // SOE = Save-On-Entry: The register allocator assumes that these registers
  48 //                      must be saved before using them upon entry to the
  49 //                      method, but they do not need to be saved at call
  50 //                      sites.
  51 //
  52 // AS  = Always-Save:   The register allocator assumes that these registers
  53 //                      must be saved before using them upon entry to the
  54 //                      method, & that they must be saved at call sites.
  55 //
  56 // Ideal Register Type is used to determine how to save & restore a
  57 // register.  Op_RegI will get spilled with LoadI/StoreI, Op_RegP will get
  58 // spilled with LoadP/StoreP.  If the register supports both, use Op_RegI.
  59 //
  60 // The encoding number is the actual bit-pattern placed into the opcodes.
  61 
  62 // XMM registers.  512-bit registers or 8 words each, labeled (a)-p.
  63 // Word a in each register holds a Float, words ab hold a Double.
  64 // The whole registers are used in SSE4.2 version intrinsics,
  65 // array copy stubs and superword operations (see UseSSE42Intrinsics,
  66 // UseXMMForArrayCopy and UseSuperword flags).
  67 // For pre EVEX enabled architectures:
  68 //      XMM8-XMM15 must be encoded with REX (VEX for UseAVX)
  69 // For EVEX enabled architectures:
  70 //      XMM8-XMM31 must be encoded with REX (EVEX for UseAVX).
  71 //
  72 // Linux ABI:   No register preserved across function calls
  73 //              XMM0-XMM7 might hold parameters
  74 // Windows ABI: XMM6-XMM31 preserved across function calls
  75 //              XMM0-XMM3 might hold parameters
  76 
  77 reg_def XMM0 ( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg());
  78 reg_def XMM0b( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(1));
  79 reg_def XMM0c( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(2));
  80 reg_def XMM0d( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(3));
  81 reg_def XMM0e( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(4));
  82 reg_def XMM0f( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(5));
  83 reg_def XMM0g( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(6));
  84 reg_def XMM0h( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(7));
  85 reg_def XMM0i( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(8));
  86 reg_def XMM0j( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(9));
  87 reg_def XMM0k( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(10));
  88 reg_def XMM0l( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(11));
  89 reg_def XMM0m( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(12));
  90 reg_def XMM0n( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(13));
  91 reg_def XMM0o( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(14));
  92 reg_def XMM0p( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(15));
  93 
  94 reg_def XMM1 ( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg());
  95 reg_def XMM1b( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(1));
  96 reg_def XMM1c( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(2));
  97 reg_def XMM1d( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(3));
  98 reg_def XMM1e( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(4));
  99 reg_def XMM1f( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(5));
 100 reg_def XMM1g( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(6));
 101 reg_def XMM1h( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(7));
 102 reg_def XMM1i( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(8));
 103 reg_def XMM1j( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(9));
 104 reg_def XMM1k( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(10));
 105 reg_def XMM1l( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(11));
 106 reg_def XMM1m( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(12));
 107 reg_def XMM1n( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(13));
 108 reg_def XMM1o( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(14));
 109 reg_def XMM1p( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(15));
 110 
 111 reg_def XMM2 ( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg());
 112 reg_def XMM2b( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(1));
 113 reg_def XMM2c( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(2));
 114 reg_def XMM2d( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(3));
 115 reg_def XMM2e( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(4));
 116 reg_def XMM2f( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(5));
 117 reg_def XMM2g( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(6));
 118 reg_def XMM2h( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(7));
 119 reg_def XMM2i( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(8));
 120 reg_def XMM2j( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(9));
 121 reg_def XMM2k( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(10));
 122 reg_def XMM2l( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(11));
 123 reg_def XMM2m( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(12));
 124 reg_def XMM2n( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(13));
 125 reg_def XMM2o( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(14));
 126 reg_def XMM2p( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(15));
 127 
 128 reg_def XMM3 ( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg());
 129 reg_def XMM3b( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(1));
 130 reg_def XMM3c( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(2));
 131 reg_def XMM3d( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(3));
 132 reg_def XMM3e( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(4));
 133 reg_def XMM3f( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(5));
 134 reg_def XMM3g( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(6));
 135 reg_def XMM3h( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(7));
 136 reg_def XMM3i( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(8));
 137 reg_def XMM3j( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(9));
 138 reg_def XMM3k( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(10));
 139 reg_def XMM3l( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(11));
 140 reg_def XMM3m( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(12));
 141 reg_def XMM3n( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(13));
 142 reg_def XMM3o( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(14));
 143 reg_def XMM3p( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(15));
 144 
 145 reg_def XMM4 ( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg());
 146 reg_def XMM4b( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(1));
 147 reg_def XMM4c( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(2));
 148 reg_def XMM4d( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(3));
 149 reg_def XMM4e( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(4));
 150 reg_def XMM4f( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(5));
 151 reg_def XMM4g( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(6));
 152 reg_def XMM4h( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(7));
 153 reg_def XMM4i( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(8));
 154 reg_def XMM4j( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(9));
 155 reg_def XMM4k( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(10));
 156 reg_def XMM4l( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(11));
 157 reg_def XMM4m( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(12));
 158 reg_def XMM4n( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(13));
 159 reg_def XMM4o( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(14));
 160 reg_def XMM4p( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(15));
 161 
 162 reg_def XMM5 ( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg());
 163 reg_def XMM5b( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(1));
 164 reg_def XMM5c( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(2));
 165 reg_def XMM5d( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(3));
 166 reg_def XMM5e( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(4));
 167 reg_def XMM5f( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(5));
 168 reg_def XMM5g( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(6));
 169 reg_def XMM5h( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(7));
 170 reg_def XMM5i( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(8));
 171 reg_def XMM5j( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(9));
 172 reg_def XMM5k( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(10));
 173 reg_def XMM5l( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(11));
 174 reg_def XMM5m( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(12));
 175 reg_def XMM5n( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(13));
 176 reg_def XMM5o( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(14));
 177 reg_def XMM5p( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(15));
 178 
 179 reg_def XMM6 ( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg());
 180 reg_def XMM6b( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(1));
 181 reg_def XMM6c( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(2));
 182 reg_def XMM6d( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(3));
 183 reg_def XMM6e( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(4));
 184 reg_def XMM6f( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(5));
 185 reg_def XMM6g( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(6));
 186 reg_def XMM6h( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(7));
 187 reg_def XMM6i( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(8));
 188 reg_def XMM6j( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(9));
 189 reg_def XMM6k( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(10));
 190 reg_def XMM6l( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(11));
 191 reg_def XMM6m( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(12));
 192 reg_def XMM6n( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(13));
 193 reg_def XMM6o( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(14));
 194 reg_def XMM6p( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(15));
 195 
 196 reg_def XMM7 ( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg());
 197 reg_def XMM7b( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(1));
 198 reg_def XMM7c( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(2));
 199 reg_def XMM7d( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(3));
 200 reg_def XMM7e( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(4));
 201 reg_def XMM7f( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(5));
 202 reg_def XMM7g( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(6));
 203 reg_def XMM7h( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(7));
 204 reg_def XMM7i( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(8));
 205 reg_def XMM7j( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(9));
 206 reg_def XMM7k( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(10));
 207 reg_def XMM7l( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(11));
 208 reg_def XMM7m( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(12));
 209 reg_def XMM7n( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(13));
 210 reg_def XMM7o( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(14));
 211 reg_def XMM7p( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(15));
 212 
 213 #ifdef _LP64
 214 
 215 reg_def XMM8 ( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg());
 216 reg_def XMM8b( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(1));
 217 reg_def XMM8c( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(2));
 218 reg_def XMM8d( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(3));
 219 reg_def XMM8e( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(4));
 220 reg_def XMM8f( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(5));
 221 reg_def XMM8g( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(6));
 222 reg_def XMM8h( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(7));
 223 reg_def XMM8i( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(8));
 224 reg_def XMM8j( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(9));
 225 reg_def XMM8k( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(10));
 226 reg_def XMM8l( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(11));
 227 reg_def XMM8m( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(12));
 228 reg_def XMM8n( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(13));
 229 reg_def XMM8o( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(14));
 230 reg_def XMM8p( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(15));
 231 
 232 reg_def XMM9 ( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg());
 233 reg_def XMM9b( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(1));
 234 reg_def XMM9c( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(2));
 235 reg_def XMM9d( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(3));
 236 reg_def XMM9e( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(4));
 237 reg_def XMM9f( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(5));
 238 reg_def XMM9g( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(6));
 239 reg_def XMM9h( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(7));
 240 reg_def XMM9i( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(8));
 241 reg_def XMM9j( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(9));
 242 reg_def XMM9k( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(10));
 243 reg_def XMM9l( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(11));
 244 reg_def XMM9m( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(12));
 245 reg_def XMM9n( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(13));
 246 reg_def XMM9o( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(14));
 247 reg_def XMM9p( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(15));
 248 
 249 reg_def XMM10 ( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg());
 250 reg_def XMM10b( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(1));
 251 reg_def XMM10c( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(2));
 252 reg_def XMM10d( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(3));
 253 reg_def XMM10e( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(4));
 254 reg_def XMM10f( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(5));
 255 reg_def XMM10g( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(6));
 256 reg_def XMM10h( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(7));
 257 reg_def XMM10i( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(8));
 258 reg_def XMM10j( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(9));
 259 reg_def XMM10k( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(10));
 260 reg_def XMM10l( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(11));
 261 reg_def XMM10m( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(12));
 262 reg_def XMM10n( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(13));
 263 reg_def XMM10o( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(14));
 264 reg_def XMM10p( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(15));
 265 
 266 reg_def XMM11 ( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg());
 267 reg_def XMM11b( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(1));
 268 reg_def XMM11c( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(2));
 269 reg_def XMM11d( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(3));
 270 reg_def XMM11e( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(4));
 271 reg_def XMM11f( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(5));
 272 reg_def XMM11g( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(6));
 273 reg_def XMM11h( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(7));
 274 reg_def XMM11i( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(8));
 275 reg_def XMM11j( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(9));
 276 reg_def XMM11k( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(10));
 277 reg_def XMM11l( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(11));
 278 reg_def XMM11m( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(12));
 279 reg_def XMM11n( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(13));
 280 reg_def XMM11o( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(14));
 281 reg_def XMM11p( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(15));
 282 
 283 reg_def XMM12 ( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg());
 284 reg_def XMM12b( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(1));
 285 reg_def XMM12c( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(2));
 286 reg_def XMM12d( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(3));
 287 reg_def XMM12e( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(4));
 288 reg_def XMM12f( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(5));
 289 reg_def XMM12g( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(6));
 290 reg_def XMM12h( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(7));
 291 reg_def XMM12i( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(8));
 292 reg_def XMM12j( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(9));
 293 reg_def XMM12k( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(10));
 294 reg_def XMM12l( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(11));
 295 reg_def XMM12m( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(12));
 296 reg_def XMM12n( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(13));
 297 reg_def XMM12o( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(14));
 298 reg_def XMM12p( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(15));
 299 
 300 reg_def XMM13 ( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg());
 301 reg_def XMM13b( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(1));
 302 reg_def XMM13c( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(2));
 303 reg_def XMM13d( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(3));
 304 reg_def XMM13e( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(4));
 305 reg_def XMM13f( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(5));
 306 reg_def XMM13g( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(6));
 307 reg_def XMM13h( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(7));
 308 reg_def XMM13i( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(8));
 309 reg_def XMM13j( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(9));
 310 reg_def XMM13k( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(10));
 311 reg_def XMM13l( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(11));
 312 reg_def XMM13m( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(12));
 313 reg_def XMM13n( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(13));
 314 reg_def XMM13o( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(14));
 315 reg_def XMM13p( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(15));
 316 
 317 reg_def XMM14 ( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg());
 318 reg_def XMM14b( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(1));
 319 reg_def XMM14c( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(2));
 320 reg_def XMM14d( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(3));
 321 reg_def XMM14e( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(4));
 322 reg_def XMM14f( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(5));
 323 reg_def XMM14g( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(6));
 324 reg_def XMM14h( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(7));
 325 reg_def XMM14i( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(8));
 326 reg_def XMM14j( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(9));
 327 reg_def XMM14k( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(10));
 328 reg_def XMM14l( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(11));
 329 reg_def XMM14m( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(12));
 330 reg_def XMM14n( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(13));
 331 reg_def XMM14o( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(14));
 332 reg_def XMM14p( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(15));
 333 
 334 reg_def XMM15 ( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg());
 335 reg_def XMM15b( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(1));
 336 reg_def XMM15c( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(2));
 337 reg_def XMM15d( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(3));
 338 reg_def XMM15e( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(4));
 339 reg_def XMM15f( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(5));
 340 reg_def XMM15g( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(6));
 341 reg_def XMM15h( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(7));
 342 reg_def XMM15i( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(8));
 343 reg_def XMM15j( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(9));
 344 reg_def XMM15k( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(10));
 345 reg_def XMM15l( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(11));
 346 reg_def XMM15m( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(12));
 347 reg_def XMM15n( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(13));
 348 reg_def XMM15o( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(14));
 349 reg_def XMM15p( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(15));
 350 
 351 reg_def XMM16 ( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg());
 352 reg_def XMM16b( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(1));
 353 reg_def XMM16c( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(2));
 354 reg_def XMM16d( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(3));
 355 reg_def XMM16e( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(4));
 356 reg_def XMM16f( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(5));
 357 reg_def XMM16g( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(6));
 358 reg_def XMM16h( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(7));
 359 reg_def XMM16i( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(8));
 360 reg_def XMM16j( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(9));
 361 reg_def XMM16k( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(10));
 362 reg_def XMM16l( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(11));
 363 reg_def XMM16m( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(12));
 364 reg_def XMM16n( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(13));
 365 reg_def XMM16o( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(14));
 366 reg_def XMM16p( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(15));
 367 
 368 reg_def XMM17 ( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg());
 369 reg_def XMM17b( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(1));
 370 reg_def XMM17c( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(2));
 371 reg_def XMM17d( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(3));
 372 reg_def XMM17e( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(4));
 373 reg_def XMM17f( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(5));
 374 reg_def XMM17g( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(6));
 375 reg_def XMM17h( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(7));
 376 reg_def XMM17i( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(8));
 377 reg_def XMM17j( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(9));
 378 reg_def XMM17k( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(10));
 379 reg_def XMM17l( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(11));
 380 reg_def XMM17m( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(12));
 381 reg_def XMM17n( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(13));
 382 reg_def XMM17o( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(14));
 383 reg_def XMM17p( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(15));
 384 
 385 reg_def XMM18 ( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg());
 386 reg_def XMM18b( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(1));
 387 reg_def XMM18c( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(2));
 388 reg_def XMM18d( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(3));
 389 reg_def XMM18e( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(4));
 390 reg_def XMM18f( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(5));
 391 reg_def XMM18g( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(6));
 392 reg_def XMM18h( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(7));
 393 reg_def XMM18i( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(8));
 394 reg_def XMM18j( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(9));
 395 reg_def XMM18k( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(10));
 396 reg_def XMM18l( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(11));
 397 reg_def XMM18m( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(12));
 398 reg_def XMM18n( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(13));
 399 reg_def XMM18o( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(14));
 400 reg_def XMM18p( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(15));
 401 
 402 reg_def XMM19 ( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg());
 403 reg_def XMM19b( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(1));
 404 reg_def XMM19c( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(2));
 405 reg_def XMM19d( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(3));
 406 reg_def XMM19e( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(4));
 407 reg_def XMM19f( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(5));
 408 reg_def XMM19g( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(6));
 409 reg_def XMM19h( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(7));
 410 reg_def XMM19i( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(8));
 411 reg_def XMM19j( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(9));
 412 reg_def XMM19k( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(10));
 413 reg_def XMM19l( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(11));
 414 reg_def XMM19m( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(12));
 415 reg_def XMM19n( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(13));
 416 reg_def XMM19o( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(14));
 417 reg_def XMM19p( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(15));
 418 
 419 reg_def XMM20 ( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg());
 420 reg_def XMM20b( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(1));
 421 reg_def XMM20c( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(2));
 422 reg_def XMM20d( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(3));
 423 reg_def XMM20e( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(4));
 424 reg_def XMM20f( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(5));
 425 reg_def XMM20g( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(6));
 426 reg_def XMM20h( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(7));
 427 reg_def XMM20i( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(8));
 428 reg_def XMM20j( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(9));
 429 reg_def XMM20k( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(10));
 430 reg_def XMM20l( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(11));
 431 reg_def XMM20m( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(12));
 432 reg_def XMM20n( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(13));
 433 reg_def XMM20o( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(14));
 434 reg_def XMM20p( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(15));
 435 
 436 reg_def XMM21 ( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg());
 437 reg_def XMM21b( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(1));
 438 reg_def XMM21c( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(2));
 439 reg_def XMM21d( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(3));
 440 reg_def XMM21e( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(4));
 441 reg_def XMM21f( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(5));
 442 reg_def XMM21g( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(6));
 443 reg_def XMM21h( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(7));
 444 reg_def XMM21i( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(8));
 445 reg_def XMM21j( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(9));
 446 reg_def XMM21k( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(10));
 447 reg_def XMM21l( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(11));
 448 reg_def XMM21m( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(12));
 449 reg_def XMM21n( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(13));
 450 reg_def XMM21o( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(14));
 451 reg_def XMM21p( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(15));
 452 
 453 reg_def XMM22 ( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg());
 454 reg_def XMM22b( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(1));
 455 reg_def XMM22c( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(2));
 456 reg_def XMM22d( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(3));
 457 reg_def XMM22e( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(4));
 458 reg_def XMM22f( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(5));
 459 reg_def XMM22g( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(6));
 460 reg_def XMM22h( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(7));
 461 reg_def XMM22i( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(8));
 462 reg_def XMM22j( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(9));
 463 reg_def XMM22k( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(10));
 464 reg_def XMM22l( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(11));
 465 reg_def XMM22m( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(12));
 466 reg_def XMM22n( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(13));
 467 reg_def XMM22o( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(14));
 468 reg_def XMM22p( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(15));
 469 
 470 reg_def XMM23 ( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg());
 471 reg_def XMM23b( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(1));
 472 reg_def XMM23c( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(2));
 473 reg_def XMM23d( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(3));
 474 reg_def XMM23e( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(4));
 475 reg_def XMM23f( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(5));
 476 reg_def XMM23g( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(6));
 477 reg_def XMM23h( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(7));
 478 reg_def XMM23i( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(8));
 479 reg_def XMM23j( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(9));
 480 reg_def XMM23k( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(10));
 481 reg_def XMM23l( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(11));
 482 reg_def XMM23m( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(12));
 483 reg_def XMM23n( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(13));
 484 reg_def XMM23o( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(14));
 485 reg_def XMM23p( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(15));
 486 
 487 reg_def XMM24 ( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg());
 488 reg_def XMM24b( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(1));
 489 reg_def XMM24c( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(2));
 490 reg_def XMM24d( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(3));
 491 reg_def XMM24e( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(4));
 492 reg_def XMM24f( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(5));
 493 reg_def XMM24g( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(6));
 494 reg_def XMM24h( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(7));
 495 reg_def XMM24i( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(8));
 496 reg_def XMM24j( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(9));
 497 reg_def XMM24k( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(10));
 498 reg_def XMM24l( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(11));
 499 reg_def XMM24m( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(12));
 500 reg_def XMM24n( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(13));
 501 reg_def XMM24o( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(14));
 502 reg_def XMM24p( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(15));
 503 
 504 reg_def XMM25 ( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg());
 505 reg_def XMM25b( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(1));
 506 reg_def XMM25c( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(2));
 507 reg_def XMM25d( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(3));
 508 reg_def XMM25e( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(4));
 509 reg_def XMM25f( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(5));
 510 reg_def XMM25g( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(6));
 511 reg_def XMM25h( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(7));
 512 reg_def XMM25i( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(8));
 513 reg_def XMM25j( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(9));
 514 reg_def XMM25k( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(10));
 515 reg_def XMM25l( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(11));
 516 reg_def XMM25m( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(12));
 517 reg_def XMM25n( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(13));
 518 reg_def XMM25o( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(14));
 519 reg_def XMM25p( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(15));
 520 
 521 reg_def XMM26 ( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg());
 522 reg_def XMM26b( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(1));
 523 reg_def XMM26c( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(2));
 524 reg_def XMM26d( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(3));
 525 reg_def XMM26e( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(4));
 526 reg_def XMM26f( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(5));
 527 reg_def XMM26g( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(6));
 528 reg_def XMM26h( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(7));
 529 reg_def XMM26i( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(8));
 530 reg_def XMM26j( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(9));
 531 reg_def XMM26k( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(10));
 532 reg_def XMM26l( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(11));
 533 reg_def XMM26m( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(12));
 534 reg_def XMM26n( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(13));
 535 reg_def XMM26o( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(14));
 536 reg_def XMM26p( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(15));
 537 
 538 reg_def XMM27 ( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg());
 539 reg_def XMM27b( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(1));
 540 reg_def XMM27c( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(2));
 541 reg_def XMM27d( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(3));
 542 reg_def XMM27e( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(4));
 543 reg_def XMM27f( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(5));
 544 reg_def XMM27g( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(6));
 545 reg_def XMM27h( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(7));
 546 reg_def XMM27i( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(8));
 547 reg_def XMM27j( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(9));
 548 reg_def XMM27k( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(10));
 549 reg_def XMM27l( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(11));
 550 reg_def XMM27m( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(12));
 551 reg_def XMM27n( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(13));
 552 reg_def XMM27o( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(14));
 553 reg_def XMM27p( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(15));
 554 
 555 reg_def XMM28 ( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg());
 556 reg_def XMM28b( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(1));
 557 reg_def XMM28c( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(2));
 558 reg_def XMM28d( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(3));
 559 reg_def XMM28e( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(4));
 560 reg_def XMM28f( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(5));
 561 reg_def XMM28g( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(6));
 562 reg_def XMM28h( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(7));
 563 reg_def XMM28i( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(8));
 564 reg_def XMM28j( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(9));
 565 reg_def XMM28k( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(10));
 566 reg_def XMM28l( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(11));
 567 reg_def XMM28m( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(12));
 568 reg_def XMM28n( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(13));
 569 reg_def XMM28o( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(14));
 570 reg_def XMM28p( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(15));
 571 
 572 reg_def XMM29 ( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg());
 573 reg_def XMM29b( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(1));
 574 reg_def XMM29c( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(2));
 575 reg_def XMM29d( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(3));
 576 reg_def XMM29e( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(4));
 577 reg_def XMM29f( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(5));
 578 reg_def XMM29g( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(6));
 579 reg_def XMM29h( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(7));
 580 reg_def XMM29i( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(8));
 581 reg_def XMM29j( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(9));
 582 reg_def XMM29k( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(10));
 583 reg_def XMM29l( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(11));
 584 reg_def XMM29m( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(12));
 585 reg_def XMM29n( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(13));
 586 reg_def XMM29o( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(14));
 587 reg_def XMM29p( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(15));
 588 
 589 reg_def XMM30 ( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg());
 590 reg_def XMM30b( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(1));
 591 reg_def XMM30c( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(2));
 592 reg_def XMM30d( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(3));
 593 reg_def XMM30e( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(4));
 594 reg_def XMM30f( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(5));
 595 reg_def XMM30g( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(6));
 596 reg_def XMM30h( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(7));
 597 reg_def XMM30i( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(8));
 598 reg_def XMM30j( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(9));
 599 reg_def XMM30k( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(10));
 600 reg_def XMM30l( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(11));
 601 reg_def XMM30m( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(12));
 602 reg_def XMM30n( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(13));
 603 reg_def XMM30o( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(14));
 604 reg_def XMM30p( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(15));
 605 
 606 reg_def XMM31 ( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg());
 607 reg_def XMM31b( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(1));
 608 reg_def XMM31c( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(2));
 609 reg_def XMM31d( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(3));
 610 reg_def XMM31e( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(4));
 611 reg_def XMM31f( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(5));
 612 reg_def XMM31g( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(6));
 613 reg_def XMM31h( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(7));
 614 reg_def XMM31i( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(8));
 615 reg_def XMM31j( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(9));
 616 reg_def XMM31k( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(10));
 617 reg_def XMM31l( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(11));
 618 reg_def XMM31m( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(12));
 619 reg_def XMM31n( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(13));
 620 reg_def XMM31o( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(14));
 621 reg_def XMM31p( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(15));
 622 
 623 #endif // _LP64
 624 
 625 #ifdef _LP64
 626 reg_def RFLAGS(SOC, SOC, 0, 16, VMRegImpl::Bad());
 627 #else
 628 reg_def RFLAGS(SOC, SOC, 0, 8, VMRegImpl::Bad());
 629 #endif // _LP64
 630 
 631 alloc_class chunk1(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
 632                    XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
 633                    XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
 634                    XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
 635                    XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
 636                    XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
 637                    XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
 638                    XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
 639 #ifdef _LP64
 640                   ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
 641                    XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
 642                    XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
 643                    XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
 644                    XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
 645                    XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
 646                    XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
 647                    XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
 648                   ,XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
 649                    XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
 650                    XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
 651                    XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
 652                    XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
 653                    XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
 654                    XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
 655                    XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
 656                    XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
 657                    XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
 658                    XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
 659                    XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
 660                    XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
 661                    XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
 662                    XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
 663                    XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p
 664 #endif
 665                       );
 666 
 667 // flags allocation class should be last.
 668 alloc_class chunk2(RFLAGS);
 669 
 670 // Singleton class for condition codes
 671 reg_class int_flags(RFLAGS);
 672 
 673 // Class for pre evex float registers
 674 reg_class float_reg_legacy(XMM0,
 675                     XMM1,
 676                     XMM2,
 677                     XMM3,
 678                     XMM4,
 679                     XMM5,
 680                     XMM6,
 681                     XMM7
 682 #ifdef _LP64
 683                    ,XMM8,
 684                     XMM9,
 685                     XMM10,
 686                     XMM11,
 687                     XMM12,
 688                     XMM13,
 689                     XMM14,
 690                     XMM15
 691 #endif
 692                     );
 693 
 694 // Class for evex float registers
 695 reg_class float_reg_evex(XMM0,
 696                     XMM1,
 697                     XMM2,
 698                     XMM3,
 699                     XMM4,
 700                     XMM5,
 701                     XMM6,
 702                     XMM7
 703 #ifdef _LP64
 704                    ,XMM8,
 705                     XMM9,
 706                     XMM10,
 707                     XMM11,
 708                     XMM12,
 709                     XMM13,
 710                     XMM14,
 711                     XMM15,
 712                     XMM16,
 713                     XMM17,
 714                     XMM18,
 715                     XMM19,
 716                     XMM20,
 717                     XMM21,
 718                     XMM22,
 719                     XMM23,
 720                     XMM24,
 721                     XMM25,
 722                     XMM26,
 723                     XMM27,
 724                     XMM28,
 725                     XMM29,
 726                     XMM30,
 727                     XMM31
 728 #endif
 729                     );
 730 
 731 reg_class_dynamic float_reg(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() %} );
 732 
 733 // Class for pre evex double registers
 734 reg_class double_reg_legacy(XMM0,  XMM0b,
 735                      XMM1,  XMM1b,
 736                      XMM2,  XMM2b,
 737                      XMM3,  XMM3b,
 738                      XMM4,  XMM4b,
 739                      XMM5,  XMM5b,
 740                      XMM6,  XMM6b,
 741                      XMM7,  XMM7b
 742 #ifdef _LP64
 743                     ,XMM8,  XMM8b,
 744                      XMM9,  XMM9b,
 745                      XMM10, XMM10b,
 746                      XMM11, XMM11b,
 747                      XMM12, XMM12b,
 748                      XMM13, XMM13b,
 749                      XMM14, XMM14b,
 750                      XMM15, XMM15b
 751 #endif
 752                      );
 753 
 754 // Class for evex double registers
 755 reg_class double_reg_evex(XMM0,  XMM0b,
 756                      XMM1,  XMM1b,
 757                      XMM2,  XMM2b,
 758                      XMM3,  XMM3b,
 759                      XMM4,  XMM4b,
 760                      XMM5,  XMM5b,
 761                      XMM6,  XMM6b,
 762                      XMM7,  XMM7b
 763 #ifdef _LP64
 764                     ,XMM8,  XMM8b,
 765                      XMM9,  XMM9b,
 766                      XMM10, XMM10b,
 767                      XMM11, XMM11b,
 768                      XMM12, XMM12b,
 769                      XMM13, XMM13b,
 770                      XMM14, XMM14b,
 771                      XMM15, XMM15b,
 772                      XMM16, XMM16b,
 773                      XMM17, XMM17b,
 774                      XMM18, XMM18b,
 775                      XMM19, XMM19b,
 776                      XMM20, XMM20b,
 777                      XMM21, XMM21b,
 778                      XMM22, XMM22b,
 779                      XMM23, XMM23b,
 780                      XMM24, XMM24b,
 781                      XMM25, XMM25b,
 782                      XMM26, XMM26b,
 783                      XMM27, XMM27b,
 784                      XMM28, XMM28b,
 785                      XMM29, XMM29b,
 786                      XMM30, XMM30b,
 787                      XMM31, XMM31b
 788 #endif
 789                      );
 790 
 791 reg_class_dynamic double_reg(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() %} );
 792 
 793 // Class for pre evex 32bit vector registers
 794 reg_class vectors_reg_legacy(XMM0,
 795                       XMM1,
 796                       XMM2,
 797                       XMM3,
 798                       XMM4,
 799                       XMM5,
 800                       XMM6,
 801                       XMM7
 802 #ifdef _LP64
 803                      ,XMM8,
 804                       XMM9,
 805                       XMM10,
 806                       XMM11,
 807                       XMM12,
 808                       XMM13,
 809                       XMM14,
 810                       XMM15
 811 #endif
 812                       );
 813 
 814 // Class for evex 32bit vector registers
 815 reg_class vectors_reg_evex(XMM0,
 816                       XMM1,
 817                       XMM2,
 818                       XMM3,
 819                       XMM4,
 820                       XMM5,
 821                       XMM6,
 822                       XMM7
 823 #ifdef _LP64
 824                      ,XMM8,
 825                       XMM9,
 826                       XMM10,
 827                       XMM11,
 828                       XMM12,
 829                       XMM13,
 830                       XMM14,
 831                       XMM15,
 832                       XMM16,
 833                       XMM17,
 834                       XMM18,
 835                       XMM19,
 836                       XMM20,
 837                       XMM21,
 838                       XMM22,
 839                       XMM23,
 840                       XMM24,
 841                       XMM25,
 842                       XMM26,
 843                       XMM27,
 844                       XMM28,
 845                       XMM29,
 846                       XMM30,
 847                       XMM31
 848 #endif
 849                       );
 850 
 851 reg_class_dynamic vectors_reg(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_evex() %} );
 852 
 853 // Class for all 64bit vector registers
 854 reg_class vectord_reg_legacy(XMM0,  XMM0b,
 855                       XMM1,  XMM1b,
 856                       XMM2,  XMM2b,
 857                       XMM3,  XMM3b,
 858                       XMM4,  XMM4b,
 859                       XMM5,  XMM5b,
 860                       XMM6,  XMM6b,
 861                       XMM7,  XMM7b
 862 #ifdef _LP64
 863                      ,XMM8,  XMM8b,
 864                       XMM9,  XMM9b,
 865                       XMM10, XMM10b,
 866                       XMM11, XMM11b,
 867                       XMM12, XMM12b,
 868                       XMM13, XMM13b,
 869                       XMM14, XMM14b,
 870                       XMM15, XMM15b
 871 #endif
 872                       );
 873 
 874 // Class for all 64bit vector registers
 875 reg_class vectord_reg_evex(XMM0,  XMM0b,
 876                       XMM1,  XMM1b,
 877                       XMM2,  XMM2b,
 878                       XMM3,  XMM3b,
 879                       XMM4,  XMM4b,
 880                       XMM5,  XMM5b,
 881                       XMM6,  XMM6b,
 882                       XMM7,  XMM7b
 883 #ifdef _LP64
 884                      ,XMM8,  XMM8b,
 885                       XMM9,  XMM9b,
 886                       XMM10, XMM10b,
 887                       XMM11, XMM11b,
 888                       XMM12, XMM12b,
 889                       XMM13, XMM13b,
 890                       XMM14, XMM14b,
 891                       XMM15, XMM15b,
 892                       XMM16, XMM16b,
 893                       XMM17, XMM17b,
 894                       XMM18, XMM18b,
 895                       XMM19, XMM19b,
 896                       XMM20, XMM20b,
 897                       XMM21, XMM21b,
 898                       XMM22, XMM22b,
 899                       XMM23, XMM23b,
 900                       XMM24, XMM24b,
 901                       XMM25, XMM25b,
 902                       XMM26, XMM26b,
 903                       XMM27, XMM27b,
 904                       XMM28, XMM28b,
 905                       XMM29, XMM29b,
 906                       XMM30, XMM30b,
 907                       XMM31, XMM31b
 908 #endif
 909                       );
 910 
 911 reg_class_dynamic vectord_reg(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_evex() %} );
 912 
 913 // Class for all 128bit vector registers
 914 reg_class vectorx_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,
 915                       XMM1,  XMM1b,  XMM1c,  XMM1d,
 916                       XMM2,  XMM2b,  XMM2c,  XMM2d,
 917                       XMM3,  XMM3b,  XMM3c,  XMM3d,
 918                       XMM4,  XMM4b,  XMM4c,  XMM4d,
 919                       XMM5,  XMM5b,  XMM5c,  XMM5d,
 920                       XMM6,  XMM6b,  XMM6c,  XMM6d,
 921                       XMM7,  XMM7b,  XMM7c,  XMM7d
 922 #ifdef _LP64
 923                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,
 924                       XMM9,  XMM9b,  XMM9c,  XMM9d,
 925                       XMM10, XMM10b, XMM10c, XMM10d,
 926                       XMM11, XMM11b, XMM11c, XMM11d,
 927                       XMM12, XMM12b, XMM12c, XMM12d,
 928                       XMM13, XMM13b, XMM13c, XMM13d,
 929                       XMM14, XMM14b, XMM14c, XMM14d,
 930                       XMM15, XMM15b, XMM15c, XMM15d
 931 #endif
 932                       );
 933 
 934 // Class for all 128bit vector registers
 935 reg_class vectorx_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,
 936                       XMM1,  XMM1b,  XMM1c,  XMM1d,
 937                       XMM2,  XMM2b,  XMM2c,  XMM2d,
 938                       XMM3,  XMM3b,  XMM3c,  XMM3d,
 939                       XMM4,  XMM4b,  XMM4c,  XMM4d,
 940                       XMM5,  XMM5b,  XMM5c,  XMM5d,
 941                       XMM6,  XMM6b,  XMM6c,  XMM6d,
 942                       XMM7,  XMM7b,  XMM7c,  XMM7d
 943 #ifdef _LP64
 944                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,
 945                       XMM9,  XMM9b,  XMM9c,  XMM9d,
 946                       XMM10, XMM10b, XMM10c, XMM10d,
 947                       XMM11, XMM11b, XMM11c, XMM11d,
 948                       XMM12, XMM12b, XMM12c, XMM12d,
 949                       XMM13, XMM13b, XMM13c, XMM13d,
 950                       XMM14, XMM14b, XMM14c, XMM14d,
 951                       XMM15, XMM15b, XMM15c, XMM15d,
 952                       XMM16, XMM16b, XMM16c, XMM16d,
 953                       XMM17, XMM17b, XMM17c, XMM17d,
 954                       XMM18, XMM18b, XMM18c, XMM18d,
 955                       XMM19, XMM19b, XMM19c, XMM19d,
 956                       XMM20, XMM20b, XMM20c, XMM20d,
 957                       XMM21, XMM21b, XMM21c, XMM21d,
 958                       XMM22, XMM22b, XMM22c, XMM22d,
 959                       XMM23, XMM23b, XMM23c, XMM23d,
 960                       XMM24, XMM24b, XMM24c, XMM24d,
 961                       XMM25, XMM25b, XMM25c, XMM25d,
 962                       XMM26, XMM26b, XMM26c, XMM26d,
 963                       XMM27, XMM27b, XMM27c, XMM27d,
 964                       XMM28, XMM28b, XMM28c, XMM28d,
 965                       XMM29, XMM29b, XMM29c, XMM29d,
 966                       XMM30, XMM30b, XMM30c, XMM30d,
 967                       XMM31, XMM31b, XMM31c, XMM31d
 968 #endif
 969                       );
 970 
 971 reg_class_dynamic vectorx_reg(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_evex() %} );
 972 
 973 // Class for all 256bit vector registers
 974 reg_class vectory_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
 975                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
 976                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
 977                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
 978                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
 979                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
 980                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
 981                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
 982 #ifdef _LP64
 983                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
 984                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
 985                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
 986                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
 987                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
 988                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
 989                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
 990                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h
 991 #endif
 992                       );
 993 
 994 // Class for all 256bit vector registers
 995 reg_class vectory_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
 996                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
 997                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
 998                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
 999                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
1000                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
1001                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
1002                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
1003 #ifdef _LP64
1004                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
1005                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
1006                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
1007                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
1008                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
1009                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
1010                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
1011                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h,
1012                       XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h,
1013                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h,
1014                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h,
1015                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h,
1016                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h,
1017                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h,
1018                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h,
1019                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h,
1020                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h,
1021                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h,
1022                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h,
1023                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h,
1024                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h,
1025                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h,
1026                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h,
1027                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h
1028 #endif
1029                       );
1030 
1031 reg_class_dynamic vectory_reg(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_evex() %} );
1032 
1033 // Class for all 512bit vector registers
1034 reg_class vectorz_reg(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
1035                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
1036                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
1037                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
1038                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
1039                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
1040                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
1041                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
1042 #ifdef _LP64
1043                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
1044                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
1045                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
1046                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
1047                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
1048                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
1049                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
1050                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
1051                      ,XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
1052                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
1053                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
1054                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
1055                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
1056                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
1057                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
1058                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
1059                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
1060                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
1061                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
1062                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
1063                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
1064                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
1065                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
1066                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p
1067 #endif
1068                       );
1069 
1070 %}
1071 
1072 
1073 //----------SOURCE BLOCK-------------------------------------------------------
1074 // This is a block of C++ code which provides values, functions, and
1075 // definitions necessary in the rest of the architecture description
1076 
1077 source_hpp %{
1078 // Header information of the source block.
1079 // Method declarations/definitions which are used outside
1080 // the ad-scope can conveniently be defined here.
1081 //
1082 // To keep related declarations/definitions/uses close together,
1083 // we switch between source %{ }% and source_hpp %{ }% freely as needed.
1084 
1085 class NativeJump;
1086 
1087 class CallStubImpl {
1088 
1089   //--------------------------------------------------------------
1090   //---<  Used for optimization in Compile::shorten_branches  >---
1091   //--------------------------------------------------------------
1092 
1093  public:
1094   // Size of call trampoline stub.
1095   static uint size_call_trampoline() {
1096     return 0; // no call trampolines on this platform
1097   }
1098 
1099   // number of relocations needed by a call trampoline stub
1100   static uint reloc_call_trampoline() {
1101     return 0; // no call trampolines on this platform
1102   }
1103 };
1104 
1105 class HandlerImpl {
1106 
1107  public:
1108 
1109   static int emit_exception_handler(CodeBuffer &cbuf);
1110   static int emit_deopt_handler(CodeBuffer& cbuf);
1111 
1112   static uint size_exception_handler() {
1113     // NativeCall instruction size is the same as NativeJump.
1114     // exception handler starts out as jump and can be patched to
1115     // a call be deoptimization.  (4932387)
1116     // Note that this value is also credited (in output.cpp) to
1117     // the size of the code section.
1118     return NativeJump::instruction_size;
1119   }
1120 
1121 #ifdef _LP64
1122   static uint size_deopt_handler() {
1123     // three 5 byte instructions
1124     return 15;
1125   }
1126 #else
1127   static uint size_deopt_handler() {
1128     // NativeCall instruction size is the same as NativeJump.
1129     // exception handler starts out as jump and can be patched to
1130     // a call be deoptimization.  (4932387)
1131     // Note that this value is also credited (in output.cpp) to
1132     // the size of the code section.
1133     return 5 + NativeJump::instruction_size; // pushl(); jmp;
1134   }
1135 #endif
1136 };
1137 
1138 %} // end source_hpp
1139 
1140 source %{
1141 
1142 #include "opto/addnode.hpp"
1143 
1144 // Emit exception handler code.
1145 // Stuff framesize into a register and call a VM stub routine.
1146 int HandlerImpl::emit_exception_handler(CodeBuffer& cbuf) {
1147 
1148   // Note that the code buffer's insts_mark is always relative to insts.
1149   // That's why we must use the macroassembler to generate a handler.
1150   MacroAssembler _masm(&cbuf);
1151   address base = __ start_a_stub(size_exception_handler());
1152   if (base == NULL) {
1153     ciEnv::current()->record_failure("CodeCache is full");
1154     return 0;  // CodeBuffer::expand failed
1155   }
1156   int offset = __ offset();
1157   __ jump(RuntimeAddress(OptoRuntime::exception_blob()->entry_point()));
1158   assert(__ offset() - offset <= (int) size_exception_handler(), "overflow");
1159   __ end_a_stub();
1160   return offset;
1161 }
1162 
1163 // Emit deopt handler code.
1164 int HandlerImpl::emit_deopt_handler(CodeBuffer& cbuf) {
1165 
1166   // Note that the code buffer's insts_mark is always relative to insts.
1167   // That's why we must use the macroassembler to generate a handler.
1168   MacroAssembler _masm(&cbuf);
1169   address base = __ start_a_stub(size_deopt_handler());
1170   if (base == NULL) {
1171     ciEnv::current()->record_failure("CodeCache is full");
1172     return 0;  // CodeBuffer::expand failed
1173   }
1174   int offset = __ offset();
1175 
1176 #ifdef _LP64
1177   address the_pc = (address) __ pc();
1178   Label next;
1179   // push a "the_pc" on the stack without destroying any registers
1180   // as they all may be live.
1181 
1182   // push address of "next"
1183   __ call(next, relocInfo::none); // reloc none is fine since it is a disp32
1184   __ bind(next);
1185   // adjust it so it matches "the_pc"
1186   __ subptr(Address(rsp, 0), __ offset() - offset);
1187 #else
1188   InternalAddress here(__ pc());
1189   __ pushptr(here.addr());
1190 #endif
1191 
1192   __ jump(RuntimeAddress(SharedRuntime::deopt_blob()->unpack()));
1193   assert(__ offset() - offset <= (int) size_deopt_handler(), "overflow");
1194   __ end_a_stub();
1195   return offset;
1196 }
1197 
1198 
1199 //=============================================================================
1200 
1201   // Float masks come from different places depending on platform.
1202 #ifdef _LP64
1203   static address float_signmask()  { return StubRoutines::x86::float_sign_mask(); }
1204   static address float_signflip()  { return StubRoutines::x86::float_sign_flip(); }
1205   static address double_signmask() { return StubRoutines::x86::double_sign_mask(); }
1206   static address double_signflip() { return StubRoutines::x86::double_sign_flip(); }
1207 #else
1208   static address float_signmask()  { return (address)float_signmask_pool; }
1209   static address float_signflip()  { return (address)float_signflip_pool; }
1210   static address double_signmask() { return (address)double_signmask_pool; }
1211   static address double_signflip() { return (address)double_signflip_pool; }
1212 #endif
1213 
1214 
1215 const bool Matcher::match_rule_supported(int opcode) {
1216   if (!has_match_rule(opcode))
1217     return false;
1218 
1219   bool ret_value = true;
1220   switch (opcode) {
1221     case Op_PopCountI:
1222     case Op_PopCountL:
1223       if (!UsePopCountInstruction)
1224         ret_value = false;
1225       break;
1226     case Op_MulVI:
1227       if ((UseSSE < 4) && (UseAVX < 1)) // only with SSE4_1 or AVX
1228         ret_value = false;
1229       break;
1230     case Op_MulVL:
1231     case Op_MulReductionVL:
1232       if (VM_Version::supports_avx512dq() == false)
1233         ret_value = false;
1234       break;
1235     case Op_AddReductionVL:
1236       if (UseAVX < 3) // only EVEX : vector connectivity becomes an issue here
1237         ret_value = false;
1238       break;
1239     case Op_AddReductionVI:
1240       if (UseSSE < 3) // requires at least SSE3
1241         ret_value = false;
1242       break;
1243     case Op_MulReductionVI:
1244       if (UseSSE < 4) // requires at least SSE4
1245         ret_value = false;
1246       break;
1247     case Op_AddReductionVF:
1248     case Op_AddReductionVD:
1249     case Op_MulReductionVF:
1250     case Op_MulReductionVD:
1251       if (UseSSE < 1) // requires at least SSE
1252         ret_value = false;
1253       break;
1254     case Op_SqrtVD:
1255       if (UseAVX < 1) // enabled for AVX only
1256         ret_value = false;
1257       break;
1258     case Op_CompareAndSwapL:
1259 #ifdef _LP64
1260     case Op_CompareAndSwapP:
1261 #endif
1262       if (!VM_Version::supports_cx8())
1263         ret_value = false;
1264       break;
1265     case Op_CMoveVD:
1266       if (UseAVX < 1 || UseAVX > 2)
1267         ret_value = false;
1268       break;
1269     case Op_StrIndexOf:
1270       if (!UseSSE42Intrinsics)
1271         ret_value = false;
1272       break;
1273     case Op_StrIndexOfChar:
1274       if (!UseSSE42Intrinsics)
1275         ret_value = false;
1276       break;
1277     case Op_OnSpinWait:
1278       if (VM_Version::supports_on_spin_wait() == false)
1279         ret_value = false;
1280       break;
1281   }
1282 
1283   return ret_value;  // Per default match rules are supported.
1284 }
1285 
1286 const bool Matcher::match_rule_supported_vector(int opcode, int vlen) {
1287   // identify extra cases that we might want to provide match rules for
1288   // e.g. Op_ vector nodes and other intrinsics while guarding with vlen
1289   bool ret_value = match_rule_supported(opcode);
1290   if (ret_value) {
1291     switch (opcode) {
1292       case Op_AddVB:
1293       case Op_SubVB:
1294         if ((vlen == 64) && (VM_Version::supports_avx512bw() == false))
1295           ret_value = false;
1296         break;
1297       case Op_URShiftVS:
1298       case Op_RShiftVS:
1299       case Op_LShiftVS:
1300       case Op_MulVS:
1301       case Op_AddVS:
1302       case Op_SubVS:
1303         if ((vlen == 32) && (VM_Version::supports_avx512bw() == false))
1304           ret_value = false;
1305         break;
1306       case Op_CMoveVD:
1307         if (vlen != 4)
1308           ret_value  = false;
1309         break;
1310     }
1311   }
1312 
1313   return ret_value;  // Per default match rules are supported.
1314 }
1315 
1316 const bool Matcher::has_predicated_vectors(void) {
1317   bool ret_value = false;
1318   if (UseAVX > 2) {
1319     ret_value = VM_Version::supports_avx512vl();
1320   }
1321 
1322   return ret_value;
1323 }
1324 
1325 const int Matcher::float_pressure(int default_pressure_threshold) {
1326   int float_pressure_threshold = default_pressure_threshold;
1327 #ifdef _LP64
1328   if (UseAVX > 2) {
1329     // Increase pressure threshold on machines with AVX3 which have
1330     // 2x more XMM registers.
1331     float_pressure_threshold = default_pressure_threshold * 2;
1332   }
1333 #endif
1334   return float_pressure_threshold;
1335 }
1336 
1337 // Max vector size in bytes. 0 if not supported.
1338 const int Matcher::vector_width_in_bytes(BasicType bt) {
1339   assert(is_java_primitive(bt), "only primitive type vectors");
1340   if (UseSSE < 2) return 0;
1341   // SSE2 supports 128bit vectors for all types.
1342   // AVX2 supports 256bit vectors for all types.
1343   // AVX2/EVEX supports 512bit vectors for all types.
1344   int size = (UseAVX > 1) ? (1 << UseAVX) * 8 : 16;
1345   // AVX1 supports 256bit vectors only for FLOAT and DOUBLE.
1346   if (UseAVX > 0 && (bt == T_FLOAT || bt == T_DOUBLE))
1347     size = (UseAVX > 2) ? 64 : 32;
1348   // Use flag to limit vector size.
1349   size = MIN2(size,(int)MaxVectorSize);
1350   // Minimum 2 values in vector (or 4 for bytes).
1351   switch (bt) {
1352   case T_DOUBLE:
1353   case T_LONG:
1354     if (size < 16) return 0;
1355     break;
1356   case T_FLOAT:
1357   case T_INT:
1358     if (size < 8) return 0;
1359     break;
1360   case T_BOOLEAN:
1361     if (size < 4) return 0;
1362     break;
1363   case T_CHAR:
1364     if (size < 4) return 0;
1365     break;
1366   case T_BYTE:
1367     if (size < 4) return 0;
1368     break;
1369   case T_SHORT:
1370     if (size < 4) return 0;
1371     break;
1372   default:
1373     ShouldNotReachHere();
1374   }
1375   return size;
1376 }
1377 
1378 // Limits on vector size (number of elements) loaded into vector.
1379 const int Matcher::max_vector_size(const BasicType bt) {
1380   return vector_width_in_bytes(bt)/type2aelembytes(bt);
1381 }
1382 const int Matcher::min_vector_size(const BasicType bt) {
1383   int max_size = max_vector_size(bt);
1384   // Min size which can be loaded into vector is 4 bytes.
1385   int size = (type2aelembytes(bt) == 1) ? 4 : 2;
1386   return MIN2(size,max_size);
1387 }
1388 
1389 // Vector ideal reg corresponding to specidied size in bytes
1390 const int Matcher::vector_ideal_reg(int size) {
1391   assert(MaxVectorSize >= size, "");
1392   switch(size) {
1393     case  4: return Op_VecS;
1394     case  8: return Op_VecD;
1395     case 16: return Op_VecX;
1396     case 32: return Op_VecY;
1397     case 64: return Op_VecZ;
1398   }
1399   ShouldNotReachHere();
1400   return 0;
1401 }
1402 
1403 // Only lowest bits of xmm reg are used for vector shift count.
1404 const int Matcher::vector_shift_count_ideal_reg(int size) {
1405   return Op_VecS;
1406 }
1407 
1408 // x86 supports misaligned vectors store/load.
1409 const bool Matcher::misaligned_vectors_ok() {
1410   return !AlignVector; // can be changed by flag
1411 }
1412 
1413 // x86 AES instructions are compatible with SunJCE expanded
1414 // keys, hence we do not need to pass the original key to stubs
1415 const bool Matcher::pass_original_key_for_aes() {
1416   return false;
1417 }
1418 
1419 
1420 const bool Matcher::convi2l_type_required = true;
1421 
1422 // Check for shift by small constant as well
1423 static bool clone_shift(Node* shift, Matcher* matcher, Matcher::MStack& mstack, VectorSet& address_visited) {
1424   if (shift->Opcode() == Op_LShiftX && shift->in(2)->is_Con() &&
1425       shift->in(2)->get_int() <= 3 &&
1426       // Are there other uses besides address expressions?
1427       !matcher->is_visited(shift)) {
1428     address_visited.set(shift->_idx); // Flag as address_visited
1429     mstack.push(shift->in(2), Matcher::Visit);
1430     Node *conv = shift->in(1);
1431 #ifdef _LP64
1432     // Allow Matcher to match the rule which bypass
1433     // ConvI2L operation for an array index on LP64
1434     // if the index value is positive.
1435     if (conv->Opcode() == Op_ConvI2L &&
1436         conv->as_Type()->type()->is_long()->_lo >= 0 &&
1437         // Are there other uses besides address expressions?
1438         !matcher->is_visited(conv)) {
1439       address_visited.set(conv->_idx); // Flag as address_visited
1440       mstack.push(conv->in(1), Matcher::Pre_Visit);
1441     } else
1442 #endif
1443       mstack.push(conv, Matcher::Pre_Visit);
1444     return true;
1445   }
1446   return false;
1447 }
1448 
1449 // Should the Matcher clone shifts on addressing modes, expecting them
1450 // to be subsumed into complex addressing expressions or compute them
1451 // into registers?
1452 bool Matcher::clone_address_expressions(AddPNode* m, Matcher::MStack& mstack, VectorSet& address_visited) {
1453   Node *off = m->in(AddPNode::Offset);
1454   if (off->is_Con()) {
1455     address_visited.test_set(m->_idx); // Flag as address_visited
1456     Node *adr = m->in(AddPNode::Address);
1457 
1458     // Intel can handle 2 adds in addressing mode
1459     // AtomicAdd is not an addressing expression.
1460     // Cheap to find it by looking for screwy base.
1461     if (adr->is_AddP() &&
1462         !adr->in(AddPNode::Base)->is_top() &&
1463         // Are there other uses besides address expressions?
1464         !is_visited(adr)) {
1465       address_visited.set(adr->_idx); // Flag as address_visited
1466       Node *shift = adr->in(AddPNode::Offset);
1467       if (!clone_shift(shift, this, mstack, address_visited)) {
1468         mstack.push(shift, Pre_Visit);
1469       }
1470       mstack.push(adr->in(AddPNode::Address), Pre_Visit);
1471       mstack.push(adr->in(AddPNode::Base), Pre_Visit);
1472     } else {
1473       mstack.push(adr, Pre_Visit);
1474     }
1475 
1476     // Clone X+offset as it also folds into most addressing expressions
1477     mstack.push(off, Visit);
1478     mstack.push(m->in(AddPNode::Base), Pre_Visit);
1479     return true;
1480   } else if (clone_shift(off, this, mstack, address_visited)) {
1481     address_visited.test_set(m->_idx); // Flag as address_visited
1482     mstack.push(m->in(AddPNode::Address), Pre_Visit);
1483     mstack.push(m->in(AddPNode::Base), Pre_Visit);
1484     return true;
1485   }
1486   return false;
1487 }
1488 
1489 void Compile::reshape_address(AddPNode* addp) {
1490 }
1491 
1492 // Helper methods for MachSpillCopyNode::implementation().
1493 static int vec_mov_helper(CodeBuffer *cbuf, bool do_size, int src_lo, int dst_lo,
1494                           int src_hi, int dst_hi, uint ireg, outputStream* st) {
1495   // In 64-bit VM size calculation is very complex. Emitting instructions
1496   // into scratch buffer is used to get size in 64-bit VM.
1497   LP64_ONLY( assert(!do_size, "this method calculates size only for 32-bit VM"); )
1498   assert(ireg == Op_VecS || // 32bit vector
1499          (src_lo & 1) == 0 && (src_lo + 1) == src_hi &&
1500          (dst_lo & 1) == 0 && (dst_lo + 1) == dst_hi,
1501          "no non-adjacent vector moves" );
1502   if (cbuf) {
1503     MacroAssembler _masm(cbuf);
1504     int offset = __ offset();
1505     switch (ireg) {
1506     case Op_VecS: // copy whole register
1507     case Op_VecD:
1508     case Op_VecX:
1509       __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
1510       break;
1511     case Op_VecY:
1512       __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
1513       break;
1514     case Op_VecZ:
1515       __ evmovdquq(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 2);
1516       break;
1517     default:
1518       ShouldNotReachHere();
1519     }
1520     int size = __ offset() - offset;
1521 #ifdef ASSERT
1522     // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
1523     assert(!do_size || size == 4, "incorrect size calculattion");
1524 #endif
1525     return size;
1526 #ifndef PRODUCT
1527   } else if (!do_size) {
1528     switch (ireg) {
1529     case Op_VecS:
1530     case Op_VecD:
1531     case Op_VecX:
1532       st->print("movdqu  %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
1533       break;
1534     case Op_VecY:
1535     case Op_VecZ:
1536       st->print("vmovdqu %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
1537       break;
1538     default:
1539       ShouldNotReachHere();
1540     }
1541 #endif
1542   }
1543   // VEX_2bytes prefix is used if UseAVX > 0, and it takes the same 2 bytes as SIMD prefix.
1544   return (UseAVX > 2) ? 6 : 4;
1545 }
1546 
1547 static int vec_spill_helper(CodeBuffer *cbuf, bool do_size, bool is_load,
1548                             int stack_offset, int reg, uint ireg, outputStream* st) {
1549   // In 64-bit VM size calculation is very complex. Emitting instructions
1550   // into scratch buffer is used to get size in 64-bit VM.
1551   LP64_ONLY( assert(!do_size, "this method calculates size only for 32-bit VM"); )
1552   if (cbuf) {
1553     MacroAssembler _masm(cbuf);
1554     int offset = __ offset();
1555     if (is_load) {
1556       switch (ireg) {
1557       case Op_VecS:
1558         __ movdl(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
1559         break;
1560       case Op_VecD:
1561         __ movq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
1562         break;
1563       case Op_VecX:
1564         __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
1565         break;
1566       case Op_VecY:
1567         __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
1568         break;
1569       case Op_VecZ:
1570         __ evmovdquq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset), 2);
1571         break;
1572       default:
1573         ShouldNotReachHere();
1574       }
1575     } else { // store
1576       switch (ireg) {
1577       case Op_VecS:
1578         __ movdl(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
1579         break;
1580       case Op_VecD:
1581         __ movq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
1582         break;
1583       case Op_VecX:
1584         __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
1585         break;
1586       case Op_VecY:
1587         __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
1588         break;
1589       case Op_VecZ:
1590         __ evmovdquq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 2);
1591         break;
1592       default:
1593         ShouldNotReachHere();
1594       }
1595     }
1596     int size = __ offset() - offset;
1597 #ifdef ASSERT
1598     int offset_size = (stack_offset == 0) ? 0 : ((stack_offset < 0x80) ? 1 : (UseAVX > 2) ? 6 : 4);
1599     // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
1600     assert(!do_size || size == (5+offset_size), "incorrect size calculattion");
1601 #endif
1602     return size;
1603 #ifndef PRODUCT
1604   } else if (!do_size) {
1605     if (is_load) {
1606       switch (ireg) {
1607       case Op_VecS:
1608         st->print("movd    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
1609         break;
1610       case Op_VecD:
1611         st->print("movq    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
1612         break;
1613        case Op_VecX:
1614         st->print("movdqu  %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
1615         break;
1616       case Op_VecY:
1617       case Op_VecZ:
1618         st->print("vmovdqu %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
1619         break;
1620       default:
1621         ShouldNotReachHere();
1622       }
1623     } else { // store
1624       switch (ireg) {
1625       case Op_VecS:
1626         st->print("movd    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
1627         break;
1628       case Op_VecD:
1629         st->print("movq    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
1630         break;
1631        case Op_VecX:
1632         st->print("movdqu  [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
1633         break;
1634       case Op_VecY:
1635       case Op_VecZ:
1636         st->print("vmovdqu [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
1637         break;
1638       default:
1639         ShouldNotReachHere();
1640       }
1641     }
1642 #endif
1643   }
1644   bool is_single_byte = false;
1645   int vec_len = 0;
1646   if ((UseAVX > 2) && (stack_offset != 0)) {
1647     int tuple_type = Assembler::EVEX_FVM;
1648     int input_size = Assembler::EVEX_32bit;
1649     switch (ireg) {
1650     case Op_VecS:
1651       tuple_type = Assembler::EVEX_T1S;
1652       break;
1653     case Op_VecD:
1654       tuple_type = Assembler::EVEX_T1S;
1655       input_size = Assembler::EVEX_64bit;
1656       break;
1657     case Op_VecX:
1658       break;
1659     case Op_VecY:
1660       vec_len = 1;
1661       break;
1662     case Op_VecZ:
1663       vec_len = 2;
1664       break;
1665     }
1666     is_single_byte = Assembler::query_compressed_disp_byte(stack_offset, true, vec_len, tuple_type, input_size, 0);
1667   }
1668   int offset_size = 0;
1669   int size = 5;
1670   if (UseAVX > 2 ) {
1671     if (VM_Version::supports_avx512novl() && (vec_len == 2)) {
1672       offset_size = (stack_offset == 0) ? 0 : ((is_single_byte) ? 1 : 4);
1673       size += 2; // Need an additional two bytes for EVEX encoding
1674     } else if (VM_Version::supports_avx512novl() && (vec_len < 2)) {
1675       offset_size = (stack_offset == 0) ? 0 : ((stack_offset <= 127) ? 1 : 4);
1676     } else {
1677       offset_size = (stack_offset == 0) ? 0 : ((is_single_byte) ? 1 : 4);
1678       size += 2; // Need an additional two bytes for EVEX encodding
1679     }
1680   } else {
1681     offset_size = (stack_offset == 0) ? 0 : ((stack_offset <= 127) ? 1 : 4);
1682   }
1683   // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
1684   return size+offset_size;
1685 }
1686 
1687 static inline jint replicate4_imm(int con, int width) {
1688   // Load a constant of "width" (in bytes) and replicate it to fill 32bit.
1689   assert(width == 1 || width == 2, "only byte or short types here");
1690   int bit_width = width * 8;
1691   jint val = con;
1692   val &= (1 << bit_width) - 1;  // mask off sign bits
1693   while(bit_width < 32) {
1694     val |= (val << bit_width);
1695     bit_width <<= 1;
1696   }
1697   return val;
1698 }
1699 
1700 static inline jlong replicate8_imm(int con, int width) {
1701   // Load a constant of "width" (in bytes) and replicate it to fill 64bit.
1702   assert(width == 1 || width == 2 || width == 4, "only byte, short or int types here");
1703   int bit_width = width * 8;
1704   jlong val = con;
1705   val &= (((jlong) 1) << bit_width) - 1;  // mask off sign bits
1706   while(bit_width < 64) {
1707     val |= (val << bit_width);
1708     bit_width <<= 1;
1709   }
1710   return val;
1711 }
1712 
1713 #ifndef PRODUCT
1714   void MachNopNode::format(PhaseRegAlloc*, outputStream* st) const {
1715     st->print("nop \t# %d bytes pad for loops and calls", _count);
1716   }
1717 #endif
1718 
1719   void MachNopNode::emit(CodeBuffer &cbuf, PhaseRegAlloc*) const {
1720     MacroAssembler _masm(&cbuf);
1721     __ nop(_count);
1722   }
1723 
1724   uint MachNopNode::size(PhaseRegAlloc*) const {
1725     return _count;
1726   }
1727 
1728 #ifndef PRODUCT
1729   void MachBreakpointNode::format(PhaseRegAlloc*, outputStream* st) const {
1730     st->print("# breakpoint");
1731   }
1732 #endif
1733 
1734   void MachBreakpointNode::emit(CodeBuffer &cbuf, PhaseRegAlloc* ra_) const {
1735     MacroAssembler _masm(&cbuf);
1736     __ int3();
1737   }
1738 
1739   uint MachBreakpointNode::size(PhaseRegAlloc* ra_) const {
1740     return MachNode::size(ra_);
1741   }
1742 
1743 %}
1744 
1745 encode %{
1746 
1747   enc_class call_epilog %{
1748     if (VerifyStackAtCalls) {
1749       // Check that stack depth is unchanged: find majik cookie on stack
1750       int framesize = ra_->reg2offset_unchecked(OptoReg::add(ra_->_matcher._old_SP, -3*VMRegImpl::slots_per_word));
1751       MacroAssembler _masm(&cbuf);
1752       Label L;
1753       __ cmpptr(Address(rsp, framesize), (int32_t)0xbadb100d);
1754       __ jccb(Assembler::equal, L);
1755       // Die if stack mismatch
1756       __ int3();
1757       __ bind(L);
1758     }
1759   %}
1760 
1761 %}
1762 
1763 
1764 //----------OPERANDS-----------------------------------------------------------
1765 // Operand definitions must precede instruction definitions for correct parsing
1766 // in the ADLC because operands constitute user defined types which are used in
1767 // instruction definitions.
1768 
1769 // This one generically applies only for evex, so only one version
1770 operand vecZ() %{
1771   constraint(ALLOC_IN_RC(vectorz_reg));
1772   match(VecZ);
1773 
1774   format %{ %}
1775   interface(REG_INTER);
1776 %}
1777 
1778 // Comparison Code for FP conditional move
1779 operand cmpOp_vcmppd() %{
1780   match(Bool);
1781 
1782   predicate(n->as_Bool()->_test._test != BoolTest::overflow &&
1783             n->as_Bool()->_test._test != BoolTest::no_overflow);
1784   format %{ "" %}
1785   interface(COND_INTER) %{
1786     equal        (0x0, "eq");
1787     less         (0x1, "lt");
1788     less_equal   (0x2, "le");
1789     not_equal    (0xC, "ne");
1790     greater_equal(0xD, "ge");
1791     greater      (0xE, "gt");
1792     //TODO cannot compile (adlc breaks) without two next lines with error:
1793     // x86_64.ad(13987) Syntax Error: :In operand cmpOp_vcmppd: Do not support this encode constant: ' %{
1794     // equal' for overflow.
1795     overflow     (0x20, "o");  // not really supported by the instruction
1796     no_overflow  (0x21, "no"); // not really supported by the instruction
1797   %}
1798 %}
1799 
1800 
1801 // INSTRUCTIONS -- Platform independent definitions (same for 32- and 64-bit)
1802 
1803 // ============================================================================
1804 
1805 instruct ShouldNotReachHere() %{
1806   match(Halt);
1807   format %{ "int3\t# ShouldNotReachHere" %}
1808   ins_encode %{
1809     __ int3();
1810   %}
1811   ins_pipe(pipe_slow);
1812 %}
1813 
1814 // =================================EVEX special===============================
1815 
1816 instruct setMask(rRegI dst, rRegI src) %{
1817   predicate(Matcher::has_predicated_vectors());
1818   match(Set dst (SetVectMaskI  src));
1819   effect(TEMP dst);
1820   format %{ "setvectmask   $dst, $src" %}
1821   ins_encode %{
1822     __ setvectmask($dst$$Register, $src$$Register);
1823   %}
1824   ins_pipe(pipe_slow);
1825 %}
1826 
1827 // ============================================================================
1828 
1829 instruct addF_reg(regF dst, regF src) %{
1830   predicate((UseSSE>=1) && (UseAVX == 0));
1831   match(Set dst (AddF dst src));
1832 
1833   format %{ "addss   $dst, $src" %}
1834   ins_cost(150);
1835   ins_encode %{
1836     __ addss($dst$$XMMRegister, $src$$XMMRegister);
1837   %}
1838   ins_pipe(pipe_slow);
1839 %}
1840 
1841 instruct addF_mem(regF dst, memory src) %{
1842   predicate((UseSSE>=1) && (UseAVX == 0));
1843   match(Set dst (AddF dst (LoadF src)));
1844 
1845   format %{ "addss   $dst, $src" %}
1846   ins_cost(150);
1847   ins_encode %{
1848     __ addss($dst$$XMMRegister, $src$$Address);
1849   %}
1850   ins_pipe(pipe_slow);
1851 %}
1852 
1853 instruct addF_imm(regF dst, immF con) %{
1854   predicate((UseSSE>=1) && (UseAVX == 0));
1855   match(Set dst (AddF dst con));
1856   format %{ "addss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
1857   ins_cost(150);
1858   ins_encode %{
1859     __ addss($dst$$XMMRegister, $constantaddress($con));
1860   %}
1861   ins_pipe(pipe_slow);
1862 %}
1863 
1864 instruct addF_reg_reg(regF dst, regF src1, regF src2) %{
1865   predicate(UseAVX > 0);
1866   match(Set dst (AddF src1 src2));
1867 
1868   format %{ "vaddss  $dst, $src1, $src2" %}
1869   ins_cost(150);
1870   ins_encode %{
1871     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
1872   %}
1873   ins_pipe(pipe_slow);
1874 %}
1875 
1876 instruct addF_reg_mem(regF dst, regF src1, memory src2) %{
1877   predicate(UseAVX > 0);
1878   match(Set dst (AddF src1 (LoadF src2)));
1879 
1880   format %{ "vaddss  $dst, $src1, $src2" %}
1881   ins_cost(150);
1882   ins_encode %{
1883     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
1884   %}
1885   ins_pipe(pipe_slow);
1886 %}
1887 
1888 instruct addF_reg_imm(regF dst, regF src, immF con) %{
1889   predicate(UseAVX > 0);
1890   match(Set dst (AddF src con));
1891 
1892   format %{ "vaddss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
1893   ins_cost(150);
1894   ins_encode %{
1895     __ vaddss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
1896   %}
1897   ins_pipe(pipe_slow);
1898 %}
1899 
1900 instruct addD_reg(regD dst, regD src) %{
1901   predicate((UseSSE>=2) && (UseAVX == 0));
1902   match(Set dst (AddD dst src));
1903 
1904   format %{ "addsd   $dst, $src" %}
1905   ins_cost(150);
1906   ins_encode %{
1907     __ addsd($dst$$XMMRegister, $src$$XMMRegister);
1908   %}
1909   ins_pipe(pipe_slow);
1910 %}
1911 
1912 instruct addD_mem(regD dst, memory src) %{
1913   predicate((UseSSE>=2) && (UseAVX == 0));
1914   match(Set dst (AddD dst (LoadD src)));
1915 
1916   format %{ "addsd   $dst, $src" %}
1917   ins_cost(150);
1918   ins_encode %{
1919     __ addsd($dst$$XMMRegister, $src$$Address);
1920   %}
1921   ins_pipe(pipe_slow);
1922 %}
1923 
1924 instruct addD_imm(regD dst, immD con) %{
1925   predicate((UseSSE>=2) && (UseAVX == 0));
1926   match(Set dst (AddD dst con));
1927   format %{ "addsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
1928   ins_cost(150);
1929   ins_encode %{
1930     __ addsd($dst$$XMMRegister, $constantaddress($con));
1931   %}
1932   ins_pipe(pipe_slow);
1933 %}
1934 
1935 instruct addD_reg_reg(regD dst, regD src1, regD src2) %{
1936   predicate(UseAVX > 0);
1937   match(Set dst (AddD src1 src2));
1938 
1939   format %{ "vaddsd  $dst, $src1, $src2" %}
1940   ins_cost(150);
1941   ins_encode %{
1942     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
1943   %}
1944   ins_pipe(pipe_slow);
1945 %}
1946 
1947 instruct addD_reg_mem(regD dst, regD src1, memory src2) %{
1948   predicate(UseAVX > 0);
1949   match(Set dst (AddD src1 (LoadD src2)));
1950 
1951   format %{ "vaddsd  $dst, $src1, $src2" %}
1952   ins_cost(150);
1953   ins_encode %{
1954     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
1955   %}
1956   ins_pipe(pipe_slow);
1957 %}
1958 
1959 instruct addD_reg_imm(regD dst, regD src, immD con) %{
1960   predicate(UseAVX > 0);
1961   match(Set dst (AddD src con));
1962 
1963   format %{ "vaddsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
1964   ins_cost(150);
1965   ins_encode %{
1966     __ vaddsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
1967   %}
1968   ins_pipe(pipe_slow);
1969 %}
1970 
1971 instruct subF_reg(regF dst, regF src) %{
1972   predicate((UseSSE>=1) && (UseAVX == 0));
1973   match(Set dst (SubF dst src));
1974 
1975   format %{ "subss   $dst, $src" %}
1976   ins_cost(150);
1977   ins_encode %{
1978     __ subss($dst$$XMMRegister, $src$$XMMRegister);
1979   %}
1980   ins_pipe(pipe_slow);
1981 %}
1982 
1983 instruct subF_mem(regF dst, memory src) %{
1984   predicate((UseSSE>=1) && (UseAVX == 0));
1985   match(Set dst (SubF dst (LoadF src)));
1986 
1987   format %{ "subss   $dst, $src" %}
1988   ins_cost(150);
1989   ins_encode %{
1990     __ subss($dst$$XMMRegister, $src$$Address);
1991   %}
1992   ins_pipe(pipe_slow);
1993 %}
1994 
1995 instruct subF_imm(regF dst, immF con) %{
1996   predicate((UseSSE>=1) && (UseAVX == 0));
1997   match(Set dst (SubF dst con));
1998   format %{ "subss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
1999   ins_cost(150);
2000   ins_encode %{
2001     __ subss($dst$$XMMRegister, $constantaddress($con));
2002   %}
2003   ins_pipe(pipe_slow);
2004 %}
2005 
2006 instruct subF_reg_reg(regF dst, regF src1, regF src2) %{
2007   predicate(UseAVX > 0);
2008   match(Set dst (SubF src1 src2));
2009 
2010   format %{ "vsubss  $dst, $src1, $src2" %}
2011   ins_cost(150);
2012   ins_encode %{
2013     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2014   %}
2015   ins_pipe(pipe_slow);
2016 %}
2017 
2018 instruct subF_reg_mem(regF dst, regF src1, memory src2) %{
2019   predicate(UseAVX > 0);
2020   match(Set dst (SubF src1 (LoadF src2)));
2021 
2022   format %{ "vsubss  $dst, $src1, $src2" %}
2023   ins_cost(150);
2024   ins_encode %{
2025     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2026   %}
2027   ins_pipe(pipe_slow);
2028 %}
2029 
2030 instruct subF_reg_imm(regF dst, regF src, immF con) %{
2031   predicate(UseAVX > 0);
2032   match(Set dst (SubF src con));
2033 
2034   format %{ "vsubss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
2035   ins_cost(150);
2036   ins_encode %{
2037     __ vsubss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2038   %}
2039   ins_pipe(pipe_slow);
2040 %}
2041 
2042 instruct subD_reg(regD dst, regD src) %{
2043   predicate((UseSSE>=2) && (UseAVX == 0));
2044   match(Set dst (SubD dst src));
2045 
2046   format %{ "subsd   $dst, $src" %}
2047   ins_cost(150);
2048   ins_encode %{
2049     __ subsd($dst$$XMMRegister, $src$$XMMRegister);
2050   %}
2051   ins_pipe(pipe_slow);
2052 %}
2053 
2054 instruct subD_mem(regD dst, memory src) %{
2055   predicate((UseSSE>=2) && (UseAVX == 0));
2056   match(Set dst (SubD dst (LoadD src)));
2057 
2058   format %{ "subsd   $dst, $src" %}
2059   ins_cost(150);
2060   ins_encode %{
2061     __ subsd($dst$$XMMRegister, $src$$Address);
2062   %}
2063   ins_pipe(pipe_slow);
2064 %}
2065 
2066 instruct subD_imm(regD dst, immD con) %{
2067   predicate((UseSSE>=2) && (UseAVX == 0));
2068   match(Set dst (SubD dst con));
2069   format %{ "subsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2070   ins_cost(150);
2071   ins_encode %{
2072     __ subsd($dst$$XMMRegister, $constantaddress($con));
2073   %}
2074   ins_pipe(pipe_slow);
2075 %}
2076 
2077 instruct subD_reg_reg(regD dst, regD src1, regD src2) %{
2078   predicate(UseAVX > 0);
2079   match(Set dst (SubD src1 src2));
2080 
2081   format %{ "vsubsd  $dst, $src1, $src2" %}
2082   ins_cost(150);
2083   ins_encode %{
2084     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2085   %}
2086   ins_pipe(pipe_slow);
2087 %}
2088 
2089 instruct subD_reg_mem(regD dst, regD src1, memory src2) %{
2090   predicate(UseAVX > 0);
2091   match(Set dst (SubD src1 (LoadD src2)));
2092 
2093   format %{ "vsubsd  $dst, $src1, $src2" %}
2094   ins_cost(150);
2095   ins_encode %{
2096     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2097   %}
2098   ins_pipe(pipe_slow);
2099 %}
2100 
2101 instruct subD_reg_imm(regD dst, regD src, immD con) %{
2102   predicate(UseAVX > 0);
2103   match(Set dst (SubD src con));
2104 
2105   format %{ "vsubsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
2106   ins_cost(150);
2107   ins_encode %{
2108     __ vsubsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2109   %}
2110   ins_pipe(pipe_slow);
2111 %}
2112 
2113 instruct mulF_reg(regF dst, regF src) %{
2114   predicate((UseSSE>=1) && (UseAVX == 0));
2115   match(Set dst (MulF dst src));
2116 
2117   format %{ "mulss   $dst, $src" %}
2118   ins_cost(150);
2119   ins_encode %{
2120     __ mulss($dst$$XMMRegister, $src$$XMMRegister);
2121   %}
2122   ins_pipe(pipe_slow);
2123 %}
2124 
2125 instruct mulF_mem(regF dst, memory src) %{
2126   predicate((UseSSE>=1) && (UseAVX == 0));
2127   match(Set dst (MulF dst (LoadF src)));
2128 
2129   format %{ "mulss   $dst, $src" %}
2130   ins_cost(150);
2131   ins_encode %{
2132     __ mulss($dst$$XMMRegister, $src$$Address);
2133   %}
2134   ins_pipe(pipe_slow);
2135 %}
2136 
2137 instruct mulF_imm(regF dst, immF con) %{
2138   predicate((UseSSE>=1) && (UseAVX == 0));
2139   match(Set dst (MulF dst con));
2140   format %{ "mulss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2141   ins_cost(150);
2142   ins_encode %{
2143     __ mulss($dst$$XMMRegister, $constantaddress($con));
2144   %}
2145   ins_pipe(pipe_slow);
2146 %}
2147 
2148 instruct mulF_reg_reg(regF dst, regF src1, regF src2) %{
2149   predicate(UseAVX > 0);
2150   match(Set dst (MulF src1 src2));
2151 
2152   format %{ "vmulss  $dst, $src1, $src2" %}
2153   ins_cost(150);
2154   ins_encode %{
2155     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2156   %}
2157   ins_pipe(pipe_slow);
2158 %}
2159 
2160 instruct mulF_reg_mem(regF dst, regF src1, memory src2) %{
2161   predicate(UseAVX > 0);
2162   match(Set dst (MulF src1 (LoadF src2)));
2163 
2164   format %{ "vmulss  $dst, $src1, $src2" %}
2165   ins_cost(150);
2166   ins_encode %{
2167     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2168   %}
2169   ins_pipe(pipe_slow);
2170 %}
2171 
2172 instruct mulF_reg_imm(regF dst, regF src, immF con) %{
2173   predicate(UseAVX > 0);
2174   match(Set dst (MulF src con));
2175 
2176   format %{ "vmulss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
2177   ins_cost(150);
2178   ins_encode %{
2179     __ vmulss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2180   %}
2181   ins_pipe(pipe_slow);
2182 %}
2183 
2184 instruct mulD_reg(regD dst, regD src) %{
2185   predicate((UseSSE>=2) && (UseAVX == 0));
2186   match(Set dst (MulD dst src));
2187 
2188   format %{ "mulsd   $dst, $src" %}
2189   ins_cost(150);
2190   ins_encode %{
2191     __ mulsd($dst$$XMMRegister, $src$$XMMRegister);
2192   %}
2193   ins_pipe(pipe_slow);
2194 %}
2195 
2196 instruct mulD_mem(regD dst, memory src) %{
2197   predicate((UseSSE>=2) && (UseAVX == 0));
2198   match(Set dst (MulD dst (LoadD src)));
2199 
2200   format %{ "mulsd   $dst, $src" %}
2201   ins_cost(150);
2202   ins_encode %{
2203     __ mulsd($dst$$XMMRegister, $src$$Address);
2204   %}
2205   ins_pipe(pipe_slow);
2206 %}
2207 
2208 instruct mulD_imm(regD dst, immD con) %{
2209   predicate((UseSSE>=2) && (UseAVX == 0));
2210   match(Set dst (MulD dst con));
2211   format %{ "mulsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2212   ins_cost(150);
2213   ins_encode %{
2214     __ mulsd($dst$$XMMRegister, $constantaddress($con));
2215   %}
2216   ins_pipe(pipe_slow);
2217 %}
2218 
2219 instruct mulD_reg_reg(regD dst, regD src1, regD src2) %{
2220   predicate(UseAVX > 0);
2221   match(Set dst (MulD src1 src2));
2222 
2223   format %{ "vmulsd  $dst, $src1, $src2" %}
2224   ins_cost(150);
2225   ins_encode %{
2226     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2227   %}
2228   ins_pipe(pipe_slow);
2229 %}
2230 
2231 instruct mulD_reg_mem(regD dst, regD src1, memory src2) %{
2232   predicate(UseAVX > 0);
2233   match(Set dst (MulD src1 (LoadD src2)));
2234 
2235   format %{ "vmulsd  $dst, $src1, $src2" %}
2236   ins_cost(150);
2237   ins_encode %{
2238     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2239   %}
2240   ins_pipe(pipe_slow);
2241 %}
2242 
2243 instruct mulD_reg_imm(regD dst, regD src, immD con) %{
2244   predicate(UseAVX > 0);
2245   match(Set dst (MulD src con));
2246 
2247   format %{ "vmulsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
2248   ins_cost(150);
2249   ins_encode %{
2250     __ vmulsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2251   %}
2252   ins_pipe(pipe_slow);
2253 %}
2254 
2255 instruct divF_reg(regF dst, regF src) %{
2256   predicate((UseSSE>=1) && (UseAVX == 0));
2257   match(Set dst (DivF dst src));
2258 
2259   format %{ "divss   $dst, $src" %}
2260   ins_cost(150);
2261   ins_encode %{
2262     __ divss($dst$$XMMRegister, $src$$XMMRegister);
2263   %}
2264   ins_pipe(pipe_slow);
2265 %}
2266 
2267 instruct divF_mem(regF dst, memory src) %{
2268   predicate((UseSSE>=1) && (UseAVX == 0));
2269   match(Set dst (DivF dst (LoadF src)));
2270 
2271   format %{ "divss   $dst, $src" %}
2272   ins_cost(150);
2273   ins_encode %{
2274     __ divss($dst$$XMMRegister, $src$$Address);
2275   %}
2276   ins_pipe(pipe_slow);
2277 %}
2278 
2279 instruct divF_imm(regF dst, immF con) %{
2280   predicate((UseSSE>=1) && (UseAVX == 0));
2281   match(Set dst (DivF dst con));
2282   format %{ "divss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2283   ins_cost(150);
2284   ins_encode %{
2285     __ divss($dst$$XMMRegister, $constantaddress($con));
2286   %}
2287   ins_pipe(pipe_slow);
2288 %}
2289 
2290 instruct divF_reg_reg(regF dst, regF src1, regF src2) %{
2291   predicate(UseAVX > 0);
2292   match(Set dst (DivF src1 src2));
2293 
2294   format %{ "vdivss  $dst, $src1, $src2" %}
2295   ins_cost(150);
2296   ins_encode %{
2297     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2298   %}
2299   ins_pipe(pipe_slow);
2300 %}
2301 
2302 instruct divF_reg_mem(regF dst, regF src1, memory src2) %{
2303   predicate(UseAVX > 0);
2304   match(Set dst (DivF src1 (LoadF src2)));
2305 
2306   format %{ "vdivss  $dst, $src1, $src2" %}
2307   ins_cost(150);
2308   ins_encode %{
2309     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2310   %}
2311   ins_pipe(pipe_slow);
2312 %}
2313 
2314 instruct divF_reg_imm(regF dst, regF src, immF con) %{
2315   predicate(UseAVX > 0);
2316   match(Set dst (DivF src con));
2317 
2318   format %{ "vdivss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
2319   ins_cost(150);
2320   ins_encode %{
2321     __ vdivss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2322   %}
2323   ins_pipe(pipe_slow);
2324 %}
2325 
2326 instruct divD_reg(regD dst, regD src) %{
2327   predicate((UseSSE>=2) && (UseAVX == 0));
2328   match(Set dst (DivD dst src));
2329 
2330   format %{ "divsd   $dst, $src" %}
2331   ins_cost(150);
2332   ins_encode %{
2333     __ divsd($dst$$XMMRegister, $src$$XMMRegister);
2334   %}
2335   ins_pipe(pipe_slow);
2336 %}
2337 
2338 instruct divD_mem(regD dst, memory src) %{
2339   predicate((UseSSE>=2) && (UseAVX == 0));
2340   match(Set dst (DivD dst (LoadD src)));
2341 
2342   format %{ "divsd   $dst, $src" %}
2343   ins_cost(150);
2344   ins_encode %{
2345     __ divsd($dst$$XMMRegister, $src$$Address);
2346   %}
2347   ins_pipe(pipe_slow);
2348 %}
2349 
2350 instruct divD_imm(regD dst, immD con) %{
2351   predicate((UseSSE>=2) && (UseAVX == 0));
2352   match(Set dst (DivD dst con));
2353   format %{ "divsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2354   ins_cost(150);
2355   ins_encode %{
2356     __ divsd($dst$$XMMRegister, $constantaddress($con));
2357   %}
2358   ins_pipe(pipe_slow);
2359 %}
2360 
2361 instruct divD_reg_reg(regD dst, regD src1, regD src2) %{
2362   predicate(UseAVX > 0);
2363   match(Set dst (DivD src1 src2));
2364 
2365   format %{ "vdivsd  $dst, $src1, $src2" %}
2366   ins_cost(150);
2367   ins_encode %{
2368     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2369   %}
2370   ins_pipe(pipe_slow);
2371 %}
2372 
2373 instruct divD_reg_mem(regD dst, regD src1, memory src2) %{
2374   predicate(UseAVX > 0);
2375   match(Set dst (DivD src1 (LoadD src2)));
2376 
2377   format %{ "vdivsd  $dst, $src1, $src2" %}
2378   ins_cost(150);
2379   ins_encode %{
2380     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2381   %}
2382   ins_pipe(pipe_slow);
2383 %}
2384 
2385 instruct divD_reg_imm(regD dst, regD src, immD con) %{
2386   predicate(UseAVX > 0);
2387   match(Set dst (DivD src con));
2388 
2389   format %{ "vdivsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
2390   ins_cost(150);
2391   ins_encode %{
2392     __ vdivsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2393   %}
2394   ins_pipe(pipe_slow);
2395 %}
2396 
2397 instruct absF_reg(regF dst) %{
2398   predicate((UseSSE>=1) && (UseAVX == 0));
2399   match(Set dst (AbsF dst));
2400   ins_cost(150);
2401   format %{ "andps   $dst, [0x7fffffff]\t# abs float by sign masking" %}
2402   ins_encode %{
2403     __ andps($dst$$XMMRegister, ExternalAddress(float_signmask()));
2404   %}
2405   ins_pipe(pipe_slow);
2406 %}
2407 
2408 instruct absF_reg_reg(regF dst, regF src) %{
2409   predicate(VM_Version::supports_avxonly());
2410   match(Set dst (AbsF src));
2411   ins_cost(150);
2412   format %{ "vandps  $dst, $src, [0x7fffffff]\t# abs float by sign masking" %}
2413   ins_encode %{
2414     int vector_len = 0;
2415     __ vandps($dst$$XMMRegister, $src$$XMMRegister,
2416               ExternalAddress(float_signmask()), vector_len);
2417   %}
2418   ins_pipe(pipe_slow);
2419 %}
2420 
2421 #ifdef _LP64
2422 instruct absF_reg_reg_evex(regF dst, regF src) %{
2423   predicate(UseAVX > 2 && VM_Version::supports_avx512vl());
2424   match(Set dst (AbsF src));
2425   ins_cost(150);
2426   format %{ "vandps  $dst, $src, [0x7fffffff]\t# abs float by sign masking" %}
2427   ins_encode %{
2428     int vector_len = 0;
2429     __ vandps($dst$$XMMRegister, $src$$XMMRegister,
2430               ExternalAddress(float_signmask()), vector_len);
2431   %}
2432   ins_pipe(pipe_slow);
2433 %}
2434 
2435 instruct absF_reg_reg_evex_special(regF dst, regF src1, regF src2) %{
2436   predicate(VM_Version::supports_avx512novl());
2437   match(Set dst (AbsF src1));
2438   effect(TEMP src2);
2439   ins_cost(150);
2440   format %{ "vabsss  $dst, $src1, $src2, [0x7fffffff]\t# abs float by sign masking" %}
2441   ins_encode %{
2442     int vector_len = 0;
2443     __ vabsss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister,
2444               ExternalAddress(float_signmask()), vector_len);
2445   %}
2446   ins_pipe(pipe_slow);
2447 %}
2448 #else // _LP64
2449 instruct absF_reg_reg_evex(regF dst, regF src) %{
2450   predicate(UseAVX > 2);
2451   match(Set dst (AbsF src));
2452   ins_cost(150);
2453   format %{ "vandps  $dst, $src, [0x7fffffff]\t# abs float by sign masking" %}
2454   ins_encode %{
2455     int vector_len = 0;
2456     __ vandps($dst$$XMMRegister, $src$$XMMRegister,
2457               ExternalAddress(float_signmask()), vector_len);
2458   %}
2459   ins_pipe(pipe_slow);
2460 %}
2461 #endif
2462 
2463 instruct absD_reg(regD dst) %{
2464   predicate((UseSSE>=2) && (UseAVX == 0));
2465   match(Set dst (AbsD dst));
2466   ins_cost(150);
2467   format %{ "andpd   $dst, [0x7fffffffffffffff]\t"
2468             "# abs double by sign masking" %}
2469   ins_encode %{
2470     __ andpd($dst$$XMMRegister, ExternalAddress(double_signmask()));
2471   %}
2472   ins_pipe(pipe_slow);
2473 %}
2474 
2475 instruct absD_reg_reg(regD dst, regD src) %{
2476   predicate(VM_Version::supports_avxonly());
2477   match(Set dst (AbsD src));
2478   ins_cost(150);
2479   format %{ "vandpd  $dst, $src, [0x7fffffffffffffff]\t"
2480             "# abs double by sign masking" %}
2481   ins_encode %{
2482     int vector_len = 0;
2483     __ vandpd($dst$$XMMRegister, $src$$XMMRegister,
2484               ExternalAddress(double_signmask()), vector_len);
2485   %}
2486   ins_pipe(pipe_slow);
2487 %}
2488 
2489 #ifdef _LP64
2490 instruct absD_reg_reg_evex(regD dst, regD src) %{
2491   predicate(UseAVX > 2 && VM_Version::supports_avx512vl());
2492   match(Set dst (AbsD src));
2493   ins_cost(150);
2494   format %{ "vandpd  $dst, $src, [0x7fffffffffffffff]\t"
2495             "# abs double by sign masking" %}
2496   ins_encode %{
2497     int vector_len = 0;
2498     __ vandpd($dst$$XMMRegister, $src$$XMMRegister,
2499               ExternalAddress(double_signmask()), vector_len);
2500   %}
2501   ins_pipe(pipe_slow);
2502 %}
2503 
2504 instruct absD_reg_reg_evex_special(regD dst, regD src1, regD src2) %{
2505   predicate(VM_Version::supports_avx512novl());
2506   match(Set dst (AbsD src1));
2507   effect(TEMP src2);
2508   ins_cost(150);
2509   format %{ "vabssd  $dst, $src1, $src2, [0x7fffffffffffffff]\t# abs float by sign masking" %}
2510   ins_encode %{
2511     int vector_len = 0;
2512     __ vabssd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister,
2513               ExternalAddress(double_signmask()), vector_len);
2514   %}
2515   ins_pipe(pipe_slow);
2516 %}
2517 #else // _LP64
2518 instruct absD_reg_reg_evex(regD dst, regD src) %{
2519   predicate(UseAVX > 2);
2520   match(Set dst (AbsD src));
2521   ins_cost(150);
2522   format %{ "vandpd  $dst, $src, [0x7fffffffffffffff]\t"
2523             "# abs double by sign masking" %}
2524   ins_encode %{
2525     int vector_len = 0;
2526     __ vandpd($dst$$XMMRegister, $src$$XMMRegister,
2527               ExternalAddress(double_signmask()), vector_len);
2528   %}
2529   ins_pipe(pipe_slow);
2530 %}
2531 #endif
2532 
2533 instruct negF_reg(regF dst) %{
2534   predicate((UseSSE>=1) && (UseAVX == 0));
2535   match(Set dst (NegF dst));
2536   ins_cost(150);
2537   format %{ "xorps   $dst, [0x80000000]\t# neg float by sign flipping" %}
2538   ins_encode %{
2539     __ xorps($dst$$XMMRegister, ExternalAddress(float_signflip()));
2540   %}
2541   ins_pipe(pipe_slow);
2542 %}
2543 
2544 instruct negF_reg_reg(regF dst, regF src) %{
2545   predicate(UseAVX > 0);
2546   match(Set dst (NegF src));
2547   ins_cost(150);
2548   format %{ "vnegatess  $dst, $src, [0x80000000]\t# neg float by sign flipping" %}
2549   ins_encode %{
2550     __ vnegatess($dst$$XMMRegister, $src$$XMMRegister,
2551                  ExternalAddress(float_signflip()));
2552   %}
2553   ins_pipe(pipe_slow);
2554 %}
2555 
2556 instruct negD_reg(regD dst) %{
2557   predicate((UseSSE>=2) && (UseAVX == 0));
2558   match(Set dst (NegD dst));
2559   ins_cost(150);
2560   format %{ "xorpd   $dst, [0x8000000000000000]\t"
2561             "# neg double by sign flipping" %}
2562   ins_encode %{
2563     __ xorpd($dst$$XMMRegister, ExternalAddress(double_signflip()));
2564   %}
2565   ins_pipe(pipe_slow);
2566 %}
2567 
2568 instruct negD_reg_reg(regD dst, regD src) %{
2569   predicate(UseAVX > 0);
2570   match(Set dst (NegD src));
2571   ins_cost(150);
2572   format %{ "vnegatess  $dst, $src, [0x8000000000000000]\t"
2573             "# neg double by sign flipping" %}
2574   ins_encode %{
2575     __ vnegatesd($dst$$XMMRegister, $src$$XMMRegister,
2576                  ExternalAddress(double_signflip()));
2577   %}
2578   ins_pipe(pipe_slow);
2579 %}
2580 
2581 instruct sqrtF_reg(regF dst, regF src) %{
2582   predicate(UseSSE>=1);
2583   match(Set dst (ConvD2F (SqrtD (ConvF2D src))));
2584 
2585   format %{ "sqrtss  $dst, $src" %}
2586   ins_cost(150);
2587   ins_encode %{
2588     __ sqrtss($dst$$XMMRegister, $src$$XMMRegister);
2589   %}
2590   ins_pipe(pipe_slow);
2591 %}
2592 
2593 instruct sqrtF_mem(regF dst, memory src) %{
2594   predicate(UseSSE>=1);
2595   match(Set dst (ConvD2F (SqrtD (ConvF2D (LoadF src)))));
2596 
2597   format %{ "sqrtss  $dst, $src" %}
2598   ins_cost(150);
2599   ins_encode %{
2600     __ sqrtss($dst$$XMMRegister, $src$$Address);
2601   %}
2602   ins_pipe(pipe_slow);
2603 %}
2604 
2605 instruct sqrtF_imm(regF dst, immF con) %{
2606   predicate(UseSSE>=1);
2607   match(Set dst (ConvD2F (SqrtD (ConvF2D con))));
2608   format %{ "sqrtss  $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2609   ins_cost(150);
2610   ins_encode %{
2611     __ sqrtss($dst$$XMMRegister, $constantaddress($con));
2612   %}
2613   ins_pipe(pipe_slow);
2614 %}
2615 
2616 instruct sqrtD_reg(regD dst, regD src) %{
2617   predicate(UseSSE>=2);
2618   match(Set dst (SqrtD src));
2619 
2620   format %{ "sqrtsd  $dst, $src" %}
2621   ins_cost(150);
2622   ins_encode %{
2623     __ sqrtsd($dst$$XMMRegister, $src$$XMMRegister);
2624   %}
2625   ins_pipe(pipe_slow);
2626 %}
2627 
2628 instruct sqrtD_mem(regD dst, memory src) %{
2629   predicate(UseSSE>=2);
2630   match(Set dst (SqrtD (LoadD src)));
2631 
2632   format %{ "sqrtsd  $dst, $src" %}
2633   ins_cost(150);
2634   ins_encode %{
2635     __ sqrtsd($dst$$XMMRegister, $src$$Address);
2636   %}
2637   ins_pipe(pipe_slow);
2638 %}
2639 
2640 instruct sqrtD_imm(regD dst, immD con) %{
2641   predicate(UseSSE>=2);
2642   match(Set dst (SqrtD con));
2643   format %{ "sqrtsd  $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2644   ins_cost(150);
2645   ins_encode %{
2646     __ sqrtsd($dst$$XMMRegister, $constantaddress($con));
2647   %}
2648   ins_pipe(pipe_slow);
2649 %}
2650 
2651 instruct onspinwait() %{
2652   match(OnSpinWait);
2653   ins_cost(200);
2654 
2655   format %{
2656     $$template
2657     if (os::is_MP()) {
2658       $$emit$$"pause\t! membar_onspinwait"
2659     } else {
2660       $$emit$$"MEMBAR-onspinwait ! (empty encoding)"
2661     }
2662   %}
2663   ins_encode %{
2664     __ pause();
2665   %}
2666   ins_pipe(pipe_slow);
2667 %}
2668 
2669 // ====================VECTOR INSTRUCTIONS=====================================
2670 
2671 // Load vectors (4 bytes long)
2672 instruct loadV4(vecS dst, memory mem) %{
2673   predicate(n->as_LoadVector()->memory_size() == 4);
2674   match(Set dst (LoadVector mem));
2675   ins_cost(125);
2676   format %{ "movd    $dst,$mem\t! load vector (4 bytes)" %}
2677   ins_encode %{
2678     __ movdl($dst$$XMMRegister, $mem$$Address);
2679   %}
2680   ins_pipe( pipe_slow );
2681 %}
2682 
2683 // Load vectors (8 bytes long)
2684 instruct loadV8(vecD dst, memory mem) %{
2685   predicate(n->as_LoadVector()->memory_size() == 8);
2686   match(Set dst (LoadVector mem));
2687   ins_cost(125);
2688   format %{ "movq    $dst,$mem\t! load vector (8 bytes)" %}
2689   ins_encode %{
2690     __ movq($dst$$XMMRegister, $mem$$Address);
2691   %}
2692   ins_pipe( pipe_slow );
2693 %}
2694 
2695 // Load vectors (16 bytes long)
2696 instruct loadV16(vecX dst, memory mem) %{
2697   predicate(n->as_LoadVector()->memory_size() == 16);
2698   match(Set dst (LoadVector mem));
2699   ins_cost(125);
2700   format %{ "movdqu  $dst,$mem\t! load vector (16 bytes)" %}
2701   ins_encode %{
2702     __ movdqu($dst$$XMMRegister, $mem$$Address);
2703   %}
2704   ins_pipe( pipe_slow );
2705 %}
2706 
2707 // Load vectors (32 bytes long)
2708 instruct loadV32(vecY dst, memory mem) %{
2709   predicate(n->as_LoadVector()->memory_size() == 32);
2710   match(Set dst (LoadVector mem));
2711   ins_cost(125);
2712   format %{ "vmovdqu $dst,$mem\t! load vector (32 bytes)" %}
2713   ins_encode %{
2714     __ vmovdqu($dst$$XMMRegister, $mem$$Address);
2715   %}
2716   ins_pipe( pipe_slow );
2717 %}
2718 
2719 // Load vectors (64 bytes long)
2720 instruct loadV64_dword(vecZ dst, memory mem) %{
2721   predicate(n->as_LoadVector()->memory_size() == 64 && n->as_LoadVector()->element_size() <= 4);
2722   match(Set dst (LoadVector mem));
2723   ins_cost(125);
2724   format %{ "vmovdqul $dst k0,$mem\t! load vector (64 bytes)" %}
2725   ins_encode %{
2726     int vector_len = 2;
2727     __ evmovdqul($dst$$XMMRegister, $mem$$Address, vector_len);
2728   %}
2729   ins_pipe( pipe_slow );
2730 %}
2731 
2732 // Load vectors (64 bytes long)
2733 instruct loadV64_qword(vecZ dst, memory mem) %{
2734   predicate(n->as_LoadVector()->memory_size() == 64 && n->as_LoadVector()->element_size() > 4);
2735   match(Set dst (LoadVector mem));
2736   ins_cost(125);
2737   format %{ "vmovdquq $dst k0,$mem\t! load vector (64 bytes)" %}
2738   ins_encode %{
2739     int vector_len = 2;
2740     __ evmovdquq($dst$$XMMRegister, $mem$$Address, vector_len);
2741   %}
2742   ins_pipe( pipe_slow );
2743 %}
2744 
2745 // Store vectors
2746 instruct storeV4(memory mem, vecS src) %{
2747   predicate(n->as_StoreVector()->memory_size() == 4);
2748   match(Set mem (StoreVector mem src));
2749   ins_cost(145);
2750   format %{ "movd    $mem,$src\t! store vector (4 bytes)" %}
2751   ins_encode %{
2752     __ movdl($mem$$Address, $src$$XMMRegister);
2753   %}
2754   ins_pipe( pipe_slow );
2755 %}
2756 
2757 instruct storeV8(memory mem, vecD src) %{
2758   predicate(n->as_StoreVector()->memory_size() == 8);
2759   match(Set mem (StoreVector mem src));
2760   ins_cost(145);
2761   format %{ "movq    $mem,$src\t! store vector (8 bytes)" %}
2762   ins_encode %{
2763     __ movq($mem$$Address, $src$$XMMRegister);
2764   %}
2765   ins_pipe( pipe_slow );
2766 %}
2767 
2768 instruct storeV16(memory mem, vecX src) %{
2769   predicate(n->as_StoreVector()->memory_size() == 16);
2770   match(Set mem (StoreVector mem src));
2771   ins_cost(145);
2772   format %{ "movdqu  $mem,$src\t! store vector (16 bytes)" %}
2773   ins_encode %{
2774     __ movdqu($mem$$Address, $src$$XMMRegister);
2775   %}
2776   ins_pipe( pipe_slow );
2777 %}
2778 
2779 instruct storeV32(memory mem, vecY src) %{
2780   predicate(n->as_StoreVector()->memory_size() == 32);
2781   match(Set mem (StoreVector mem src));
2782   ins_cost(145);
2783   format %{ "vmovdqu $mem,$src\t! store vector (32 bytes)" %}
2784   ins_encode %{
2785     __ vmovdqu($mem$$Address, $src$$XMMRegister);
2786   %}
2787   ins_pipe( pipe_slow );
2788 %}
2789 
2790 instruct storeV64_dword(memory mem, vecZ src) %{
2791   predicate(n->as_StoreVector()->memory_size() == 64 && n->as_StoreVector()->element_size() <= 4);
2792   match(Set mem (StoreVector mem src));
2793   ins_cost(145);
2794   format %{ "vmovdqul $mem k0,$src\t! store vector (64 bytes)" %}
2795   ins_encode %{
2796     int vector_len = 2;
2797     __ evmovdqul($mem$$Address, $src$$XMMRegister, vector_len);
2798   %}
2799   ins_pipe( pipe_slow );
2800 %}
2801 
2802 instruct storeV64_qword(memory mem, vecZ src) %{
2803   predicate(n->as_StoreVector()->memory_size() == 64 && n->as_StoreVector()->element_size() > 4);
2804   match(Set mem (StoreVector mem src));
2805   ins_cost(145);
2806   format %{ "vmovdquq $mem k0,$src\t! store vector (64 bytes)" %}
2807   ins_encode %{
2808     int vector_len = 2;
2809     __ evmovdquq($mem$$Address, $src$$XMMRegister, vector_len);
2810   %}
2811   ins_pipe( pipe_slow );
2812 %}
2813 
2814 // ====================LEGACY REPLICATE=======================================
2815 
2816 instruct Repl4B_mem(vecS dst, memory mem) %{
2817   predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
2818   match(Set dst (ReplicateB (LoadB mem)));
2819   format %{ "punpcklbw $dst,$mem\n\t"
2820             "pshuflw $dst,$dst,0x00\t! replicate4B" %}
2821   ins_encode %{
2822     __ punpcklbw($dst$$XMMRegister, $mem$$Address);
2823     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
2824   %}
2825   ins_pipe( pipe_slow );
2826 %}
2827 
2828 instruct Repl8B_mem(vecD dst, memory mem) %{
2829   predicate(n->as_Vector()->length() == 8 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
2830   match(Set dst (ReplicateB (LoadB mem)));
2831   format %{ "punpcklbw $dst,$mem\n\t"
2832             "pshuflw $dst,$dst,0x00\t! replicate8B" %}
2833   ins_encode %{
2834     __ punpcklbw($dst$$XMMRegister, $mem$$Address);
2835     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
2836   %}
2837   ins_pipe( pipe_slow );
2838 %}
2839 
2840 instruct Repl16B(vecX dst, rRegI src) %{
2841   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
2842   match(Set dst (ReplicateB src));
2843   format %{ "movd    $dst,$src\n\t"
2844             "punpcklbw $dst,$dst\n\t"
2845             "pshuflw $dst,$dst,0x00\n\t"
2846             "punpcklqdq $dst,$dst\t! replicate16B" %}
2847   ins_encode %{
2848     __ movdl($dst$$XMMRegister, $src$$Register);
2849     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
2850     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
2851     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2852   %}
2853   ins_pipe( pipe_slow );
2854 %}
2855 
2856 instruct Repl16B_mem(vecX dst, memory mem) %{
2857   predicate(n->as_Vector()->length() == 16 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
2858   match(Set dst (ReplicateB (LoadB mem)));
2859   format %{ "punpcklbw $dst,$mem\n\t"
2860             "pshuflw $dst,$dst,0x00\n\t"
2861             "punpcklqdq $dst,$dst\t! replicate16B" %}
2862   ins_encode %{
2863     __ punpcklbw($dst$$XMMRegister, $mem$$Address);
2864     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
2865     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2866   %}
2867   ins_pipe( pipe_slow );
2868 %}
2869 
2870 instruct Repl32B(vecY dst, rRegI src) %{
2871   predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
2872   match(Set dst (ReplicateB src));
2873   format %{ "movd    $dst,$src\n\t"
2874             "punpcklbw $dst,$dst\n\t"
2875             "pshuflw $dst,$dst,0x00\n\t"
2876             "punpcklqdq $dst,$dst\n\t"
2877             "vinserti128_high $dst,$dst\t! replicate32B" %}
2878   ins_encode %{
2879     __ movdl($dst$$XMMRegister, $src$$Register);
2880     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
2881     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
2882     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2883     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
2884   %}
2885   ins_pipe( pipe_slow );
2886 %}
2887 
2888 instruct Repl32B_mem(vecY dst, memory mem) %{
2889   predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
2890   match(Set dst (ReplicateB (LoadB mem)));
2891   format %{ "punpcklbw $dst,$mem\n\t"
2892             "pshuflw $dst,$dst,0x00\n\t"
2893             "punpcklqdq $dst,$dst\n\t"
2894             "vinserti128_high $dst,$dst\t! replicate32B" %}
2895   ins_encode %{
2896     __ punpcklbw($dst$$XMMRegister, $mem$$Address);
2897     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
2898     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2899     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
2900   %}
2901   ins_pipe( pipe_slow );
2902 %}
2903 
2904 instruct Repl16B_imm(vecX dst, immI con) %{
2905   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
2906   match(Set dst (ReplicateB con));
2907   format %{ "movq    $dst,[$constantaddress]\n\t"
2908             "punpcklqdq $dst,$dst\t! replicate16B($con)" %}
2909   ins_encode %{
2910     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
2911     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2912   %}
2913   ins_pipe( pipe_slow );
2914 %}
2915 
2916 instruct Repl32B_imm(vecY dst, immI con) %{
2917   predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
2918   match(Set dst (ReplicateB con));
2919   format %{ "movq    $dst,[$constantaddress]\n\t"
2920             "punpcklqdq $dst,$dst\n\t"
2921             "vinserti128_high $dst,$dst\t! lreplicate32B($con)" %}
2922   ins_encode %{
2923     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
2924     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2925     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
2926   %}
2927   ins_pipe( pipe_slow );
2928 %}
2929 
2930 instruct Repl4S(vecD dst, rRegI src) %{
2931   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vlbw());
2932   match(Set dst (ReplicateS src));
2933   format %{ "movd    $dst,$src\n\t"
2934             "pshuflw $dst,$dst,0x00\t! replicate4S" %}
2935   ins_encode %{
2936     __ movdl($dst$$XMMRegister, $src$$Register);
2937     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
2938   %}
2939   ins_pipe( pipe_slow );
2940 %}
2941 
2942 instruct Repl4S_mem(vecD dst, memory mem) %{
2943   predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
2944   match(Set dst (ReplicateS (LoadS mem)));
2945   format %{ "pshuflw $dst,$mem,0x00\t! replicate4S" %}
2946   ins_encode %{
2947     __ pshuflw($dst$$XMMRegister, $mem$$Address, 0x00);
2948   %}
2949   ins_pipe( pipe_slow );
2950 %}
2951 
2952 instruct Repl8S(vecX dst, rRegI src) %{
2953   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vlbw());
2954   match(Set dst (ReplicateS src));
2955   format %{ "movd    $dst,$src\n\t"
2956             "pshuflw $dst,$dst,0x00\n\t"
2957             "punpcklqdq $dst,$dst\t! replicate8S" %}
2958   ins_encode %{
2959     __ movdl($dst$$XMMRegister, $src$$Register);
2960     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
2961     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2962   %}
2963   ins_pipe( pipe_slow );
2964 %}
2965 
2966 instruct Repl8S_mem(vecX dst, memory mem) %{
2967   predicate(n->as_Vector()->length() == 8 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
2968   match(Set dst (ReplicateS (LoadS mem)));
2969   format %{ "pshuflw $dst,$mem,0x00\n\t"
2970             "punpcklqdq $dst,$dst\t! replicate8S" %}
2971   ins_encode %{
2972     __ pshuflw($dst$$XMMRegister, $mem$$Address, 0x00);
2973     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2974   %}
2975   ins_pipe( pipe_slow );
2976 %}
2977 
2978 instruct Repl8S_imm(vecX dst, immI con) %{
2979   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vlbw());
2980   match(Set dst (ReplicateS con));
2981   format %{ "movq    $dst,[$constantaddress]\n\t"
2982             "punpcklqdq $dst,$dst\t! replicate8S($con)" %}
2983   ins_encode %{
2984     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
2985     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2986   %}
2987   ins_pipe( pipe_slow );
2988 %}
2989 
2990 instruct Repl16S(vecY dst, rRegI src) %{
2991   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
2992   match(Set dst (ReplicateS src));
2993   format %{ "movd    $dst,$src\n\t"
2994             "pshuflw $dst,$dst,0x00\n\t"
2995             "punpcklqdq $dst,$dst\n\t"
2996             "vinserti128_high $dst,$dst\t! replicate16S" %}
2997   ins_encode %{
2998     __ movdl($dst$$XMMRegister, $src$$Register);
2999     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3000     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3001     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3002   %}
3003   ins_pipe( pipe_slow );
3004 %}
3005 
3006 instruct Repl16S_mem(vecY dst, memory mem) %{
3007   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
3008   match(Set dst (ReplicateS (LoadS mem)));
3009   format %{ "pshuflw $dst,$mem,0x00\n\t"
3010             "punpcklqdq $dst,$dst\n\t"
3011             "vinserti128_high $dst,$dst\t! replicate16S" %}
3012   ins_encode %{
3013     __ pshuflw($dst$$XMMRegister, $mem$$Address, 0x00);
3014     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3015     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3016   %}
3017   ins_pipe( pipe_slow );
3018 %}
3019 
3020 instruct Repl16S_imm(vecY dst, immI con) %{
3021   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
3022   match(Set dst (ReplicateS con));
3023   format %{ "movq    $dst,[$constantaddress]\n\t"
3024             "punpcklqdq $dst,$dst\n\t"
3025             "vinserti128_high $dst,$dst\t! replicate16S($con)" %}
3026   ins_encode %{
3027     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
3028     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3029     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3030   %}
3031   ins_pipe( pipe_slow );
3032 %}
3033 
3034 instruct Repl4I(vecX dst, rRegI src) %{
3035   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3036   match(Set dst (ReplicateI src));
3037   format %{ "movd    $dst,$src\n\t"
3038             "pshufd  $dst,$dst,0x00\t! replicate4I" %}
3039   ins_encode %{
3040     __ movdl($dst$$XMMRegister, $src$$Register);
3041     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3042   %}
3043   ins_pipe( pipe_slow );
3044 %}
3045 
3046 instruct Repl4I_mem(vecX dst, memory mem) %{
3047   predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3048   match(Set dst (ReplicateI (LoadI mem)));
3049   format %{ "pshufd  $dst,$mem,0x00\t! replicate4I" %}
3050   ins_encode %{
3051     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3052   %}
3053   ins_pipe( pipe_slow );
3054 %}
3055 
3056 instruct Repl8I(vecY dst, rRegI src) %{
3057   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3058   match(Set dst (ReplicateI src));
3059   format %{ "movd    $dst,$src\n\t"
3060             "pshufd  $dst,$dst,0x00\n\t"
3061             "vinserti128_high $dst,$dst\t! replicate8I" %}
3062   ins_encode %{
3063     __ movdl($dst$$XMMRegister, $src$$Register);
3064     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3065     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3066   %}
3067   ins_pipe( pipe_slow );
3068 %}
3069 
3070 instruct Repl8I_mem(vecY dst, memory mem) %{
3071   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3072   match(Set dst (ReplicateI (LoadI mem)));
3073   format %{ "pshufd  $dst,$mem,0x00\n\t"
3074             "vinserti128_high $dst,$dst\t! replicate8I" %}
3075   ins_encode %{
3076     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3077     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3078   %}
3079   ins_pipe( pipe_slow );
3080 %}
3081 
3082 instruct Repl4I_imm(vecX dst, immI con) %{
3083   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3084   match(Set dst (ReplicateI con));
3085   format %{ "movq    $dst,[$constantaddress]\t! replicate4I($con)\n\t"
3086             "punpcklqdq $dst,$dst" %}
3087   ins_encode %{
3088     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
3089     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3090   %}
3091   ins_pipe( pipe_slow );
3092 %}
3093 
3094 instruct Repl8I_imm(vecY dst, immI con) %{
3095   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3096   match(Set dst (ReplicateI con));
3097   format %{ "movq    $dst,[$constantaddress]\t! replicate8I($con)\n\t"
3098             "punpcklqdq $dst,$dst\n\t"
3099             "vinserti128_high $dst,$dst" %}
3100   ins_encode %{
3101     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
3102     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3103     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3104   %}
3105   ins_pipe( pipe_slow );
3106 %}
3107 
3108 // Long could be loaded into xmm register directly from memory.
3109 instruct Repl2L_mem(vecX dst, memory mem) %{
3110   predicate(n->as_Vector()->length() == 2 && !VM_Version::supports_avx512vlbw());
3111   match(Set dst (ReplicateL (LoadL mem)));
3112   format %{ "movq    $dst,$mem\n\t"
3113             "punpcklqdq $dst,$dst\t! replicate2L" %}
3114   ins_encode %{
3115     __ movq($dst$$XMMRegister, $mem$$Address);
3116     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3117   %}
3118   ins_pipe( pipe_slow );
3119 %}
3120 
3121 // Replicate long (8 byte) scalar to be vector
3122 #ifdef _LP64
3123 instruct Repl4L(vecY dst, rRegL src) %{
3124   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3125   match(Set dst (ReplicateL src));
3126   format %{ "movdq   $dst,$src\n\t"
3127             "punpcklqdq $dst,$dst\n\t"
3128             "vinserti128_high $dst,$dst\t! replicate4L" %}
3129   ins_encode %{
3130     __ movdq($dst$$XMMRegister, $src$$Register);
3131     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3132     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3133   %}
3134   ins_pipe( pipe_slow );
3135 %}
3136 #else // _LP64
3137 instruct Repl4L(vecY dst, eRegL src, regD tmp) %{
3138   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3139   match(Set dst (ReplicateL src));
3140   effect(TEMP dst, USE src, TEMP tmp);
3141   format %{ "movdl   $dst,$src.lo\n\t"
3142             "movdl   $tmp,$src.hi\n\t"
3143             "punpckldq $dst,$tmp\n\t"
3144             "punpcklqdq $dst,$dst\n\t"
3145             "vinserti128_high $dst,$dst\t! replicate4L" %}
3146   ins_encode %{
3147     __ movdl($dst$$XMMRegister, $src$$Register);
3148     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
3149     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
3150     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3151     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3152   %}
3153   ins_pipe( pipe_slow );
3154 %}
3155 #endif // _LP64
3156 
3157 instruct Repl4L_imm(vecY dst, immL con) %{
3158   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3159   match(Set dst (ReplicateL con));
3160   format %{ "movq    $dst,[$constantaddress]\n\t"
3161             "punpcklqdq $dst,$dst\n\t"
3162             "vinserti128_high $dst,$dst\t! replicate4L($con)" %}
3163   ins_encode %{
3164     __ movq($dst$$XMMRegister, $constantaddress($con));
3165     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3166     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3167   %}
3168   ins_pipe( pipe_slow );
3169 %}
3170 
3171 instruct Repl4L_mem(vecY dst, memory mem) %{
3172   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3173   match(Set dst (ReplicateL (LoadL mem)));
3174   format %{ "movq    $dst,$mem\n\t"
3175             "punpcklqdq $dst,$dst\n\t"
3176             "vinserti128_high $dst,$dst\t! replicate4L" %}
3177   ins_encode %{
3178     __ movq($dst$$XMMRegister, $mem$$Address);
3179     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3180     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3181   %}
3182   ins_pipe( pipe_slow );
3183 %}
3184 
3185 instruct Repl2F_mem(vecD dst, memory mem) %{
3186   predicate(n->as_Vector()->length() == 2 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3187   match(Set dst (ReplicateF (LoadF mem)));
3188   format %{ "pshufd  $dst,$mem,0x00\t! replicate2F" %}
3189   ins_encode %{
3190     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3191   %}
3192   ins_pipe( pipe_slow );
3193 %}
3194 
3195 instruct Repl4F_mem(vecX dst, memory mem) %{
3196   predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3197   match(Set dst (ReplicateF (LoadF mem)));
3198   format %{ "pshufd  $dst,$mem,0x00\t! replicate4F" %}
3199   ins_encode %{
3200     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3201   %}
3202   ins_pipe( pipe_slow );
3203 %}
3204 
3205 instruct Repl8F(vecY dst, regF src) %{
3206   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3207   match(Set dst (ReplicateF src));
3208   format %{ "pshufd  $dst,$src,0x00\n\t"
3209             "vinsertf128_high $dst,$dst\t! replicate8F" %}
3210   ins_encode %{
3211     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
3212     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3213   %}
3214   ins_pipe( pipe_slow );
3215 %}
3216 
3217 instruct Repl8F_mem(vecY dst, memory mem) %{
3218   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3219   match(Set dst (ReplicateF (LoadF mem)));
3220   format %{ "pshufd  $dst,$mem,0x00\n\t"
3221             "vinsertf128_high $dst,$dst\t! replicate8F" %}
3222   ins_encode %{
3223     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3224     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3225   %}
3226   ins_pipe( pipe_slow );
3227 %}
3228 
3229 instruct Repl2F_zero(vecD dst, immF0 zero) %{
3230   predicate(n->as_Vector()->length() == 2 && UseAVX < 3);
3231   match(Set dst (ReplicateF zero));
3232   format %{ "xorps   $dst,$dst\t! replicate2F zero" %}
3233   ins_encode %{
3234     __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
3235   %}
3236   ins_pipe( fpu_reg_reg );
3237 %}
3238 
3239 instruct Repl4F_zero(vecX dst, immF0 zero) %{
3240   predicate(n->as_Vector()->length() == 4 && UseAVX < 3);
3241   match(Set dst (ReplicateF zero));
3242   format %{ "xorps   $dst,$dst\t! replicate4F zero" %}
3243   ins_encode %{
3244     __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
3245   %}
3246   ins_pipe( fpu_reg_reg );
3247 %}
3248 
3249 instruct Repl8F_zero(vecY dst, immF0 zero) %{
3250   predicate(n->as_Vector()->length() == 8 && UseAVX < 3);
3251   match(Set dst (ReplicateF zero));
3252   format %{ "vxorps  $dst,$dst,$dst\t! replicate8F zero" %}
3253   ins_encode %{
3254     int vector_len = 1;
3255     __ vxorps($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3256   %}
3257   ins_pipe( fpu_reg_reg );
3258 %}
3259 
3260 instruct Repl2D_mem(vecX dst, memory mem) %{
3261   predicate(n->as_Vector()->length() == 2 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3262   match(Set dst (ReplicateD (LoadD mem)));
3263   format %{ "pshufd  $dst,$mem,0x44\t! replicate2D" %}
3264   ins_encode %{
3265     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x44);
3266   %}
3267   ins_pipe( pipe_slow );
3268 %}
3269 
3270 instruct Repl4D(vecY dst, regD src) %{
3271   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3272   match(Set dst (ReplicateD src));
3273   format %{ "pshufd  $dst,$src,0x44\n\t"
3274             "vinsertf128_high $dst,$dst\t! replicate4D" %}
3275   ins_encode %{
3276     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
3277     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3278   %}
3279   ins_pipe( pipe_slow );
3280 %}
3281 
3282 instruct Repl4D_mem(vecY dst, memory mem) %{
3283   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3284   match(Set dst (ReplicateD (LoadD mem)));
3285   format %{ "pshufd  $dst,$mem,0x44\n\t"
3286             "vinsertf128_high $dst,$dst\t! replicate4D" %}
3287   ins_encode %{
3288     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x44);
3289     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3290   %}
3291   ins_pipe( pipe_slow );
3292 %}
3293 
3294 // Replicate double (8 byte) scalar zero to be vector
3295 instruct Repl2D_zero(vecX dst, immD0 zero) %{
3296   predicate(n->as_Vector()->length() == 2 && UseAVX < 3);
3297   match(Set dst (ReplicateD zero));
3298   format %{ "xorpd   $dst,$dst\t! replicate2D zero" %}
3299   ins_encode %{
3300     __ xorpd($dst$$XMMRegister, $dst$$XMMRegister);
3301   %}
3302   ins_pipe( fpu_reg_reg );
3303 %}
3304 
3305 instruct Repl4D_zero(vecY dst, immD0 zero) %{
3306   predicate(n->as_Vector()->length() == 4 && UseAVX < 3);
3307   match(Set dst (ReplicateD zero));
3308   format %{ "vxorpd  $dst,$dst,$dst,vect256\t! replicate4D zero" %}
3309   ins_encode %{
3310     int vector_len = 1;
3311     __ vxorpd($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3312   %}
3313   ins_pipe( fpu_reg_reg );
3314 %}
3315 
3316 // ====================GENERIC REPLICATE==========================================
3317 
3318 // Replicate byte scalar to be vector
3319 instruct Repl4B(vecS dst, rRegI src) %{
3320   predicate(n->as_Vector()->length() == 4);
3321   match(Set dst (ReplicateB src));
3322   format %{ "movd    $dst,$src\n\t"
3323             "punpcklbw $dst,$dst\n\t"
3324             "pshuflw $dst,$dst,0x00\t! replicate4B" %}
3325   ins_encode %{
3326     __ movdl($dst$$XMMRegister, $src$$Register);
3327     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
3328     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3329   %}
3330   ins_pipe( pipe_slow );
3331 %}
3332 
3333 instruct Repl8B(vecD dst, rRegI src) %{
3334   predicate(n->as_Vector()->length() == 8);
3335   match(Set dst (ReplicateB src));
3336   format %{ "movd    $dst,$src\n\t"
3337             "punpcklbw $dst,$dst\n\t"
3338             "pshuflw $dst,$dst,0x00\t! replicate8B" %}
3339   ins_encode %{
3340     __ movdl($dst$$XMMRegister, $src$$Register);
3341     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
3342     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3343   %}
3344   ins_pipe( pipe_slow );
3345 %}
3346 
3347 // Replicate byte scalar immediate to be vector by loading from const table.
3348 instruct Repl4B_imm(vecS dst, immI con) %{
3349   predicate(n->as_Vector()->length() == 4);
3350   match(Set dst (ReplicateB con));
3351   format %{ "movdl   $dst,[$constantaddress]\t! replicate4B($con)" %}
3352   ins_encode %{
3353     __ movdl($dst$$XMMRegister, $constantaddress(replicate4_imm($con$$constant, 1)));
3354   %}
3355   ins_pipe( pipe_slow );
3356 %}
3357 
3358 instruct Repl8B_imm(vecD dst, immI con) %{
3359   predicate(n->as_Vector()->length() == 8);
3360   match(Set dst (ReplicateB con));
3361   format %{ "movq    $dst,[$constantaddress]\t! replicate8B($con)" %}
3362   ins_encode %{
3363     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
3364   %}
3365   ins_pipe( pipe_slow );
3366 %}
3367 
3368 // Replicate byte scalar zero to be vector
3369 instruct Repl4B_zero(vecS dst, immI0 zero) %{
3370   predicate(n->as_Vector()->length() == 4);
3371   match(Set dst (ReplicateB zero));
3372   format %{ "pxor    $dst,$dst\t! replicate4B zero" %}
3373   ins_encode %{
3374     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3375   %}
3376   ins_pipe( fpu_reg_reg );
3377 %}
3378 
3379 instruct Repl8B_zero(vecD dst, immI0 zero) %{
3380   predicate(n->as_Vector()->length() == 8);
3381   match(Set dst (ReplicateB zero));
3382   format %{ "pxor    $dst,$dst\t! replicate8B zero" %}
3383   ins_encode %{
3384     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3385   %}
3386   ins_pipe( fpu_reg_reg );
3387 %}
3388 
3389 instruct Repl16B_zero(vecX dst, immI0 zero) %{
3390   predicate(n->as_Vector()->length() == 16);
3391   match(Set dst (ReplicateB zero));
3392   format %{ "pxor    $dst,$dst\t! replicate16B zero" %}
3393   ins_encode %{
3394     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3395   %}
3396   ins_pipe( fpu_reg_reg );
3397 %}
3398 
3399 instruct Repl32B_zero(vecY dst, immI0 zero) %{
3400   predicate(n->as_Vector()->length() == 32);
3401   match(Set dst (ReplicateB zero));
3402   format %{ "vpxor   $dst,$dst,$dst\t! replicate32B zero" %}
3403   ins_encode %{
3404     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
3405     int vector_len = 1;
3406     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3407   %}
3408   ins_pipe( fpu_reg_reg );
3409 %}
3410 
3411 // Replicate char/short (2 byte) scalar to be vector
3412 instruct Repl2S(vecS dst, rRegI src) %{
3413   predicate(n->as_Vector()->length() == 2);
3414   match(Set dst (ReplicateS src));
3415   format %{ "movd    $dst,$src\n\t"
3416             "pshuflw $dst,$dst,0x00\t! replicate2S" %}
3417   ins_encode %{
3418     __ movdl($dst$$XMMRegister, $src$$Register);
3419     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3420   %}
3421   ins_pipe( fpu_reg_reg );
3422 %}
3423 
3424 // Replicate char/short (2 byte) scalar immediate to be vector by loading from const table.
3425 instruct Repl2S_imm(vecS dst, immI con) %{
3426   predicate(n->as_Vector()->length() == 2);
3427   match(Set dst (ReplicateS con));
3428   format %{ "movdl   $dst,[$constantaddress]\t! replicate2S($con)" %}
3429   ins_encode %{
3430     __ movdl($dst$$XMMRegister, $constantaddress(replicate4_imm($con$$constant, 2)));
3431   %}
3432   ins_pipe( fpu_reg_reg );
3433 %}
3434 
3435 instruct Repl4S_imm(vecD dst, immI con) %{
3436   predicate(n->as_Vector()->length() == 4);
3437   match(Set dst (ReplicateS con));
3438   format %{ "movq    $dst,[$constantaddress]\t! replicate4S($con)" %}
3439   ins_encode %{
3440     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
3441   %}
3442   ins_pipe( fpu_reg_reg );
3443 %}
3444 
3445 // Replicate char/short (2 byte) scalar zero to be vector
3446 instruct Repl2S_zero(vecS dst, immI0 zero) %{
3447   predicate(n->as_Vector()->length() == 2);
3448   match(Set dst (ReplicateS zero));
3449   format %{ "pxor    $dst,$dst\t! replicate2S zero" %}
3450   ins_encode %{
3451     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3452   %}
3453   ins_pipe( fpu_reg_reg );
3454 %}
3455 
3456 instruct Repl4S_zero(vecD dst, immI0 zero) %{
3457   predicate(n->as_Vector()->length() == 4);
3458   match(Set dst (ReplicateS zero));
3459   format %{ "pxor    $dst,$dst\t! replicate4S zero" %}
3460   ins_encode %{
3461     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3462   %}
3463   ins_pipe( fpu_reg_reg );
3464 %}
3465 
3466 instruct Repl8S_zero(vecX dst, immI0 zero) %{
3467   predicate(n->as_Vector()->length() == 8);
3468   match(Set dst (ReplicateS zero));
3469   format %{ "pxor    $dst,$dst\t! replicate8S zero" %}
3470   ins_encode %{
3471     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3472   %}
3473   ins_pipe( fpu_reg_reg );
3474 %}
3475 
3476 instruct Repl16S_zero(vecY dst, immI0 zero) %{
3477   predicate(n->as_Vector()->length() == 16);
3478   match(Set dst (ReplicateS zero));
3479   format %{ "vpxor   $dst,$dst,$dst\t! replicate16S zero" %}
3480   ins_encode %{
3481     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
3482     int vector_len = 1;
3483     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3484   %}
3485   ins_pipe( fpu_reg_reg );
3486 %}
3487 
3488 // Replicate integer (4 byte) scalar to be vector
3489 instruct Repl2I(vecD dst, rRegI src) %{
3490   predicate(n->as_Vector()->length() == 2);
3491   match(Set dst (ReplicateI src));
3492   format %{ "movd    $dst,$src\n\t"
3493             "pshufd  $dst,$dst,0x00\t! replicate2I" %}
3494   ins_encode %{
3495     __ movdl($dst$$XMMRegister, $src$$Register);
3496     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3497   %}
3498   ins_pipe( fpu_reg_reg );
3499 %}
3500 
3501 // Integer could be loaded into xmm register directly from memory.
3502 instruct Repl2I_mem(vecD dst, memory mem) %{
3503   predicate(n->as_Vector()->length() == 2);
3504   match(Set dst (ReplicateI (LoadI mem)));
3505   format %{ "movd    $dst,$mem\n\t"
3506             "pshufd  $dst,$dst,0x00\t! replicate2I" %}
3507   ins_encode %{
3508     __ movdl($dst$$XMMRegister, $mem$$Address);
3509     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3510   %}
3511   ins_pipe( fpu_reg_reg );
3512 %}
3513 
3514 // Replicate integer (4 byte) scalar immediate to be vector by loading from const table.
3515 instruct Repl2I_imm(vecD dst, immI con) %{
3516   predicate(n->as_Vector()->length() == 2);
3517   match(Set dst (ReplicateI con));
3518   format %{ "movq    $dst,[$constantaddress]\t! replicate2I($con)" %}
3519   ins_encode %{
3520     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
3521   %}
3522   ins_pipe( fpu_reg_reg );
3523 %}
3524 
3525 // Replicate integer (4 byte) scalar zero to be vector
3526 instruct Repl2I_zero(vecD dst, immI0 zero) %{
3527   predicate(n->as_Vector()->length() == 2);
3528   match(Set dst (ReplicateI zero));
3529   format %{ "pxor    $dst,$dst\t! replicate2I" %}
3530   ins_encode %{
3531     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3532   %}
3533   ins_pipe( fpu_reg_reg );
3534 %}
3535 
3536 instruct Repl4I_zero(vecX dst, immI0 zero) %{
3537   predicate(n->as_Vector()->length() == 4);
3538   match(Set dst (ReplicateI zero));
3539   format %{ "pxor    $dst,$dst\t! replicate4I zero)" %}
3540   ins_encode %{
3541     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3542   %}
3543   ins_pipe( fpu_reg_reg );
3544 %}
3545 
3546 instruct Repl8I_zero(vecY dst, immI0 zero) %{
3547   predicate(n->as_Vector()->length() == 8);
3548   match(Set dst (ReplicateI zero));
3549   format %{ "vpxor   $dst,$dst,$dst\t! replicate8I zero" %}
3550   ins_encode %{
3551     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
3552     int vector_len = 1;
3553     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3554   %}
3555   ins_pipe( fpu_reg_reg );
3556 %}
3557 
3558 // Replicate long (8 byte) scalar to be vector
3559 #ifdef _LP64
3560 instruct Repl2L(vecX dst, rRegL src) %{
3561   predicate(n->as_Vector()->length() == 2);
3562   match(Set dst (ReplicateL src));
3563   format %{ "movdq   $dst,$src\n\t"
3564             "punpcklqdq $dst,$dst\t! replicate2L" %}
3565   ins_encode %{
3566     __ movdq($dst$$XMMRegister, $src$$Register);
3567     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3568   %}
3569   ins_pipe( pipe_slow );
3570 %}
3571 #else // _LP64
3572 instruct Repl2L(vecX dst, eRegL src, regD tmp) %{
3573   predicate(n->as_Vector()->length() == 2);
3574   match(Set dst (ReplicateL src));
3575   effect(TEMP dst, USE src, TEMP tmp);
3576   format %{ "movdl   $dst,$src.lo\n\t"
3577             "movdl   $tmp,$src.hi\n\t"
3578             "punpckldq $dst,$tmp\n\t"
3579             "punpcklqdq $dst,$dst\t! replicate2L"%}
3580   ins_encode %{
3581     __ movdl($dst$$XMMRegister, $src$$Register);
3582     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
3583     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
3584     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3585   %}
3586   ins_pipe( pipe_slow );
3587 %}
3588 #endif // _LP64
3589 
3590 // Replicate long (8 byte) scalar immediate to be vector by loading from const table.
3591 instruct Repl2L_imm(vecX dst, immL con) %{
3592   predicate(n->as_Vector()->length() == 2);
3593   match(Set dst (ReplicateL con));
3594   format %{ "movq    $dst,[$constantaddress]\n\t"
3595             "punpcklqdq $dst,$dst\t! replicate2L($con)" %}
3596   ins_encode %{
3597     __ movq($dst$$XMMRegister, $constantaddress($con));
3598     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3599   %}
3600   ins_pipe( pipe_slow );
3601 %}
3602 
3603 // Replicate long (8 byte) scalar zero to be vector
3604 instruct Repl2L_zero(vecX dst, immL0 zero) %{
3605   predicate(n->as_Vector()->length() == 2);
3606   match(Set dst (ReplicateL zero));
3607   format %{ "pxor    $dst,$dst\t! replicate2L zero" %}
3608   ins_encode %{
3609     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3610   %}
3611   ins_pipe( fpu_reg_reg );
3612 %}
3613 
3614 instruct Repl4L_zero(vecY dst, immL0 zero) %{
3615   predicate(n->as_Vector()->length() == 4);
3616   match(Set dst (ReplicateL zero));
3617   format %{ "vpxor   $dst,$dst,$dst\t! replicate4L zero" %}
3618   ins_encode %{
3619     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
3620     int vector_len = 1;
3621     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3622   %}
3623   ins_pipe( fpu_reg_reg );
3624 %}
3625 
3626 // Replicate float (4 byte) scalar to be vector
3627 instruct Repl2F(vecD dst, regF src) %{
3628   predicate(n->as_Vector()->length() == 2);
3629   match(Set dst (ReplicateF src));
3630   format %{ "pshufd  $dst,$dst,0x00\t! replicate2F" %}
3631   ins_encode %{
3632     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
3633   %}
3634   ins_pipe( fpu_reg_reg );
3635 %}
3636 
3637 instruct Repl4F(vecX dst, regF src) %{
3638   predicate(n->as_Vector()->length() == 4);
3639   match(Set dst (ReplicateF src));
3640   format %{ "pshufd  $dst,$dst,0x00\t! replicate4F" %}
3641   ins_encode %{
3642     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
3643   %}
3644   ins_pipe( pipe_slow );
3645 %}
3646 
3647 // Replicate double (8 bytes) scalar to be vector
3648 instruct Repl2D(vecX dst, regD src) %{
3649   predicate(n->as_Vector()->length() == 2);
3650   match(Set dst (ReplicateD src));
3651   format %{ "pshufd  $dst,$src,0x44\t! replicate2D" %}
3652   ins_encode %{
3653     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
3654   %}
3655   ins_pipe( pipe_slow );
3656 %}
3657 
3658 // ====================EVEX REPLICATE=============================================
3659 
3660 instruct Repl4B_mem_evex(vecS dst, memory mem) %{
3661   predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vlbw());
3662   match(Set dst (ReplicateB (LoadB mem)));
3663   format %{ "vpbroadcastb  $dst,$mem\t! replicate4B" %}
3664   ins_encode %{
3665     int vector_len = 0;
3666     __ evpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
3667   %}
3668   ins_pipe( pipe_slow );
3669 %}
3670 
3671 instruct Repl8B_mem_evex(vecD dst, memory mem) %{
3672   predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vlbw());
3673   match(Set dst (ReplicateB (LoadB mem)));
3674   format %{ "vpbroadcastb  $dst,$mem\t! replicate8B" %}
3675   ins_encode %{
3676     int vector_len = 0;
3677     __ evpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
3678   %}
3679   ins_pipe( pipe_slow );
3680 %}
3681 
3682 instruct Repl16B_evex(vecX dst, rRegI src) %{
3683   predicate(n->as_Vector()->length() == 16 && VM_Version::supports_avx512vlbw());
3684   match(Set dst (ReplicateB src));
3685   format %{ "vpbroadcastb $dst,$src\t! replicate16B" %}
3686   ins_encode %{
3687    int vector_len = 0;
3688     __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vector_len);
3689   %}
3690   ins_pipe( pipe_slow );
3691 %}
3692 
3693 instruct Repl16B_mem_evex(vecX dst, memory mem) %{
3694   predicate(n->as_Vector()->length() == 16 && VM_Version::supports_avx512vlbw());
3695   match(Set dst (ReplicateB (LoadB mem)));
3696   format %{ "vpbroadcastb  $dst,$mem\t! replicate16B" %}
3697   ins_encode %{
3698     int vector_len = 0;
3699     __ evpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
3700   %}
3701   ins_pipe( pipe_slow );
3702 %}
3703 
3704 instruct Repl32B_evex(vecY dst, rRegI src) %{
3705   predicate(n->as_Vector()->length() == 32 && VM_Version::supports_avx512vlbw());
3706   match(Set dst (ReplicateB src));
3707   format %{ "vpbroadcastb $dst,$src\t! replicate32B" %}
3708   ins_encode %{
3709    int vector_len = 1;
3710     __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vector_len);
3711   %}
3712   ins_pipe( pipe_slow );
3713 %}
3714 
3715 instruct Repl32B_mem_evex(vecY dst, memory mem) %{
3716   predicate(n->as_Vector()->length() == 32 && VM_Version::supports_avx512vlbw());
3717   match(Set dst (ReplicateB (LoadB mem)));
3718   format %{ "vpbroadcastb  $dst,$mem\t! replicate32B" %}
3719   ins_encode %{
3720     int vector_len = 1;
3721     __ evpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
3722   %}
3723   ins_pipe( pipe_slow );
3724 %}
3725 
3726 instruct Repl64B_evex(vecZ dst, rRegI src) %{
3727   predicate(n->as_Vector()->length() == 64 && VM_Version::supports_avx512bw());
3728   match(Set dst (ReplicateB src));
3729   format %{ "vpbroadcastb $dst,$src\t! upper replicate64B" %}
3730   ins_encode %{
3731    int vector_len = 2;
3732     __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vector_len);
3733   %}
3734   ins_pipe( pipe_slow );
3735 %}
3736 
3737 instruct Repl64B_mem_evex(vecZ dst, memory mem) %{
3738   predicate(n->as_Vector()->length() == 64 && VM_Version::supports_avx512bw());
3739   match(Set dst (ReplicateB (LoadB mem)));
3740   format %{ "vpbroadcastb  $dst,$mem\t! replicate64B" %}
3741   ins_encode %{
3742     int vector_len = 2;
3743     __ evpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
3744   %}
3745   ins_pipe( pipe_slow );
3746 %}
3747 
3748 instruct Repl16B_imm_evex(vecX dst, immI con) %{
3749   predicate(n->as_Vector()->length() == 16 && VM_Version::supports_avx512vlbw());
3750   match(Set dst (ReplicateB con));
3751   format %{ "movq    $dst,[$constantaddress]\n\t"
3752             "vpbroadcastb $dst,$dst\t! replicate16B" %}
3753   ins_encode %{
3754    int vector_len = 0;
3755     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
3756     __ evpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3757   %}
3758   ins_pipe( pipe_slow );
3759 %}
3760 
3761 instruct Repl32B_imm_evex(vecY dst, immI con) %{
3762   predicate(n->as_Vector()->length() == 32 && VM_Version::supports_avx512vlbw());
3763   match(Set dst (ReplicateB con));
3764   format %{ "movq    $dst,[$constantaddress]\n\t"
3765             "vpbroadcastb $dst,$dst\t! replicate32B" %}
3766   ins_encode %{
3767    int vector_len = 1;
3768     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
3769     __ evpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3770   %}
3771   ins_pipe( pipe_slow );
3772 %}
3773 
3774 instruct Repl64B_imm_evex(vecZ dst, immI con) %{
3775   predicate(n->as_Vector()->length() == 64 && VM_Version::supports_avx512bw());
3776   match(Set dst (ReplicateB con));
3777   format %{ "movq    $dst,[$constantaddress]\n\t"
3778             "vpbroadcastb $dst,$dst\t! upper replicate64B" %}
3779   ins_encode %{
3780    int vector_len = 2;
3781     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
3782     __ evpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3783   %}
3784   ins_pipe( pipe_slow );
3785 %}
3786 
3787 instruct Repl64B_zero_evex(vecZ dst, immI0 zero) %{
3788   predicate(n->as_Vector()->length() == 64 && UseAVX > 2);
3789   match(Set dst (ReplicateB zero));
3790   format %{ "vpxor   $dst k0,$dst,$dst\t! replicate64B zero" %}
3791   ins_encode %{
3792     // Use vxorpd since AVX does not have vpxor for 512-bit (EVEX will have it).
3793     int vector_len = 2;
3794     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3795   %}
3796   ins_pipe( fpu_reg_reg );
3797 %}
3798 
3799 instruct Repl4S_evex(vecD dst, rRegI src) %{
3800   predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vlbw());
3801   match(Set dst (ReplicateS src));
3802   format %{ "vpbroadcastw $dst,$src\t! replicate4S" %}
3803   ins_encode %{
3804    int vector_len = 0;
3805     __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
3806   %}
3807   ins_pipe( pipe_slow );
3808 %}
3809 
3810 instruct Repl4S_mem_evex(vecD dst, memory mem) %{
3811   predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vlbw());
3812   match(Set dst (ReplicateS (LoadS mem)));
3813   format %{ "vpbroadcastw  $dst,$mem\t! replicate4S" %}
3814   ins_encode %{
3815     int vector_len = 0;
3816     __ evpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
3817   %}
3818   ins_pipe( pipe_slow );
3819 %}
3820 
3821 instruct Repl8S_evex(vecX dst, rRegI src) %{
3822   predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vlbw());
3823   match(Set dst (ReplicateS src));
3824   format %{ "vpbroadcastw $dst,$src\t! replicate8S" %}
3825   ins_encode %{
3826    int vector_len = 0;
3827     __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
3828   %}
3829   ins_pipe( pipe_slow );
3830 %}
3831 
3832 instruct Repl8S_mem_evex(vecX dst, memory mem) %{
3833   predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vlbw());
3834   match(Set dst (ReplicateS (LoadS mem)));
3835   format %{ "vpbroadcastw  $dst,$mem\t! replicate8S" %}
3836   ins_encode %{
3837     int vector_len = 0;
3838     __ evpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
3839   %}
3840   ins_pipe( pipe_slow );
3841 %}
3842 
3843 instruct Repl16S_evex(vecY dst, rRegI src) %{
3844   predicate(n->as_Vector()->length() == 16 && VM_Version::supports_avx512vlbw());
3845   match(Set dst (ReplicateS src));
3846   format %{ "vpbroadcastw $dst,$src\t! replicate16S" %}
3847   ins_encode %{
3848    int vector_len = 1;
3849     __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
3850   %}
3851   ins_pipe( pipe_slow );
3852 %}
3853 
3854 instruct Repl16S_mem_evex(vecY dst, memory mem) %{
3855   predicate(n->as_Vector()->length() == 16 && VM_Version::supports_avx512vlbw());
3856   match(Set dst (ReplicateS (LoadS mem)));
3857   format %{ "vpbroadcastw  $dst,$mem\t! replicate16S" %}
3858   ins_encode %{
3859     int vector_len = 1;
3860     __ evpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
3861   %}
3862   ins_pipe( pipe_slow );
3863 %}
3864 
3865 instruct Repl32S_evex(vecZ dst, rRegI src) %{
3866   predicate(n->as_Vector()->length() == 32 && VM_Version::supports_avx512bw());
3867   match(Set dst (ReplicateS src));
3868   format %{ "vpbroadcastw $dst,$src\t! replicate32S" %}
3869   ins_encode %{
3870    int vector_len = 2;
3871     __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
3872   %}
3873   ins_pipe( pipe_slow );
3874 %}
3875 
3876 instruct Repl32S_mem_evex(vecZ dst, memory mem) %{
3877   predicate(n->as_Vector()->length() == 32 && VM_Version::supports_avx512bw());
3878   match(Set dst (ReplicateS (LoadS mem)));
3879   format %{ "vpbroadcastw  $dst,$mem\t! replicate32S" %}
3880   ins_encode %{
3881     int vector_len = 2;
3882     __ evpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
3883   %}
3884   ins_pipe( pipe_slow );
3885 %}
3886 
3887 instruct Repl8S_imm_evex(vecX dst, immI con) %{
3888   predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vlbw());
3889   match(Set dst (ReplicateS con));
3890   format %{ "movq    $dst,[$constantaddress]\n\t"
3891             "vpbroadcastw $dst,$dst\t! replicate8S" %}
3892   ins_encode %{
3893    int vector_len = 0;
3894     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
3895     __ evpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3896   %}
3897   ins_pipe( pipe_slow );
3898 %}
3899 
3900 instruct Repl16S_imm_evex(vecY dst, immI con) %{
3901   predicate(n->as_Vector()->length() == 16 && VM_Version::supports_avx512vlbw());
3902   match(Set dst (ReplicateS con));
3903   format %{ "movq    $dst,[$constantaddress]\n\t"
3904             "vpbroadcastw $dst,$dst\t! replicate16S" %}
3905   ins_encode %{
3906    int vector_len = 1;
3907     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
3908     __ evpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3909   %}
3910   ins_pipe( pipe_slow );
3911 %}
3912 
3913 instruct Repl32S_imm_evex(vecZ dst, immI con) %{
3914   predicate(n->as_Vector()->length() == 32 && VM_Version::supports_avx512bw());
3915   match(Set dst (ReplicateS con));
3916   format %{ "movq    $dst,[$constantaddress]\n\t"
3917             "vpbroadcastw $dst,$dst\t! replicate32S" %}
3918   ins_encode %{
3919    int vector_len = 2;
3920     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
3921     __ evpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3922   %}
3923   ins_pipe( pipe_slow );
3924 %}
3925 
3926 instruct Repl32S_zero_evex(vecZ dst, immI0 zero) %{
3927   predicate(n->as_Vector()->length() == 32 && UseAVX > 2);
3928   match(Set dst (ReplicateS zero));
3929   format %{ "vpxor   $dst k0,$dst,$dst\t! replicate32S zero" %}
3930   ins_encode %{
3931     // Use vxorpd since AVX does not have vpxor for 512-bit (EVEX will have it).
3932     int vector_len = 2;
3933     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3934   %}
3935   ins_pipe( fpu_reg_reg );
3936 %}
3937 
3938 instruct Repl4I_evex(vecX dst, rRegI src) %{
3939   predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
3940   match(Set dst (ReplicateI src));
3941   format %{ "vpbroadcastd  $dst,$src\t! replicate4I" %}
3942   ins_encode %{
3943     int vector_len = 0;
3944     __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vector_len);
3945   %}
3946   ins_pipe( pipe_slow );
3947 %}
3948 
3949 instruct Repl4I_mem_evex(vecX dst, memory mem) %{
3950   predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
3951   match(Set dst (ReplicateI (LoadI mem)));
3952   format %{ "vpbroadcastd  $dst,$mem\t! replicate4I" %}
3953   ins_encode %{
3954     int vector_len = 0;
3955     __ evpbroadcastd($dst$$XMMRegister, $mem$$Address, vector_len);
3956   %}
3957   ins_pipe( pipe_slow );
3958 %}
3959 
3960 instruct Repl8I_evex(vecY dst, rRegI src) %{
3961   predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vl());
3962   match(Set dst (ReplicateI src));
3963   format %{ "vpbroadcastd  $dst,$src\t! replicate8I" %}
3964   ins_encode %{
3965     int vector_len = 1;
3966     __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vector_len);
3967   %}
3968   ins_pipe( pipe_slow );
3969 %}
3970 
3971 instruct Repl8I_mem_evex(vecY dst, memory mem) %{
3972   predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vl());
3973   match(Set dst (ReplicateI (LoadI mem)));
3974   format %{ "vpbroadcastd  $dst,$mem\t! replicate8I" %}
3975   ins_encode %{
3976     int vector_len = 1;
3977     __ evpbroadcastd($dst$$XMMRegister, $mem$$Address, vector_len);
3978   %}
3979   ins_pipe( pipe_slow );
3980 %}
3981 
3982 instruct Repl16I_evex(vecZ dst, rRegI src) %{
3983   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
3984   match(Set dst (ReplicateI src));
3985   format %{ "vpbroadcastd  $dst,$src\t! replicate16I" %}
3986   ins_encode %{
3987     int vector_len = 2;
3988     __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vector_len);
3989   %}
3990   ins_pipe( pipe_slow );
3991 %}
3992 
3993 instruct Repl16I_mem_evex(vecZ dst, memory mem) %{
3994   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
3995   match(Set dst (ReplicateI (LoadI mem)));
3996   format %{ "vpbroadcastd  $dst,$mem\t! replicate16I" %}
3997   ins_encode %{
3998     int vector_len = 2;
3999     __ evpbroadcastd($dst$$XMMRegister, $mem$$Address, vector_len);
4000   %}
4001   ins_pipe( pipe_slow );
4002 %}
4003 
4004 instruct Repl4I_imm_evex(vecX dst, immI con) %{
4005   predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
4006   match(Set dst (ReplicateI con));
4007   format %{ "movq    $dst,[$constantaddress]\t! replicate8I($con)\n\t"
4008             "vpbroadcastd  $dst,$dst\t! replicate4I" %}
4009   ins_encode %{
4010     int vector_len = 0;
4011     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
4012     __ evpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4013   %}
4014   ins_pipe( pipe_slow );
4015 %}
4016 
4017 instruct Repl8I_imm_evex(vecY dst, immI con) %{
4018   predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vl());
4019   match(Set dst (ReplicateI con));
4020   format %{ "movq    $dst,[$constantaddress]\t! replicate8I($con)\n\t"
4021             "vpbroadcastd  $dst,$dst\t! replicate8I" %}
4022   ins_encode %{
4023     int vector_len = 1;
4024     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
4025     __ evpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4026   %}
4027   ins_pipe( pipe_slow );
4028 %}
4029 
4030 instruct Repl16I_imm_evex(vecZ dst, immI con) %{
4031   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4032   match(Set dst (ReplicateI con));
4033   format %{ "movq    $dst,[$constantaddress]\t! replicate16I($con)\n\t"
4034             "vpbroadcastd  $dst,$dst\t! replicate16I" %}
4035   ins_encode %{
4036     int vector_len = 2;
4037     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
4038     __ evpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4039   %}
4040   ins_pipe( pipe_slow );
4041 %}
4042 
4043 instruct Repl16I_zero_evex(vecZ dst, immI0 zero) %{
4044   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4045   match(Set dst (ReplicateI zero));
4046   format %{ "vpxor   $dst k0,$dst,$dst\t! replicate16I zero" %}
4047   ins_encode %{
4048     // Use vxorpd since AVX does not have vpxor for 512-bit (AVX2 will have it).
4049     int vector_len = 2;
4050     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4051   %}
4052   ins_pipe( fpu_reg_reg );
4053 %}
4054 
4055 // Replicate long (8 byte) scalar to be vector
4056 #ifdef _LP64
4057 instruct Repl4L_evex(vecY dst, rRegL src) %{
4058   predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
4059   match(Set dst (ReplicateL src));
4060   format %{ "vpbroadcastq  $dst,$src\t! replicate4L" %}
4061   ins_encode %{
4062     int vector_len = 1;
4063     __ evpbroadcastq($dst$$XMMRegister, $src$$Register, vector_len);
4064   %}
4065   ins_pipe( pipe_slow );
4066 %}
4067 
4068 instruct Repl8L_evex(vecZ dst, rRegL src) %{
4069   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4070   match(Set dst (ReplicateL src));
4071   format %{ "vpbroadcastq  $dst,$src\t! replicate8L" %}
4072   ins_encode %{
4073     int vector_len = 2;
4074     __ evpbroadcastq($dst$$XMMRegister, $src$$Register, vector_len);
4075   %}
4076   ins_pipe( pipe_slow );
4077 %}
4078 #else // _LP64
4079 instruct Repl4L_evex(vecY dst, eRegL src, regD tmp) %{
4080   predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
4081   match(Set dst (ReplicateL src));
4082   effect(TEMP dst, USE src, TEMP tmp);
4083   format %{ "movdl   $dst,$src.lo\n\t"
4084             "movdl   $tmp,$src.hi\n\t"
4085             "punpckldq $dst,$tmp\n\t"
4086             "vpbroadcastq  $dst,$dst\t! replicate4L" %}
4087   ins_encode %{
4088     int vector_len = 1;
4089     __ movdl($dst$$XMMRegister, $src$$Register);
4090     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
4091     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
4092     __ evpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4093   %}
4094   ins_pipe( pipe_slow );
4095 %}
4096 
4097 instruct Repl8L_evex(vecZ dst, eRegL src, regD tmp) %{
4098   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4099   match(Set dst (ReplicateL src));
4100   effect(TEMP dst, USE src, TEMP tmp);
4101   format %{ "movdl   $dst,$src.lo\n\t"
4102             "movdl   $tmp,$src.hi\n\t"
4103             "punpckldq $dst,$tmp\n\t"
4104             "vpbroadcastq  $dst,$dst\t! replicate8L" %}
4105   ins_encode %{
4106     int vector_len = 2;
4107     __ movdl($dst$$XMMRegister, $src$$Register);
4108     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
4109     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
4110     __ evpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4111   %}
4112   ins_pipe( pipe_slow );
4113 %}
4114 #endif // _LP64
4115 
4116 instruct Repl4L_imm_evex(vecY dst, immL con) %{
4117   predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
4118   match(Set dst (ReplicateL con));
4119   format %{ "movq    $dst,[$constantaddress]\n\t"
4120             "vpbroadcastq  $dst,$dst\t! replicate4L" %}
4121   ins_encode %{
4122     int vector_len = 1;
4123     __ movq($dst$$XMMRegister, $constantaddress($con));
4124     __ evpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4125   %}
4126   ins_pipe( pipe_slow );
4127 %}
4128 
4129 instruct Repl8L_imm_evex(vecZ dst, immL con) %{
4130   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4131   match(Set dst (ReplicateL con));
4132   format %{ "movq    $dst,[$constantaddress]\n\t"
4133             "vpbroadcastq  $dst,$dst\t! replicate8L" %}
4134   ins_encode %{
4135     int vector_len = 2;
4136     __ movq($dst$$XMMRegister, $constantaddress($con));
4137     __ evpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4138   %}
4139   ins_pipe( pipe_slow );
4140 %}
4141 
4142 instruct Repl2L_mem_evex(vecX dst, memory mem) %{
4143   predicate(n->as_Vector()->length() == 2 && VM_Version::supports_avx512vl());
4144   match(Set dst (ReplicateL (LoadL mem)));
4145   format %{ "vpbroadcastd  $dst,$mem\t! replicate2L" %}
4146   ins_encode %{
4147     int vector_len = 0;
4148     __ evpbroadcastq($dst$$XMMRegister, $mem$$Address, vector_len);
4149   %}
4150   ins_pipe( pipe_slow );
4151 %}
4152 
4153 instruct Repl4L_mem_evex(vecY dst, memory mem) %{
4154   predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
4155   match(Set dst (ReplicateL (LoadL mem)));
4156   format %{ "vpbroadcastd  $dst,$mem\t! replicate4L" %}
4157   ins_encode %{
4158     int vector_len = 1;
4159     __ evpbroadcastq($dst$$XMMRegister, $mem$$Address, vector_len);
4160   %}
4161   ins_pipe( pipe_slow );
4162 %}
4163 
4164 instruct Repl8L_mem_evex(vecZ dst, memory mem) %{
4165   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4166   match(Set dst (ReplicateL (LoadL mem)));
4167   format %{ "vpbroadcastd  $dst,$mem\t! replicate8L" %}
4168   ins_encode %{
4169     int vector_len = 2;
4170     __ evpbroadcastq($dst$$XMMRegister, $mem$$Address, vector_len);
4171   %}
4172   ins_pipe( pipe_slow );
4173 %}
4174 
4175 instruct Repl8L_zero_evex(vecZ dst, immL0 zero) %{
4176   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4177   match(Set dst (ReplicateL zero));
4178   format %{ "vpxor   $dst k0,$dst,$dst\t! replicate8L zero" %}
4179   ins_encode %{
4180     // Use vxorpd since AVX does not have vpxor for 512-bit (EVEX will have it).
4181     int vector_len = 2;
4182     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4183   %}
4184   ins_pipe( fpu_reg_reg );
4185 %}
4186 
4187 instruct Repl8F_evex(vecY dst, regF src) %{
4188   predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vl());
4189   match(Set dst (ReplicateF src));
4190   format %{ "vbroadcastss $dst,$src\t! replicate8F" %}
4191   ins_encode %{
4192     int vector_len = 1;
4193     __ evpbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vector_len);
4194   %}
4195   ins_pipe( pipe_slow );
4196 %}
4197 
4198 instruct Repl8F_mem_evex(vecY dst, memory mem) %{
4199   predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vl());
4200   match(Set dst (ReplicateF (LoadF mem)));
4201   format %{ "vbroadcastss  $dst,$mem\t! replicate8F" %}
4202   ins_encode %{
4203     int vector_len = 1;
4204     __ evpbroadcastss($dst$$XMMRegister, $mem$$Address, vector_len);
4205   %}
4206   ins_pipe( pipe_slow );
4207 %}
4208 
4209 instruct Repl16F_evex(vecZ dst, regF src) %{
4210   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4211   match(Set dst (ReplicateF src));
4212   format %{ "vbroadcastss $dst,$src\t! replicate16F" %}
4213   ins_encode %{
4214     int vector_len = 2;
4215     __ evpbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vector_len);
4216   %}
4217   ins_pipe( pipe_slow );
4218 %}
4219 
4220 instruct Repl16F_mem_evex(vecZ dst, memory mem) %{
4221   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4222   match(Set dst (ReplicateF (LoadF mem)));
4223   format %{ "vbroadcastss  $dst,$mem\t! replicate16F" %}
4224   ins_encode %{
4225     int vector_len = 2;
4226     __ evpbroadcastss($dst$$XMMRegister, $mem$$Address, vector_len);
4227   %}
4228   ins_pipe( pipe_slow );
4229 %}
4230 
4231 instruct Repl2F_zero_evex(vecD dst, immF0 zero) %{
4232   predicate(n->as_Vector()->length() == 2 && UseAVX > 2);
4233   match(Set dst (ReplicateF zero));
4234   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate2F zero" %}
4235   ins_encode %{
4236     // Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
4237     int vector_len = 2;
4238     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4239   %}
4240   ins_pipe( fpu_reg_reg );
4241 %}
4242 
4243 instruct Repl4F_zero_evex(vecX dst, immF0 zero) %{
4244   predicate(n->as_Vector()->length() == 4 && UseAVX > 2);
4245   match(Set dst (ReplicateF zero));
4246   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate4F zero" %}
4247   ins_encode %{
4248     // Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
4249     int vector_len = 2;
4250     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4251   %}
4252   ins_pipe( fpu_reg_reg );
4253 %}
4254 
4255 instruct Repl8F_zero_evex(vecY dst, immF0 zero) %{
4256   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4257   match(Set dst (ReplicateF zero));
4258   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate8F zero" %}
4259   ins_encode %{
4260     // Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
4261     int vector_len = 2;
4262     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4263   %}
4264   ins_pipe( fpu_reg_reg );
4265 %}
4266 
4267 instruct Repl16F_zero_evex(vecZ dst, immF0 zero) %{
4268   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4269   match(Set dst (ReplicateF zero));
4270   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate16F zero" %}
4271   ins_encode %{
4272     // Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
4273     int vector_len = 2;
4274     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4275   %}
4276   ins_pipe( fpu_reg_reg );
4277 %}
4278 
4279 instruct Repl4D_evex(vecY dst, regD src) %{
4280   predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
4281   match(Set dst (ReplicateD src));
4282   format %{ "vbroadcastsd $dst,$src\t! replicate4D" %}
4283   ins_encode %{
4284     int vector_len = 1;
4285     __ evpbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
4286   %}
4287   ins_pipe( pipe_slow );
4288 %}
4289 
4290 instruct Repl4D_mem_evex(vecY dst, memory mem) %{
4291   predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
4292   match(Set dst (ReplicateD (LoadD mem)));
4293   format %{ "vbroadcastsd  $dst,$mem\t! replicate4D" %}
4294   ins_encode %{
4295     int vector_len = 1;
4296     __ evpbroadcastsd($dst$$XMMRegister, $mem$$Address, vector_len);
4297   %}
4298   ins_pipe( pipe_slow );
4299 %}
4300 
4301 instruct Repl8D_evex(vecZ dst, regD src) %{
4302   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4303   match(Set dst (ReplicateD src));
4304   format %{ "vbroadcastsd $dst,$src\t! replicate8D" %}
4305   ins_encode %{
4306     int vector_len = 2;
4307     __ evpbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
4308   %}
4309   ins_pipe( pipe_slow );
4310 %}
4311 
4312 instruct Repl8D_mem_evex(vecZ dst, memory mem) %{
4313   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4314   match(Set dst (ReplicateD (LoadD mem)));
4315   format %{ "vbroadcastsd  $dst,$mem\t! replicate8D" %}
4316   ins_encode %{
4317     int vector_len = 2;
4318     __ evpbroadcastsd($dst$$XMMRegister, $mem$$Address, vector_len);
4319   %}
4320   ins_pipe( pipe_slow );
4321 %}
4322 
4323 instruct Repl2D_zero_evex(vecX dst, immD0 zero) %{
4324   predicate(n->as_Vector()->length() == 2 && UseAVX > 2);
4325   match(Set dst (ReplicateD zero));
4326   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate2D zero" %}
4327   ins_encode %{
4328     // Use vpxor in place of vxorpd since EVEX has a constriant on dq for vxorpd: this is a 512-bit operation
4329     int vector_len = 2;
4330     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4331   %}
4332   ins_pipe( fpu_reg_reg );
4333 %}
4334 
4335 instruct Repl4D_zero_evex(vecY dst, immD0 zero) %{
4336   predicate(n->as_Vector()->length() == 4 && UseAVX > 2);
4337   match(Set dst (ReplicateD zero));
4338   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate4D zero" %}
4339   ins_encode %{
4340     // Use vpxor in place of vxorpd since EVEX has a constriant on dq for vxorpd: this is a 512-bit operation
4341     int vector_len = 2;
4342     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4343   %}
4344   ins_pipe( fpu_reg_reg );
4345 %}
4346 
4347 instruct Repl8D_zero_evex(vecZ dst, immD0 zero) %{
4348   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4349   match(Set dst (ReplicateD zero));
4350   format %{ "vpxor  $dst k0,$dst,$dst,vect512\t! replicate8D zero" %}
4351   ins_encode %{
4352     // Use vpxor in place of vxorpd since EVEX has a constriant on dq for vxorpd: this is a 512-bit operation
4353     int vector_len = 2;
4354     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4355   %}
4356   ins_pipe( fpu_reg_reg );
4357 %}
4358 
4359 // ====================REDUCTION ARITHMETIC=======================================
4360 
4361 instruct rsadd2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
4362   predicate(UseSSE > 2 && UseAVX == 0);
4363   match(Set dst (AddReductionVI src1 src2));
4364   effect(TEMP tmp2, TEMP tmp);
4365   format %{ "movdqu  $tmp2,$src2\n\t"
4366             "phaddd  $tmp2,$tmp2\n\t"
4367             "movd    $tmp,$src1\n\t"
4368             "paddd   $tmp,$tmp2\n\t"
4369             "movd    $dst,$tmp\t! add reduction2I" %}
4370   ins_encode %{
4371     __ movdqu($tmp2$$XMMRegister, $src2$$XMMRegister);
4372     __ phaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister);
4373     __ movdl($tmp$$XMMRegister, $src1$$Register);
4374     __ paddd($tmp$$XMMRegister, $tmp2$$XMMRegister);
4375     __ movdl($dst$$Register, $tmp$$XMMRegister);
4376   %}
4377   ins_pipe( pipe_slow );
4378 %}
4379 
4380 instruct rvadd2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
4381   predicate(VM_Version::supports_avxonly());
4382   match(Set dst (AddReductionVI src1 src2));
4383   effect(TEMP tmp, TEMP tmp2);
4384   format %{ "vphaddd  $tmp,$src2,$src2\n\t"
4385             "movd     $tmp2,$src1\n\t"
4386             "vpaddd   $tmp2,$tmp2,$tmp\n\t"
4387             "movd     $dst,$tmp2\t! add reduction2I" %}
4388   ins_encode %{
4389     int vector_len = 0;
4390     __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len);
4391     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4392     __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
4393     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4394   %}
4395   ins_pipe( pipe_slow );
4396 %}
4397 
4398 instruct rvadd2I_reduction_reg_evex(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
4399   predicate(UseAVX > 2);
4400   match(Set dst (AddReductionVI src1 src2));
4401   effect(TEMP tmp, TEMP tmp2);
4402   format %{ "pshufd  $tmp2,$src2,0x1\n\t"
4403             "vpaddd  $tmp,$src2,$tmp2\n\t"
4404             "movd    $tmp2,$src1\n\t"
4405             "vpaddd  $tmp2,$tmp,$tmp2\n\t"
4406             "movd    $dst,$tmp2\t! add reduction2I" %}
4407   ins_encode %{
4408     int vector_len = 0;
4409     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
4410     __ vpaddd($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4411     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4412     __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4413     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4414   %}
4415   ins_pipe( pipe_slow );
4416 %}
4417 
4418 instruct rsadd4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
4419   predicate(UseSSE > 2 && UseAVX == 0);
4420   match(Set dst (AddReductionVI src1 src2));
4421   effect(TEMP tmp, TEMP tmp2);
4422   format %{ "movdqu  $tmp,$src2\n\t"
4423             "phaddd  $tmp,$tmp\n\t"
4424             "phaddd  $tmp,$tmp\n\t"
4425             "movd    $tmp2,$src1\n\t"
4426             "paddd   $tmp2,$tmp\n\t"
4427             "movd    $dst,$tmp2\t! add reduction4I" %}
4428   ins_encode %{
4429     __ movdqu($tmp$$XMMRegister, $src2$$XMMRegister);
4430     __ phaddd($tmp$$XMMRegister, $tmp$$XMMRegister);
4431     __ phaddd($tmp$$XMMRegister, $tmp$$XMMRegister);
4432     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4433     __ paddd($tmp2$$XMMRegister, $tmp$$XMMRegister);
4434     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4435   %}
4436   ins_pipe( pipe_slow );
4437 %}
4438 
4439 instruct rvadd4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
4440   predicate(VM_Version::supports_avxonly());
4441   match(Set dst (AddReductionVI src1 src2));
4442   effect(TEMP tmp, TEMP tmp2);
4443   format %{ "vphaddd  $tmp,$src2,$src2\n\t"
4444             "vphaddd  $tmp,$tmp,$tmp\n\t"
4445             "movd     $tmp2,$src1\n\t"
4446             "vpaddd   $tmp2,$tmp2,$tmp\n\t"
4447             "movd     $dst,$tmp2\t! add reduction4I" %}
4448   ins_encode %{
4449     int vector_len = 0;
4450     __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len);
4451     __ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp$$XMMRegister, vector_len);
4452     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4453     __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
4454     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4455   %}
4456   ins_pipe( pipe_slow );
4457 %}
4458 
4459 instruct rvadd4I_reduction_reg_evex(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
4460   predicate(UseAVX > 2);
4461   match(Set dst (AddReductionVI src1 src2));
4462   effect(TEMP tmp, TEMP tmp2);
4463   format %{ "pshufd  $tmp2,$src2,0xE\n\t"
4464             "vpaddd  $tmp,$src2,$tmp2\n\t"
4465             "pshufd  $tmp2,$tmp,0x1\n\t"
4466             "vpaddd  $tmp,$tmp,$tmp2\n\t"
4467             "movd    $tmp2,$src1\n\t"
4468             "vpaddd  $tmp2,$tmp,$tmp2\n\t"
4469             "movd    $dst,$tmp2\t! add reduction4I" %}
4470   ins_encode %{
4471     int vector_len = 0;
4472     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
4473     __ vpaddd($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4474     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
4475     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4476     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4477     __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4478     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4479   %}
4480   ins_pipe( pipe_slow );
4481 %}
4482 
4483 instruct rvadd8I_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2) %{
4484   predicate(VM_Version::supports_avxonly());
4485   match(Set dst (AddReductionVI src1 src2));
4486   effect(TEMP tmp, TEMP tmp2);
4487   format %{ "vphaddd  $tmp,$src2,$src2\n\t"
4488             "vphaddd  $tmp,$tmp,$tmp2\n\t"
4489             "vextracti128_high  $tmp2,$tmp\n\t"
4490             "vpaddd   $tmp,$tmp,$tmp2\n\t"
4491             "movd     $tmp2,$src1\n\t"
4492             "vpaddd   $tmp2,$tmp2,$tmp\n\t"
4493             "movd     $dst,$tmp2\t! add reduction8I" %}
4494   ins_encode %{
4495     int vector_len = 1;
4496     __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len);
4497     __ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4498     __ vextracti128_high($tmp2$$XMMRegister, $tmp$$XMMRegister);
4499     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
4500     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4501     __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
4502     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4503   %}
4504   ins_pipe( pipe_slow );
4505 %}
4506 
4507 instruct rvadd8I_reduction_reg_evex(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2) %{
4508   predicate(UseAVX > 2);
4509   match(Set dst (AddReductionVI src1 src2));
4510   effect(TEMP tmp, TEMP tmp2);
4511   format %{ "vextracti128_high  $tmp,$src2\n\t"
4512             "vpaddd  $tmp,$tmp,$src2\n\t"
4513             "pshufd  $tmp2,$tmp,0xE\n\t"
4514             "vpaddd  $tmp,$tmp,$tmp2\n\t"
4515             "pshufd  $tmp2,$tmp,0x1\n\t"
4516             "vpaddd  $tmp,$tmp,$tmp2\n\t"
4517             "movd    $tmp2,$src1\n\t"
4518             "vpaddd  $tmp2,$tmp,$tmp2\n\t"
4519             "movd    $dst,$tmp2\t! add reduction8I" %}
4520   ins_encode %{
4521     int vector_len = 0;
4522     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
4523     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
4524     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
4525     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4526     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
4527     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4528     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4529     __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4530     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4531   %}
4532   ins_pipe( pipe_slow );
4533 %}
4534 
4535 instruct rvadd16I_reduction_reg_evex(rRegI dst, rRegI src1, vecZ src2, regF tmp, regF tmp2, regF tmp3) %{
4536   predicate(UseAVX > 2);
4537   match(Set dst (AddReductionVI src1 src2));
4538   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
4539   format %{ "vextracti64x4_high  $tmp3,$src2\n\t"
4540             "vpaddd  $tmp3,$tmp3,$src2\n\t"
4541             "vextracti128_high  $tmp,$tmp3\n\t"
4542             "vpaddd  $tmp,$tmp,$tmp3\n\t"
4543             "pshufd  $tmp2,$tmp,0xE\n\t"
4544             "vpaddd  $tmp,$tmp,$tmp2\n\t"
4545             "pshufd  $tmp2,$tmp,0x1\n\t"
4546             "vpaddd  $tmp,$tmp,$tmp2\n\t"
4547             "movd    $tmp2,$src1\n\t"
4548             "vpaddd  $tmp2,$tmp,$tmp2\n\t"
4549             "movd    $dst,$tmp2\t! mul reduction16I" %}
4550   ins_encode %{
4551     __ vextracti64x4_high($tmp3$$XMMRegister, $src2$$XMMRegister);
4552     __ vpaddd($tmp3$$XMMRegister, $tmp3$$XMMRegister, $src2$$XMMRegister, 1);
4553     __ vextracti128_high($tmp$$XMMRegister, $tmp3$$XMMRegister);
4554     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp3$$XMMRegister, 0);
4555     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
4556     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
4557     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
4558     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
4559     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4560     __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
4561     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4562   %}
4563   ins_pipe( pipe_slow );
4564 %}
4565 
4566 #ifdef _LP64
4567 instruct rvadd2L_reduction_reg(rRegL dst, rRegL src1, vecX src2, regF tmp, regF tmp2) %{
4568   predicate(UseAVX > 2);
4569   match(Set dst (AddReductionVL src1 src2));
4570   effect(TEMP tmp, TEMP tmp2);
4571   format %{ "pshufd  $tmp2,$src2,0xE\n\t"
4572             "vpaddq  $tmp,$src2,$tmp2\n\t"
4573             "movdq   $tmp2,$src1\n\t"
4574             "vpaddq  $tmp2,$tmp,$tmp2\n\t"
4575             "movdq   $dst,$tmp2\t! add reduction2L" %}
4576   ins_encode %{
4577     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
4578     __ vpaddq($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, 0);
4579     __ movdq($tmp2$$XMMRegister, $src1$$Register);
4580     __ vpaddq($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
4581     __ movdq($dst$$Register, $tmp2$$XMMRegister);
4582   %}
4583   ins_pipe( pipe_slow );
4584 %}
4585 
4586 instruct rvadd4L_reduction_reg(rRegL dst, rRegL src1, vecY src2, regF tmp, regF tmp2) %{
4587   predicate(UseAVX > 2);
4588   match(Set dst (AddReductionVL src1 src2));
4589   effect(TEMP tmp, TEMP tmp2);
4590   format %{ "vextracti128_high  $tmp,$src2\n\t"
4591             "vpaddq  $tmp2,$tmp,$src2\n\t"
4592             "pshufd  $tmp,$tmp2,0xE\n\t"
4593             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
4594             "movdq   $tmp,$src1\n\t"
4595             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
4596             "movdq   $dst,$tmp2\t! add reduction4L" %}
4597   ins_encode %{
4598     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
4599     __ vpaddq($tmp2$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, 0);
4600     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
4601     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
4602     __ movdq($tmp$$XMMRegister, $src1$$Register);
4603     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
4604     __ movdq($dst$$Register, $tmp2$$XMMRegister);
4605   %}
4606   ins_pipe( pipe_slow );
4607 %}
4608 
4609 instruct rvadd8L_reduction_reg(rRegL dst, rRegL src1, vecZ src2, regF tmp, regF tmp2) %{
4610   predicate(UseAVX > 2);
4611   match(Set dst (AddReductionVL src1 src2));
4612   effect(TEMP tmp, TEMP tmp2);
4613   format %{ "vextracti64x4_high  $tmp2,$src2\n\t"
4614             "vpaddq  $tmp2,$tmp2,$src2\n\t"
4615             "vextracti128_high  $tmp,$tmp2\n\t"
4616             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
4617             "pshufd  $tmp,$tmp2,0xE\n\t"
4618             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
4619             "movdq   $tmp,$src1\n\t"
4620             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
4621             "movdq   $dst,$tmp2\t! add reduction8L" %}
4622   ins_encode %{
4623     __ vextracti64x4_high($tmp2$$XMMRegister, $src2$$XMMRegister);
4624     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
4625     __ vextracti128_high($tmp$$XMMRegister, $tmp2$$XMMRegister);
4626     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
4627     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
4628     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
4629     __ movdq($tmp$$XMMRegister, $src1$$Register);
4630     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
4631     __ movdq($dst$$Register, $tmp2$$XMMRegister);
4632   %}
4633   ins_pipe( pipe_slow );
4634 %}
4635 #endif
4636 
4637 instruct rsadd2F_reduction_reg(regF dst, vecD src2, regF tmp) %{
4638   predicate(UseSSE >= 1 && UseAVX == 0);
4639   match(Set dst (AddReductionVF dst src2));
4640   effect(TEMP dst, TEMP tmp);
4641   format %{ "addss   $dst,$src2\n\t"
4642             "pshufd  $tmp,$src2,0x01\n\t"
4643             "addss   $dst,$tmp\t! add reduction2F" %}
4644   ins_encode %{
4645     __ addss($dst$$XMMRegister, $src2$$XMMRegister);
4646     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
4647     __ addss($dst$$XMMRegister, $tmp$$XMMRegister);
4648   %}
4649   ins_pipe( pipe_slow );
4650 %}
4651 
4652 instruct rvadd2F_reduction_reg(regF dst, vecD src2, regF tmp) %{
4653   predicate(UseAVX > 0);
4654   match(Set dst (AddReductionVF dst src2));
4655   effect(TEMP dst, TEMP tmp);
4656   format %{ "vaddss  $dst,$dst,$src2\n\t"
4657             "pshufd  $tmp,$src2,0x01\n\t"
4658             "vaddss  $dst,$dst,$tmp\t! add reduction2F" %}
4659   ins_encode %{
4660     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
4661     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
4662     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4663   %}
4664   ins_pipe( pipe_slow );
4665 %}
4666 
4667 instruct rsadd4F_reduction_reg(regF dst, vecX src2, regF tmp) %{
4668   predicate(UseSSE >= 1 && UseAVX == 0);
4669   match(Set dst (AddReductionVF dst src2));
4670   effect(TEMP dst, TEMP tmp);
4671   format %{ "addss   $dst,$src2\n\t"
4672             "pshufd  $tmp,$src2,0x01\n\t"
4673             "addss   $dst,$tmp\n\t"
4674             "pshufd  $tmp,$src2,0x02\n\t"
4675             "addss   $dst,$tmp\n\t"
4676             "pshufd  $tmp,$src2,0x03\n\t"
4677             "addss   $dst,$tmp\t! add reduction4F" %}
4678   ins_encode %{
4679     __ addss($dst$$XMMRegister, $src2$$XMMRegister);
4680     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
4681     __ addss($dst$$XMMRegister, $tmp$$XMMRegister);
4682     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
4683     __ addss($dst$$XMMRegister, $tmp$$XMMRegister);
4684     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
4685     __ addss($dst$$XMMRegister, $tmp$$XMMRegister);
4686   %}
4687   ins_pipe( pipe_slow );
4688 %}
4689 
4690 instruct rvadd4F_reduction_reg(regF dst, vecX src2, regF tmp) %{
4691   predicate(UseAVX > 0);
4692   match(Set dst (AddReductionVF dst src2));
4693   effect(TEMP tmp, TEMP dst);
4694   format %{ "vaddss  $dst,dst,$src2\n\t"
4695             "pshufd  $tmp,$src2,0x01\n\t"
4696             "vaddss  $dst,$dst,$tmp\n\t"
4697             "pshufd  $tmp,$src2,0x02\n\t"
4698             "vaddss  $dst,$dst,$tmp\n\t"
4699             "pshufd  $tmp,$src2,0x03\n\t"
4700             "vaddss  $dst,$dst,$tmp\t! add reduction4F" %}
4701   ins_encode %{
4702     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
4703     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
4704     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4705     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
4706     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4707     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
4708     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4709   %}
4710   ins_pipe( pipe_slow );
4711 %}
4712 
4713 instruct radd8F_reduction_reg(regF dst, vecY src2, regF tmp, regF tmp2) %{
4714   predicate(UseAVX > 0);
4715   match(Set dst (AddReductionVF dst src2));
4716   effect(TEMP tmp, TEMP dst, TEMP tmp2);
4717   format %{ "vaddss  $dst,$dst,$src2\n\t"
4718             "pshufd  $tmp,$src2,0x01\n\t"
4719             "vaddss  $dst,$dst,$tmp\n\t"
4720             "pshufd  $tmp,$src2,0x02\n\t"
4721             "vaddss  $dst,$dst,$tmp\n\t"
4722             "pshufd  $tmp,$src2,0x03\n\t"
4723             "vaddss  $dst,$dst,$tmp\n\t"
4724             "vextractf128_high  $tmp2,$src2\n\t"
4725             "vaddss  $dst,$dst,$tmp2\n\t"
4726             "pshufd  $tmp,$tmp2,0x01\n\t"
4727             "vaddss  $dst,$dst,$tmp\n\t"
4728             "pshufd  $tmp,$tmp2,0x02\n\t"
4729             "vaddss  $dst,$dst,$tmp\n\t"
4730             "pshufd  $tmp,$tmp2,0x03\n\t"
4731             "vaddss  $dst,$dst,$tmp\t! add reduction8F" %}
4732   ins_encode %{
4733     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
4734     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
4735     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4736     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
4737     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4738     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
4739     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4740     __ vextractf128_high($tmp2$$XMMRegister, $src2$$XMMRegister);
4741     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
4742     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
4743     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4744     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
4745     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4746     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
4747     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4748   %}
4749   ins_pipe( pipe_slow );
4750 %}
4751 
4752 instruct radd16F_reduction_reg(regF dst, vecZ src2, regF tmp, regF tmp2) %{
4753   predicate(UseAVX > 2);
4754   match(Set dst (AddReductionVF dst src2));
4755   effect(TEMP tmp, TEMP dst, TEMP tmp2);
4756   format %{ "vaddss  $dst,$dst,$src2\n\t"
4757             "pshufd  $tmp,$src2,0x01\n\t"
4758             "vaddss  $dst,$dst,$tmp\n\t"
4759             "pshufd  $tmp,$src2,0x02\n\t"
4760             "vaddss  $dst,$dst,$tmp\n\t"
4761             "pshufd  $tmp,$src2,0x03\n\t"
4762             "vaddss  $dst,$dst,$tmp\n\t"
4763             "vextractf32x4  $tmp2,$src2,0x1\n\t"
4764             "vaddss  $dst,$dst,$tmp2\n\t"
4765             "pshufd  $tmp,$tmp2,0x01\n\t"
4766             "vaddss  $dst,$dst,$tmp\n\t"
4767             "pshufd  $tmp,$tmp2,0x02\n\t"
4768             "vaddss  $dst,$dst,$tmp\n\t"
4769             "pshufd  $tmp,$tmp2,0x03\n\t"
4770             "vaddss  $dst,$dst,$tmp\n\t"
4771             "vextractf32x4  $tmp2,$src2,0x2\n\t"
4772             "vaddss  $dst,$dst,$tmp2\n\t"
4773             "pshufd  $tmp,$tmp2,0x01\n\t"
4774             "vaddss  $dst,$dst,$tmp\n\t"
4775             "pshufd  $tmp,$tmp2,0x02\n\t"
4776             "vaddss  $dst,$dst,$tmp\n\t"
4777             "pshufd  $tmp,$tmp2,0x03\n\t"
4778             "vaddss  $dst,$dst,$tmp\n\t"
4779             "vextractf32x4  $tmp2,$src2,0x3\n\t"
4780             "vaddss  $dst,$dst,$tmp2\n\t"
4781             "pshufd  $tmp,$tmp2,0x01\n\t"
4782             "vaddss  $dst,$dst,$tmp\n\t"
4783             "pshufd  $tmp,$tmp2,0x02\n\t"
4784             "vaddss  $dst,$dst,$tmp\n\t"
4785             "pshufd  $tmp,$tmp2,0x03\n\t"
4786             "vaddss  $dst,$dst,$tmp\t! add reduction16F" %}
4787   ins_encode %{
4788     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
4789     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
4790     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4791     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
4792     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4793     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
4794     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4795     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
4796     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
4797     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
4798     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4799     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
4800     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4801     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
4802     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4803     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
4804     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
4805     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
4806     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4807     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
4808     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4809     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
4810     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4811     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
4812     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
4813     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
4814     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4815     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
4816     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4817     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
4818     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4819   %}
4820   ins_pipe( pipe_slow );
4821 %}
4822 
4823 instruct rsadd2D_reduction_reg(regD dst, vecX src2, regD tmp) %{
4824   predicate(UseSSE >= 1 && UseAVX == 0);
4825   match(Set dst (AddReductionVD dst src2));
4826   effect(TEMP tmp, TEMP dst);
4827   format %{ "addsd   $dst,$src2\n\t"
4828             "pshufd  $tmp,$src2,0xE\n\t"
4829             "addsd   $dst,$tmp\t! add reduction2D" %}
4830   ins_encode %{
4831     __ addsd($dst$$XMMRegister, $src2$$XMMRegister);
4832     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
4833     __ addsd($dst$$XMMRegister, $tmp$$XMMRegister);
4834   %}
4835   ins_pipe( pipe_slow );
4836 %}
4837 
4838 instruct rvadd2D_reduction_reg(regD dst, vecX src2, regD tmp) %{
4839   predicate(UseAVX > 0);
4840   match(Set dst (AddReductionVD dst src2));
4841   effect(TEMP tmp, TEMP dst);
4842   format %{ "vaddsd  $dst,$dst,$src2\n\t"
4843             "pshufd  $tmp,$src2,0xE\n\t"
4844             "vaddsd  $dst,$dst,$tmp\t! add reduction2D" %}
4845   ins_encode %{
4846     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
4847     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
4848     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4849   %}
4850   ins_pipe( pipe_slow );
4851 %}
4852 
4853 instruct rvadd4D_reduction_reg(regD dst, vecY src2, regD tmp, regD tmp2) %{
4854   predicate(UseAVX > 0);
4855   match(Set dst (AddReductionVD dst src2));
4856   effect(TEMP tmp, TEMP dst, TEMP tmp2);
4857   format %{ "vaddsd  $dst,$dst,$src2\n\t"
4858             "pshufd  $tmp,$src2,0xE\n\t"
4859             "vaddsd  $dst,$dst,$tmp\n\t"
4860             "vextractf32x4  $tmp2,$src2,0x1\n\t"
4861             "vaddsd  $dst,$dst,$tmp2\n\t"
4862             "pshufd  $tmp,$tmp2,0xE\n\t"
4863             "vaddsd  $dst,$dst,$tmp\t! add reduction4D" %}
4864   ins_encode %{
4865     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
4866     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
4867     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4868     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
4869     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
4870     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
4871     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4872   %}
4873   ins_pipe( pipe_slow );
4874 %}
4875 
4876 instruct rvadd8D_reduction_reg(regD dst, vecZ src2, regD tmp, regD tmp2) %{
4877   predicate(UseAVX > 2);
4878   match(Set dst (AddReductionVD dst src2));
4879   effect(TEMP tmp, TEMP dst, TEMP tmp2);
4880   format %{ "vaddsd  $dst,$dst,$src2\n\t"
4881             "pshufd  $tmp,$src2,0xE\n\t"
4882             "vaddsd  $dst,$dst,$tmp\n\t"
4883             "vextractf32x4  $tmp2,$src2,0x1\n\t"
4884             "vaddsd  $dst,$dst,$tmp2\n\t"
4885             "pshufd  $tmp,$tmp2,0xE\n\t"
4886             "vaddsd  $dst,$dst,$tmp\n\t"
4887             "vextractf32x4  $tmp2,$src2,0x2\n\t"
4888             "vaddsd  $dst,$dst,$tmp2\n\t"
4889             "pshufd  $tmp,$tmp2,0xE\n\t"
4890             "vaddsd  $dst,$dst,$tmp\n\t"
4891             "vextractf32x4  $tmp2,$src2,0x3\n\t"
4892             "vaddsd  $dst,$dst,$tmp2\n\t"
4893             "pshufd  $tmp,$tmp2,0xE\n\t"
4894             "vaddsd  $dst,$dst,$tmp\t! add reduction8D" %}
4895   ins_encode %{
4896     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
4897     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
4898     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4899     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
4900     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
4901     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
4902     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4903     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
4904     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
4905     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
4906     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4907     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
4908     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
4909     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
4910     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4911   %}
4912   ins_pipe( pipe_slow );
4913 %}
4914 
4915 instruct rsmul2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
4916   predicate(UseSSE > 3 && UseAVX == 0);
4917   match(Set dst (MulReductionVI src1 src2));
4918   effect(TEMP tmp, TEMP tmp2);
4919   format %{ "pshufd  $tmp2,$src2,0x1\n\t"
4920             "pmulld  $tmp2,$src2\n\t"
4921             "movd    $tmp,$src1\n\t"
4922             "pmulld  $tmp2,$tmp\n\t"
4923             "movd    $dst,$tmp2\t! mul reduction2I" %}
4924   ins_encode %{
4925     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
4926     __ pmulld($tmp2$$XMMRegister, $src2$$XMMRegister);
4927     __ movdl($tmp$$XMMRegister, $src1$$Register);
4928     __ pmulld($tmp2$$XMMRegister, $tmp$$XMMRegister);
4929     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4930   %}
4931   ins_pipe( pipe_slow );
4932 %}
4933 
4934 instruct rvmul2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
4935   predicate(UseAVX > 0);
4936   match(Set dst (MulReductionVI src1 src2));
4937   effect(TEMP tmp, TEMP tmp2);
4938   format %{ "pshufd   $tmp2,$src2,0x1\n\t"
4939             "vpmulld  $tmp,$src2,$tmp2\n\t"
4940             "movd     $tmp2,$src1\n\t"
4941             "vpmulld  $tmp2,$tmp,$tmp2\n\t"
4942             "movd     $dst,$tmp2\t! mul reduction2I" %}
4943   ins_encode %{
4944     int vector_len = 0;
4945     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
4946     __ vpmulld($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4947     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4948     __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4949     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4950   %}
4951   ins_pipe( pipe_slow );
4952 %}
4953 
4954 instruct rsmul4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
4955   predicate(UseSSE > 3 && UseAVX == 0);
4956   match(Set dst (MulReductionVI src1 src2));
4957   effect(TEMP tmp, TEMP tmp2);
4958   format %{ "pshufd  $tmp2,$src2,0xE\n\t"
4959             "pmulld  $tmp2,$src2\n\t"
4960             "pshufd  $tmp,$tmp2,0x1\n\t"
4961             "pmulld  $tmp2,$tmp\n\t"
4962             "movd    $tmp,$src1\n\t"
4963             "pmulld  $tmp2,$tmp\n\t"
4964             "movd    $dst,$tmp2\t! mul reduction4I" %}
4965   ins_encode %{
4966     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
4967     __ pmulld($tmp2$$XMMRegister, $src2$$XMMRegister);
4968     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x1);
4969     __ pmulld($tmp2$$XMMRegister, $tmp$$XMMRegister);
4970     __ movdl($tmp$$XMMRegister, $src1$$Register);
4971     __ pmulld($tmp2$$XMMRegister, $tmp$$XMMRegister);
4972     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4973   %}
4974   ins_pipe( pipe_slow );
4975 %}
4976 
4977 instruct rvmul4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
4978   predicate(UseAVX > 0);
4979   match(Set dst (MulReductionVI src1 src2));
4980   effect(TEMP tmp, TEMP tmp2);
4981   format %{ "pshufd   $tmp2,$src2,0xE\n\t"
4982             "vpmulld  $tmp,$src2,$tmp2\n\t"
4983             "pshufd   $tmp2,$tmp,0x1\n\t"
4984             "vpmulld  $tmp,$tmp,$tmp2\n\t"
4985             "movd     $tmp2,$src1\n\t"
4986             "vpmulld  $tmp2,$tmp,$tmp2\n\t"
4987             "movd     $dst,$tmp2\t! mul reduction4I" %}
4988   ins_encode %{
4989     int vector_len = 0;
4990     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
4991     __ vpmulld($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4992     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
4993     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4994     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4995     __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4996     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4997   %}
4998   ins_pipe( pipe_slow );
4999 %}
5000 
5001 instruct rvmul8I_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2) %{
5002   predicate(UseAVX > 0);
5003   match(Set dst (MulReductionVI src1 src2));
5004   effect(TEMP tmp, TEMP tmp2);
5005   format %{ "vextracti128_high  $tmp,$src2\n\t"
5006             "vpmulld  $tmp,$tmp,$src2\n\t"
5007             "pshufd   $tmp2,$tmp,0xE\n\t"
5008             "vpmulld  $tmp,$tmp,$tmp2\n\t"
5009             "pshufd   $tmp2,$tmp,0x1\n\t"
5010             "vpmulld  $tmp,$tmp,$tmp2\n\t"
5011             "movd     $tmp2,$src1\n\t"
5012             "vpmulld  $tmp2,$tmp,$tmp2\n\t"
5013             "movd     $dst,$tmp2\t! mul reduction8I" %}
5014   ins_encode %{
5015     int vector_len = 0;
5016     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
5017     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
5018     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
5019     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5020     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
5021     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5022     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5023     __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5024     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5025   %}
5026   ins_pipe( pipe_slow );
5027 %}
5028 
5029 instruct rvmul16I_reduction_reg(rRegI dst, rRegI src1, vecZ src2, regF tmp, regF tmp2, regF tmp3) %{
5030   predicate(UseAVX > 2);
5031   match(Set dst (MulReductionVI src1 src2));
5032   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
5033   format %{ "vextracti64x4_high  $tmp3,$src2\n\t"
5034             "vpmulld  $tmp3,$tmp3,$src2\n\t"
5035             "vextracti128_high  $tmp,$tmp3\n\t"
5036             "vpmulld  $tmp,$tmp,$src2\n\t"
5037             "pshufd   $tmp2,$tmp,0xE\n\t"
5038             "vpmulld  $tmp,$tmp,$tmp2\n\t"
5039             "pshufd   $tmp2,$tmp,0x1\n\t"
5040             "vpmulld  $tmp,$tmp,$tmp2\n\t"
5041             "movd     $tmp2,$src1\n\t"
5042             "vpmulld  $tmp2,$tmp,$tmp2\n\t"
5043             "movd     $dst,$tmp2\t! mul reduction16I" %}
5044   ins_encode %{
5045     __ vextracti64x4_high($tmp3$$XMMRegister, $src2$$XMMRegister);
5046     __ vpmulld($tmp3$$XMMRegister, $tmp3$$XMMRegister, $src2$$XMMRegister, 1);
5047     __ vextracti128_high($tmp$$XMMRegister, $tmp3$$XMMRegister);
5048     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp3$$XMMRegister, 0);
5049     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
5050     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5051     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
5052     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5053     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5054     __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5055     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5056   %}
5057   ins_pipe( pipe_slow );
5058 %}
5059 
5060 #ifdef _LP64
5061 instruct rvmul2L_reduction_reg(rRegL dst, rRegL src1, vecX src2, regF tmp, regF tmp2) %{
5062   predicate(UseAVX > 2 && VM_Version::supports_avx512dq());
5063   match(Set dst (MulReductionVL src1 src2));
5064   effect(TEMP tmp, TEMP tmp2);
5065   format %{ "pshufd   $tmp2,$src2,0xE\n\t"
5066             "vpmullq  $tmp,$src2,$tmp2\n\t"
5067             "movdq    $tmp2,$src1\n\t"
5068             "vpmullq  $tmp2,$tmp,$tmp2\n\t"
5069             "movdq    $dst,$tmp2\t! mul reduction2L" %}
5070   ins_encode %{
5071     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
5072     __ vpmullq($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, 0);
5073     __ movdq($tmp2$$XMMRegister, $src1$$Register);
5074     __ vpmullq($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5075     __ movdq($dst$$Register, $tmp2$$XMMRegister);
5076   %}
5077   ins_pipe( pipe_slow );
5078 %}
5079 
5080 instruct rvmul4L_reduction_reg(rRegL dst, rRegL src1, vecY src2, regF tmp, regF tmp2) %{
5081   predicate(UseAVX > 2 && VM_Version::supports_avx512dq());
5082   match(Set dst (MulReductionVL src1 src2));
5083   effect(TEMP tmp, TEMP tmp2);
5084   format %{ "vextracti128_high  $tmp,$src2\n\t"
5085             "vpmullq  $tmp2,$tmp,$src2\n\t"
5086             "pshufd   $tmp,$tmp2,0xE\n\t"
5087             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5088             "movdq    $tmp,$src1\n\t"
5089             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5090             "movdq    $dst,$tmp2\t! mul reduction4L" %}
5091   ins_encode %{
5092     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
5093     __ vpmullq($tmp2$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, 0);
5094     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5095     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5096     __ movdq($tmp$$XMMRegister, $src1$$Register);
5097     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5098     __ movdq($dst$$Register, $tmp2$$XMMRegister);
5099   %}
5100   ins_pipe( pipe_slow );
5101 %}
5102 
5103 instruct rvmul8L_reduction_reg(rRegL dst, rRegL src1, vecZ src2, regF tmp, regF tmp2) %{
5104   predicate(UseAVX > 2 && VM_Version::supports_avx512dq());
5105   match(Set dst (MulReductionVL src1 src2));
5106   effect(TEMP tmp, TEMP tmp2);
5107   format %{ "vextracti64x4_high  $tmp2,$src2\n\t"
5108             "vpmullq  $tmp2,$tmp2,$src2\n\t"
5109             "vextracti128_high  $tmp,$tmp2\n\t"
5110             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5111             "pshufd   $tmp,$tmp2,0xE\n\t"
5112             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5113             "movdq    $tmp,$src1\n\t"
5114             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5115             "movdq    $dst,$tmp2\t! mul reduction8L" %}
5116   ins_encode %{
5117     __ vextracti64x4_high($tmp2$$XMMRegister, $src2$$XMMRegister);
5118     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
5119     __ vextracti128_high($tmp$$XMMRegister, $tmp2$$XMMRegister);
5120     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5121     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5122     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5123     __ movdq($tmp$$XMMRegister, $src1$$Register);
5124     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5125     __ movdq($dst$$Register, $tmp2$$XMMRegister);
5126   %}
5127   ins_pipe( pipe_slow );
5128 %}
5129 #endif
5130 
5131 instruct rsmul2F_reduction(regF dst, vecD src2, regF tmp) %{
5132   predicate(UseSSE >= 1 && UseAVX == 0);
5133   match(Set dst (MulReductionVF dst src2));
5134   effect(TEMP dst, TEMP tmp);
5135   format %{ "mulss   $dst,$src2\n\t"
5136             "pshufd  $tmp,$src2,0x01\n\t"
5137             "mulss   $dst,$tmp\t! mul reduction2F" %}
5138   ins_encode %{
5139     __ mulss($dst$$XMMRegister, $src2$$XMMRegister);
5140     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5141     __ mulss($dst$$XMMRegister, $tmp$$XMMRegister);
5142   %}
5143   ins_pipe( pipe_slow );
5144 %}
5145 
5146 instruct rvmul2F_reduction_reg(regF dst, vecD src2, regF tmp) %{
5147   predicate(UseAVX > 0);
5148   match(Set dst (MulReductionVF dst src2));
5149   effect(TEMP tmp, TEMP dst);
5150   format %{ "vmulss  $dst,$dst,$src2\n\t"
5151             "pshufd  $tmp,$src2,0x01\n\t"
5152             "vmulss  $dst,$dst,$tmp\t! mul reduction2F" %}
5153   ins_encode %{
5154     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5155     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5156     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5157   %}
5158   ins_pipe( pipe_slow );
5159 %}
5160 
5161 instruct rsmul4F_reduction_reg(regF dst, vecX src2, regF tmp) %{
5162   predicate(UseSSE >= 1 && UseAVX == 0);
5163   match(Set dst (MulReductionVF dst src2));
5164   effect(TEMP dst, TEMP tmp);
5165   format %{ "mulss   $dst,$src2\n\t"
5166             "pshufd  $tmp,$src2,0x01\n\t"
5167             "mulss   $dst,$tmp\n\t"
5168             "pshufd  $tmp,$src2,0x02\n\t"
5169             "mulss   $dst,$tmp\n\t"
5170             "pshufd  $tmp,$src2,0x03\n\t"
5171             "mulss   $dst,$tmp\t! mul reduction4F" %}
5172   ins_encode %{
5173     __ mulss($dst$$XMMRegister, $src2$$XMMRegister);
5174     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5175     __ mulss($dst$$XMMRegister, $tmp$$XMMRegister);
5176     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5177     __ mulss($dst$$XMMRegister, $tmp$$XMMRegister);
5178     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5179     __ mulss($dst$$XMMRegister, $tmp$$XMMRegister);
5180   %}
5181   ins_pipe( pipe_slow );
5182 %}
5183 
5184 instruct rvmul4F_reduction_reg(regF dst, vecX src2, regF tmp) %{
5185   predicate(UseAVX > 0);
5186   match(Set dst (MulReductionVF dst src2));
5187   effect(TEMP tmp, TEMP dst);
5188   format %{ "vmulss  $dst,$dst,$src2\n\t"
5189             "pshufd  $tmp,$src2,0x01\n\t"
5190             "vmulss  $dst,$dst,$tmp\n\t"
5191             "pshufd  $tmp,$src2,0x02\n\t"
5192             "vmulss  $dst,$dst,$tmp\n\t"
5193             "pshufd  $tmp,$src2,0x03\n\t"
5194             "vmulss  $dst,$dst,$tmp\t! mul reduction4F" %}
5195   ins_encode %{
5196     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5197     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5198     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5199     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5200     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5201     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5202     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5203   %}
5204   ins_pipe( pipe_slow );
5205 %}
5206 
5207 instruct rvmul8F_reduction_reg(regF dst, vecY src2, regF tmp, regF tmp2) %{
5208   predicate(UseAVX > 0);
5209   match(Set dst (MulReductionVF dst src2));
5210   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5211   format %{ "vmulss  $dst,$dst,$src2\n\t"
5212             "pshufd  $tmp,$src2,0x01\n\t"
5213             "vmulss  $dst,$dst,$tmp\n\t"
5214             "pshufd  $tmp,$src2,0x02\n\t"
5215             "vmulss  $dst,$dst,$tmp\n\t"
5216             "pshufd  $tmp,$src2,0x03\n\t"
5217             "vmulss  $dst,$dst,$tmp\n\t"
5218             "vextractf128_high  $tmp2,$src2\n\t"
5219             "vmulss  $dst,$dst,$tmp2\n\t"
5220             "pshufd  $tmp,$tmp2,0x01\n\t"
5221             "vmulss  $dst,$dst,$tmp\n\t"
5222             "pshufd  $tmp,$tmp2,0x02\n\t"
5223             "vmulss  $dst,$dst,$tmp\n\t"
5224             "pshufd  $tmp,$tmp2,0x03\n\t"
5225             "vmulss  $dst,$dst,$tmp\t! mul reduction8F" %}
5226   ins_encode %{
5227     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5228     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5229     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5230     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5231     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5232     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5233     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5234     __ vextractf128_high($tmp2$$XMMRegister, $src2$$XMMRegister);
5235     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5236     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5237     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5238     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5239     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5240     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5241     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5242   %}
5243   ins_pipe( pipe_slow );
5244 %}
5245 
5246 instruct rvmul16F_reduction_reg(regF dst, vecZ src2, regF tmp, regF tmp2) %{
5247   predicate(UseAVX > 2);
5248   match(Set dst (MulReductionVF dst src2));
5249   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5250   format %{ "vmulss  $dst,$dst,$src2\n\t"
5251             "pshufd  $tmp,$src2,0x01\n\t"
5252             "vmulss  $dst,$dst,$tmp\n\t"
5253             "pshufd  $tmp,$src2,0x02\n\t"
5254             "vmulss  $dst,$dst,$tmp\n\t"
5255             "pshufd  $tmp,$src2,0x03\n\t"
5256             "vmulss  $dst,$dst,$tmp\n\t"
5257             "vextractf32x4  $tmp2,$src2,0x1\n\t"
5258             "vmulss  $dst,$dst,$tmp2\n\t"
5259             "pshufd  $tmp,$tmp2,0x01\n\t"
5260             "vmulss  $dst,$dst,$tmp\n\t"
5261             "pshufd  $tmp,$tmp2,0x02\n\t"
5262             "vmulss  $dst,$dst,$tmp\n\t"
5263             "pshufd  $tmp,$tmp2,0x03\n\t"
5264             "vmulss  $dst,$dst,$tmp\n\t"
5265             "vextractf32x4  $tmp2,$src2,0x2\n\t"
5266             "vmulss  $dst,$dst,$tmp2\n\t"
5267             "pshufd  $tmp,$tmp2,0x01\n\t"
5268             "vmulss  $dst,$dst,$tmp\n\t"
5269             "pshufd  $tmp,$tmp2,0x02\n\t"
5270             "vmulss  $dst,$dst,$tmp\n\t"
5271             "pshufd  $tmp,$tmp2,0x03\n\t"
5272             "vmulss  $dst,$dst,$tmp\n\t"
5273             "vextractf32x4  $tmp2,$src2,0x3\n\t"
5274             "vmulss  $dst,$dst,$tmp2\n\t"
5275             "pshufd  $tmp,$tmp2,0x01\n\t"
5276             "vmulss  $dst,$dst,$tmp\n\t"
5277             "pshufd  $tmp,$tmp2,0x02\n\t"
5278             "vmulss  $dst,$dst,$tmp\n\t"
5279             "pshufd  $tmp,$tmp2,0x03\n\t"
5280             "vmulss  $dst,$dst,$tmp\t! mul reduction16F" %}
5281   ins_encode %{
5282     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5283     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5284     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5285     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5286     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5287     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5288     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5289     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5290     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5291     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5292     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5293     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5294     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5295     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5296     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5297     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
5298     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5299     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5300     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5301     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5302     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5303     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5304     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5305     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
5306     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5307     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5308     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5309     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5310     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5311     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5312     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5313   %}
5314   ins_pipe( pipe_slow );
5315 %}
5316 
5317 instruct rsmul2D_reduction_reg(regD dst, vecX src2, regD tmp) %{
5318   predicate(UseSSE >= 1 && UseAVX == 0);
5319   match(Set dst (MulReductionVD dst src2));
5320   effect(TEMP dst, TEMP tmp);
5321   format %{ "mulsd   $dst,$src2\n\t"
5322             "pshufd  $tmp,$src2,0xE\n\t"
5323             "mulsd   $dst,$tmp\t! mul reduction2D" %}
5324   ins_encode %{
5325     __ mulsd($dst$$XMMRegister, $src2$$XMMRegister);
5326     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5327     __ mulsd($dst$$XMMRegister, $tmp$$XMMRegister);
5328   %}
5329   ins_pipe( pipe_slow );
5330 %}
5331 
5332 instruct rvmul2D_reduction_reg(regD dst, vecX src2, regD tmp) %{
5333   predicate(UseAVX > 0);
5334   match(Set dst (MulReductionVD dst src2));
5335   effect(TEMP tmp, TEMP dst);
5336   format %{ "vmulsd  $dst,$dst,$src2\n\t"
5337             "pshufd  $tmp,$src2,0xE\n\t"
5338             "vmulsd  $dst,$dst,$tmp\t! mul reduction2D" %}
5339   ins_encode %{
5340     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5341     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5342     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5343   %}
5344   ins_pipe( pipe_slow );
5345 %}
5346 
5347 instruct rvmul4D_reduction_reg(regD dst, vecY src2, regD tmp, regD tmp2) %{
5348   predicate(UseAVX > 0);
5349   match(Set dst (MulReductionVD dst src2));
5350   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5351   format %{ "vmulsd  $dst,$dst,$src2\n\t"
5352             "pshufd  $tmp,$src2,0xE\n\t"
5353             "vmulsd  $dst,$dst,$tmp\n\t"
5354             "vextractf128_high  $tmp2,$src2\n\t"
5355             "vmulsd  $dst,$dst,$tmp2\n\t"
5356             "pshufd  $tmp,$tmp2,0xE\n\t"
5357             "vmulsd  $dst,$dst,$tmp\t! mul reduction4D" %}
5358   ins_encode %{
5359     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5360     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5361     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5362     __ vextractf128_high($tmp2$$XMMRegister, $src2$$XMMRegister);
5363     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5364     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5365     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5366   %}
5367   ins_pipe( pipe_slow );
5368 %}
5369 
5370 instruct rvmul8D_reduction_reg(regD dst, vecZ src2, regD tmp, regD tmp2) %{
5371   predicate(UseAVX > 2);
5372   match(Set dst (MulReductionVD dst src2));
5373   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5374   format %{ "vmulsd  $dst,$dst,$src2\n\t"
5375             "pshufd  $tmp,$src2,0xE\n\t"
5376             "vmulsd  $dst,$dst,$tmp\n\t"
5377             "vextractf32x4  $tmp2,$src2,0x1\n\t"
5378             "vmulsd  $dst,$dst,$tmp2\n\t"
5379             "pshufd  $tmp,$src2,0xE\n\t"
5380             "vmulsd  $dst,$dst,$tmp\n\t"
5381             "vextractf32x4  $tmp2,$src2,0x2\n\t"
5382             "vmulsd  $dst,$dst,$tmp2\n\t"
5383             "pshufd  $tmp,$tmp2,0xE\n\t"
5384             "vmulsd  $dst,$dst,$tmp\n\t"
5385             "vextractf32x4  $tmp2,$src2,0x3\n\t"
5386             "vmulsd  $dst,$dst,$tmp2\n\t"
5387             "pshufd  $tmp,$tmp2,0xE\n\t"
5388             "vmulsd  $dst,$dst,$tmp\t! mul reduction8D" %}
5389   ins_encode %{
5390     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5391     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5392     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5393     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5394     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5395     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5396     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5397     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
5398     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5399     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5400     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5401     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
5402     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5403     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5404     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5405   %}
5406   ins_pipe( pipe_slow );
5407 %}
5408 
5409 // ====================VECTOR ARITHMETIC=======================================
5410 
5411 // --------------------------------- ADD --------------------------------------
5412 
5413 // Bytes vector add
5414 instruct vadd4B(vecS dst, vecS src) %{
5415   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
5416   match(Set dst (AddVB dst src));
5417   format %{ "paddb   $dst,$src\t! add packed4B" %}
5418   ins_encode %{
5419     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
5420   %}
5421   ins_pipe( pipe_slow );
5422 %}
5423 
5424 instruct vadd4B_reg_avx(vecS dst, vecS src1, vecS src2) %{
5425   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
5426   match(Set dst (AddVB src1 src2));
5427   format %{ "vpaddb  $dst,$src1,$src2\t! add packed4B" %}
5428   ins_encode %{
5429     int vector_len = 0;
5430     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5431   %}
5432   ins_pipe( pipe_slow );
5433 %}
5434 
5435 instruct vadd4B_reg_evex(vecS dst, vecS src1, vecS src2) %{
5436   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
5437   match(Set dst (AddVB src1 src2));
5438   format %{ "vpaddb  $dst,$src1,$src2\t! add packed4B" %}
5439   ins_encode %{
5440     int vector_len = 0;
5441     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5442   %}
5443   ins_pipe( pipe_slow );
5444 %}
5445 
5446 instruct vadd4B_reg_evex_special(vecS dst, vecS src1, vecS src2) %{
5447   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
5448   match(Set dst (AddVB dst src2));
5449   effect(TEMP src1);
5450   format %{ "vpaddb  $dst,$dst,$src2\t! add packed4B" %}
5451   ins_encode %{
5452     int vector_len = 0;
5453     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5454   %}
5455   ins_pipe( pipe_slow );
5456 %}
5457 
5458 instruct vadd4B_mem_avx(vecS dst, vecS src, memory mem) %{
5459   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
5460   match(Set dst (AddVB src (LoadVector mem)));
5461   format %{ "vpaddb  $dst,$src,$mem\t! add packed4B" %}
5462   ins_encode %{
5463     int vector_len = 0;
5464     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5465   %}
5466   ins_pipe( pipe_slow );
5467 %}
5468 
5469 instruct vadd4B_mem_evex(vecS dst, vecS src, memory mem) %{
5470   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
5471   match(Set dst (AddVB src (LoadVector mem)));
5472   format %{ "vpaddb  $dst,$src,$mem\t! add packed4B" %}
5473   ins_encode %{
5474     int vector_len = 0;
5475     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5476   %}
5477   ins_pipe( pipe_slow );
5478 %}
5479 
5480 instruct vadd4B_mem_evex_special(vecS dst, vecS src, memory mem) %{
5481   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
5482   match(Set dst (AddVB dst (LoadVector mem)));
5483   effect(TEMP src);
5484   format %{ "vpaddb  $dst,$src,$mem\t! add packed4B" %}
5485   ins_encode %{
5486     int vector_len = 0;
5487     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5488   %}
5489   ins_pipe( pipe_slow );
5490 %}
5491 
5492 instruct vadd8B(vecD dst, vecD src) %{
5493   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
5494   match(Set dst (AddVB dst src));
5495   format %{ "paddb   $dst,$src\t! add packed8B" %}
5496   ins_encode %{
5497     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
5498   %}
5499   ins_pipe( pipe_slow );
5500 %}
5501 
5502 instruct vadd8B_reg_avx(vecD dst, vecD src1, vecD src2) %{
5503   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
5504   match(Set dst (AddVB src1 src2));
5505   format %{ "vpaddb  $dst,$src1,$src2\t! add packed8B" %}
5506   ins_encode %{
5507     int vector_len = 0;
5508     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5509   %}
5510   ins_pipe( pipe_slow );
5511 %}
5512 
5513 instruct vadd8B_reg_evex(vecD dst, vecD src1, vecD src2) %{
5514   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
5515   match(Set dst (AddVB src1 src2));
5516   format %{ "vpaddb  $dst,$src1,$src2\t! add packed8B" %}
5517   ins_encode %{
5518     int vector_len = 0;
5519     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5520   %}
5521   ins_pipe( pipe_slow );
5522 %}
5523 
5524 instruct vadd8B_reg_evex_special(vecD dst, vecD src1, vecD src2) %{
5525   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
5526   match(Set dst (AddVB dst src2));
5527   effect(TEMP src1);
5528   format %{ "vpaddb  $dst,$dst,$src2\t! add packed8B" %}
5529   ins_encode %{
5530     int vector_len = 0;
5531     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5532   %}
5533   ins_pipe( pipe_slow );
5534 %}
5535 
5536 instruct vadd8B_mem_avx(vecD dst, vecD src, memory mem) %{
5537   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
5538   match(Set dst (AddVB src (LoadVector mem)));
5539   format %{ "vpaddb  $dst,$src,$mem\t! add packed8B" %}
5540   ins_encode %{
5541     int vector_len = 0;
5542     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5543   %}
5544   ins_pipe( pipe_slow );
5545 %}
5546 
5547 instruct vadd8B_mem_evex(vecD dst, vecD src, memory mem) %{
5548   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
5549   match(Set dst (AddVB src (LoadVector mem)));
5550   format %{ "vpaddb  $dst,$src,$mem\t! add packed8B" %}
5551   ins_encode %{
5552     int vector_len = 0;
5553     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5554   %}
5555   ins_pipe( pipe_slow );
5556 %}
5557 
5558 instruct vadd8B_mem_evex_special(vecD dst, vecD src, memory mem) %{
5559   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
5560   match(Set dst (AddVB dst (LoadVector mem)));
5561   effect(TEMP src);
5562   format %{ "vpaddb  $dst,$src,$mem\t! add packed8B" %}
5563   ins_encode %{
5564     int vector_len = 0;
5565     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5566   %}
5567   ins_pipe( pipe_slow );
5568 %}
5569 
5570 instruct vadd16B(vecX dst, vecX src) %{
5571   predicate(UseAVX == 0 && n->as_Vector()->length() == 16);
5572   match(Set dst (AddVB dst src));
5573   format %{ "paddb   $dst,$src\t! add packed16B" %}
5574   ins_encode %{
5575     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
5576   %}
5577   ins_pipe( pipe_slow );
5578 %}
5579 
5580 instruct vadd16B_reg_avx(vecX dst, vecX src1, vecX src2) %{
5581   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 16);
5582   match(Set dst (AddVB src1 src2));
5583   format %{ "vpaddb  $dst,$src1,$src2\t! add packed16B" %}
5584   ins_encode %{
5585     int vector_len = 0;
5586     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5587   %}
5588   ins_pipe( pipe_slow );
5589 %}
5590 
5591 instruct vadd16B_reg_evex(vecX dst, vecX src1, vecX src2) %{
5592   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
5593   match(Set dst (AddVB src1 src2));
5594   format %{ "vpaddb  $dst,$src1,$src2\t! add packed16B" %}
5595   ins_encode %{
5596     int vector_len = 0;
5597     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5598   %}
5599   ins_pipe( pipe_slow );
5600 %}
5601 
5602 instruct vadd16B_reg_evex_special(vecX dst, vecX src1, vecX src2) %{
5603   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
5604   match(Set dst (AddVB dst src2));
5605   effect(TEMP src1);
5606   format %{ "vpaddb  $dst,$dst,$src2\t! add packed16B" %}
5607   ins_encode %{
5608     int vector_len = 0;
5609     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5610   %}
5611   ins_pipe( pipe_slow );
5612 %}
5613 
5614 instruct vadd16B_mem_avx(vecX dst, vecX src, memory mem) %{
5615   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 16);
5616   match(Set dst (AddVB src (LoadVector mem)));
5617   format %{ "vpaddb  $dst,$src,$mem\t! add packed16B" %}
5618   ins_encode %{
5619     int vector_len = 0;
5620     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5621   %}
5622   ins_pipe( pipe_slow );
5623 %}
5624 
5625 instruct vadd16B_mem_evex(vecX dst, vecX src, memory mem) %{
5626   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
5627   match(Set dst (AddVB src (LoadVector mem)));
5628   format %{ "vpaddb  $dst,$src,$mem\t! add packed16B" %}
5629   ins_encode %{
5630     int vector_len = 0;
5631     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5632   %}
5633   ins_pipe( pipe_slow );
5634 %}
5635 
5636 instruct vadd16B_mem_evex_special(vecX dst, vecX src, memory mem) %{
5637   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
5638   match(Set dst (AddVB dst (LoadVector mem)));
5639   effect(TEMP src);
5640   format %{ "vpaddb  $dst,$src,$mem\t! add packed16B" %}
5641   ins_encode %{
5642     int vector_len = 0;
5643     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5644   %}
5645   ins_pipe( pipe_slow );
5646 %}
5647 
5648 instruct vadd32B_reg_avx(vecY dst, vecY src1, vecY src2) %{
5649   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 32);
5650   match(Set dst (AddVB src1 src2));
5651   format %{ "vpaddb  $dst,$src1,$src2\t! add packed32B" %}
5652   ins_encode %{
5653     int vector_len = 1;
5654     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5655   %}
5656   ins_pipe( pipe_slow );
5657 %}
5658 
5659 instruct vadd32B_reg_evex(vecY dst, vecY src1, vecY src2) %{
5660   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
5661   match(Set dst (AddVB src1 src2));
5662   format %{ "vpaddb  $dst,$src1,$src2\t! add packed32B" %}
5663   ins_encode %{
5664     int vector_len = 1;
5665     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5666   %}
5667   ins_pipe( pipe_slow );
5668 %}
5669 
5670 instruct vadd32B_reg_evex_special(vecY dst, vecY src1, vecY src2) %{
5671   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 32);
5672   match(Set dst (AddVB dst src2));
5673   effect(TEMP src1);
5674   format %{ "vpaddb  $dst,$dst,$src2\t! add packed32B" %}
5675   ins_encode %{
5676     int vector_len = 1;
5677     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5678   %}
5679   ins_pipe( pipe_slow );
5680 %}
5681 
5682 instruct vadd32B_mem_avx(vecY dst, vecY src, memory mem) %{
5683   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 32);
5684   match(Set dst (AddVB src (LoadVector mem)));
5685   format %{ "vpaddb  $dst,$src,$mem\t! add packed32B" %}
5686   ins_encode %{
5687     int vector_len = 1;
5688     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5689   %}
5690   ins_pipe( pipe_slow );
5691 %}
5692 
5693 instruct vadd32B_mem_evex(vecY dst, vecY src, memory mem) %{
5694   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
5695   match(Set dst (AddVB src (LoadVector mem)));
5696   format %{ "vpaddb  $dst,$src,$mem\t! add packed32B" %}
5697   ins_encode %{
5698     int vector_len = 1;
5699     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5700   %}
5701   ins_pipe( pipe_slow );
5702 %}
5703 
5704 instruct vadd32B_mem_evex_special(vecY dst, vecY src, memory mem) %{
5705   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
5706   match(Set dst (AddVB dst (LoadVector mem)));
5707   effect(TEMP src);
5708   format %{ "vpaddb  $dst,$src,$mem\t! add packed32B" %}
5709   ins_encode %{
5710     int vector_len = 1;
5711     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5712   %}
5713   ins_pipe( pipe_slow );
5714 %}
5715 
5716 instruct vadd64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
5717   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64);
5718   match(Set dst (AddVB src1 src2));
5719   format %{ "vpaddb  $dst,$src1,$src2\t! add packed64B" %}
5720   ins_encode %{
5721     int vector_len = 2;
5722     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5723   %}
5724   ins_pipe( pipe_slow );
5725 %}
5726 
5727 instruct vadd64B_mem(vecZ dst, vecZ src, memory mem) %{
5728   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64);
5729   match(Set dst (AddVB src (LoadVector mem)));
5730   format %{ "vpaddb  $dst,$src,$mem\t! add packed64B" %}
5731   ins_encode %{
5732     int vector_len = 2;
5733     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5734   %}
5735   ins_pipe( pipe_slow );
5736 %}
5737 
5738 // Shorts/Chars vector add
5739 instruct vadd2S(vecS dst, vecS src) %{
5740   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
5741   match(Set dst (AddVS dst src));
5742   format %{ "paddw   $dst,$src\t! add packed2S" %}
5743   ins_encode %{
5744     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
5745   %}
5746   ins_pipe( pipe_slow );
5747 %}
5748 
5749 instruct vadd2S_reg_avx(vecS dst, vecS src1, vecS src2) %{
5750   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
5751   match(Set dst (AddVS src1 src2));
5752   format %{ "vpaddw  $dst,$src1,$src2\t! add packed2S" %}
5753   ins_encode %{
5754     int vector_len = 0;
5755     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5756   %}
5757   ins_pipe( pipe_slow );
5758 %}
5759 
5760 instruct vadd2S_reg_evex(vecS dst, vecS src1, vecS src2) %{
5761   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
5762   match(Set dst (AddVS src1 src2));
5763   format %{ "vpaddw  $dst,$src1,$src2\t! add packed2S" %}
5764   ins_encode %{
5765     int vector_len = 0;
5766     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5767   %}
5768   ins_pipe( pipe_slow );
5769 %}
5770 
5771 instruct vadd2S_reg_evex_special(vecS dst, vecS src1, vecS src2) %{
5772   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
5773   match(Set dst (AddVS dst src2));
5774   effect(TEMP src1);
5775   format %{ "vpaddw  $dst,$dst,$src2\t! add packed2S" %}
5776   ins_encode %{
5777     int vector_len = 0;
5778     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5779   %}
5780   ins_pipe( pipe_slow );
5781 %}
5782 
5783 instruct vadd2S_mem_avx(vecS dst, vecS src, memory mem) %{
5784   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
5785   match(Set dst (AddVS src (LoadVector mem)));
5786   format %{ "vpaddw  $dst,$src,$mem\t! add packed2S" %}
5787   ins_encode %{
5788     int vector_len = 0;
5789     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5790   %}
5791   ins_pipe( pipe_slow );
5792 %}
5793 
5794 instruct vadd2S_mem_evex(vecS dst, vecS src, memory mem) %{
5795   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
5796   match(Set dst (AddVS src (LoadVector mem)));
5797   format %{ "vpaddw  $dst,$src,$mem\t! add packed2S" %}
5798   ins_encode %{
5799     int vector_len = 0;
5800     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5801   %}
5802   ins_pipe( pipe_slow );
5803 %}
5804 
5805 instruct vadd2S_mem_evex_special(vecS dst, vecS src, memory mem) %{
5806   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
5807   match(Set dst (AddVS dst (LoadVector mem)));
5808   effect(TEMP src);
5809   format %{ "vpaddw  $dst,$src,$mem\t! add packed2S" %}
5810   ins_encode %{
5811     int vector_len = 0;
5812     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5813   %}
5814   ins_pipe( pipe_slow );
5815 %}
5816 
5817 instruct vadd4S(vecD dst, vecD src) %{
5818   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
5819   match(Set dst (AddVS dst src));
5820   format %{ "paddw   $dst,$src\t! add packed4S" %}
5821   ins_encode %{
5822     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
5823   %}
5824   ins_pipe( pipe_slow );
5825 %}
5826 
5827 instruct vadd4S_reg_avx(vecD dst, vecD src1, vecD src2) %{
5828   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
5829   match(Set dst (AddVS src1 src2));
5830   format %{ "vpaddw  $dst,$src1,$src2\t! add packed4S" %}
5831   ins_encode %{
5832     int vector_len = 0;
5833     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5834   %}
5835   ins_pipe( pipe_slow );
5836 %}
5837 
5838 instruct vadd4S_reg_evex(vecD dst, vecD src1, vecD src2) %{
5839   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
5840   match(Set dst (AddVS src1 src2));
5841   format %{ "vpaddw  $dst,$src1,$src2\t! add packed4S" %}
5842   ins_encode %{
5843     int vector_len = 0;
5844     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5845   %}
5846   ins_pipe( pipe_slow );
5847 %}
5848 
5849 instruct vadd4S_reg_evex_special(vecD dst, vecD src1, vecD src2) %{
5850   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
5851   match(Set dst (AddVS dst src2));
5852   effect(TEMP src1);
5853   format %{ "vpaddw  $dst,$dst,$src2\t! add packed4S" %}
5854   ins_encode %{
5855     int vector_len = 0;
5856     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5857   %}
5858   ins_pipe( pipe_slow );
5859 %}
5860 
5861 instruct vadd4S_mem_avx(vecD dst, vecD src, memory mem) %{
5862   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
5863   match(Set dst (AddVS src (LoadVector mem)));
5864   format %{ "vpaddw  $dst,$src,$mem\t! add packed4S" %}
5865   ins_encode %{
5866     int vector_len = 0;
5867     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5868   %}
5869   ins_pipe( pipe_slow );
5870 %}
5871 
5872 instruct vadd4S_mem_evex(vecD dst, vecD src, memory mem) %{
5873   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
5874   match(Set dst (AddVS src (LoadVector mem)));
5875   format %{ "vpaddw  $dst,$src,$mem\t! add packed4S" %}
5876   ins_encode %{
5877     int vector_len = 0;
5878     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5879   %}
5880   ins_pipe( pipe_slow );
5881 %}
5882 
5883 instruct vadd4S_mem_evex_special(vecD dst, vecD src, memory mem) %{
5884   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
5885   match(Set dst (AddVS dst (LoadVector mem)));
5886   effect(TEMP src);
5887   format %{ "vpaddw  $dst,$src,$mem\t! add packed4S" %}
5888   ins_encode %{
5889     int vector_len = 0;
5890     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5891   %}
5892   ins_pipe( pipe_slow );
5893 %}
5894 
5895 instruct vadd8S(vecX dst, vecX src) %{
5896   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
5897   match(Set dst (AddVS dst src));
5898   format %{ "paddw   $dst,$src\t! add packed8S" %}
5899   ins_encode %{
5900     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
5901   %}
5902   ins_pipe( pipe_slow );
5903 %}
5904 
5905 instruct vadd8S_reg_avx(vecX dst, vecX src1, vecX src2) %{
5906   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
5907   match(Set dst (AddVS src1 src2));
5908   format %{ "vpaddw  $dst,$src1,$src2\t! add packed8S" %}
5909   ins_encode %{
5910     int vector_len = 0;
5911     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5912   %}
5913   ins_pipe( pipe_slow );
5914 %}
5915 
5916 instruct vadd8S_reg_evex(vecX dst, vecX src1, vecX src2) %{
5917   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
5918   match(Set dst (AddVS src1 src2));
5919   format %{ "vpaddw  $dst,$src1,$src2\t! add packed8S" %}
5920   ins_encode %{
5921     int vector_len = 0;
5922     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5923   %}
5924   ins_pipe( pipe_slow );
5925 %}
5926 
5927 instruct vadd8S_reg_evex_special(vecX dst, vecX src1, vecX src2) %{
5928   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
5929   match(Set dst (AddVS dst src2));
5930   effect(TEMP src1);
5931   format %{ "vpaddw  $dst,$dst,$src2\t! add packed8S" %}
5932   ins_encode %{
5933     int vector_len = 0;
5934     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5935   %}
5936   ins_pipe( pipe_slow );
5937 %}
5938 
5939 instruct vadd8S_mem_avx(vecX dst, vecX src, memory mem) %{
5940   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
5941   match(Set dst (AddVS src (LoadVector mem)));
5942   format %{ "vpaddw  $dst,$src,$mem\t! add packed8S" %}
5943   ins_encode %{
5944     int vector_len = 0;
5945     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5946   %}
5947   ins_pipe( pipe_slow );
5948 %}
5949 
5950 instruct vadd8S_mem_evex(vecX dst, vecX src, memory mem) %{
5951   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
5952   match(Set dst (AddVS src (LoadVector mem)));
5953   format %{ "vpaddw  $dst,$src,$mem\t! add packed8S" %}
5954   ins_encode %{
5955     int vector_len = 0;
5956     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5957   %}
5958   ins_pipe( pipe_slow );
5959 %}
5960 
5961 instruct vadd8S_mem_evex_special(vecX dst, vecX src, memory mem) %{
5962   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
5963   match(Set dst (AddVS dst (LoadVector mem)));
5964   effect(TEMP src);
5965   format %{ "vpaddw  $dst,$src,$mem\t! add packed8S" %}
5966   ins_encode %{
5967     int vector_len = 0;
5968     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5969   %}
5970   ins_pipe( pipe_slow );
5971 %}
5972 
5973 instruct vadd16S_reg_avx(vecY dst, vecY src1, vecY src2) %{
5974   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
5975   match(Set dst (AddVS src1 src2));
5976   format %{ "vpaddw  $dst,$src1,$src2\t! add packed16S" %}
5977   ins_encode %{
5978     int vector_len = 1;
5979     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5980   %}
5981   ins_pipe( pipe_slow );
5982 %}
5983 
5984 instruct vadd16S_reg_evex(vecY dst, vecY src1, vecY src2) %{
5985   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
5986   match(Set dst (AddVS src1 src2));
5987   format %{ "vpaddw  $dst,$src1,$src2\t! add packed16S" %}
5988   ins_encode %{
5989     int vector_len = 1;
5990     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5991   %}
5992   ins_pipe( pipe_slow );
5993 %}
5994 
5995 instruct vadd16S_reg_evex_special(vecY dst, vecY src1, vecY src2) %{
5996   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
5997   match(Set dst (AddVS dst src2));
5998   effect(TEMP src1);
5999   format %{ "vpaddw  $dst,$dst,$src2\t! add packed16S" %}
6000   ins_encode %{
6001     int vector_len = 1;
6002     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6003   %}
6004   ins_pipe( pipe_slow );
6005 %}
6006 
6007 instruct vadd16S_mem_avx(vecY dst, vecY src, memory mem) %{
6008   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
6009   match(Set dst (AddVS src (LoadVector mem)));
6010   format %{ "vpaddw  $dst,$src,$mem\t! add packed16S" %}
6011   ins_encode %{
6012     int vector_len = 1;
6013     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6014   %}
6015   ins_pipe( pipe_slow );
6016 %}
6017 
6018 instruct vadd16S_mem_evex(vecY dst, vecY src, memory mem) %{
6019   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
6020   match(Set dst (AddVS src (LoadVector mem)));
6021   format %{ "vpaddw  $dst,$src,$mem\t! add packed16S" %}
6022   ins_encode %{
6023     int vector_len = 1;
6024     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6025   %}
6026   ins_pipe( pipe_slow );
6027 %}
6028 
6029 instruct vadd16S_mem_evex_special(vecY dst, vecY src, memory mem) %{
6030   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
6031   match(Set dst (AddVS dst (LoadVector mem)));
6032   effect(TEMP src);
6033   format %{ "vpaddw  $dst,$src,$mem\t! add packed16S" %}
6034   ins_encode %{
6035     int vector_len = 1;
6036     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6037   %}
6038   ins_pipe( pipe_slow );
6039 %}
6040 
6041 instruct vadd32S_reg(vecZ dst, vecZ src1, vecZ src2) %{
6042   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
6043   match(Set dst (AddVS src1 src2));
6044   format %{ "vpaddw  $dst,$src1,$src2\t! add packed32S" %}
6045   ins_encode %{
6046     int vector_len = 2;
6047     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6048   %}
6049   ins_pipe( pipe_slow );
6050 %}
6051 
6052 instruct vadd32S_mem(vecZ dst, vecZ src, memory mem) %{
6053   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
6054   match(Set dst (AddVS src (LoadVector mem)));
6055   format %{ "vpaddw  $dst,$src,$mem\t! add packed32S" %}
6056   ins_encode %{
6057     int vector_len = 2;
6058     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6059   %}
6060   ins_pipe( pipe_slow );
6061 %}
6062 
6063 // Integers vector add
6064 instruct vadd2I(vecD dst, vecD src) %{
6065   predicate(n->as_Vector()->length() == 2);
6066   match(Set dst (AddVI dst src));
6067   format %{ "paddd   $dst,$src\t! add packed2I" %}
6068   ins_encode %{
6069     __ paddd($dst$$XMMRegister, $src$$XMMRegister);
6070   %}
6071   ins_pipe( pipe_slow );
6072 %}
6073 
6074 instruct vadd2I_reg(vecD dst, vecD src1, vecD src2) %{
6075   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6076   match(Set dst (AddVI src1 src2));
6077   format %{ "vpaddd  $dst,$src1,$src2\t! add packed2I" %}
6078   ins_encode %{
6079     int vector_len = 0;
6080     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6081   %}
6082   ins_pipe( pipe_slow );
6083 %}
6084 
6085 instruct vadd2I_mem(vecD dst, vecD src, memory mem) %{
6086   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6087   match(Set dst (AddVI src (LoadVector mem)));
6088   format %{ "vpaddd  $dst,$src,$mem\t! add packed2I" %}
6089   ins_encode %{
6090     int vector_len = 0;
6091     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6092   %}
6093   ins_pipe( pipe_slow );
6094 %}
6095 
6096 instruct vadd4I(vecX dst, vecX src) %{
6097   predicate(n->as_Vector()->length() == 4);
6098   match(Set dst (AddVI dst src));
6099   format %{ "paddd   $dst,$src\t! add packed4I" %}
6100   ins_encode %{
6101     __ paddd($dst$$XMMRegister, $src$$XMMRegister);
6102   %}
6103   ins_pipe( pipe_slow );
6104 %}
6105 
6106 instruct vadd4I_reg(vecX dst, vecX src1, vecX src2) %{
6107   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6108   match(Set dst (AddVI src1 src2));
6109   format %{ "vpaddd  $dst,$src1,$src2\t! add packed4I" %}
6110   ins_encode %{
6111     int vector_len = 0;
6112     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6113   %}
6114   ins_pipe( pipe_slow );
6115 %}
6116 
6117 instruct vadd4I_mem(vecX dst, vecX src, memory mem) %{
6118   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6119   match(Set dst (AddVI src (LoadVector mem)));
6120   format %{ "vpaddd  $dst,$src,$mem\t! add packed4I" %}
6121   ins_encode %{
6122     int vector_len = 0;
6123     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6124   %}
6125   ins_pipe( pipe_slow );
6126 %}
6127 
6128 instruct vadd8I_reg(vecY dst, vecY src1, vecY src2) %{
6129   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
6130   match(Set dst (AddVI src1 src2));
6131   format %{ "vpaddd  $dst,$src1,$src2\t! add packed8I" %}
6132   ins_encode %{
6133     int vector_len = 1;
6134     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6135   %}
6136   ins_pipe( pipe_slow );
6137 %}
6138 
6139 instruct vadd8I_mem(vecY dst, vecY src, memory mem) %{
6140   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
6141   match(Set dst (AddVI src (LoadVector mem)));
6142   format %{ "vpaddd  $dst,$src,$mem\t! add packed8I" %}
6143   ins_encode %{
6144     int vector_len = 1;
6145     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6146   %}
6147   ins_pipe( pipe_slow );
6148 %}
6149 
6150 instruct vadd16I_reg(vecZ dst, vecZ src1, vecZ src2) %{
6151   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
6152   match(Set dst (AddVI src1 src2));
6153   format %{ "vpaddd  $dst,$src1,$src2\t! add packed16I" %}
6154   ins_encode %{
6155     int vector_len = 2;
6156     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6157   %}
6158   ins_pipe( pipe_slow );
6159 %}
6160 
6161 instruct vadd16I_mem(vecZ dst, vecZ src, memory mem) %{
6162   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
6163   match(Set dst (AddVI src (LoadVector mem)));
6164   format %{ "vpaddd  $dst,$src,$mem\t! add packed16I" %}
6165   ins_encode %{
6166     int vector_len = 2;
6167     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6168   %}
6169   ins_pipe( pipe_slow );
6170 %}
6171 
6172 // Longs vector add
6173 instruct vadd2L(vecX dst, vecX src) %{
6174   predicate(n->as_Vector()->length() == 2);
6175   match(Set dst (AddVL dst src));
6176   format %{ "paddq   $dst,$src\t! add packed2L" %}
6177   ins_encode %{
6178     __ paddq($dst$$XMMRegister, $src$$XMMRegister);
6179   %}
6180   ins_pipe( pipe_slow );
6181 %}
6182 
6183 instruct vadd2L_reg(vecX dst, vecX src1, vecX src2) %{
6184   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6185   match(Set dst (AddVL src1 src2));
6186   format %{ "vpaddq  $dst,$src1,$src2\t! add packed2L" %}
6187   ins_encode %{
6188     int vector_len = 0;
6189     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6190   %}
6191   ins_pipe( pipe_slow );
6192 %}
6193 
6194 instruct vadd2L_mem(vecX dst, vecX src, memory mem) %{
6195   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6196   match(Set dst (AddVL src (LoadVector mem)));
6197   format %{ "vpaddq  $dst,$src,$mem\t! add packed2L" %}
6198   ins_encode %{
6199     int vector_len = 0;
6200     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6201   %}
6202   ins_pipe( pipe_slow );
6203 %}
6204 
6205 instruct vadd4L_reg(vecY dst, vecY src1, vecY src2) %{
6206   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
6207   match(Set dst (AddVL src1 src2));
6208   format %{ "vpaddq  $dst,$src1,$src2\t! add packed4L" %}
6209   ins_encode %{
6210     int vector_len = 1;
6211     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6212   %}
6213   ins_pipe( pipe_slow );
6214 %}
6215 
6216 instruct vadd4L_mem(vecY dst, vecY src, memory mem) %{
6217   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
6218   match(Set dst (AddVL src (LoadVector mem)));
6219   format %{ "vpaddq  $dst,$src,$mem\t! add packed4L" %}
6220   ins_encode %{
6221     int vector_len = 1;
6222     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6223   %}
6224   ins_pipe( pipe_slow );
6225 %}
6226 
6227 instruct vadd8L_reg(vecZ dst, vecZ src1, vecZ src2) %{
6228   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
6229   match(Set dst (AddVL src1 src2));
6230   format %{ "vpaddq  $dst,$src1,$src2\t! add packed8L" %}
6231   ins_encode %{
6232     int vector_len = 2;
6233     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6234   %}
6235   ins_pipe( pipe_slow );
6236 %}
6237 
6238 instruct vadd8L_mem(vecZ dst, vecZ src, memory mem) %{
6239   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
6240   match(Set dst (AddVL src (LoadVector mem)));
6241   format %{ "vpaddq  $dst,$src,$mem\t! add packed8L" %}
6242   ins_encode %{
6243     int vector_len = 2;
6244     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6245   %}
6246   ins_pipe( pipe_slow );
6247 %}
6248 
6249 // Floats vector add
6250 instruct vadd2F(vecD dst, vecD src) %{
6251   predicate(n->as_Vector()->length() == 2);
6252   match(Set dst (AddVF dst src));
6253   format %{ "addps   $dst,$src\t! add packed2F" %}
6254   ins_encode %{
6255     __ addps($dst$$XMMRegister, $src$$XMMRegister);
6256   %}
6257   ins_pipe( pipe_slow );
6258 %}
6259 
6260 instruct vadd2F_reg(vecD dst, vecD src1, vecD src2) %{
6261   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6262   match(Set dst (AddVF src1 src2));
6263   format %{ "vaddps  $dst,$src1,$src2\t! add packed2F" %}
6264   ins_encode %{
6265     int vector_len = 0;
6266     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6267   %}
6268   ins_pipe( pipe_slow );
6269 %}
6270 
6271 instruct vadd2F_mem(vecD dst, vecD src, memory mem) %{
6272   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6273   match(Set dst (AddVF src (LoadVector mem)));
6274   format %{ "vaddps  $dst,$src,$mem\t! add packed2F" %}
6275   ins_encode %{
6276     int vector_len = 0;
6277     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6278   %}
6279   ins_pipe( pipe_slow );
6280 %}
6281 
6282 instruct vadd4F(vecX dst, vecX src) %{
6283   predicate(n->as_Vector()->length() == 4);
6284   match(Set dst (AddVF dst src));
6285   format %{ "addps   $dst,$src\t! add packed4F" %}
6286   ins_encode %{
6287     __ addps($dst$$XMMRegister, $src$$XMMRegister);
6288   %}
6289   ins_pipe( pipe_slow );
6290 %}
6291 
6292 instruct vadd4F_reg(vecX dst, vecX src1, vecX src2) %{
6293   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6294   match(Set dst (AddVF src1 src2));
6295   format %{ "vaddps  $dst,$src1,$src2\t! add packed4F" %}
6296   ins_encode %{
6297     int vector_len = 0;
6298     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6299   %}
6300   ins_pipe( pipe_slow );
6301 %}
6302 
6303 instruct vadd4F_mem(vecX dst, vecX src, memory mem) %{
6304   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6305   match(Set dst (AddVF src (LoadVector mem)));
6306   format %{ "vaddps  $dst,$src,$mem\t! add packed4F" %}
6307   ins_encode %{
6308     int vector_len = 0;
6309     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6310   %}
6311   ins_pipe( pipe_slow );
6312 %}
6313 
6314 instruct vadd8F_reg(vecY dst, vecY src1, vecY src2) %{
6315   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6316   match(Set dst (AddVF src1 src2));
6317   format %{ "vaddps  $dst,$src1,$src2\t! add packed8F" %}
6318   ins_encode %{
6319     int vector_len = 1;
6320     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6321   %}
6322   ins_pipe( pipe_slow );
6323 %}
6324 
6325 instruct vadd8F_mem(vecY dst, vecY src, memory mem) %{
6326   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6327   match(Set dst (AddVF src (LoadVector mem)));
6328   format %{ "vaddps  $dst,$src,$mem\t! add packed8F" %}
6329   ins_encode %{
6330     int vector_len = 1;
6331     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6332   %}
6333   ins_pipe( pipe_slow );
6334 %}
6335 
6336 instruct vadd16F_reg(vecZ dst, vecZ src1, vecZ src2) %{
6337   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
6338   match(Set dst (AddVF src1 src2));
6339   format %{ "vaddps  $dst,$src1,$src2\t! add packed16F" %}
6340   ins_encode %{
6341     int vector_len = 2;
6342     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6343   %}
6344   ins_pipe( pipe_slow );
6345 %}
6346 
6347 instruct vadd16F_mem(vecZ dst, vecZ src, memory mem) %{
6348   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
6349   match(Set dst (AddVF src (LoadVector mem)));
6350   format %{ "vaddps  $dst,$src,$mem\t! add packed16F" %}
6351   ins_encode %{
6352     int vector_len = 2;
6353     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6354   %}
6355   ins_pipe( pipe_slow );
6356 %}
6357 
6358 // Doubles vector add
6359 instruct vadd2D(vecX dst, vecX src) %{
6360   predicate(n->as_Vector()->length() == 2);
6361   match(Set dst (AddVD dst src));
6362   format %{ "addpd   $dst,$src\t! add packed2D" %}
6363   ins_encode %{
6364     __ addpd($dst$$XMMRegister, $src$$XMMRegister);
6365   %}
6366   ins_pipe( pipe_slow );
6367 %}
6368 
6369 instruct vadd2D_reg(vecX dst, vecX src1, vecX src2) %{
6370   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6371   match(Set dst (AddVD src1 src2));
6372   format %{ "vaddpd  $dst,$src1,$src2\t! add packed2D" %}
6373   ins_encode %{
6374     int vector_len = 0;
6375     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6376   %}
6377   ins_pipe( pipe_slow );
6378 %}
6379 
6380 instruct vadd2D_mem(vecX dst, vecX src, memory mem) %{
6381   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6382   match(Set dst (AddVD src (LoadVector mem)));
6383   format %{ "vaddpd  $dst,$src,$mem\t! add packed2D" %}
6384   ins_encode %{
6385     int vector_len = 0;
6386     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6387   %}
6388   ins_pipe( pipe_slow );
6389 %}
6390 
6391 instruct vadd4D_reg(vecY dst, vecY src1, vecY src2) %{
6392   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6393   match(Set dst (AddVD src1 src2));
6394   format %{ "vaddpd  $dst,$src1,$src2\t! add packed4D" %}
6395   ins_encode %{
6396     int vector_len = 1;
6397     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6398   %}
6399   ins_pipe( pipe_slow );
6400 %}
6401 
6402 instruct vadd4D_mem(vecY dst, vecY src, memory mem) %{
6403   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6404   match(Set dst (AddVD src (LoadVector mem)));
6405   format %{ "vaddpd  $dst,$src,$mem\t! add packed4D" %}
6406   ins_encode %{
6407     int vector_len = 1;
6408     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6409   %}
6410   ins_pipe( pipe_slow );
6411 %}
6412 
6413 instruct vadd8D_reg(vecZ dst, vecZ src1, vecZ src2) %{
6414   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
6415   match(Set dst (AddVD src1 src2));
6416   format %{ "vaddpd  $dst,$src1,$src2\t! add packed8D" %}
6417   ins_encode %{
6418     int vector_len = 2;
6419     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6420   %}
6421   ins_pipe( pipe_slow );
6422 %}
6423 
6424 instruct vadd8D_mem(vecZ dst, vecZ src, memory mem) %{
6425   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
6426   match(Set dst (AddVD src (LoadVector mem)));
6427   format %{ "vaddpd  $dst,$src,$mem\t! add packed8D" %}
6428   ins_encode %{
6429     int vector_len = 2;
6430     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6431   %}
6432   ins_pipe( pipe_slow );
6433 %}
6434 
6435 // --------------------------------- SUB --------------------------------------
6436 
6437 // Bytes vector sub
6438 instruct vsub4B(vecS dst, vecS src) %{
6439   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
6440   match(Set dst (SubVB dst src));
6441   format %{ "psubb   $dst,$src\t! sub packed4B" %}
6442   ins_encode %{
6443     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
6444   %}
6445   ins_pipe( pipe_slow );
6446 %}
6447 
6448 instruct vsub4B_reg_avx(vecS dst, vecS src1, vecS src2) %{
6449   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
6450   match(Set dst (SubVB src1 src2));
6451   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed4B" %}
6452   ins_encode %{
6453     int vector_len = 0;
6454     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6455   %}
6456   ins_pipe( pipe_slow );
6457 %}
6458 
6459 instruct vsub4B_reg_evex(vecS dst, vecS src1, vecS src2) %{
6460   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
6461   match(Set dst (SubVB src1 src2));
6462   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed4B" %}
6463   ins_encode %{
6464     int vector_len = 0;
6465     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6466   %}
6467   ins_pipe( pipe_slow );
6468 %}
6469 
6470 instruct vsub4B_reg_exex_special(vecS dst, vecS src1, vecS src2) %{
6471   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
6472   match(Set dst (SubVB dst src2));
6473   effect(TEMP src1);
6474   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed4B" %}
6475   ins_encode %{
6476     int vector_len = 0;
6477     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6478   %}
6479   ins_pipe( pipe_slow );
6480 %}
6481 
6482 instruct vsub4B_mem_avx(vecS dst, vecS src, memory mem) %{
6483   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
6484   match(Set dst (SubVB src (LoadVector mem)));
6485   format %{ "vpsubb  $dst,$src,$mem\t! sub packed4B" %}
6486   ins_encode %{
6487     int vector_len = 0;
6488     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6489   %}
6490   ins_pipe( pipe_slow );
6491 %}
6492 
6493 instruct vsub4B_mem_evex(vecS dst, vecS src, memory mem) %{
6494   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
6495   match(Set dst (SubVB src (LoadVector mem)));
6496   format %{ "vpsubb  $dst,$src,$mem\t! sub packed4B" %}
6497   ins_encode %{
6498     int vector_len = 0;
6499     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6500   %}
6501   ins_pipe( pipe_slow );
6502 %}
6503 
6504 instruct vsub4B_mem_evex_special(vecS dst, vecS src, memory mem) %{
6505   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
6506   match(Set dst (SubVB dst (LoadVector mem)));
6507   effect(TEMP src);
6508   format %{ "vpsubb  $dst,$src,$mem\t! sub packed4B" %}
6509   ins_encode %{
6510     int vector_len = 0;
6511     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6512   %}
6513   ins_pipe( pipe_slow );
6514 %}
6515 
6516 instruct vsub8B(vecD dst, vecD src) %{
6517   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
6518   match(Set dst (SubVB dst src));
6519   format %{ "psubb   $dst,$src\t! sub packed8B" %}
6520   ins_encode %{
6521     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
6522   %}
6523   ins_pipe( pipe_slow );
6524 %}
6525 
6526 instruct vsub8B_reg_avx(vecD dst, vecD src1, vecD src2) %{
6527   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
6528   match(Set dst (SubVB src1 src2));
6529   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed8B" %}
6530   ins_encode %{
6531     int vector_len = 0;
6532     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6533   %}
6534   ins_pipe( pipe_slow );
6535 %}
6536 
6537 instruct vsub8B_reg_evex(vecD dst, vecD src1, vecD src2) %{
6538   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
6539   match(Set dst (SubVB src1 src2));
6540   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed8B" %}
6541   ins_encode %{
6542     int vector_len = 0;
6543     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6544   %}
6545   ins_pipe( pipe_slow );
6546 %}
6547 
6548 instruct vsub8B_reg_evex_special(vecD dst, vecD src1, vecD src2) %{
6549   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
6550   match(Set dst (SubVB dst src2));
6551   effect(TEMP src1);
6552   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed8B" %}
6553   ins_encode %{
6554     int vector_len = 0;
6555     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6556   %}
6557   ins_pipe( pipe_slow );
6558 %}
6559 
6560 instruct vsub8B_mem_avx(vecD dst, vecD src, memory mem) %{
6561   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
6562   match(Set dst (SubVB src (LoadVector mem)));
6563   format %{ "vpsubb  $dst,$src,$mem\t! sub packed8B" %}
6564   ins_encode %{
6565     int vector_len = 0;
6566     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6567   %}
6568   ins_pipe( pipe_slow );
6569 %}
6570 
6571 instruct vsub8B_mem_evex(vecD dst, vecD src, memory mem) %{
6572   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
6573   match(Set dst (SubVB src (LoadVector mem)));
6574   format %{ "vpsubb  $dst,$src,$mem\t! sub packed8B" %}
6575   ins_encode %{
6576     int vector_len = 0;
6577     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6578   %}
6579   ins_pipe( pipe_slow );
6580 %}
6581 
6582 instruct vsub8B_mem_evex_special(vecD dst, vecD src, memory mem) %{
6583   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
6584   match(Set dst (SubVB dst (LoadVector mem)));
6585   effect(TEMP src);
6586   format %{ "vpsubb  $dst,$src,$mem\t! sub packed8B" %}
6587   ins_encode %{
6588     int vector_len = 0;
6589     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6590   %}
6591   ins_pipe( pipe_slow );
6592 %}
6593 
6594 instruct vsub16B(vecX dst, vecX src) %{
6595   predicate(UseAVX == 0 && n->as_Vector()->length() == 16);
6596   match(Set dst (SubVB dst src));
6597   format %{ "psubb   $dst,$src\t! sub packed16B" %}
6598   ins_encode %{
6599     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
6600   %}
6601   ins_pipe( pipe_slow );
6602 %}
6603 
6604 instruct vsub16B_reg_avx(vecX dst, vecX src1, vecX src2) %{
6605   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 16);
6606   match(Set dst (SubVB src1 src2));
6607   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed16B" %}
6608   ins_encode %{
6609     int vector_len = 0;
6610     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6611   %}
6612   ins_pipe( pipe_slow );
6613 %}
6614 
6615 instruct vsub16B_reg_evex(vecX dst, vecX src1, vecX src2) %{
6616   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
6617   match(Set dst (SubVB src1 src2));
6618   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed16B" %}
6619   ins_encode %{
6620     int vector_len = 0;
6621     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6622   %}
6623   ins_pipe( pipe_slow );
6624 %}
6625 
6626 instruct vsub16B_reg_evex_special(vecX dst, vecX src1, vecX src2) %{
6627   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
6628   match(Set dst (SubVB dst src2));
6629   effect(TEMP src1);
6630   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed16B" %}
6631   ins_encode %{
6632     int vector_len = 0;
6633     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6634   %}
6635   ins_pipe( pipe_slow );
6636 %}
6637 
6638 instruct vsub16B_mem_avx(vecX dst, vecX src, memory mem) %{
6639   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 16);
6640   match(Set dst (SubVB src (LoadVector mem)));
6641   format %{ "vpsubb  $dst,$src,$mem\t! sub packed16B" %}
6642   ins_encode %{
6643     int vector_len = 0;
6644     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6645   %}
6646   ins_pipe( pipe_slow );
6647 %}
6648 
6649 instruct vsub16B_mem_evex(vecX dst, vecX src, memory mem) %{
6650   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
6651   match(Set dst (SubVB src (LoadVector mem)));
6652   format %{ "vpsubb  $dst,$src,$mem\t! sub packed16B" %}
6653   ins_encode %{
6654     int vector_len = 0;
6655     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6656   %}
6657   ins_pipe( pipe_slow );
6658 %}
6659 
6660 instruct vsub16B_mem_evex_special(vecX dst, vecX src, memory mem) %{
6661   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
6662   match(Set dst (SubVB dst (LoadVector mem)));
6663   effect(TEMP src);
6664   format %{ "vpsubb  $dst,$src,$mem\t! sub packed16B" %}
6665   ins_encode %{
6666     int vector_len = 0;
6667     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6668   %}
6669   ins_pipe( pipe_slow );
6670 %}
6671 
6672 instruct vsub32B_reg_avx(vecY dst, vecY src1, vecY src2) %{
6673   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 32);
6674   match(Set dst (SubVB src1 src2));
6675   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed32B" %}
6676   ins_encode %{
6677     int vector_len = 1;
6678     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6679   %}
6680   ins_pipe( pipe_slow );
6681 %}
6682 
6683 instruct vsub32B_reg_evex(vecY dst, vecY src1, vecY src2) %{
6684   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
6685   match(Set dst (SubVB src1 src2));
6686   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed32B" %}
6687   ins_encode %{
6688     int vector_len = 1;
6689     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6690   %}
6691   ins_pipe( pipe_slow );
6692 %}
6693 
6694 instruct vsub32B_reg_evex_special(vecY dst, vecY src1, vecY src2) %{
6695   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 32);
6696   match(Set dst (SubVB dst src2));
6697   effect(TEMP src1);
6698   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed32B" %}
6699   ins_encode %{
6700     int vector_len = 1;
6701     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6702   %}
6703   ins_pipe( pipe_slow );
6704 %}
6705 
6706 instruct vsub32B_mem_avx(vecY dst, vecY src, memory mem) %{
6707   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 32);
6708   match(Set dst (SubVB src (LoadVector mem)));
6709   format %{ "vpsubb  $dst,$src,$mem\t! sub packed32B" %}
6710   ins_encode %{
6711     int vector_len = 1;
6712     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6713   %}
6714   ins_pipe( pipe_slow );
6715 %}
6716 
6717 instruct vsub32B_mem_evex(vecY dst, vecY src, memory mem) %{
6718   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
6719   match(Set dst (SubVB src (LoadVector mem)));
6720   format %{ "vpsubb  $dst,$src,$mem\t! sub packed32B" %}
6721   ins_encode %{
6722     int vector_len = 1;
6723     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6724   %}
6725   ins_pipe( pipe_slow );
6726 %}
6727 
6728 instruct vsub32B_mem_evex_special(vecY dst, vecY src, memory mem) %{
6729   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 32);
6730   match(Set dst (SubVB dst (LoadVector mem)));
6731   effect(TEMP src);
6732   format %{ "vpsubb  $dst,$src,$mem\t! sub packed32B" %}
6733   ins_encode %{
6734     int vector_len = 1;
6735     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6736   %}
6737   ins_pipe( pipe_slow );
6738 %}
6739 
6740 instruct vsub64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
6741   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64);
6742   match(Set dst (SubVB src1 src2));
6743   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed64B" %}
6744   ins_encode %{
6745     int vector_len = 2;
6746     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6747   %}
6748   ins_pipe( pipe_slow );
6749 %}
6750 
6751 instruct vsub64B_mem(vecZ dst, vecZ src, memory mem) %{
6752   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64);
6753   match(Set dst (SubVB src (LoadVector mem)));
6754   format %{ "vpsubb  $dst,$src,$mem\t! sub packed64B" %}
6755   ins_encode %{
6756     int vector_len = 2;
6757     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6758   %}
6759   ins_pipe( pipe_slow );
6760 %}
6761 
6762 // Shorts/Chars vector sub
6763 instruct vsub2S(vecS dst, vecS src) %{
6764   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
6765   match(Set dst (SubVS dst src));
6766   format %{ "psubw   $dst,$src\t! sub packed2S" %}
6767   ins_encode %{
6768     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
6769   %}
6770   ins_pipe( pipe_slow );
6771 %}
6772 
6773 instruct vsub2S_reg_avx(vecS dst, vecS src1, vecS src2) %{
6774   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
6775   match(Set dst (SubVS src1 src2));
6776   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed2S" %}
6777   ins_encode %{
6778     int vector_len = 0;
6779     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6780   %}
6781   ins_pipe( pipe_slow );
6782 %}
6783 
6784 instruct vsub2S_reg_evex(vecS dst, vecS src1, vecS src2) %{
6785   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
6786   match(Set dst (SubVS src1 src2));
6787   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed2S" %}
6788   ins_encode %{
6789     int vector_len = 0;
6790     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6791   %}
6792   ins_pipe( pipe_slow );
6793 %}
6794 
6795 instruct vsub2S_reg_evex_special(vecS dst, vecS src1, vecS src2) %{
6796   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
6797   match(Set dst (SubVS dst src2));
6798   effect(TEMP src1);
6799   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed2S" %}
6800   ins_encode %{
6801     int vector_len = 0;
6802     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6803   %}
6804   ins_pipe( pipe_slow );
6805 %}
6806 
6807 instruct vsub2S_mem_avx(vecS dst, vecS src, memory mem) %{
6808   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
6809   match(Set dst (SubVS src (LoadVector mem)));
6810   format %{ "vpsubw  $dst,$src,$mem\t! sub packed2S" %}
6811   ins_encode %{
6812     int vector_len = 0;
6813     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6814   %}
6815   ins_pipe( pipe_slow );
6816 %}
6817 
6818 instruct vsub2S_mem_evex(vecS dst, vecS src, memory mem) %{
6819   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
6820   match(Set dst (SubVS src (LoadVector mem)));
6821   format %{ "vpsubw  $dst,$src,$mem\t! sub packed2S" %}
6822   ins_encode %{
6823     int vector_len = 0;
6824     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6825   %}
6826   ins_pipe( pipe_slow );
6827 %}
6828 
6829 instruct vsub2S_mem_evex_special(vecS dst, vecS src, memory mem) %{
6830   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
6831   match(Set dst (SubVS dst (LoadVector mem)));
6832   effect(TEMP src);
6833   format %{ "vpsubw  $dst,$src,$mem\t! sub packed2S" %}
6834   ins_encode %{
6835     int vector_len = 0;
6836     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6837   %}
6838   ins_pipe( pipe_slow );
6839 %}
6840 
6841 instruct vsub4S(vecD dst, vecD src) %{
6842   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
6843   match(Set dst (SubVS dst src));
6844   format %{ "psubw   $dst,$src\t! sub packed4S" %}
6845   ins_encode %{
6846     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
6847   %}
6848   ins_pipe( pipe_slow );
6849 %}
6850 
6851 instruct vsub4S_reg_avx(vecD dst, vecD src1, vecD src2) %{
6852   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
6853   match(Set dst (SubVS src1 src2));
6854   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed4S" %}
6855   ins_encode %{
6856     int vector_len = 0;
6857     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6858   %}
6859   ins_pipe( pipe_slow );
6860 %}
6861 
6862 instruct vsub4S_reg_evex(vecD dst, vecD src1, vecD src2) %{
6863   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
6864   match(Set dst (SubVS src1 src2));
6865   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed4S" %}
6866   ins_encode %{
6867     int vector_len = 0;
6868     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6869   %}
6870   ins_pipe( pipe_slow );
6871 %}
6872 
6873 instruct vsub4S_reg_evex_special(vecD dst, vecD src1, vecD src2) %{
6874   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
6875   match(Set dst (SubVS dst src2));
6876   effect(TEMP src1);
6877   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed4S" %}
6878   ins_encode %{
6879     int vector_len = 0;
6880     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6881   %}
6882   ins_pipe( pipe_slow );
6883 %}
6884 
6885 instruct vsub4S_mem_avx(vecD dst, vecD src, memory mem) %{
6886   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
6887   match(Set dst (SubVS src (LoadVector mem)));
6888   format %{ "vpsubw  $dst,$src,$mem\t! sub packed4S" %}
6889   ins_encode %{
6890     int vector_len = 0;
6891     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6892   %}
6893   ins_pipe( pipe_slow );
6894 %}
6895 
6896 instruct vsub4S_mem_evex(vecD dst, vecD src, memory mem) %{
6897   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
6898   match(Set dst (SubVS src (LoadVector mem)));
6899   format %{ "vpsubw  $dst,$src,$mem\t! sub packed4S" %}
6900   ins_encode %{
6901     int vector_len = 0;
6902     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6903   %}
6904   ins_pipe( pipe_slow );
6905 %}
6906 
6907 instruct vsub4S_mem_evex_special(vecD dst, vecD src, memory mem) %{
6908   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
6909   match(Set dst (SubVS dst (LoadVector mem)));
6910   effect(TEMP src);
6911   format %{ "vpsubw  $dst,$src,$mem\t! sub packed4S" %}
6912   ins_encode %{
6913     int vector_len = 0;
6914     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6915   %}
6916   ins_pipe( pipe_slow );
6917 %}
6918 
6919 instruct vsub8S(vecX dst, vecX src) %{
6920   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
6921   match(Set dst (SubVS dst src));
6922   format %{ "psubw   $dst,$src\t! sub packed8S" %}
6923   ins_encode %{
6924     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
6925   %}
6926   ins_pipe( pipe_slow );
6927 %}
6928 
6929 instruct vsub8S_reg_avx(vecX dst, vecX src1, vecX src2) %{
6930   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
6931   match(Set dst (SubVS src1 src2));
6932   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed8S" %}
6933   ins_encode %{
6934     int vector_len = 0;
6935     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6936   %}
6937   ins_pipe( pipe_slow );
6938 %}
6939 
6940 instruct vsub8S_reg_evex(vecX dst, vecX src1, vecX src2) %{
6941   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
6942   match(Set dst (SubVS src1 src2));
6943   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed8S" %}
6944   ins_encode %{
6945     int vector_len = 0;
6946     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6947   %}
6948   ins_pipe( pipe_slow );
6949 %}
6950 
6951 instruct vsub8S_reg_evex_special(vecX dst, vecX src1, vecX src2) %{
6952   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
6953   match(Set dst (SubVS dst src2));
6954   effect(TEMP src1);
6955   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed8S" %}
6956   ins_encode %{
6957     int vector_len = 0;
6958     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6959   %}
6960   ins_pipe( pipe_slow );
6961 %}
6962 
6963 instruct vsub8S_mem_avx(vecX dst, vecX src, memory mem) %{
6964   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
6965   match(Set dst (SubVS src (LoadVector mem)));
6966   format %{ "vpsubw  $dst,$src,$mem\t! sub packed8S" %}
6967   ins_encode %{
6968     int vector_len = 0;
6969     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6970   %}
6971   ins_pipe( pipe_slow );
6972 %}
6973 
6974 instruct vsub8S_mem_evex(vecX dst, vecX src, memory mem) %{
6975   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
6976   match(Set dst (SubVS src (LoadVector mem)));
6977   format %{ "vpsubw  $dst,$src,$mem\t! sub packed8S" %}
6978   ins_encode %{
6979     int vector_len = 0;
6980     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6981   %}
6982   ins_pipe( pipe_slow );
6983 %}
6984 
6985 instruct vsub8S_mem_evex_special(vecX dst, vecX src, memory mem) %{
6986   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
6987   match(Set dst (SubVS dst (LoadVector mem)));
6988   effect(TEMP src);
6989   format %{ "vpsubw  $dst,$src,$mem\t! sub packed8S" %}
6990   ins_encode %{
6991     int vector_len = 0;
6992     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6993   %}
6994   ins_pipe( pipe_slow );
6995 %}
6996 
6997 instruct vsub16S_reg_avx(vecY dst, vecY src1, vecY src2) %{
6998   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
6999   match(Set dst (SubVS src1 src2));
7000   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed16S" %}
7001   ins_encode %{
7002     int vector_len = 1;
7003     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7004   %}
7005   ins_pipe( pipe_slow );
7006 %}
7007 
7008 instruct vsub16S_reg_evex(vecY dst, vecY src1, vecY src2) %{
7009   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
7010   match(Set dst (SubVS src1 src2));
7011   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed16S" %}
7012   ins_encode %{
7013     int vector_len = 1;
7014     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7015   %}
7016   ins_pipe( pipe_slow );
7017 %}
7018 
7019 instruct vsub16S_reg_evex_special(vecY dst, vecY src1, vecY src2) %{
7020   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
7021   match(Set dst (SubVS dst src2));
7022   effect(TEMP src1);
7023   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed16S" %}
7024   ins_encode %{
7025     int vector_len = 1;
7026     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7027   %}
7028   ins_pipe( pipe_slow );
7029 %}
7030 
7031 instruct vsub16S_mem_avx(vecY dst, vecY src, memory mem) %{
7032   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
7033   match(Set dst (SubVS src (LoadVector mem)));
7034   format %{ "vpsubw  $dst,$src,$mem\t! sub packed16S" %}
7035   ins_encode %{
7036     int vector_len = 1;
7037     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7038   %}
7039   ins_pipe( pipe_slow );
7040 %}
7041 
7042 instruct vsub16S_mem_evex(vecY dst, vecY src, memory mem) %{
7043   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
7044   match(Set dst (SubVS src (LoadVector mem)));
7045   format %{ "vpsubw  $dst,$src,$mem\t! sub packed16S" %}
7046   ins_encode %{
7047     int vector_len = 1;
7048     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7049   %}
7050   ins_pipe( pipe_slow );
7051 %}
7052 
7053 instruct vsub16S_mem_evex_special(vecY dst, vecY src, memory mem) %{
7054   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
7055   match(Set dst (SubVS dst (LoadVector mem)));
7056    effect(TEMP src);
7057   format %{ "vpsubw  $dst,$src,$mem\t! sub packed16S" %}
7058   ins_encode %{
7059     int vector_len = 1;
7060     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7061   %}
7062   ins_pipe( pipe_slow );
7063 %}
7064 
7065 instruct vsub32S_reg(vecZ dst, vecZ src1, vecZ src2) %{
7066   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
7067   match(Set dst (SubVS src1 src2));
7068   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed32S" %}
7069   ins_encode %{
7070     int vector_len = 2;
7071     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7072   %}
7073   ins_pipe( pipe_slow );
7074 %}
7075 
7076 instruct vsub32S_mem(vecZ dst, vecZ src, memory mem) %{
7077   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
7078   match(Set dst (SubVS src (LoadVector mem)));
7079   format %{ "vpsubw  $dst,$src,$mem\t! sub packed32S" %}
7080   ins_encode %{
7081     int vector_len = 2;
7082     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7083   %}
7084   ins_pipe( pipe_slow );
7085 %}
7086 
7087 // Integers vector sub
7088 instruct vsub2I(vecD dst, vecD src) %{
7089   predicate(n->as_Vector()->length() == 2);
7090   match(Set dst (SubVI dst src));
7091   format %{ "psubd   $dst,$src\t! sub packed2I" %}
7092   ins_encode %{
7093     __ psubd($dst$$XMMRegister, $src$$XMMRegister);
7094   %}
7095   ins_pipe( pipe_slow );
7096 %}
7097 
7098 instruct vsub2I_reg(vecD dst, vecD src1, vecD src2) %{
7099   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7100   match(Set dst (SubVI src1 src2));
7101   format %{ "vpsubd  $dst,$src1,$src2\t! sub packed2I" %}
7102   ins_encode %{
7103     int vector_len = 0;
7104     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7105   %}
7106   ins_pipe( pipe_slow );
7107 %}
7108 
7109 instruct vsub2I_mem(vecD dst, vecD src, memory mem) %{
7110   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7111   match(Set dst (SubVI src (LoadVector mem)));
7112   format %{ "vpsubd  $dst,$src,$mem\t! sub packed2I" %}
7113   ins_encode %{
7114     int vector_len = 0;
7115     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7116   %}
7117   ins_pipe( pipe_slow );
7118 %}
7119 
7120 instruct vsub4I(vecX dst, vecX src) %{
7121   predicate(n->as_Vector()->length() == 4);
7122   match(Set dst (SubVI dst src));
7123   format %{ "psubd   $dst,$src\t! sub packed4I" %}
7124   ins_encode %{
7125     __ psubd($dst$$XMMRegister, $src$$XMMRegister);
7126   %}
7127   ins_pipe( pipe_slow );
7128 %}
7129 
7130 instruct vsub4I_reg(vecX dst, vecX src1, vecX src2) %{
7131   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7132   match(Set dst (SubVI src1 src2));
7133   format %{ "vpsubd  $dst,$src1,$src2\t! sub packed4I" %}
7134   ins_encode %{
7135     int vector_len = 0;
7136     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7137   %}
7138   ins_pipe( pipe_slow );
7139 %}
7140 
7141 instruct vsub4I_mem(vecX dst, vecX src, memory mem) %{
7142   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7143   match(Set dst (SubVI src (LoadVector mem)));
7144   format %{ "vpsubd  $dst,$src,$mem\t! sub packed4I" %}
7145   ins_encode %{
7146     int vector_len = 0;
7147     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7148   %}
7149   ins_pipe( pipe_slow );
7150 %}
7151 
7152 instruct vsub8I_reg(vecY dst, vecY src1, vecY src2) %{
7153   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
7154   match(Set dst (SubVI src1 src2));
7155   format %{ "vpsubd  $dst,$src1,$src2\t! sub packed8I" %}
7156   ins_encode %{
7157     int vector_len = 1;
7158     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7159   %}
7160   ins_pipe( pipe_slow );
7161 %}
7162 
7163 instruct vsub8I_mem(vecY dst, vecY src, memory mem) %{
7164   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
7165   match(Set dst (SubVI src (LoadVector mem)));
7166   format %{ "vpsubd  $dst,$src,$mem\t! sub packed8I" %}
7167   ins_encode %{
7168     int vector_len = 1;
7169     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7170   %}
7171   ins_pipe( pipe_slow );
7172 %}
7173 
7174 instruct vsub16I_reg(vecZ dst, vecZ src1, vecZ src2) %{
7175   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
7176   match(Set dst (SubVI src1 src2));
7177   format %{ "vpsubd  $dst,$src1,$src2\t! sub packed16I" %}
7178   ins_encode %{
7179     int vector_len = 2;
7180     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7181   %}
7182   ins_pipe( pipe_slow );
7183 %}
7184 
7185 instruct vsub16I_mem(vecZ dst, vecZ src, memory mem) %{
7186   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
7187   match(Set dst (SubVI src (LoadVector mem)));
7188   format %{ "vpsubd  $dst,$src,$mem\t! sub packed16I" %}
7189   ins_encode %{
7190     int vector_len = 2;
7191     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7192   %}
7193   ins_pipe( pipe_slow );
7194 %}
7195 
7196 // Longs vector sub
7197 instruct vsub2L(vecX dst, vecX src) %{
7198   predicate(n->as_Vector()->length() == 2);
7199   match(Set dst (SubVL dst src));
7200   format %{ "psubq   $dst,$src\t! sub packed2L" %}
7201   ins_encode %{
7202     __ psubq($dst$$XMMRegister, $src$$XMMRegister);
7203   %}
7204   ins_pipe( pipe_slow );
7205 %}
7206 
7207 instruct vsub2L_reg(vecX dst, vecX src1, vecX src2) %{
7208   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7209   match(Set dst (SubVL src1 src2));
7210   format %{ "vpsubq  $dst,$src1,$src2\t! sub packed2L" %}
7211   ins_encode %{
7212     int vector_len = 0;
7213     __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7214   %}
7215   ins_pipe( pipe_slow );
7216 %}
7217 
7218 instruct vsub2L_mem(vecX dst, vecX src, memory mem) %{
7219   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7220   match(Set dst (SubVL src (LoadVector mem)));
7221   format %{ "vpsubq  $dst,$src,$mem\t! sub packed2L" %}
7222   ins_encode %{
7223     int vector_len = 0;
7224     __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7225   %}
7226   ins_pipe( pipe_slow );
7227 %}
7228 
7229 instruct vsub4L_reg(vecY dst, vecY src1, vecY src2) %{
7230   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
7231   match(Set dst (SubVL src1 src2));
7232   format %{ "vpsubq  $dst,$src1,$src2\t! sub packed4L" %}
7233   ins_encode %{
7234     int vector_len = 1;
7235     __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7236   %}
7237   ins_pipe( pipe_slow );
7238 %}
7239 
7240 instruct vsub4L_mem(vecY dst, vecY src, memory mem) %{
7241   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
7242   match(Set dst (SubVL src (LoadVector mem)));
7243   format %{ "vpsubq  $dst,$src,$mem\t! sub packed4L" %}
7244   ins_encode %{
7245     int vector_len = 1;
7246     __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7247   %}
7248   ins_pipe( pipe_slow );
7249 %}
7250 
7251 instruct vsub8L_reg(vecZ dst, vecZ src1, vecZ src2) %{
7252   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
7253   match(Set dst (SubVL src1 src2));
7254   format %{ "vpsubq  $dst,$src1,$src2\t! sub packed8L" %}
7255   ins_encode %{
7256     int vector_len = 2;
7257     __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7258   %}
7259   ins_pipe( pipe_slow );
7260 %}
7261 
7262 instruct vsub8L_mem(vecZ dst, vecZ src, memory mem) %{
7263   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
7264   match(Set dst (SubVL src (LoadVector mem)));
7265   format %{ "vpsubq  $dst,$src,$mem\t! sub packed8L" %}
7266   ins_encode %{
7267     int vector_len = 2;
7268     __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7269   %}
7270   ins_pipe( pipe_slow );
7271 %}
7272 
7273 // Floats vector sub
7274 instruct vsub2F(vecD dst, vecD src) %{
7275   predicate(n->as_Vector()->length() == 2);
7276   match(Set dst (SubVF dst src));
7277   format %{ "subps   $dst,$src\t! sub packed2F" %}
7278   ins_encode %{
7279     __ subps($dst$$XMMRegister, $src$$XMMRegister);
7280   %}
7281   ins_pipe( pipe_slow );
7282 %}
7283 
7284 instruct vsub2F_reg(vecD dst, vecD src1, vecD src2) %{
7285   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7286   match(Set dst (SubVF src1 src2));
7287   format %{ "vsubps  $dst,$src1,$src2\t! sub packed2F" %}
7288   ins_encode %{
7289     int vector_len = 0;
7290     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7291   %}
7292   ins_pipe( pipe_slow );
7293 %}
7294 
7295 instruct vsub2F_mem(vecD dst, vecD src, memory mem) %{
7296   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7297   match(Set dst (SubVF src (LoadVector mem)));
7298   format %{ "vsubps  $dst,$src,$mem\t! sub packed2F" %}
7299   ins_encode %{
7300     int vector_len = 0;
7301     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7302   %}
7303   ins_pipe( pipe_slow );
7304 %}
7305 
7306 instruct vsub4F(vecX dst, vecX src) %{
7307   predicate(n->as_Vector()->length() == 4);
7308   match(Set dst (SubVF dst src));
7309   format %{ "subps   $dst,$src\t! sub packed4F" %}
7310   ins_encode %{
7311     __ subps($dst$$XMMRegister, $src$$XMMRegister);
7312   %}
7313   ins_pipe( pipe_slow );
7314 %}
7315 
7316 instruct vsub4F_reg(vecX dst, vecX src1, vecX src2) %{
7317   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7318   match(Set dst (SubVF src1 src2));
7319   format %{ "vsubps  $dst,$src1,$src2\t! sub packed4F" %}
7320   ins_encode %{
7321     int vector_len = 0;
7322     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7323   %}
7324   ins_pipe( pipe_slow );
7325 %}
7326 
7327 instruct vsub4F_mem(vecX dst, vecX src, memory mem) %{
7328   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7329   match(Set dst (SubVF src (LoadVector mem)));
7330   format %{ "vsubps  $dst,$src,$mem\t! sub packed4F" %}
7331   ins_encode %{
7332     int vector_len = 0;
7333     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7334   %}
7335   ins_pipe( pipe_slow );
7336 %}
7337 
7338 instruct vsub8F_reg(vecY dst, vecY src1, vecY src2) %{
7339   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
7340   match(Set dst (SubVF src1 src2));
7341   format %{ "vsubps  $dst,$src1,$src2\t! sub packed8F" %}
7342   ins_encode %{
7343     int vector_len = 1;
7344     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7345   %}
7346   ins_pipe( pipe_slow );
7347 %}
7348 
7349 instruct vsub8F_mem(vecY dst, vecY src, memory mem) %{
7350   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
7351   match(Set dst (SubVF src (LoadVector mem)));
7352   format %{ "vsubps  $dst,$src,$mem\t! sub packed8F" %}
7353   ins_encode %{
7354     int vector_len = 1;
7355     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7356   %}
7357   ins_pipe( pipe_slow );
7358 %}
7359 
7360 instruct vsub16F_reg(vecZ dst, vecZ src1, vecZ src2) %{
7361   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
7362   match(Set dst (SubVF src1 src2));
7363   format %{ "vsubps  $dst,$src1,$src2\t! sub packed16F" %}
7364   ins_encode %{
7365     int vector_len = 2;
7366     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7367   %}
7368   ins_pipe( pipe_slow );
7369 %}
7370 
7371 instruct vsub16F_mem(vecZ dst, vecZ src, memory mem) %{
7372   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
7373   match(Set dst (SubVF src (LoadVector mem)));
7374   format %{ "vsubps  $dst,$src,$mem\t! sub packed16F" %}
7375   ins_encode %{
7376     int vector_len = 2;
7377     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7378   %}
7379   ins_pipe( pipe_slow );
7380 %}
7381 
7382 // Doubles vector sub
7383 instruct vsub2D(vecX dst, vecX src) %{
7384   predicate(n->as_Vector()->length() == 2);
7385   match(Set dst (SubVD dst src));
7386   format %{ "subpd   $dst,$src\t! sub packed2D" %}
7387   ins_encode %{
7388     __ subpd($dst$$XMMRegister, $src$$XMMRegister);
7389   %}
7390   ins_pipe( pipe_slow );
7391 %}
7392 
7393 instruct vsub2D_reg(vecX dst, vecX src1, vecX src2) %{
7394   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7395   match(Set dst (SubVD src1 src2));
7396   format %{ "vsubpd  $dst,$src1,$src2\t! sub packed2D" %}
7397   ins_encode %{
7398     int vector_len = 0;
7399     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7400   %}
7401   ins_pipe( pipe_slow );
7402 %}
7403 
7404 instruct vsub2D_mem(vecX dst, vecX src, memory mem) %{
7405   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7406   match(Set dst (SubVD src (LoadVector mem)));
7407   format %{ "vsubpd  $dst,$src,$mem\t! sub packed2D" %}
7408   ins_encode %{
7409     int vector_len = 0;
7410     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7411   %}
7412   ins_pipe( pipe_slow );
7413 %}
7414 
7415 instruct vsub4D_reg(vecY dst, vecY src1, vecY src2) %{
7416   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7417   match(Set dst (SubVD src1 src2));
7418   format %{ "vsubpd  $dst,$src1,$src2\t! sub packed4D" %}
7419   ins_encode %{
7420     int vector_len = 1;
7421     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7422   %}
7423   ins_pipe( pipe_slow );
7424 %}
7425 
7426 instruct vsub4D_mem(vecY dst, vecY src, memory mem) %{
7427   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7428   match(Set dst (SubVD src (LoadVector mem)));
7429   format %{ "vsubpd  $dst,$src,$mem\t! sub packed4D" %}
7430   ins_encode %{
7431     int vector_len = 1;
7432     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7433   %}
7434   ins_pipe( pipe_slow );
7435 %}
7436 
7437 instruct vsub8D_reg(vecZ dst, vecZ src1, vecZ src2) %{
7438   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
7439   match(Set dst (SubVD src1 src2));
7440   format %{ "vsubpd  $dst,$src1,$src2\t! sub packed8D" %}
7441   ins_encode %{
7442     int vector_len = 2;
7443     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7444   %}
7445   ins_pipe( pipe_slow );
7446 %}
7447 
7448 instruct vsub8D_mem(vecZ dst, vecZ src, memory mem) %{
7449   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
7450   match(Set dst (SubVD src (LoadVector mem)));
7451   format %{ "vsubpd  $dst,$src,$mem\t! sub packed8D" %}
7452   ins_encode %{
7453     int vector_len = 2;
7454     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7455   %}
7456   ins_pipe( pipe_slow );
7457 %}
7458 
7459 // --------------------------------- MUL --------------------------------------
7460 
7461 // Shorts/Chars vector mul
7462 instruct vmul2S(vecS dst, vecS src) %{
7463   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
7464   match(Set dst (MulVS dst src));
7465   format %{ "pmullw $dst,$src\t! mul packed2S" %}
7466   ins_encode %{
7467     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
7468   %}
7469   ins_pipe( pipe_slow );
7470 %}
7471 
7472 instruct vmul2S_reg_avx(vecS dst, vecS src1, vecS src2) %{
7473   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
7474   match(Set dst (MulVS src1 src2));
7475   format %{ "vpmullw $dst,$src1,$src2\t! mul packed2S" %}
7476   ins_encode %{
7477     int vector_len = 0;
7478     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7479   %}
7480   ins_pipe( pipe_slow );
7481 %}
7482 
7483 instruct vmul2S_reg_evex(vecS dst, vecS src1, vecS src2) %{
7484   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
7485   match(Set dst (MulVS src1 src2));
7486   format %{ "vpmullw $dst,$src1,$src2\t! mul packed2S" %}
7487   ins_encode %{
7488     int vector_len = 0;
7489     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7490   %}
7491   ins_pipe( pipe_slow );
7492 %}
7493 
7494 instruct vmul2S_evex_special(vecS dst, vecS src1, vecS src2) %{
7495   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
7496   match(Set dst (MulVS dst src2));
7497   effect(TEMP src1);
7498   format %{ "vpmullw $dst,$src1,$src2\t! mul packed2S" %}
7499   ins_encode %{
7500     int vector_len = 0;
7501     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7502   %}
7503   ins_pipe( pipe_slow );
7504 %}
7505 
7506 instruct vmul2S_mem_avx(vecS dst, vecS src, memory mem) %{
7507   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
7508   match(Set dst (MulVS src (LoadVector mem)));
7509   format %{ "vpmullw $dst,$src,$mem\t! mul packed2S" %}
7510   ins_encode %{
7511     int vector_len = 0;
7512     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7513   %}
7514   ins_pipe( pipe_slow );
7515 %}
7516 
7517 instruct vmul2S_mem_evex(vecS dst, vecS src, memory mem) %{
7518   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
7519   match(Set dst (MulVS src (LoadVector mem)));
7520   format %{ "vpmullw $dst,$src,$mem\t! mul packed2S" %}
7521   ins_encode %{
7522     int vector_len = 0;
7523     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7524   %}
7525   ins_pipe( pipe_slow );
7526 %}
7527 
7528 instruct vmul2S_mem_evex_special(vecS dst, vecS src, memory mem) %{
7529   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
7530   match(Set dst (MulVS dst (LoadVector mem)));
7531   effect(TEMP src);
7532   format %{ "vpmullw $dst,$src,$mem\t! mul packed2S" %}
7533   ins_encode %{
7534     int vector_len = 0;
7535     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7536   %}
7537   ins_pipe( pipe_slow );
7538 %}
7539 
7540 instruct vmul4S(vecD dst, vecD src) %{
7541   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
7542   match(Set dst (MulVS dst src));
7543   format %{ "pmullw  $dst,$src\t! mul packed4S" %}
7544   ins_encode %{
7545     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
7546   %}
7547   ins_pipe( pipe_slow );
7548 %}
7549 
7550 instruct vmul4S_reg_avx(vecD dst, vecD src1, vecD src2) %{
7551   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
7552   match(Set dst (MulVS src1 src2));
7553   format %{ "vpmullw $dst,$src1,$src2\t! mul packed4S" %}
7554   ins_encode %{
7555     int vector_len = 0;
7556     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7557   %}
7558   ins_pipe( pipe_slow );
7559 %}
7560 
7561 instruct vmul4S_reg_evex(vecD dst, vecD src1, vecD src2) %{
7562   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
7563   match(Set dst (MulVS src1 src2));
7564   format %{ "vpmullw $dst,$src1,$src2\t! mul packed4S" %}
7565   ins_encode %{
7566     int vector_len = 0;
7567     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7568   %}
7569   ins_pipe( pipe_slow );
7570 %}
7571 
7572 instruct vmul4S_reg_evex_special(vecD dst, vecD src1, vecD src2) %{
7573   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
7574   match(Set dst (MulVS dst src2));
7575   effect(TEMP src1);
7576   format %{ "vpmullw $dst,$src1,$src2\t! mul packed4S" %}
7577   ins_encode %{
7578     int vector_len = 0;
7579     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7580   %}
7581   ins_pipe( pipe_slow );
7582 %}
7583 
7584 instruct vmul4S_mem_avx(vecD dst, vecD src, memory mem) %{
7585   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
7586   match(Set dst (MulVS src (LoadVector mem)));
7587   format %{ "vpmullw $dst,$src,$mem\t! mul packed4S" %}
7588   ins_encode %{
7589     int vector_len = 0;
7590     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7591   %}
7592   ins_pipe( pipe_slow );
7593 %}
7594 
7595 instruct vmul4S_mem_evex(vecD dst, vecD src, memory mem) %{
7596   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
7597   match(Set dst (MulVS src (LoadVector mem)));
7598   format %{ "vpmullw $dst,$src,$mem\t! mul packed4S" %}
7599   ins_encode %{
7600     int vector_len = 0;
7601     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7602   %}
7603   ins_pipe( pipe_slow );
7604 %}
7605 
7606 instruct vmul4S_mem_evex_special(vecD dst, vecD src, memory mem) %{
7607   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
7608   match(Set dst (MulVS dst (LoadVector mem)));
7609   effect(TEMP src);
7610   format %{ "vpmullw $dst,$src,$mem\t! mul packed4S" %}
7611   ins_encode %{
7612     int vector_len = 0;
7613     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7614   %}
7615   ins_pipe( pipe_slow );
7616 %}
7617 
7618 instruct vmul8S(vecX dst, vecX src) %{
7619   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
7620   match(Set dst (MulVS dst src));
7621   format %{ "pmullw  $dst,$src\t! mul packed8S" %}
7622   ins_encode %{
7623     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
7624   %}
7625   ins_pipe( pipe_slow );
7626 %}
7627 
7628 instruct vmul8S_reg_avx(vecX dst, vecX src1, vecX src2) %{
7629   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
7630   match(Set dst (MulVS src1 src2));
7631   format %{ "vpmullw $dst,$src1,$src2\t! mul packed8S" %}
7632   ins_encode %{
7633     int vector_len = 0;
7634     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7635   %}
7636   ins_pipe( pipe_slow );
7637 %}
7638 
7639 instruct vmul8S_reg_evex(vecX dst, vecX src1, vecX src2) %{
7640   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
7641   match(Set dst (MulVS src1 src2));
7642   format %{ "vpmullw $dst,$src1,$src2\t! mul packed8S" %}
7643   ins_encode %{
7644     int vector_len = 0;
7645     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7646   %}
7647   ins_pipe( pipe_slow );
7648 %}
7649 
7650 instruct vmul8S_reg_evex_special(vecX dst, vecX src1, vecX src2) %{
7651   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
7652   match(Set dst (MulVS dst src2));
7653   effect(TEMP src1);
7654   format %{ "vpmullw $dst,$src1,$src2\t! mul packed8S" %}
7655   ins_encode %{
7656     int vector_len = 0;
7657     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7658   %}
7659   ins_pipe( pipe_slow );
7660 %}
7661 
7662 instruct vmul8S_mem_avx(vecX dst, vecX src, memory mem) %{
7663   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
7664   match(Set dst (MulVS src (LoadVector mem)));
7665   format %{ "vpmullw $dst,$src,$mem\t! mul packed8S" %}
7666   ins_encode %{
7667     int vector_len = 0;
7668     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7669   %}
7670   ins_pipe( pipe_slow );
7671 %}
7672 
7673 instruct vmul8S_mem_evex(vecX dst, vecX src, memory mem) %{
7674   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
7675   match(Set dst (MulVS src (LoadVector mem)));
7676   format %{ "vpmullw $dst,$src,$mem\t! mul packed8S" %}
7677   ins_encode %{
7678     int vector_len = 0;
7679     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7680   %}
7681   ins_pipe( pipe_slow );
7682 %}
7683 
7684 instruct vmul8S_mem_evex_special(vecX dst, vecX src, memory mem) %{
7685   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
7686   match(Set dst (MulVS dst (LoadVector mem)));
7687   effect(TEMP src);
7688   format %{ "vpmullw $dst,$src,$mem\t! mul packed8S" %}
7689   ins_encode %{
7690     int vector_len = 0;
7691     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7692   %}
7693   ins_pipe( pipe_slow );
7694 %}
7695 
7696 instruct vmul16S_reg_avx(vecY dst, vecY src1, vecY src2) %{
7697   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
7698   match(Set dst (MulVS src1 src2));
7699   format %{ "vpmullw $dst,$src1,$src2\t! mul packed16S" %}
7700   ins_encode %{
7701     int vector_len = 1;
7702     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7703   %}
7704   ins_pipe( pipe_slow );
7705 %}
7706 
7707 instruct vmul16S_reg_evex(vecY dst, vecY src1, vecY src2) %{
7708   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
7709   match(Set dst (MulVS src1 src2));
7710   format %{ "vpmullw $dst,$src1,$src2\t! mul packed16S" %}
7711   ins_encode %{
7712     int vector_len = 1;
7713     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7714   %}
7715   ins_pipe( pipe_slow );
7716 %}
7717 
7718 instruct vmul16S_reg_evex_special(vecY dst, vecY src1, vecY src2) %{
7719   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
7720   match(Set dst (MulVS dst src2));
7721   effect(TEMP src1);
7722   format %{ "vpmullw $dst,$src1,$src2\t! mul packed16S" %}
7723   ins_encode %{
7724     int vector_len = 1;
7725     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7726   %}
7727   ins_pipe( pipe_slow );
7728 %}
7729 
7730 instruct vmul16S_mem_avx(vecY dst, vecY src, memory mem) %{
7731   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
7732   match(Set dst (MulVS src (LoadVector mem)));
7733   format %{ "vpmullw $dst,$src,$mem\t! mul packed16S" %}
7734   ins_encode %{
7735     int vector_len = 1;
7736     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7737   %}
7738   ins_pipe( pipe_slow );
7739 %}
7740 
7741 instruct vmul16S_mem_evex(vecY dst, vecY src, memory mem) %{
7742   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
7743   match(Set dst (MulVS src (LoadVector mem)));
7744   format %{ "vpmullw $dst,$src,$mem\t! mul packed16S" %}
7745   ins_encode %{
7746     int vector_len = 1;
7747     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7748   %}
7749   ins_pipe( pipe_slow );
7750 %}
7751 
7752 instruct vmul16S_mem_evex_special(vecY dst, vecY src, memory mem) %{
7753   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
7754   match(Set dst (MulVS dst (LoadVector mem)));
7755   effect(TEMP src);
7756   format %{ "vpmullw $dst,$src,$mem\t! mul packed16S" %}
7757   ins_encode %{
7758     int vector_len = 1;
7759     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7760   %}
7761   ins_pipe( pipe_slow );
7762 %}
7763 
7764 instruct vmul32S_reg(vecZ dst, vecZ src1, vecZ src2) %{
7765   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
7766   match(Set dst (MulVS src1 src2));
7767   format %{ "vpmullw $dst,$src1,$src2\t! mul packed32S" %}
7768   ins_encode %{
7769     int vector_len = 2;
7770     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7771   %}
7772   ins_pipe( pipe_slow );
7773 %}
7774 
7775 instruct vmul32S_mem(vecZ dst, vecZ src, memory mem) %{
7776   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
7777   match(Set dst (MulVS src (LoadVector mem)));
7778   format %{ "vpmullw $dst,$src,$mem\t! mul packed32S" %}
7779   ins_encode %{
7780     int vector_len = 2;
7781     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7782   %}
7783   ins_pipe( pipe_slow );
7784 %}
7785 
7786 // Integers vector mul (sse4_1)
7787 instruct vmul2I(vecD dst, vecD src) %{
7788   predicate(UseSSE > 3 && n->as_Vector()->length() == 2);
7789   match(Set dst (MulVI dst src));
7790   format %{ "pmulld  $dst,$src\t! mul packed2I" %}
7791   ins_encode %{
7792     __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
7793   %}
7794   ins_pipe( pipe_slow );
7795 %}
7796 
7797 instruct vmul2I_reg(vecD dst, vecD src1, vecD src2) %{
7798   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7799   match(Set dst (MulVI src1 src2));
7800   format %{ "vpmulld $dst,$src1,$src2\t! mul packed2I" %}
7801   ins_encode %{
7802     int vector_len = 0;
7803     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7804   %}
7805   ins_pipe( pipe_slow );
7806 %}
7807 
7808 instruct vmul2I_mem(vecD dst, vecD src, memory mem) %{
7809   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7810   match(Set dst (MulVI src (LoadVector mem)));
7811   format %{ "vpmulld $dst,$src,$mem\t! mul packed2I" %}
7812   ins_encode %{
7813     int vector_len = 0;
7814     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7815   %}
7816   ins_pipe( pipe_slow );
7817 %}
7818 
7819 instruct vmul4I(vecX dst, vecX src) %{
7820   predicate(UseSSE > 3 && n->as_Vector()->length() == 4);
7821   match(Set dst (MulVI dst src));
7822   format %{ "pmulld  $dst,$src\t! mul packed4I" %}
7823   ins_encode %{
7824     __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
7825   %}
7826   ins_pipe( pipe_slow );
7827 %}
7828 
7829 instruct vmul4I_reg(vecX dst, vecX src1, vecX src2) %{
7830   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7831   match(Set dst (MulVI src1 src2));
7832   format %{ "vpmulld $dst,$src1,$src2\t! mul packed4I" %}
7833   ins_encode %{
7834     int vector_len = 0;
7835     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7836   %}
7837   ins_pipe( pipe_slow );
7838 %}
7839 
7840 instruct vmul4I_mem(vecX dst, vecX src, memory mem) %{
7841   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7842   match(Set dst (MulVI src (LoadVector mem)));
7843   format %{ "vpmulld $dst,$src,$mem\t! mul packed4I" %}
7844   ins_encode %{
7845     int vector_len = 0;
7846     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7847   %}
7848   ins_pipe( pipe_slow );
7849 %}
7850 
7851 instruct vmul2L_reg(vecX dst, vecX src1, vecX src2) %{
7852   predicate(UseAVX > 2 && n->as_Vector()->length() == 2 && VM_Version::supports_avx512dq());
7853   match(Set dst (MulVL src1 src2));
7854   format %{ "vpmullq $dst,$src1,$src2\t! mul packed2L" %}
7855   ins_encode %{
7856     int vector_len = 0;
7857     __ vpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7858   %}
7859   ins_pipe( pipe_slow );
7860 %}
7861 
7862 instruct vmul2L_mem(vecX dst, vecX src, memory mem) %{
7863   predicate(UseAVX > 2 && n->as_Vector()->length() == 2 && VM_Version::supports_avx512dq());
7864   match(Set dst (MulVL src (LoadVector mem)));
7865   format %{ "vpmullq $dst,$src,$mem\t! mul packed2L" %}
7866   ins_encode %{
7867     int vector_len = 0;
7868     __ vpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7869   %}
7870   ins_pipe( pipe_slow );
7871 %}
7872 
7873 instruct vmul4L_reg(vecY dst, vecY src1, vecY src2) %{
7874   predicate(UseAVX > 2 && n->as_Vector()->length() == 4 && VM_Version::supports_avx512dq());
7875   match(Set dst (MulVL src1 src2));
7876   format %{ "vpmullq $dst,$src1,$src2\t! mul packed4L" %}
7877   ins_encode %{
7878     int vector_len = 1;
7879     __ vpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7880   %}
7881   ins_pipe( pipe_slow );
7882 %}
7883 
7884 instruct vmul4L_mem(vecY dst, vecY src, memory mem) %{
7885   predicate(UseAVX > 2 && n->as_Vector()->length() == 4 && VM_Version::supports_avx512dq());
7886   match(Set dst (MulVL src (LoadVector mem)));
7887   format %{ "vpmullq $dst,$src,$mem\t! mul packed4L" %}
7888   ins_encode %{
7889     int vector_len = 1;
7890     __ vpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7891   %}
7892   ins_pipe( pipe_slow );
7893 %}
7894 
7895 instruct vmul8L_reg(vecZ dst, vecZ src1, vecZ src2) %{
7896   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && VM_Version::supports_avx512dq());
7897   match(Set dst (MulVL src1 src2));
7898   format %{ "vpmullq $dst,$src1,$src2\t! mul packed8L" %}
7899   ins_encode %{
7900     int vector_len = 2;
7901     __ vpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7902   %}
7903   ins_pipe( pipe_slow );
7904 %}
7905 
7906 instruct vmul8L_mem(vecZ dst, vecZ src, memory mem) %{
7907   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && VM_Version::supports_avx512dq());
7908   match(Set dst (MulVL src (LoadVector mem)));
7909   format %{ "vpmullq $dst,$src,$mem\t! mul packed8L" %}
7910   ins_encode %{
7911     int vector_len = 2;
7912     __ vpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7913   %}
7914   ins_pipe( pipe_slow );
7915 %}
7916 
7917 instruct vmul8I_reg(vecY dst, vecY src1, vecY src2) %{
7918   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
7919   match(Set dst (MulVI src1 src2));
7920   format %{ "vpmulld $dst,$src1,$src2\t! mul packed8I" %}
7921   ins_encode %{
7922     int vector_len = 1;
7923     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7924   %}
7925   ins_pipe( pipe_slow );
7926 %}
7927 
7928 instruct vmul8I_mem(vecY dst, vecY src, memory mem) %{
7929   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
7930   match(Set dst (MulVI src (LoadVector mem)));
7931   format %{ "vpmulld $dst,$src,$mem\t! mul packed8I" %}
7932   ins_encode %{
7933     int vector_len = 1;
7934     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7935   %}
7936   ins_pipe( pipe_slow );
7937 %}
7938 
7939 instruct vmul16I_reg(vecZ dst, vecZ src1, vecZ src2) %{
7940   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
7941   match(Set dst (MulVI src1 src2));
7942   format %{ "vpmulld $dst,$src1,$src2\t! mul packed16I" %}
7943   ins_encode %{
7944     int vector_len = 2;
7945     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7946   %}
7947   ins_pipe( pipe_slow );
7948 %}
7949 
7950 instruct vmul16I_mem(vecZ dst, vecZ src, memory mem) %{
7951   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
7952   match(Set dst (MulVI src (LoadVector mem)));
7953   format %{ "vpmulld $dst,$src,$mem\t! mul packed16I" %}
7954   ins_encode %{
7955     int vector_len = 2;
7956     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7957   %}
7958   ins_pipe( pipe_slow );
7959 %}
7960 
7961 // Floats vector mul
7962 instruct vmul2F(vecD dst, vecD src) %{
7963   predicate(n->as_Vector()->length() == 2);
7964   match(Set dst (MulVF dst src));
7965   format %{ "mulps   $dst,$src\t! mul packed2F" %}
7966   ins_encode %{
7967     __ mulps($dst$$XMMRegister, $src$$XMMRegister);
7968   %}
7969   ins_pipe( pipe_slow );
7970 %}
7971 
7972 instruct vmul2F_reg(vecD dst, vecD src1, vecD src2) %{
7973   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7974   match(Set dst (MulVF src1 src2));
7975   format %{ "vmulps  $dst,$src1,$src2\t! mul packed2F" %}
7976   ins_encode %{
7977     int vector_len = 0;
7978     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7979   %}
7980   ins_pipe( pipe_slow );
7981 %}
7982 
7983 instruct vmul2F_mem(vecD dst, vecD src, memory mem) %{
7984   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7985   match(Set dst (MulVF src (LoadVector mem)));
7986   format %{ "vmulps  $dst,$src,$mem\t! mul packed2F" %}
7987   ins_encode %{
7988     int vector_len = 0;
7989     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7990   %}
7991   ins_pipe( pipe_slow );
7992 %}
7993 
7994 instruct vmul4F(vecX dst, vecX src) %{
7995   predicate(n->as_Vector()->length() == 4);
7996   match(Set dst (MulVF dst src));
7997   format %{ "mulps   $dst,$src\t! mul packed4F" %}
7998   ins_encode %{
7999     __ mulps($dst$$XMMRegister, $src$$XMMRegister);
8000   %}
8001   ins_pipe( pipe_slow );
8002 %}
8003 
8004 instruct vmul4F_reg(vecX dst, vecX src1, vecX src2) %{
8005   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8006   match(Set dst (MulVF src1 src2));
8007   format %{ "vmulps  $dst,$src1,$src2\t! mul packed4F" %}
8008   ins_encode %{
8009     int vector_len = 0;
8010     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8011   %}
8012   ins_pipe( pipe_slow );
8013 %}
8014 
8015 instruct vmul4F_mem(vecX dst, vecX src, memory mem) %{
8016   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8017   match(Set dst (MulVF src (LoadVector mem)));
8018   format %{ "vmulps  $dst,$src,$mem\t! mul packed4F" %}
8019   ins_encode %{
8020     int vector_len = 0;
8021     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8022   %}
8023   ins_pipe( pipe_slow );
8024 %}
8025 
8026 instruct vmul8F_reg(vecY dst, vecY src1, vecY src2) %{
8027   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
8028   match(Set dst (MulVF src1 src2));
8029   format %{ "vmulps  $dst,$src1,$src2\t! mul packed8F" %}
8030   ins_encode %{
8031     int vector_len = 1;
8032     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8033   %}
8034   ins_pipe( pipe_slow );
8035 %}
8036 
8037 instruct vmul8F_mem(vecY dst, vecY src, memory mem) %{
8038   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
8039   match(Set dst (MulVF src (LoadVector mem)));
8040   format %{ "vmulps  $dst,$src,$mem\t! mul packed8F" %}
8041   ins_encode %{
8042     int vector_len = 1;
8043     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8044   %}
8045   ins_pipe( pipe_slow );
8046 %}
8047 
8048 instruct vmul16F_reg(vecZ dst, vecZ src1, vecZ src2) %{
8049   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
8050   match(Set dst (MulVF src1 src2));
8051   format %{ "vmulps  $dst,$src1,$src2\t! mul packed16F" %}
8052   ins_encode %{
8053     int vector_len = 2;
8054     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8055   %}
8056   ins_pipe( pipe_slow );
8057 %}
8058 
8059 instruct vmul16F_mem(vecZ dst, vecZ src, memory mem) %{
8060   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
8061   match(Set dst (MulVF src (LoadVector mem)));
8062   format %{ "vmulps  $dst,$src,$mem\t! mul packed16F" %}
8063   ins_encode %{
8064     int vector_len = 2;
8065     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8066   %}
8067   ins_pipe( pipe_slow );
8068 %}
8069 
8070 // Doubles vector mul
8071 instruct vmul2D(vecX dst, vecX src) %{
8072   predicate(n->as_Vector()->length() == 2);
8073   match(Set dst (MulVD dst src));
8074   format %{ "mulpd   $dst,$src\t! mul packed2D" %}
8075   ins_encode %{
8076     __ mulpd($dst$$XMMRegister, $src$$XMMRegister);
8077   %}
8078   ins_pipe( pipe_slow );
8079 %}
8080 
8081 instruct vmul2D_reg(vecX dst, vecX src1, vecX src2) %{
8082   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8083   match(Set dst (MulVD src1 src2));
8084   format %{ "vmulpd  $dst,$src1,$src2\t! mul packed2D" %}
8085   ins_encode %{
8086     int vector_len = 0;
8087     __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8088   %}
8089   ins_pipe( pipe_slow );
8090 %}
8091 
8092 instruct vmul2D_mem(vecX dst, vecX src, memory mem) %{
8093   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8094   match(Set dst (MulVD src (LoadVector mem)));
8095   format %{ "vmulpd  $dst,$src,$mem\t! mul packed2D" %}
8096   ins_encode %{
8097     int vector_len = 0;
8098     __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8099   %}
8100   ins_pipe( pipe_slow );
8101 %}
8102 
8103 instruct vmul4D_reg(vecY dst, vecY src1, vecY src2) %{
8104   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8105   match(Set dst (MulVD src1 src2));
8106   format %{ "vmulpd  $dst,$src1,$src2\t! mul packed4D" %}
8107   ins_encode %{
8108     int vector_len = 1;
8109     __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8110   %}
8111   ins_pipe( pipe_slow );
8112 %}
8113 
8114 instruct vmul4D_mem(vecY dst, vecY src, memory mem) %{
8115   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8116   match(Set dst (MulVD src (LoadVector mem)));
8117   format %{ "vmulpd  $dst,$src,$mem\t! mul packed4D" %}
8118   ins_encode %{
8119     int vector_len = 1;
8120     __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8121   %}
8122   ins_pipe( pipe_slow );
8123 %}
8124 
8125 instruct vmul8D_reg(vecZ dst, vecZ src1, vecZ src2) %{
8126   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8127   match(Set dst (MulVD src1 src2));
8128   format %{ "vmulpd  $dst k0,$src1,$src2\t! mul packed8D" %}
8129   ins_encode %{
8130     int vector_len = 2;
8131     __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8132   %}
8133   ins_pipe( pipe_slow );
8134 %}
8135 
8136 instruct vmul8D_mem(vecZ dst, vecZ src, memory mem) %{
8137   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8138   match(Set dst (MulVD src (LoadVector mem)));
8139   format %{ "vmulpd  $dst k0,$src,$mem\t! mul packed8D" %}
8140   ins_encode %{
8141     int vector_len = 2;
8142     __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8143   %}
8144   ins_pipe( pipe_slow );
8145 %}
8146 
8147 instruct vcmov4D_reg(vecY dst, vecY src1, vecY src2, immI8 cop, cmpOp_vcmppd copnd) %{
8148   predicate(UseAVX > 0 && UseAVX < 3 && n->as_Vector()->length() == 4);
8149   match(Set dst (CMoveVD (Binary copnd cop) (Binary src1 src2)));
8150   effect(TEMP dst, USE src1, USE src2);
8151   format %{ "cmppd.$copnd  $dst, $src1, $src2  ! vcmovevd, cond=$cop\n\t"
8152             "vpblendd $dst,$src1,$src2,$dst ! vcmovevd\n\t"
8153          %}
8154   ins_encode %{
8155     int vector_len = 1;
8156     int cond = (Assembler::Condition)($copnd$$cmpcode);
8157     __ cmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cond, vector_len);
8158     __ vpblendd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $dst$$XMMRegister, vector_len);
8159   %}
8160   ins_pipe( pipe_slow );
8161 %}
8162 
8163 // --------------------------------- DIV --------------------------------------
8164 
8165 // Floats vector div
8166 instruct vdiv2F(vecD dst, vecD src) %{
8167   predicate(n->as_Vector()->length() == 2);
8168   match(Set dst (DivVF dst src));
8169   format %{ "divps   $dst,$src\t! div packed2F" %}
8170   ins_encode %{
8171     __ divps($dst$$XMMRegister, $src$$XMMRegister);
8172   %}
8173   ins_pipe( pipe_slow );
8174 %}
8175 
8176 instruct vdiv2F_reg(vecD dst, vecD src1, vecD src2) %{
8177   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8178   match(Set dst (DivVF src1 src2));
8179   format %{ "vdivps  $dst,$src1,$src2\t! div packed2F" %}
8180   ins_encode %{
8181     int vector_len = 0;
8182     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8183   %}
8184   ins_pipe( pipe_slow );
8185 %}
8186 
8187 instruct vdiv2F_mem(vecD dst, vecD src, memory mem) %{
8188   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8189   match(Set dst (DivVF src (LoadVector mem)));
8190   format %{ "vdivps  $dst,$src,$mem\t! div packed2F" %}
8191   ins_encode %{
8192     int vector_len = 0;
8193     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8194   %}
8195   ins_pipe( pipe_slow );
8196 %}
8197 
8198 instruct vdiv4F(vecX dst, vecX src) %{
8199   predicate(n->as_Vector()->length() == 4);
8200   match(Set dst (DivVF dst src));
8201   format %{ "divps   $dst,$src\t! div packed4F" %}
8202   ins_encode %{
8203     __ divps($dst$$XMMRegister, $src$$XMMRegister);
8204   %}
8205   ins_pipe( pipe_slow );
8206 %}
8207 
8208 instruct vdiv4F_reg(vecX dst, vecX src1, vecX src2) %{
8209   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8210   match(Set dst (DivVF src1 src2));
8211   format %{ "vdivps  $dst,$src1,$src2\t! div packed4F" %}
8212   ins_encode %{
8213     int vector_len = 0;
8214     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8215   %}
8216   ins_pipe( pipe_slow );
8217 %}
8218 
8219 instruct vdiv4F_mem(vecX dst, vecX src, memory mem) %{
8220   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8221   match(Set dst (DivVF src (LoadVector mem)));
8222   format %{ "vdivps  $dst,$src,$mem\t! div packed4F" %}
8223   ins_encode %{
8224     int vector_len = 0;
8225     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8226   %}
8227   ins_pipe( pipe_slow );
8228 %}
8229 
8230 instruct vdiv8F_reg(vecY dst, vecY src1, vecY src2) %{
8231   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
8232   match(Set dst (DivVF src1 src2));
8233   format %{ "vdivps  $dst,$src1,$src2\t! div packed8F" %}
8234   ins_encode %{
8235     int vector_len = 1;
8236     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8237   %}
8238   ins_pipe( pipe_slow );
8239 %}
8240 
8241 instruct vdiv8F_mem(vecY dst, vecY src, memory mem) %{
8242   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
8243   match(Set dst (DivVF src (LoadVector mem)));
8244   format %{ "vdivps  $dst,$src,$mem\t! div packed8F" %}
8245   ins_encode %{
8246     int vector_len = 1;
8247     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8248   %}
8249   ins_pipe( pipe_slow );
8250 %}
8251 
8252 instruct vdiv16F_reg(vecZ dst, vecZ src1, vecZ src2) %{
8253   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
8254   match(Set dst (DivVF src1 src2));
8255   format %{ "vdivps  $dst,$src1,$src2\t! div packed16F" %}
8256   ins_encode %{
8257     int vector_len = 2;
8258     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8259   %}
8260   ins_pipe( pipe_slow );
8261 %}
8262 
8263 instruct vdiv16F_mem(vecZ dst, vecZ src, memory mem) %{
8264   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
8265   match(Set dst (DivVF src (LoadVector mem)));
8266   format %{ "vdivps  $dst,$src,$mem\t! div packed16F" %}
8267   ins_encode %{
8268     int vector_len = 2;
8269     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8270   %}
8271   ins_pipe( pipe_slow );
8272 %}
8273 
8274 // Doubles vector div
8275 instruct vdiv2D(vecX dst, vecX src) %{
8276   predicate(n->as_Vector()->length() == 2);
8277   match(Set dst (DivVD dst src));
8278   format %{ "divpd   $dst,$src\t! div packed2D" %}
8279   ins_encode %{
8280     __ divpd($dst$$XMMRegister, $src$$XMMRegister);
8281   %}
8282   ins_pipe( pipe_slow );
8283 %}
8284 
8285 instruct vdiv2D_reg(vecX dst, vecX src1, vecX src2) %{
8286   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8287   match(Set dst (DivVD src1 src2));
8288   format %{ "vdivpd  $dst,$src1,$src2\t! div packed2D" %}
8289   ins_encode %{
8290     int vector_len = 0;
8291     __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8292   %}
8293   ins_pipe( pipe_slow );
8294 %}
8295 
8296 instruct vdiv2D_mem(vecX dst, vecX src, memory mem) %{
8297   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8298   match(Set dst (DivVD src (LoadVector mem)));
8299   format %{ "vdivpd  $dst,$src,$mem\t! div packed2D" %}
8300   ins_encode %{
8301     int vector_len = 0;
8302     __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8303   %}
8304   ins_pipe( pipe_slow );
8305 %}
8306 
8307 instruct vdiv4D_reg(vecY dst, vecY src1, vecY src2) %{
8308   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8309   match(Set dst (DivVD src1 src2));
8310   format %{ "vdivpd  $dst,$src1,$src2\t! div packed4D" %}
8311   ins_encode %{
8312     int vector_len = 1;
8313     __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8314   %}
8315   ins_pipe( pipe_slow );
8316 %}
8317 
8318 instruct vdiv4D_mem(vecY dst, vecY src, memory mem) %{
8319   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8320   match(Set dst (DivVD src (LoadVector mem)));
8321   format %{ "vdivpd  $dst,$src,$mem\t! div packed4D" %}
8322   ins_encode %{
8323     int vector_len = 1;
8324     __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8325   %}
8326   ins_pipe( pipe_slow );
8327 %}
8328 
8329 instruct vdiv8D_reg(vecZ dst, vecZ src1, vecZ src2) %{
8330   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8331   match(Set dst (DivVD src1 src2));
8332   format %{ "vdivpd  $dst,$src1,$src2\t! div packed8D" %}
8333   ins_encode %{
8334     int vector_len = 2;
8335     __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8336   %}
8337   ins_pipe( pipe_slow );
8338 %}
8339 
8340 instruct vdiv8D_mem(vecZ dst, vecZ src, memory mem) %{
8341   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8342   match(Set dst (DivVD src (LoadVector mem)));
8343   format %{ "vdivpd  $dst,$src,$mem\t! div packed8D" %}
8344   ins_encode %{
8345     int vector_len = 2;
8346     __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8347   %}
8348   ins_pipe( pipe_slow );
8349 %}
8350 
8351 // ------------------------------ Shift ---------------------------------------
8352 
8353 // Left and right shift count vectors are the same on x86
8354 // (only lowest bits of xmm reg are used for count).
8355 instruct vshiftcnt(vecS dst, rRegI cnt) %{
8356   match(Set dst (LShiftCntV cnt));
8357   match(Set dst (RShiftCntV cnt));
8358   format %{ "movd    $dst,$cnt\t! load shift count" %}
8359   ins_encode %{
8360     __ movdl($dst$$XMMRegister, $cnt$$Register);
8361   %}
8362   ins_pipe( pipe_slow );
8363 %}
8364 
8365 // --------------------------------- Sqrt --------------------------------------
8366 
8367 // Floating point vector sqrt - double precision only
8368 instruct vsqrt2D_reg(vecX dst, vecX src) %{
8369   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8370   match(Set dst (SqrtVD src));
8371   format %{ "vsqrtpd  $dst,$src\t! sqrt packed2D" %}
8372   ins_encode %{
8373     int vector_len = 0;
8374     __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
8375   %}
8376   ins_pipe( pipe_slow );
8377 %}
8378 
8379 instruct vsqrt2D_mem(vecX dst, memory mem) %{
8380   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8381   match(Set dst (SqrtVD (LoadVector mem)));
8382   format %{ "vsqrtpd  $dst,$mem\t! sqrt packed2D" %}
8383   ins_encode %{
8384     int vector_len = 0;
8385     __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vector_len);
8386   %}
8387   ins_pipe( pipe_slow );
8388 %}
8389 
8390 instruct vsqrt4D_reg(vecY dst, vecY src) %{
8391   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8392   match(Set dst (SqrtVD src));
8393   format %{ "vsqrtpd  $dst,$src\t! sqrt packed4D" %}
8394   ins_encode %{
8395     int vector_len = 1;
8396     __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
8397   %}
8398   ins_pipe( pipe_slow );
8399 %}
8400 
8401 instruct vsqrt4D_mem(vecY dst, memory mem) %{
8402   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8403   match(Set dst (SqrtVD (LoadVector mem)));
8404   format %{ "vsqrtpd  $dst,$mem\t! sqrt packed4D" %}
8405   ins_encode %{
8406     int vector_len = 1;
8407     __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vector_len);
8408   %}
8409   ins_pipe( pipe_slow );
8410 %}
8411 
8412 instruct vsqrt8D_reg(vecZ dst, vecZ src) %{
8413   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8414   match(Set dst (SqrtVD src));
8415   format %{ "vsqrtpd  $dst,$src\t! sqrt packed8D" %}
8416   ins_encode %{
8417     int vector_len = 2;
8418     __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
8419   %}
8420   ins_pipe( pipe_slow );
8421 %}
8422 
8423 instruct vsqrt8D_mem(vecZ dst, memory mem) %{
8424   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8425   match(Set dst (SqrtVD (LoadVector mem)));
8426   format %{ "vsqrtpd  $dst,$mem\t! sqrt packed8D" %}
8427   ins_encode %{
8428     int vector_len = 2;
8429     __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vector_len);
8430   %}
8431   ins_pipe( pipe_slow );
8432 %}
8433 
8434 // ------------------------------ LeftShift -----------------------------------
8435 
8436 // Shorts/Chars vector left shift
8437 instruct vsll2S(vecS dst, vecS shift) %{
8438   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
8439   match(Set dst (LShiftVS dst shift));
8440   format %{ "psllw   $dst,$shift\t! left shift packed2S" %}
8441   ins_encode %{
8442     __ psllw($dst$$XMMRegister, $shift$$XMMRegister);
8443   %}
8444   ins_pipe( pipe_slow );
8445 %}
8446 
8447 instruct vsll2S_imm(vecS dst, immI8 shift) %{
8448   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
8449   match(Set dst (LShiftVS dst shift));
8450   format %{ "psllw   $dst,$shift\t! left shift packed2S" %}
8451   ins_encode %{
8452     __ psllw($dst$$XMMRegister, (int)$shift$$constant);
8453   %}
8454   ins_pipe( pipe_slow );
8455 %}
8456 
8457 instruct vsll2S_reg_avx(vecS dst, vecS src, vecS shift) %{
8458   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
8459   match(Set dst (LShiftVS src shift));
8460   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
8461   ins_encode %{
8462     int vector_len = 0;
8463     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8464   %}
8465   ins_pipe( pipe_slow );
8466 %}
8467 
8468 instruct vsll2S_reg_evex(vecS dst, vecS src, vecS shift) %{
8469   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
8470   match(Set dst (LShiftVS src shift));
8471   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
8472   ins_encode %{
8473     int vector_len = 0;
8474     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8475   %}
8476   ins_pipe( pipe_slow );
8477 %}
8478 
8479 instruct vsll2S_reg_evex_special(vecS dst, vecS src, vecS shift) %{
8480   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
8481   match(Set dst (LShiftVS dst shift));
8482   effect(TEMP src);
8483   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
8484   ins_encode %{
8485     int vector_len = 0;
8486     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8487   %}
8488   ins_pipe( pipe_slow );
8489 %}
8490 
8491 instruct vsll2S_reg_imm_avx(vecS dst, vecS src, immI8 shift) %{
8492   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
8493   match(Set dst (LShiftVS src shift));
8494   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
8495   ins_encode %{
8496     int vector_len = 0;
8497     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8498   %}
8499   ins_pipe( pipe_slow );
8500 %}
8501 
8502 instruct vsll2S_reg_imm_evex(vecS dst, vecS src, immI8 shift) %{
8503   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
8504   match(Set dst (LShiftVS src shift));
8505   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
8506   ins_encode %{
8507     int vector_len = 0;
8508     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8509   %}
8510   ins_pipe( pipe_slow );
8511 %}
8512 
8513 instruct vsll2S_reg_imm_evex_special(vecS dst, vecS src, immI8 shift) %{
8514   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
8515   match(Set dst (LShiftVS dst shift));
8516   effect(TEMP src);
8517   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
8518   ins_encode %{
8519     int vector_len = 0;
8520     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8521   %}
8522   ins_pipe( pipe_slow );
8523 %}
8524 
8525 instruct vsll4S(vecD dst, vecS shift) %{
8526   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
8527   match(Set dst (LShiftVS dst shift));
8528   format %{ "psllw   $dst,$shift\t! left shift packed4S" %}
8529   ins_encode %{
8530     __ psllw($dst$$XMMRegister, $shift$$XMMRegister);
8531   %}
8532   ins_pipe( pipe_slow );
8533 %}
8534 
8535 instruct vsll4S_imm(vecD dst, immI8 shift) %{
8536   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
8537   match(Set dst (LShiftVS dst shift));
8538   format %{ "psllw   $dst,$shift\t! left shift packed4S" %}
8539   ins_encode %{
8540     __ psllw($dst$$XMMRegister, (int)$shift$$constant);
8541   %}
8542   ins_pipe( pipe_slow );
8543 %}
8544 
8545 instruct vsll4S_reg_avx(vecD dst, vecD src, vecS shift) %{
8546   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
8547   match(Set dst (LShiftVS src shift));
8548   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
8549   ins_encode %{
8550     int vector_len = 0;
8551     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8552   %}
8553   ins_pipe( pipe_slow );
8554 %}
8555 
8556 instruct vsll4S_reg_evex(vecD dst, vecD src, vecS shift) %{
8557   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
8558   match(Set dst (LShiftVS src shift));
8559   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
8560   ins_encode %{
8561     int vector_len = 0;
8562     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8563   %}
8564   ins_pipe( pipe_slow );
8565 %}
8566 
8567 instruct vsll4S_reg_evex_special(vecD dst, vecD src, vecS shift) %{
8568   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
8569   match(Set dst (LShiftVS dst shift));
8570   effect(TEMP src);
8571   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
8572   ins_encode %{
8573     int vector_len = 0;
8574     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8575   %}
8576   ins_pipe( pipe_slow );
8577 %}
8578 
8579 instruct vsll4S_reg_imm_avx(vecD dst, vecD src, immI8 shift) %{
8580   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
8581   match(Set dst (LShiftVS src shift));
8582   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
8583   ins_encode %{
8584     int vector_len = 0;
8585     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8586   %}
8587   ins_pipe( pipe_slow );
8588 %}
8589 
8590 instruct vsll4S_reg_imm_evex(vecD dst, vecD src, immI8 shift) %{
8591   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
8592   match(Set dst (LShiftVS src shift));
8593   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
8594   ins_encode %{
8595     int vector_len = 0;
8596     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8597   %}
8598   ins_pipe( pipe_slow );
8599 %}
8600 
8601 instruct vsll4S_reg_imm_evex_special(vecD dst, vecD src, immI8 shift) %{
8602   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
8603   match(Set dst (LShiftVS dst shift));
8604   effect(TEMP src);
8605   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
8606   ins_encode %{
8607     int vector_len = 0;
8608     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8609   %}
8610   ins_pipe( pipe_slow );
8611 %}
8612 
8613 instruct vsll8S(vecX dst, vecS shift) %{
8614   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
8615   match(Set dst (LShiftVS dst shift));
8616   format %{ "psllw   $dst,$shift\t! left shift packed8S" %}
8617   ins_encode %{
8618     __ psllw($dst$$XMMRegister, $shift$$XMMRegister);
8619   %}
8620   ins_pipe( pipe_slow );
8621 %}
8622 
8623 instruct vsll8S_imm(vecX dst, immI8 shift) %{
8624   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
8625   match(Set dst (LShiftVS dst shift));
8626   format %{ "psllw   $dst,$shift\t! left shift packed8S" %}
8627   ins_encode %{
8628     __ psllw($dst$$XMMRegister, (int)$shift$$constant);
8629   %}
8630   ins_pipe( pipe_slow );
8631 %}
8632 
8633 instruct vsll8S_reg_avx(vecX dst, vecX src, vecS shift) %{
8634   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
8635   match(Set dst (LShiftVS src shift));
8636   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
8637   ins_encode %{
8638     int vector_len = 0;
8639     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8640   %}
8641   ins_pipe( pipe_slow );
8642 %}
8643 
8644 instruct vsll8S_reg_evex(vecX dst, vecX src, vecS shift) %{
8645   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
8646   match(Set dst (LShiftVS src shift));
8647   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
8648   ins_encode %{
8649     int vector_len = 0;
8650     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8651   %}
8652   ins_pipe( pipe_slow );
8653 %}
8654 
8655 instruct vsll8S_reg_evex_special(vecX dst, vecX src, vecS shift) %{
8656   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
8657   match(Set dst (LShiftVS dst shift));
8658   effect(TEMP src);
8659   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
8660   ins_encode %{
8661     int vector_len = 0;
8662     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8663   %}
8664   ins_pipe( pipe_slow );
8665 %}
8666 
8667 instruct vsll8S_reg_imm_avx(vecX dst, vecX src, immI8 shift) %{
8668   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
8669   match(Set dst (LShiftVS src shift));
8670   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
8671   ins_encode %{
8672     int vector_len = 0;
8673     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8674   %}
8675   ins_pipe( pipe_slow );
8676 %}
8677 
8678 instruct vsll8S_reg_imm_evex(vecX dst, vecX src, immI8 shift) %{
8679   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
8680   match(Set dst (LShiftVS src shift));
8681   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
8682   ins_encode %{
8683     int vector_len = 0;
8684     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8685   %}
8686   ins_pipe( pipe_slow );
8687 %}
8688 
8689 instruct vsll8S_reg_imm_evex_special(vecX dst, vecX src, immI8 shift) %{
8690   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
8691   match(Set dst (LShiftVS dst shift));
8692   effect(TEMP src);
8693   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
8694   ins_encode %{
8695     int vector_len = 0;
8696     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8697   %}
8698   ins_pipe( pipe_slow );
8699 %}
8700 
8701 instruct vsll16S_reg_avx(vecY dst, vecY src, vecS shift) %{
8702   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
8703   match(Set dst (LShiftVS src shift));
8704   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
8705   ins_encode %{
8706     int vector_len = 1;
8707     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8708   %}
8709   ins_pipe( pipe_slow );
8710 %}
8711 
8712 instruct vsll16S_reg_evex(vecY dst, vecY src, vecS shift) %{
8713   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
8714   match(Set dst (LShiftVS src shift));
8715   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
8716   ins_encode %{
8717     int vector_len = 1;
8718     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8719   %}
8720   ins_pipe( pipe_slow );
8721 %}
8722 
8723 instruct vsll16S_reg_evex_special(vecY dst, vecY src, vecS shift) %{
8724   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
8725   match(Set dst (LShiftVS dst shift));
8726   effect(TEMP src);
8727   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
8728   ins_encode %{
8729     int vector_len = 1;
8730     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8731   %}
8732   ins_pipe( pipe_slow );
8733 %}
8734 
8735 instruct vsll16S_reg_imm_avx(vecY dst, vecY src, immI8 shift) %{
8736   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
8737   match(Set dst (LShiftVS src shift));
8738   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
8739   ins_encode %{
8740     int vector_len = 1;
8741     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8742   %}
8743   ins_pipe( pipe_slow );
8744 %}
8745 
8746 instruct vsll16S_reg_imm_evex(vecY dst, vecY src, immI8 shift) %{
8747   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
8748   match(Set dst (LShiftVS src shift));
8749   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
8750   ins_encode %{
8751     int vector_len = 1;
8752     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8753   %}
8754   ins_pipe( pipe_slow );
8755 %}
8756 
8757 instruct vsll16S_reg_imm_evex_special(vecY dst, vecY src, immI8 shift) %{
8758   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
8759   match(Set dst (LShiftVS dst shift));
8760   effect(TEMP src);
8761   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
8762   ins_encode %{
8763     int vector_len = 1;
8764     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8765   %}
8766   ins_pipe( pipe_slow );
8767 %}
8768 
8769 instruct vsll32S_reg(vecZ dst, vecZ src, vecS shift) %{
8770   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
8771   match(Set dst (LShiftVS src shift));
8772   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed32S" %}
8773   ins_encode %{
8774     int vector_len = 2;
8775     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8776   %}
8777   ins_pipe( pipe_slow );
8778 %}
8779 
8780 instruct vsll32S_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
8781   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
8782   match(Set dst (LShiftVS src shift));
8783   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed32S" %}
8784   ins_encode %{
8785     int vector_len = 2;
8786     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8787   %}
8788   ins_pipe( pipe_slow );
8789 %}
8790 
8791 // Integers vector left shift
8792 instruct vsll2I(vecD dst, vecS shift) %{
8793   predicate(n->as_Vector()->length() == 2);
8794   match(Set dst (LShiftVI dst shift));
8795   format %{ "pslld   $dst,$shift\t! left shift packed2I" %}
8796   ins_encode %{
8797     __ pslld($dst$$XMMRegister, $shift$$XMMRegister);
8798   %}
8799   ins_pipe( pipe_slow );
8800 %}
8801 
8802 instruct vsll2I_imm(vecD dst, immI8 shift) %{
8803   predicate(n->as_Vector()->length() == 2);
8804   match(Set dst (LShiftVI dst shift));
8805   format %{ "pslld   $dst,$shift\t! left shift packed2I" %}
8806   ins_encode %{
8807     __ pslld($dst$$XMMRegister, (int)$shift$$constant);
8808   %}
8809   ins_pipe( pipe_slow );
8810 %}
8811 
8812 instruct vsll2I_reg(vecD dst, vecD src, vecS shift) %{
8813   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8814   match(Set dst (LShiftVI src shift));
8815   format %{ "vpslld  $dst,$src,$shift\t! left shift packed2I" %}
8816   ins_encode %{
8817     int vector_len = 0;
8818     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8819   %}
8820   ins_pipe( pipe_slow );
8821 %}
8822 
8823 instruct vsll2I_reg_imm(vecD dst, vecD src, immI8 shift) %{
8824   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8825   match(Set dst (LShiftVI src shift));
8826   format %{ "vpslld  $dst,$src,$shift\t! left shift packed2I" %}
8827   ins_encode %{
8828     int vector_len = 0;
8829     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8830   %}
8831   ins_pipe( pipe_slow );
8832 %}
8833 
8834 instruct vsll4I(vecX dst, vecS shift) %{
8835   predicate(n->as_Vector()->length() == 4);
8836   match(Set dst (LShiftVI dst shift));
8837   format %{ "pslld   $dst,$shift\t! left shift packed4I" %}
8838   ins_encode %{
8839     __ pslld($dst$$XMMRegister, $shift$$XMMRegister);
8840   %}
8841   ins_pipe( pipe_slow );
8842 %}
8843 
8844 instruct vsll4I_imm(vecX dst, immI8 shift) %{
8845   predicate(n->as_Vector()->length() == 4);
8846   match(Set dst (LShiftVI dst shift));
8847   format %{ "pslld   $dst,$shift\t! left shift packed4I" %}
8848   ins_encode %{
8849     __ pslld($dst$$XMMRegister, (int)$shift$$constant);
8850   %}
8851   ins_pipe( pipe_slow );
8852 %}
8853 
8854 instruct vsll4I_reg(vecX dst, vecX src, vecS shift) %{
8855   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8856   match(Set dst (LShiftVI src shift));
8857   format %{ "vpslld  $dst,$src,$shift\t! left shift packed4I" %}
8858   ins_encode %{
8859     int vector_len = 0;
8860     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8861   %}
8862   ins_pipe( pipe_slow );
8863 %}
8864 
8865 instruct vsll4I_reg_imm(vecX dst, vecX src, immI8 shift) %{
8866   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8867   match(Set dst (LShiftVI src shift));
8868   format %{ "vpslld  $dst,$src,$shift\t! left shift packed4I" %}
8869   ins_encode %{
8870     int vector_len = 0;
8871     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8872   %}
8873   ins_pipe( pipe_slow );
8874 %}
8875 
8876 instruct vsll8I_reg(vecY dst, vecY src, vecS shift) %{
8877   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
8878   match(Set dst (LShiftVI src shift));
8879   format %{ "vpslld  $dst,$src,$shift\t! left shift packed8I" %}
8880   ins_encode %{
8881     int vector_len = 1;
8882     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8883   %}
8884   ins_pipe( pipe_slow );
8885 %}
8886 
8887 instruct vsll8I_reg_imm(vecY dst, vecY src, immI8 shift) %{
8888   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
8889   match(Set dst (LShiftVI src shift));
8890   format %{ "vpslld  $dst,$src,$shift\t! left shift packed8I" %}
8891   ins_encode %{
8892     int vector_len = 1;
8893     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8894   %}
8895   ins_pipe( pipe_slow );
8896 %}
8897 
8898 instruct vsll16I_reg(vecZ dst, vecZ src, vecS shift) %{
8899   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
8900   match(Set dst (LShiftVI src shift));
8901   format %{ "vpslld  $dst,$src,$shift\t! left shift packed16I" %}
8902   ins_encode %{
8903     int vector_len = 2;
8904     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8905   %}
8906   ins_pipe( pipe_slow );
8907 %}
8908 
8909 instruct vsll16I_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
8910   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
8911   match(Set dst (LShiftVI src shift));
8912   format %{ "vpslld  $dst,$src,$shift\t! left shift packed16I" %}
8913   ins_encode %{
8914     int vector_len = 2;
8915     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8916   %}
8917   ins_pipe( pipe_slow );
8918 %}
8919 
8920 // Longs vector left shift
8921 instruct vsll2L(vecX dst, vecS shift) %{
8922   predicate(n->as_Vector()->length() == 2);
8923   match(Set dst (LShiftVL dst shift));
8924   format %{ "psllq   $dst,$shift\t! left shift packed2L" %}
8925   ins_encode %{
8926     __ psllq($dst$$XMMRegister, $shift$$XMMRegister);
8927   %}
8928   ins_pipe( pipe_slow );
8929 %}
8930 
8931 instruct vsll2L_imm(vecX dst, immI8 shift) %{
8932   predicate(n->as_Vector()->length() == 2);
8933   match(Set dst (LShiftVL dst shift));
8934   format %{ "psllq   $dst,$shift\t! left shift packed2L" %}
8935   ins_encode %{
8936     __ psllq($dst$$XMMRegister, (int)$shift$$constant);
8937   %}
8938   ins_pipe( pipe_slow );
8939 %}
8940 
8941 instruct vsll2L_reg(vecX dst, vecX src, vecS shift) %{
8942   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8943   match(Set dst (LShiftVL src shift));
8944   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed2L" %}
8945   ins_encode %{
8946     int vector_len = 0;
8947     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8948   %}
8949   ins_pipe( pipe_slow );
8950 %}
8951 
8952 instruct vsll2L_reg_imm(vecX dst, vecX src, immI8 shift) %{
8953   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8954   match(Set dst (LShiftVL src shift));
8955   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed2L" %}
8956   ins_encode %{
8957     int vector_len = 0;
8958     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8959   %}
8960   ins_pipe( pipe_slow );
8961 %}
8962 
8963 instruct vsll4L_reg(vecY dst, vecY src, vecS shift) %{
8964   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
8965   match(Set dst (LShiftVL src shift));
8966   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed4L" %}
8967   ins_encode %{
8968     int vector_len = 1;
8969     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8970   %}
8971   ins_pipe( pipe_slow );
8972 %}
8973 
8974 instruct vsll4L_reg_imm(vecY dst, vecY src, immI8 shift) %{
8975   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
8976   match(Set dst (LShiftVL src shift));
8977   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed4L" %}
8978   ins_encode %{
8979     int vector_len = 1;
8980     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8981   %}
8982   ins_pipe( pipe_slow );
8983 %}
8984 
8985 instruct vsll8L_reg(vecZ dst, vecZ src, vecS shift) %{
8986   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8987   match(Set dst (LShiftVL src shift));
8988   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed8L" %}
8989   ins_encode %{
8990     int vector_len = 2;
8991     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8992   %}
8993   ins_pipe( pipe_slow );
8994 %}
8995 
8996 instruct vsll8L_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
8997   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8998   match(Set dst (LShiftVL src shift));
8999   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed8L" %}
9000   ins_encode %{
9001     int vector_len = 2;
9002     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9003   %}
9004   ins_pipe( pipe_slow );
9005 %}
9006 
9007 // ----------------------- LogicalRightShift -----------------------------------
9008 
9009 // Shorts vector logical right shift produces incorrect Java result
9010 // for negative data because java code convert short value into int with
9011 // sign extension before a shift. But char vectors are fine since chars are
9012 // unsigned values.
9013 
9014 instruct vsrl2S(vecS dst, vecS shift) %{
9015   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
9016   match(Set dst (URShiftVS dst shift));
9017   format %{ "psrlw   $dst,$shift\t! logical right shift packed2S" %}
9018   ins_encode %{
9019     __ psrlw($dst$$XMMRegister, $shift$$XMMRegister);
9020   %}
9021   ins_pipe( pipe_slow );
9022 %}
9023 
9024 instruct vsrl2S_imm(vecS dst, immI8 shift) %{
9025   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
9026   match(Set dst (URShiftVS dst shift));
9027   format %{ "psrlw   $dst,$shift\t! logical right shift packed2S" %}
9028   ins_encode %{
9029     __ psrlw($dst$$XMMRegister, (int)$shift$$constant);
9030   %}
9031   ins_pipe( pipe_slow );
9032 %}
9033 
9034 instruct vsrl2S_reg_avx(vecS dst, vecS src, vecS shift) %{
9035   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
9036   match(Set dst (URShiftVS src shift));
9037   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed2S" %}
9038   ins_encode %{
9039     int vector_len = 0;
9040     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9041   %}
9042   ins_pipe( pipe_slow );
9043 %}
9044 
9045 instruct vsrl2S_reg_evex(vecS dst, vecS src, vecS shift) %{
9046   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
9047   match(Set dst (URShiftVS src shift));
9048   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed2S" %}
9049   ins_encode %{
9050     int vector_len = 0;
9051     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9052   %}
9053   ins_pipe( pipe_slow );
9054 %}
9055 
9056 instruct vsrl2S_reg_evex_special(vecS dst, vecS src, vecS shift) %{
9057   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
9058   match(Set dst (URShiftVS dst shift));
9059   effect(TEMP src);
9060   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed2S" %}
9061   ins_encode %{
9062     int vector_len = 0;
9063     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9064   %}
9065   ins_pipe( pipe_slow );
9066 %}
9067 
9068 instruct vsrl2S_reg_imm_avx(vecS dst, vecS src, immI8 shift) %{
9069   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
9070   match(Set dst (URShiftVS src shift));
9071   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed2S" %}
9072   ins_encode %{
9073     int vector_len = 0;
9074     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9075   %}
9076   ins_pipe( pipe_slow );
9077 %}
9078 
9079 instruct vsrl2S_reg_imm_evex(vecS dst, vecS src, immI8 shift) %{
9080   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
9081   match(Set dst (URShiftVS src shift));
9082   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed2S" %}
9083   ins_encode %{
9084     int vector_len = 0;
9085     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9086   %}
9087   ins_pipe( pipe_slow );
9088 %}
9089 
9090 instruct vsrl2S_reg_imm_evex_special(vecS dst, vecS src, immI8 shift) %{
9091   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
9092   match(Set dst (URShiftVS dst shift));
9093   effect(TEMP src);
9094   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed2S" %}
9095   ins_encode %{
9096     int vector_len = 0;
9097     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9098   %}
9099   ins_pipe( pipe_slow );
9100 %}
9101 
9102 instruct vsrl4S(vecD dst, vecS shift) %{
9103   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
9104   match(Set dst (URShiftVS dst shift));
9105   format %{ "psrlw   $dst,$shift\t! logical right shift packed4S" %}
9106   ins_encode %{
9107     __ psrlw($dst$$XMMRegister, $shift$$XMMRegister);
9108   %}
9109   ins_pipe( pipe_slow );
9110 %}
9111 
9112 instruct vsrl4S_imm(vecD dst, immI8 shift) %{
9113   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
9114   match(Set dst (URShiftVS dst shift));
9115   format %{ "psrlw   $dst,$shift\t! logical right shift packed4S" %}
9116   ins_encode %{
9117     __ psrlw($dst$$XMMRegister, (int)$shift$$constant);
9118   %}
9119   ins_pipe( pipe_slow );
9120 %}
9121 
9122 instruct vsrl4S_reg_avx(vecD dst, vecD src, vecS shift) %{
9123   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
9124   match(Set dst (URShiftVS src shift));
9125   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed4S" %}
9126   ins_encode %{
9127     int vector_len = 0;
9128     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9129   %}
9130   ins_pipe( pipe_slow );
9131 %}
9132 
9133 instruct vsrl4S_reg_evex(vecD dst, vecD src, vecS shift) %{
9134   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
9135   match(Set dst (URShiftVS src shift));
9136   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed4S" %}
9137   ins_encode %{
9138     int vector_len = 0;
9139     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9140   %}
9141   ins_pipe( pipe_slow );
9142 %}
9143 
9144 instruct vsrl4S_reg_evex_special(vecD dst, vecD src, vecS shift) %{
9145   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
9146   match(Set dst (URShiftVS dst shift));
9147   effect(TEMP src);
9148   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed4S" %}
9149   ins_encode %{
9150     int vector_len = 0;
9151     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9152   %}
9153   ins_pipe( pipe_slow );
9154 %}
9155 
9156 instruct vsrl4S_reg_imm_avx(vecD dst, vecD src, immI8 shift) %{
9157   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
9158   match(Set dst (URShiftVS src shift));
9159   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed4S" %}
9160   ins_encode %{
9161     int vector_len = 0;
9162     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9163   %}
9164   ins_pipe( pipe_slow );
9165 %}
9166 
9167 instruct vsrl4S_reg_imm_evex(vecD dst, vecD src, immI8 shift) %{
9168   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
9169   match(Set dst (URShiftVS src shift));
9170   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed4S" %}
9171   ins_encode %{
9172     int vector_len = 0;
9173     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9174   %}
9175   ins_pipe( pipe_slow );
9176 %}
9177 
9178 instruct vsrl4S_reg_imm_evex_special(vecD dst, vecD src, immI8 shift) %{
9179   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
9180   match(Set dst (URShiftVS dst shift));
9181   effect(TEMP src);
9182   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed4S" %}
9183   ins_encode %{
9184     int vector_len = 0;
9185     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9186   %}
9187   ins_pipe( pipe_slow );
9188 %}
9189 
9190 instruct vsrl8S(vecX dst, vecS shift) %{
9191   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
9192   match(Set dst (URShiftVS dst shift));
9193   format %{ "psrlw   $dst,$shift\t! logical right shift packed8S" %}
9194   ins_encode %{
9195     __ psrlw($dst$$XMMRegister, $shift$$XMMRegister);
9196   %}
9197   ins_pipe( pipe_slow );
9198 %}
9199 
9200 instruct vsrl8S_imm(vecX dst, immI8 shift) %{
9201   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
9202   match(Set dst (URShiftVS dst shift));
9203   format %{ "psrlw   $dst,$shift\t! logical right shift packed8S" %}
9204   ins_encode %{
9205     __ psrlw($dst$$XMMRegister, (int)$shift$$constant);
9206   %}
9207   ins_pipe( pipe_slow );
9208 %}
9209 
9210 instruct vsrl8S_reg_avx(vecX dst, vecX src, vecS shift) %{
9211   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
9212   match(Set dst (URShiftVS src shift));
9213   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed8S" %}
9214   ins_encode %{
9215     int vector_len = 0;
9216     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9217   %}
9218   ins_pipe( pipe_slow );
9219 %}
9220 
9221 instruct vsrl8S_reg_evex(vecX dst, vecX src, vecS shift) %{
9222   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
9223   match(Set dst (URShiftVS src shift));
9224   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed8S" %}
9225   ins_encode %{
9226     int vector_len = 0;
9227     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9228   %}
9229   ins_pipe( pipe_slow );
9230 %}
9231 
9232 instruct vsrl8S_reg_evex_special(vecX dst, vecX src, vecS shift) %{
9233   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
9234   match(Set dst (URShiftVS dst shift));
9235   effect(TEMP src);
9236   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed8S" %}
9237   ins_encode %{
9238     int vector_len = 0;
9239     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9240   %}
9241   ins_pipe( pipe_slow );
9242 %}
9243 
9244 instruct vsrl8S_reg_imm_avx(vecX dst, vecX src, immI8 shift) %{
9245   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
9246   match(Set dst (URShiftVS src shift));
9247   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed8S" %}
9248   ins_encode %{
9249     int vector_len = 0;
9250     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9251   %}
9252   ins_pipe( pipe_slow );
9253 %}
9254 
9255 instruct vsrl8S_reg_imm_evex(vecX dst, vecX src, immI8 shift) %{
9256   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
9257   match(Set dst (URShiftVS src shift));
9258   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed8S" %}
9259   ins_encode %{
9260     int vector_len = 0;
9261     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9262   %}
9263   ins_pipe( pipe_slow );
9264 %}
9265 
9266 instruct vsrl8S_reg_imm_evex_special(vecX dst, vecX src, immI8 shift) %{
9267   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
9268   match(Set dst (URShiftVS dst shift));
9269   effect(TEMP src);
9270   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed8S" %}
9271   ins_encode %{
9272     int vector_len = 0;
9273     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9274   %}
9275   ins_pipe( pipe_slow );
9276 %}
9277 
9278 instruct vsrl16S_reg_avx(vecY dst, vecY src, vecS shift) %{
9279   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
9280   match(Set dst (URShiftVS src shift));
9281   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed16S" %}
9282   ins_encode %{
9283     int vector_len = 1;
9284     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9285   %}
9286   ins_pipe( pipe_slow );
9287 %}
9288 
9289 instruct vsrl16S_reg_evex(vecY dst, vecY src, vecS shift) %{
9290   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
9291   match(Set dst (URShiftVS src shift));
9292   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed16S" %}
9293   ins_encode %{
9294     int vector_len = 1;
9295     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9296   %}
9297   ins_pipe( pipe_slow );
9298 %}
9299 
9300 instruct vsrl16S_reg_evex_special(vecY dst, vecY src, vecS shift) %{
9301   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
9302   match(Set dst (URShiftVS dst shift));
9303   effect(TEMP src);
9304   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed16S" %}
9305   ins_encode %{
9306     int vector_len = 1;
9307     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9308   %}
9309   ins_pipe( pipe_slow );
9310 %}
9311 
9312 instruct vsrl16S_reg_imm_avx(vecY dst, vecY src, immI8 shift) %{
9313   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
9314   match(Set dst (URShiftVS src shift));
9315   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed16S" %}
9316   ins_encode %{
9317     int vector_len = 1;
9318     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9319   %}
9320   ins_pipe( pipe_slow );
9321 %}
9322 
9323 instruct vsrl16S_reg_imm_evex(vecY dst, vecY src, immI8 shift) %{
9324   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
9325   match(Set dst (URShiftVS src shift));
9326   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed16S" %}
9327   ins_encode %{
9328     int vector_len = 1;
9329     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9330   %}
9331   ins_pipe( pipe_slow );
9332 %}
9333 
9334 instruct vsrl16S_reg_imm_evex_special(vecY dst, vecY src, immI8 shift) %{
9335   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
9336   match(Set dst (URShiftVS dst shift));
9337   effect(TEMP src);
9338   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed16S" %}
9339   ins_encode %{
9340     int vector_len = 1;
9341     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9342   %}
9343   ins_pipe( pipe_slow );
9344 %}
9345 
9346 instruct vsrl32S_reg(vecZ dst, vecZ src, vecS shift) %{
9347   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
9348   match(Set dst (URShiftVS src shift));
9349   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed32S" %}
9350   ins_encode %{
9351     int vector_len = 2;
9352     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9353   %}
9354   ins_pipe( pipe_slow );
9355 %}
9356 
9357 instruct vsrl32S_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
9358   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
9359   match(Set dst (URShiftVS src shift));
9360   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed32S" %}
9361   ins_encode %{
9362     int vector_len = 2;
9363     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9364   %}
9365   ins_pipe( pipe_slow );
9366 %}
9367 
9368 // Integers vector logical right shift
9369 instruct vsrl2I(vecD dst, vecS shift) %{
9370   predicate(n->as_Vector()->length() == 2);
9371   match(Set dst (URShiftVI dst shift));
9372   format %{ "psrld   $dst,$shift\t! logical right shift packed2I" %}
9373   ins_encode %{
9374     __ psrld($dst$$XMMRegister, $shift$$XMMRegister);
9375   %}
9376   ins_pipe( pipe_slow );
9377 %}
9378 
9379 instruct vsrl2I_imm(vecD dst, immI8 shift) %{
9380   predicate(n->as_Vector()->length() == 2);
9381   match(Set dst (URShiftVI dst shift));
9382   format %{ "psrld   $dst,$shift\t! logical right shift packed2I" %}
9383   ins_encode %{
9384     __ psrld($dst$$XMMRegister, (int)$shift$$constant);
9385   %}
9386   ins_pipe( pipe_slow );
9387 %}
9388 
9389 instruct vsrl2I_reg(vecD dst, vecD src, vecS shift) %{
9390   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
9391   match(Set dst (URShiftVI src shift));
9392   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed2I" %}
9393   ins_encode %{
9394     int vector_len = 0;
9395     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9396   %}
9397   ins_pipe( pipe_slow );
9398 %}
9399 
9400 instruct vsrl2I_reg_imm(vecD dst, vecD src, immI8 shift) %{
9401   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
9402   match(Set dst (URShiftVI src shift));
9403   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed2I" %}
9404   ins_encode %{
9405     int vector_len = 0;
9406     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9407   %}
9408   ins_pipe( pipe_slow );
9409 %}
9410 
9411 instruct vsrl4I(vecX dst, vecS shift) %{
9412   predicate(n->as_Vector()->length() == 4);
9413   match(Set dst (URShiftVI dst shift));
9414   format %{ "psrld   $dst,$shift\t! logical right shift packed4I" %}
9415   ins_encode %{
9416     __ psrld($dst$$XMMRegister, $shift$$XMMRegister);
9417   %}
9418   ins_pipe( pipe_slow );
9419 %}
9420 
9421 instruct vsrl4I_imm(vecX dst, immI8 shift) %{
9422   predicate(n->as_Vector()->length() == 4);
9423   match(Set dst (URShiftVI dst shift));
9424   format %{ "psrld   $dst,$shift\t! logical right shift packed4I" %}
9425   ins_encode %{
9426     __ psrld($dst$$XMMRegister, (int)$shift$$constant);
9427   %}
9428   ins_pipe( pipe_slow );
9429 %}
9430 
9431 instruct vsrl4I_reg(vecX dst, vecX src, vecS shift) %{
9432   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
9433   match(Set dst (URShiftVI src shift));
9434   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed4I" %}
9435   ins_encode %{
9436     int vector_len = 0;
9437     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9438   %}
9439   ins_pipe( pipe_slow );
9440 %}
9441 
9442 instruct vsrl4I_reg_imm(vecX dst, vecX src, immI8 shift) %{
9443   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
9444   match(Set dst (URShiftVI src shift));
9445   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed4I" %}
9446   ins_encode %{
9447     int vector_len = 0;
9448     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9449   %}
9450   ins_pipe( pipe_slow );
9451 %}
9452 
9453 instruct vsrl8I_reg(vecY dst, vecY src, vecS shift) %{
9454   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
9455   match(Set dst (URShiftVI src shift));
9456   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed8I" %}
9457   ins_encode %{
9458     int vector_len = 1;
9459     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9460   %}
9461   ins_pipe( pipe_slow );
9462 %}
9463 
9464 instruct vsrl8I_reg_imm(vecY dst, vecY src, immI8 shift) %{
9465   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
9466   match(Set dst (URShiftVI src shift));
9467   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed8I" %}
9468   ins_encode %{
9469     int vector_len = 1;
9470     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9471   %}
9472   ins_pipe( pipe_slow );
9473 %}
9474 
9475 instruct vsrl16I_reg(vecZ dst, vecZ src, vecS shift) %{
9476   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
9477   match(Set dst (URShiftVI src shift));
9478   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed16I" %}
9479   ins_encode %{
9480     int vector_len = 2;
9481     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9482   %}
9483   ins_pipe( pipe_slow );
9484 %}
9485 
9486 instruct vsrl16I_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
9487   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
9488   match(Set dst (URShiftVI src shift));
9489   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed16I" %}
9490   ins_encode %{
9491     int vector_len = 2;
9492     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9493   %}
9494   ins_pipe( pipe_slow );
9495 %}
9496 
9497 // Longs vector logical right shift
9498 instruct vsrl2L(vecX dst, vecS shift) %{
9499   predicate(n->as_Vector()->length() == 2);
9500   match(Set dst (URShiftVL dst shift));
9501   format %{ "psrlq   $dst,$shift\t! logical right shift packed2L" %}
9502   ins_encode %{
9503     __ psrlq($dst$$XMMRegister, $shift$$XMMRegister);
9504   %}
9505   ins_pipe( pipe_slow );
9506 %}
9507 
9508 instruct vsrl2L_imm(vecX dst, immI8 shift) %{
9509   predicate(n->as_Vector()->length() == 2);
9510   match(Set dst (URShiftVL dst shift));
9511   format %{ "psrlq   $dst,$shift\t! logical right shift packed2L" %}
9512   ins_encode %{
9513     __ psrlq($dst$$XMMRegister, (int)$shift$$constant);
9514   %}
9515   ins_pipe( pipe_slow );
9516 %}
9517 
9518 instruct vsrl2L_reg(vecX dst, vecX src, vecS shift) %{
9519   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
9520   match(Set dst (URShiftVL src shift));
9521   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed2L" %}
9522   ins_encode %{
9523     int vector_len = 0;
9524     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9525   %}
9526   ins_pipe( pipe_slow );
9527 %}
9528 
9529 instruct vsrl2L_reg_imm(vecX dst, vecX src, immI8 shift) %{
9530   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
9531   match(Set dst (URShiftVL src shift));
9532   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed2L" %}
9533   ins_encode %{
9534     int vector_len = 0;
9535     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9536   %}
9537   ins_pipe( pipe_slow );
9538 %}
9539 
9540 instruct vsrl4L_reg(vecY dst, vecY src, vecS shift) %{
9541   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
9542   match(Set dst (URShiftVL src shift));
9543   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed4L" %}
9544   ins_encode %{
9545     int vector_len = 1;
9546     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9547   %}
9548   ins_pipe( pipe_slow );
9549 %}
9550 
9551 instruct vsrl4L_reg_imm(vecY dst, vecY src, immI8 shift) %{
9552   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
9553   match(Set dst (URShiftVL src shift));
9554   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed4L" %}
9555   ins_encode %{
9556     int vector_len = 1;
9557     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9558   %}
9559   ins_pipe( pipe_slow );
9560 %}
9561 
9562 instruct vsrl8L_reg(vecZ dst, vecZ src, vecS shift) %{
9563   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
9564   match(Set dst (URShiftVL src shift));
9565   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed8L" %}
9566   ins_encode %{
9567     int vector_len = 2;
9568     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9569   %}
9570   ins_pipe( pipe_slow );
9571 %}
9572 
9573 instruct vsrl8L_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
9574   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
9575   match(Set dst (URShiftVL src shift));
9576   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed8L" %}
9577   ins_encode %{
9578     int vector_len = 2;
9579     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9580   %}
9581   ins_pipe( pipe_slow );
9582 %}
9583 
9584 // ------------------- ArithmeticRightShift -----------------------------------
9585 
9586 // Shorts/Chars vector arithmetic right shift
9587 instruct vsra2S(vecS dst, vecS shift) %{
9588   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
9589   match(Set dst (RShiftVS dst shift));
9590   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed2S" %}
9591   ins_encode %{
9592     __ psraw($dst$$XMMRegister, $shift$$XMMRegister);
9593   %}
9594   ins_pipe( pipe_slow );
9595 %}
9596 
9597 instruct vsra2S_imm(vecS dst, immI8 shift) %{
9598   predicate(n->as_Vector()->length() == 2);
9599   match(Set dst (RShiftVS dst shift));
9600   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed2S" %}
9601   ins_encode %{
9602     __ psraw($dst$$XMMRegister, (int)$shift$$constant);
9603   %}
9604   ins_pipe( pipe_slow );
9605 %}
9606 
9607 instruct vsra2S_reg_avx(vecS dst, vecS src, vecS shift) %{
9608   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
9609   match(Set dst (RShiftVS src shift));
9610   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
9611   ins_encode %{
9612     int vector_len = 0;
9613     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9614   %}
9615   ins_pipe( pipe_slow );
9616 %}
9617 
9618 instruct vsra2S_reg_evex(vecS dst, vecS src, vecS shift) %{
9619   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
9620   match(Set dst (RShiftVS src shift));
9621   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
9622   ins_encode %{
9623     int vector_len = 0;
9624     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9625   %}
9626   ins_pipe( pipe_slow );
9627 %}
9628 
9629 instruct vsra2S_reg_evex_special(vecS dst, vecS src, vecS shift) %{
9630   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
9631   match(Set dst (RShiftVS dst shift));
9632   effect(TEMP src);
9633   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
9634   ins_encode %{
9635     int vector_len = 0;
9636     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9637   %}
9638   ins_pipe( pipe_slow );
9639 %}
9640 
9641 instruct vsra2S_reg_imm_avx(vecS dst, vecS src, immI8 shift) %{
9642   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
9643   match(Set dst (RShiftVS src shift));
9644   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
9645   ins_encode %{
9646     int vector_len = 0;
9647     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9648   %}
9649   ins_pipe( pipe_slow );
9650 %}
9651 
9652 instruct vsra2S_reg_imm_evex(vecS dst, vecS src, immI8 shift) %{
9653   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
9654   match(Set dst (RShiftVS src shift));
9655   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
9656   ins_encode %{
9657     int vector_len = 0;
9658     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9659   %}
9660   ins_pipe( pipe_slow );
9661 %}
9662 
9663 instruct vsra2S_reg_imm_evex_special(vecS dst, vecS src, immI8 shift) %{
9664   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
9665   match(Set dst (RShiftVS dst shift));
9666   effect(TEMP src);
9667   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
9668   ins_encode %{
9669     int vector_len = 0;
9670     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9671   %}
9672   ins_pipe( pipe_slow );
9673 %}
9674 
9675 instruct vsra4S(vecD dst, vecS shift) %{
9676   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
9677   match(Set dst (RShiftVS dst shift));
9678   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed4S" %}
9679   ins_encode %{
9680     __ psraw($dst$$XMMRegister, $shift$$XMMRegister);
9681   %}
9682   ins_pipe( pipe_slow );
9683 %}
9684 
9685 instruct vsra4S_imm(vecD dst, immI8 shift) %{
9686   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
9687   match(Set dst (RShiftVS dst shift));
9688   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed4S" %}
9689   ins_encode %{
9690     __ psraw($dst$$XMMRegister, (int)$shift$$constant);
9691   %}
9692   ins_pipe( pipe_slow );
9693 %}
9694 
9695 instruct vsra4S_reg_avx(vecD dst, vecD src, vecS shift) %{
9696   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
9697   match(Set dst (RShiftVS src shift));
9698   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
9699   ins_encode %{
9700     int vector_len = 0;
9701     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9702   %}
9703   ins_pipe( pipe_slow );
9704 %}
9705 
9706 instruct vsra4S_reg_evex(vecD dst, vecD src, vecS shift) %{
9707   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
9708   match(Set dst (RShiftVS src shift));
9709   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
9710   ins_encode %{
9711     int vector_len = 0;
9712     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9713   %}
9714   ins_pipe( pipe_slow );
9715 %}
9716 
9717 instruct vsra4S_reg_evex_special(vecD dst, vecD src, vecS shift) %{
9718   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
9719   match(Set dst (RShiftVS dst shift));
9720   effect(TEMP src);
9721   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
9722   ins_encode %{
9723     int vector_len = 0;
9724     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9725   %}
9726   ins_pipe( pipe_slow );
9727 %}
9728 
9729 instruct vsra4S_reg_imm_avx(vecD dst, vecD src, immI8 shift) %{
9730   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
9731   match(Set dst (RShiftVS src shift));
9732   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
9733   ins_encode %{
9734     int vector_len = 0;
9735     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9736   %}
9737   ins_pipe( pipe_slow );
9738 %}
9739 
9740 instruct vsra4S_reg_imm_evex(vecD dst, vecD src, immI8 shift) %{
9741   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
9742   match(Set dst (RShiftVS src shift));
9743   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
9744   ins_encode %{
9745     int vector_len = 0;
9746     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9747   %}
9748   ins_pipe( pipe_slow );
9749 %}
9750 
9751 instruct vsra4S_reg_imm_evex_special(vecD dst, vecD src, immI8 shift) %{
9752   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
9753   match(Set dst (RShiftVS dst shift));
9754   effect(TEMP src);
9755   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
9756   ins_encode %{
9757     int vector_len = 0;
9758     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9759   %}
9760   ins_pipe( pipe_slow );
9761 %}
9762 
9763 instruct vsra8S(vecX dst, vecS shift) %{
9764   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
9765   match(Set dst (RShiftVS dst shift));
9766   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed8S" %}
9767   ins_encode %{
9768     __ psraw($dst$$XMMRegister, $shift$$XMMRegister);
9769   %}
9770   ins_pipe( pipe_slow );
9771 %}
9772 
9773 instruct vsra8S_imm(vecX dst, immI8 shift) %{
9774   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
9775   match(Set dst (RShiftVS dst shift));
9776   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed8S" %}
9777   ins_encode %{
9778     __ psraw($dst$$XMMRegister, (int)$shift$$constant);
9779   %}
9780   ins_pipe( pipe_slow );
9781 %}
9782 
9783 instruct vsra8S_reg_avx(vecX dst, vecX src, vecS shift) %{
9784   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
9785   match(Set dst (RShiftVS src shift));
9786   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
9787   ins_encode %{
9788     int vector_len = 0;
9789     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9790   %}
9791   ins_pipe( pipe_slow );
9792 %}
9793 
9794 instruct vsra8S_reg_evex(vecX dst, vecX src, vecS shift) %{
9795   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
9796   match(Set dst (RShiftVS src shift));
9797   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
9798   ins_encode %{
9799     int vector_len = 0;
9800     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9801   %}
9802   ins_pipe( pipe_slow );
9803 %}
9804 
9805 instruct vsra8S_reg_evex_special(vecX dst, vecX src, vecS shift) %{
9806   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
9807   match(Set dst (RShiftVS dst shift));
9808   effect(TEMP src);
9809   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
9810   ins_encode %{
9811     int vector_len = 0;
9812     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9813   %}
9814   ins_pipe( pipe_slow );
9815 %}
9816 
9817 instruct vsra8S_reg_imm_avx(vecX dst, vecX src, immI8 shift) %{
9818   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
9819   match(Set dst (RShiftVS src shift));
9820   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
9821   ins_encode %{
9822     int vector_len = 0;
9823     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9824   %}
9825   ins_pipe( pipe_slow );
9826 %}
9827 
9828 instruct vsra8S_reg_imm_evex(vecX dst, vecX src, immI8 shift) %{
9829   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
9830   match(Set dst (RShiftVS src shift));
9831   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
9832   ins_encode %{
9833     int vector_len = 0;
9834     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9835   %}
9836   ins_pipe( pipe_slow );
9837 %}
9838 
9839 instruct vsra8S_reg_imm_evex_special(vecX dst, vecX src, immI8 shift) %{
9840   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
9841   match(Set dst (RShiftVS dst shift));
9842   effect(TEMP src);
9843   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
9844   ins_encode %{
9845     int vector_len = 0;
9846     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9847   %}
9848   ins_pipe( pipe_slow );
9849 %}
9850 
9851 instruct vsra16S_reg_avx(vecY dst, vecY src, vecS shift) %{
9852   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
9853   match(Set dst (RShiftVS src shift));
9854   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
9855   ins_encode %{
9856     int vector_len = 1;
9857     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9858   %}
9859   ins_pipe( pipe_slow );
9860 %}
9861 
9862 instruct vsra16S_reg_evex(vecY dst, vecY src, vecS shift) %{
9863   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
9864   match(Set dst (RShiftVS src shift));
9865   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
9866   ins_encode %{
9867     int vector_len = 1;
9868     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9869   %}
9870   ins_pipe( pipe_slow );
9871 %}
9872 
9873 instruct vsra16S_reg_evex_special(vecY dst, vecY src, vecS shift) %{
9874   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
9875   match(Set dst (RShiftVS dst shift));
9876   effect(TEMP src);
9877   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
9878   ins_encode %{
9879     int vector_len = 1;
9880     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9881   %}
9882   ins_pipe( pipe_slow );
9883 %}
9884 
9885 instruct vsra16S_reg_imm_avx(vecY dst, vecY src, immI8 shift) %{
9886   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 16);
9887   match(Set dst (RShiftVS src shift));
9888   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
9889   ins_encode %{
9890     int vector_len = 1;
9891     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9892   %}
9893   ins_pipe( pipe_slow );
9894 %}
9895 
9896 instruct vsra16S_reg_imm_evex(vecY dst, vecY src, immI8 shift) %{
9897   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
9898   match(Set dst (RShiftVS src shift));
9899   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
9900   ins_encode %{
9901     int vector_len = 1;
9902     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9903   %}
9904   ins_pipe( pipe_slow );
9905 %}
9906 
9907 instruct vsra16S_reg_imm_evex_special(vecY dst, vecY src, immI8 shift) %{
9908   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
9909   match(Set dst (RShiftVS dst shift));
9910   effect(TEMP src);
9911   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
9912   ins_encode %{
9913     int vector_len = 1;
9914     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9915   %}
9916   ins_pipe( pipe_slow );
9917 %}
9918 
9919 instruct vsra32S_reg(vecZ dst, vecZ src, vecS shift) %{
9920   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
9921   match(Set dst (RShiftVS src shift));
9922   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed32S" %}
9923   ins_encode %{
9924     int vector_len = 2;
9925     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9926   %}
9927   ins_pipe( pipe_slow );
9928 %}
9929 
9930 instruct vsra32S_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
9931   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
9932   match(Set dst (RShiftVS src shift));
9933   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed32S" %}
9934   ins_encode %{
9935     int vector_len = 2;
9936     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9937   %}
9938   ins_pipe( pipe_slow );
9939 %}
9940 
9941 // Integers vector arithmetic right shift
9942 instruct vsra2I(vecD dst, vecS shift) %{
9943   predicate(n->as_Vector()->length() == 2);
9944   match(Set dst (RShiftVI dst shift));
9945   format %{ "psrad   $dst,$shift\t! arithmetic right shift packed2I" %}
9946   ins_encode %{
9947     __ psrad($dst$$XMMRegister, $shift$$XMMRegister);
9948   %}
9949   ins_pipe( pipe_slow );
9950 %}
9951 
9952 instruct vsra2I_imm(vecD dst, immI8 shift) %{
9953   predicate(n->as_Vector()->length() == 2);
9954   match(Set dst (RShiftVI dst shift));
9955   format %{ "psrad   $dst,$shift\t! arithmetic right shift packed2I" %}
9956   ins_encode %{
9957     __ psrad($dst$$XMMRegister, (int)$shift$$constant);
9958   %}
9959   ins_pipe( pipe_slow );
9960 %}
9961 
9962 instruct vsra2I_reg(vecD dst, vecD src, vecS shift) %{
9963   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
9964   match(Set dst (RShiftVI src shift));
9965   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed2I" %}
9966   ins_encode %{
9967     int vector_len = 0;
9968     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9969   %}
9970   ins_pipe( pipe_slow );
9971 %}
9972 
9973 instruct vsra2I_reg_imm(vecD dst, vecD src, immI8 shift) %{
9974   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
9975   match(Set dst (RShiftVI src shift));
9976   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed2I" %}
9977   ins_encode %{
9978     int vector_len = 0;
9979     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9980   %}
9981   ins_pipe( pipe_slow );
9982 %}
9983 
9984 instruct vsra4I(vecX dst, vecS shift) %{
9985   predicate(n->as_Vector()->length() == 4);
9986   match(Set dst (RShiftVI dst shift));
9987   format %{ "psrad   $dst,$shift\t! arithmetic right shift packed4I" %}
9988   ins_encode %{
9989     __ psrad($dst$$XMMRegister, $shift$$XMMRegister);
9990   %}
9991   ins_pipe( pipe_slow );
9992 %}
9993 
9994 instruct vsra4I_imm(vecX dst, immI8 shift) %{
9995   predicate(n->as_Vector()->length() == 4);
9996   match(Set dst (RShiftVI dst shift));
9997   format %{ "psrad   $dst,$shift\t! arithmetic right shift packed4I" %}
9998   ins_encode %{
9999     __ psrad($dst$$XMMRegister, (int)$shift$$constant);
10000   %}
10001   ins_pipe( pipe_slow );
10002 %}
10003 
10004 instruct vsra4I_reg(vecX dst, vecX src, vecS shift) %{
10005   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
10006   match(Set dst (RShiftVI src shift));
10007   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed4I" %}
10008   ins_encode %{
10009     int vector_len = 0;
10010     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
10011   %}
10012   ins_pipe( pipe_slow );
10013 %}
10014 
10015 instruct vsra4I_reg_imm(vecX dst, vecX src, immI8 shift) %{
10016   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
10017   match(Set dst (RShiftVI src shift));
10018   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed4I" %}
10019   ins_encode %{
10020     int vector_len = 0;
10021     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10022   %}
10023   ins_pipe( pipe_slow );
10024 %}
10025 
10026 instruct vsra8I_reg(vecY dst, vecY src, vecS shift) %{
10027   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
10028   match(Set dst (RShiftVI src shift));
10029   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed8I" %}
10030   ins_encode %{
10031     int vector_len = 1;
10032     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
10033   %}
10034   ins_pipe( pipe_slow );
10035 %}
10036 
10037 instruct vsra8I_reg_imm(vecY dst, vecY src, immI8 shift) %{
10038   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
10039   match(Set dst (RShiftVI src shift));
10040   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed8I" %}
10041   ins_encode %{
10042     int vector_len = 1;
10043     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10044   %}
10045   ins_pipe( pipe_slow );
10046 %}
10047 
10048 instruct vsra16I_reg(vecZ dst, vecZ src, vecS shift) %{
10049   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
10050   match(Set dst (RShiftVI src shift));
10051   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed16I" %}
10052   ins_encode %{
10053     int vector_len = 2;
10054     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
10055   %}
10056   ins_pipe( pipe_slow );
10057 %}
10058 
10059 instruct vsra16I_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
10060   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
10061   match(Set dst (RShiftVI src shift));
10062   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed16I" %}
10063   ins_encode %{
10064     int vector_len = 2;
10065     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10066   %}
10067   ins_pipe( pipe_slow );
10068 %}
10069 
10070 // There are no longs vector arithmetic right shift instructions.
10071 
10072 
10073 // --------------------------------- AND --------------------------------------
10074 
10075 instruct vand4B(vecS dst, vecS src) %{
10076   predicate(n->as_Vector()->length_in_bytes() == 4);
10077   match(Set dst (AndV dst src));
10078   format %{ "pand    $dst,$src\t! and vectors (4 bytes)" %}
10079   ins_encode %{
10080     __ pand($dst$$XMMRegister, $src$$XMMRegister);
10081   %}
10082   ins_pipe( pipe_slow );
10083 %}
10084 
10085 instruct vand4B_reg(vecS dst, vecS src1, vecS src2) %{
10086   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
10087   match(Set dst (AndV src1 src2));
10088   format %{ "vpand   $dst,$src1,$src2\t! and vectors (4 bytes)" %}
10089   ins_encode %{
10090     int vector_len = 0;
10091     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
10092   %}
10093   ins_pipe( pipe_slow );
10094 %}
10095 
10096 instruct vand4B_mem(vecS dst, vecS src, memory mem) %{
10097   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
10098   match(Set dst (AndV src (LoadVector mem)));
10099   format %{ "vpand   $dst,$src,$mem\t! and vectors (4 bytes)" %}
10100   ins_encode %{
10101     int vector_len = 0;
10102     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
10103   %}
10104   ins_pipe( pipe_slow );
10105 %}
10106 
10107 instruct vand8B(vecD dst, vecD src) %{
10108   predicate(n->as_Vector()->length_in_bytes() == 8);
10109   match(Set dst (AndV dst src));
10110   format %{ "pand    $dst,$src\t! and vectors (8 bytes)" %}
10111   ins_encode %{
10112     __ pand($dst$$XMMRegister, $src$$XMMRegister);
10113   %}
10114   ins_pipe( pipe_slow );
10115 %}
10116 
10117 instruct vand8B_reg(vecD dst, vecD src1, vecD src2) %{
10118   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
10119   match(Set dst (AndV src1 src2));
10120   format %{ "vpand   $dst,$src1,$src2\t! and vectors (8 bytes)" %}
10121   ins_encode %{
10122     int vector_len = 0;
10123     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
10124   %}
10125   ins_pipe( pipe_slow );
10126 %}
10127 
10128 instruct vand8B_mem(vecD dst, vecD src, memory mem) %{
10129   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
10130   match(Set dst (AndV src (LoadVector mem)));
10131   format %{ "vpand   $dst,$src,$mem\t! and vectors (8 bytes)" %}
10132   ins_encode %{
10133     int vector_len = 0;
10134     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
10135   %}
10136   ins_pipe( pipe_slow );
10137 %}
10138 
10139 instruct vand16B(vecX dst, vecX src) %{
10140   predicate(n->as_Vector()->length_in_bytes() == 16);
10141   match(Set dst (AndV dst src));
10142   format %{ "pand    $dst,$src\t! and vectors (16 bytes)" %}
10143   ins_encode %{
10144     __ pand($dst$$XMMRegister, $src$$XMMRegister);
10145   %}
10146   ins_pipe( pipe_slow );
10147 %}
10148 
10149 instruct vand16B_reg(vecX dst, vecX src1, vecX src2) %{
10150   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
10151   match(Set dst (AndV src1 src2));
10152   format %{ "vpand   $dst,$src1,$src2\t! and vectors (16 bytes)" %}
10153   ins_encode %{
10154     int vector_len = 0;
10155     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
10156   %}
10157   ins_pipe( pipe_slow );
10158 %}
10159 
10160 instruct vand16B_mem(vecX dst, vecX src, memory mem) %{
10161   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
10162   match(Set dst (AndV src (LoadVector mem)));
10163   format %{ "vpand   $dst,$src,$mem\t! and vectors (16 bytes)" %}
10164   ins_encode %{
10165     int vector_len = 0;
10166     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
10167   %}
10168   ins_pipe( pipe_slow );
10169 %}
10170 
10171 instruct vand32B_reg(vecY dst, vecY src1, vecY src2) %{
10172   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
10173   match(Set dst (AndV src1 src2));
10174   format %{ "vpand   $dst,$src1,$src2\t! and vectors (32 bytes)" %}
10175   ins_encode %{
10176     int vector_len = 1;
10177     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
10178   %}
10179   ins_pipe( pipe_slow );
10180 %}
10181 
10182 instruct vand32B_mem(vecY dst, vecY src, memory mem) %{
10183   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
10184   match(Set dst (AndV src (LoadVector mem)));
10185   format %{ "vpand   $dst,$src,$mem\t! and vectors (32 bytes)" %}
10186   ins_encode %{
10187     int vector_len = 1;
10188     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
10189   %}
10190   ins_pipe( pipe_slow );
10191 %}
10192 
10193 instruct vand64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
10194   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
10195   match(Set dst (AndV src1 src2));
10196   format %{ "vpand   $dst,$src1,$src2\t! and vectors (64 bytes)" %}
10197   ins_encode %{
10198     int vector_len = 2;
10199     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
10200   %}
10201   ins_pipe( pipe_slow );
10202 %}
10203 
10204 instruct vand64B_mem(vecZ dst, vecZ src, memory mem) %{
10205   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
10206   match(Set dst (AndV src (LoadVector mem)));
10207   format %{ "vpand   $dst,$src,$mem\t! and vectors (64 bytes)" %}
10208   ins_encode %{
10209     int vector_len = 2;
10210     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
10211   %}
10212   ins_pipe( pipe_slow );
10213 %}
10214 
10215 // --------------------------------- OR ---------------------------------------
10216 
10217 instruct vor4B(vecS dst, vecS src) %{
10218   predicate(n->as_Vector()->length_in_bytes() == 4);
10219   match(Set dst (OrV dst src));
10220   format %{ "por     $dst,$src\t! or vectors (4 bytes)" %}
10221   ins_encode %{
10222     __ por($dst$$XMMRegister, $src$$XMMRegister);
10223   %}
10224   ins_pipe( pipe_slow );
10225 %}
10226 
10227 instruct vor4B_reg(vecS dst, vecS src1, vecS src2) %{
10228   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
10229   match(Set dst (OrV src1 src2));
10230   format %{ "vpor    $dst,$src1,$src2\t! or vectors (4 bytes)" %}
10231   ins_encode %{
10232     int vector_len = 0;
10233     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
10234   %}
10235   ins_pipe( pipe_slow );
10236 %}
10237 
10238 instruct vor4B_mem(vecS dst, vecS src, memory mem) %{
10239   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
10240   match(Set dst (OrV src (LoadVector mem)));
10241   format %{ "vpor    $dst,$src,$mem\t! or vectors (4 bytes)" %}
10242   ins_encode %{
10243     int vector_len = 0;
10244     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
10245   %}
10246   ins_pipe( pipe_slow );
10247 %}
10248 
10249 instruct vor8B(vecD dst, vecD src) %{
10250   predicate(n->as_Vector()->length_in_bytes() == 8);
10251   match(Set dst (OrV dst src));
10252   format %{ "por     $dst,$src\t! or vectors (8 bytes)" %}
10253   ins_encode %{
10254     __ por($dst$$XMMRegister, $src$$XMMRegister);
10255   %}
10256   ins_pipe( pipe_slow );
10257 %}
10258 
10259 instruct vor8B_reg(vecD dst, vecD src1, vecD src2) %{
10260   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
10261   match(Set dst (OrV src1 src2));
10262   format %{ "vpor    $dst,$src1,$src2\t! or vectors (8 bytes)" %}
10263   ins_encode %{
10264     int vector_len = 0;
10265     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
10266   %}
10267   ins_pipe( pipe_slow );
10268 %}
10269 
10270 instruct vor8B_mem(vecD dst, vecD src, memory mem) %{
10271   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
10272   match(Set dst (OrV src (LoadVector mem)));
10273   format %{ "vpor    $dst,$src,$mem\t! or vectors (8 bytes)" %}
10274   ins_encode %{
10275     int vector_len = 0;
10276     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
10277   %}
10278   ins_pipe( pipe_slow );
10279 %}
10280 
10281 instruct vor16B(vecX dst, vecX src) %{
10282   predicate(n->as_Vector()->length_in_bytes() == 16);
10283   match(Set dst (OrV dst src));
10284   format %{ "por     $dst,$src\t! or vectors (16 bytes)" %}
10285   ins_encode %{
10286     __ por($dst$$XMMRegister, $src$$XMMRegister);
10287   %}
10288   ins_pipe( pipe_slow );
10289 %}
10290 
10291 instruct vor16B_reg(vecX dst, vecX src1, vecX src2) %{
10292   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
10293   match(Set dst (OrV src1 src2));
10294   format %{ "vpor    $dst,$src1,$src2\t! or vectors (16 bytes)" %}
10295   ins_encode %{
10296     int vector_len = 0;
10297     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
10298   %}
10299   ins_pipe( pipe_slow );
10300 %}
10301 
10302 instruct vor16B_mem(vecX dst, vecX src, memory mem) %{
10303   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
10304   match(Set dst (OrV src (LoadVector mem)));
10305   format %{ "vpor    $dst,$src,$mem\t! or vectors (16 bytes)" %}
10306   ins_encode %{
10307     int vector_len = 0;
10308     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
10309   %}
10310   ins_pipe( pipe_slow );
10311 %}
10312 
10313 instruct vor32B_reg(vecY dst, vecY src1, vecY src2) %{
10314   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
10315   match(Set dst (OrV src1 src2));
10316   format %{ "vpor    $dst,$src1,$src2\t! or vectors (32 bytes)" %}
10317   ins_encode %{
10318     int vector_len = 1;
10319     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
10320   %}
10321   ins_pipe( pipe_slow );
10322 %}
10323 
10324 instruct vor32B_mem(vecY dst, vecY src, memory mem) %{
10325   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
10326   match(Set dst (OrV src (LoadVector mem)));
10327   format %{ "vpor    $dst,$src,$mem\t! or vectors (32 bytes)" %}
10328   ins_encode %{
10329     int vector_len = 1;
10330     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
10331   %}
10332   ins_pipe( pipe_slow );
10333 %}
10334 
10335 instruct vor64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
10336   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
10337   match(Set dst (OrV src1 src2));
10338   format %{ "vpor    $dst,$src1,$src2\t! or vectors (64 bytes)" %}
10339   ins_encode %{
10340     int vector_len = 2;
10341     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
10342   %}
10343   ins_pipe( pipe_slow );
10344 %}
10345 
10346 instruct vor64B_mem(vecZ dst, vecZ src, memory mem) %{
10347   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
10348   match(Set dst (OrV src (LoadVector mem)));
10349   format %{ "vpor    $dst,$src,$mem\t! or vectors (64 bytes)" %}
10350   ins_encode %{
10351     int vector_len = 2;
10352     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
10353   %}
10354   ins_pipe( pipe_slow );
10355 %}
10356 
10357 // --------------------------------- XOR --------------------------------------
10358 
10359 instruct vxor4B(vecS dst, vecS src) %{
10360   predicate(n->as_Vector()->length_in_bytes() == 4);
10361   match(Set dst (XorV dst src));
10362   format %{ "pxor    $dst,$src\t! xor vectors (4 bytes)" %}
10363   ins_encode %{
10364     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
10365   %}
10366   ins_pipe( pipe_slow );
10367 %}
10368 
10369 instruct vxor4B_reg(vecS dst, vecS src1, vecS src2) %{
10370   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
10371   match(Set dst (XorV src1 src2));
10372   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (4 bytes)" %}
10373   ins_encode %{
10374     int vector_len = 0;
10375     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
10376   %}
10377   ins_pipe( pipe_slow );
10378 %}
10379 
10380 instruct vxor4B_mem(vecS dst, vecS src, memory mem) %{
10381   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
10382   match(Set dst (XorV src (LoadVector mem)));
10383   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (4 bytes)" %}
10384   ins_encode %{
10385     int vector_len = 0;
10386     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
10387   %}
10388   ins_pipe( pipe_slow );
10389 %}
10390 
10391 instruct vxor8B(vecD dst, vecD src) %{
10392   predicate(n->as_Vector()->length_in_bytes() == 8);
10393   match(Set dst (XorV dst src));
10394   format %{ "pxor    $dst,$src\t! xor vectors (8 bytes)" %}
10395   ins_encode %{
10396     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
10397   %}
10398   ins_pipe( pipe_slow );
10399 %}
10400 
10401 instruct vxor8B_reg(vecD dst, vecD src1, vecD src2) %{
10402   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
10403   match(Set dst (XorV src1 src2));
10404   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (8 bytes)" %}
10405   ins_encode %{
10406     int vector_len = 0;
10407     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
10408   %}
10409   ins_pipe( pipe_slow );
10410 %}
10411 
10412 instruct vxor8B_mem(vecD dst, vecD src, memory mem) %{
10413   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
10414   match(Set dst (XorV src (LoadVector mem)));
10415   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (8 bytes)" %}
10416   ins_encode %{
10417     int vector_len = 0;
10418     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
10419   %}
10420   ins_pipe( pipe_slow );
10421 %}
10422 
10423 instruct vxor16B(vecX dst, vecX src) %{
10424   predicate(n->as_Vector()->length_in_bytes() == 16);
10425   match(Set dst (XorV dst src));
10426   format %{ "pxor    $dst,$src\t! xor vectors (16 bytes)" %}
10427   ins_encode %{
10428     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
10429   %}
10430   ins_pipe( pipe_slow );
10431 %}
10432 
10433 instruct vxor16B_reg(vecX dst, vecX src1, vecX src2) %{
10434   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
10435   match(Set dst (XorV src1 src2));
10436   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (16 bytes)" %}
10437   ins_encode %{
10438     int vector_len = 0;
10439     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
10440   %}
10441   ins_pipe( pipe_slow );
10442 %}
10443 
10444 instruct vxor16B_mem(vecX dst, vecX src, memory mem) %{
10445   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
10446   match(Set dst (XorV src (LoadVector mem)));
10447   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (16 bytes)" %}
10448   ins_encode %{
10449     int vector_len = 0;
10450     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
10451   %}
10452   ins_pipe( pipe_slow );
10453 %}
10454 
10455 instruct vxor32B_reg(vecY dst, vecY src1, vecY src2) %{
10456   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
10457   match(Set dst (XorV src1 src2));
10458   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (32 bytes)" %}
10459   ins_encode %{
10460     int vector_len = 1;
10461     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
10462   %}
10463   ins_pipe( pipe_slow );
10464 %}
10465 
10466 instruct vxor32B_mem(vecY dst, vecY src, memory mem) %{
10467   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
10468   match(Set dst (XorV src (LoadVector mem)));
10469   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (32 bytes)" %}
10470   ins_encode %{
10471     int vector_len = 1;
10472     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
10473   %}
10474   ins_pipe( pipe_slow );
10475 %}
10476 
10477 instruct vxor64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
10478   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
10479   match(Set dst (XorV src1 src2));
10480   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (64 bytes)" %}
10481   ins_encode %{
10482     int vector_len = 2;
10483     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
10484   %}
10485   ins_pipe( pipe_slow );
10486 %}
10487 
10488 instruct vxor64B_mem(vecZ dst, vecZ src, memory mem) %{
10489   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
10490   match(Set dst (XorV src (LoadVector mem)));
10491   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (64 bytes)" %}
10492   ins_encode %{
10493     int vector_len = 2;
10494     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
10495   %}
10496   ins_pipe( pipe_slow );
10497 %}
10498