1 //
   2 // Copyright (c) 2011, 2018, Oracle and/or its affiliates. All rights reserved.
   3 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4 //
   5 // This code is free software; you can redistribute it and/or modify it
   6 // under the terms of the GNU General Public License version 2 only, as
   7 // published by the Free Software Foundation.
   8 //
   9 // This code is distributed in the hope that it will be useful, but WITHOUT
  10 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11 // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12 // version 2 for more details (a copy is included in the LICENSE file that
  13 // accompanied this code).
  14 //
  15 // You should have received a copy of the GNU General Public License version
  16 // 2 along with this work; if not, write to the Free Software Foundation,
  17 // Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18 //
  19 // Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20 // or visit www.oracle.com if you need additional information or have any
  21 // questions.
  22 //
  23 //
  24 
  25 // X86 Common Architecture Description File
  26 
  27 //----------REGISTER DEFINITION BLOCK------------------------------------------
  28 // This information is used by the matcher and the register allocator to
  29 // describe individual registers and classes of registers within the target
  30 // archtecture.
  31 
  32 register %{
  33 //----------Architecture Description Register Definitions----------------------
  34 // General Registers
  35 // "reg_def"  name ( register save type, C convention save type,
  36 //                   ideal register type, encoding );
  37 // Register Save Types:
  38 //
  39 // NS  = No-Save:       The register allocator assumes that these registers
  40 //                      can be used without saving upon entry to the method, &
  41 //                      that they do not need to be saved at call sites.
  42 //
  43 // SOC = Save-On-Call:  The register allocator assumes that these registers
  44 //                      can be used without saving upon entry to the method,
  45 //                      but that they must be saved at call sites.
  46 //
  47 // SOE = Save-On-Entry: The register allocator assumes that these registers
  48 //                      must be saved before using them upon entry to the
  49 //                      method, but they do not need to be saved at call
  50 //                      sites.
  51 //
  52 // AS  = Always-Save:   The register allocator assumes that these registers
  53 //                      must be saved before using them upon entry to the
  54 //                      method, & that they must be saved at call sites.
  55 //
  56 // Ideal Register Type is used to determine how to save & restore a
  57 // register.  Op_RegI will get spilled with LoadI/StoreI, Op_RegP will get
  58 // spilled with LoadP/StoreP.  If the register supports both, use Op_RegI.
  59 //
  60 // The encoding number is the actual bit-pattern placed into the opcodes.
  61 
  62 // XMM registers.  512-bit registers or 8 words each, labeled (a)-p.
  63 // Word a in each register holds a Float, words ab hold a Double.
  64 // The whole registers are used in SSE4.2 version intrinsics,
  65 // array copy stubs and superword operations (see UseSSE42Intrinsics,
  66 // UseXMMForArrayCopy and UseSuperword flags).
  67 // For pre EVEX enabled architectures:
  68 //      XMM8-XMM15 must be encoded with REX (VEX for UseAVX)
  69 // For EVEX enabled architectures:
  70 //      XMM8-XMM31 must be encoded with REX (EVEX for UseAVX).
  71 //
  72 // Linux ABI:   No register preserved across function calls
  73 //              XMM0-XMM7 might hold parameters
  74 // Windows ABI: XMM6-XMM31 preserved across function calls
  75 //              XMM0-XMM3 might hold parameters
  76 
  77 reg_def XMM0 ( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg());
  78 reg_def XMM0b( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(1));
  79 reg_def XMM0c( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(2));
  80 reg_def XMM0d( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(3));
  81 reg_def XMM0e( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(4));
  82 reg_def XMM0f( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(5));
  83 reg_def XMM0g( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(6));
  84 reg_def XMM0h( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(7));
  85 reg_def XMM0i( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(8));
  86 reg_def XMM0j( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(9));
  87 reg_def XMM0k( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(10));
  88 reg_def XMM0l( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(11));
  89 reg_def XMM0m( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(12));
  90 reg_def XMM0n( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(13));
  91 reg_def XMM0o( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(14));
  92 reg_def XMM0p( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(15));
  93 
  94 reg_def XMM1 ( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg());
  95 reg_def XMM1b( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(1));
  96 reg_def XMM1c( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(2));
  97 reg_def XMM1d( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(3));
  98 reg_def XMM1e( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(4));
  99 reg_def XMM1f( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(5));
 100 reg_def XMM1g( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(6));
 101 reg_def XMM1h( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(7));
 102 reg_def XMM1i( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(8));
 103 reg_def XMM1j( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(9));
 104 reg_def XMM1k( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(10));
 105 reg_def XMM1l( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(11));
 106 reg_def XMM1m( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(12));
 107 reg_def XMM1n( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(13));
 108 reg_def XMM1o( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(14));
 109 reg_def XMM1p( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(15));
 110 
 111 reg_def XMM2 ( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg());
 112 reg_def XMM2b( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(1));
 113 reg_def XMM2c( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(2));
 114 reg_def XMM2d( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(3));
 115 reg_def XMM2e( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(4));
 116 reg_def XMM2f( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(5));
 117 reg_def XMM2g( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(6));
 118 reg_def XMM2h( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(7));
 119 reg_def XMM2i( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(8));
 120 reg_def XMM2j( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(9));
 121 reg_def XMM2k( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(10));
 122 reg_def XMM2l( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(11));
 123 reg_def XMM2m( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(12));
 124 reg_def XMM2n( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(13));
 125 reg_def XMM2o( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(14));
 126 reg_def XMM2p( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(15));
 127 
 128 reg_def XMM3 ( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg());
 129 reg_def XMM3b( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(1));
 130 reg_def XMM3c( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(2));
 131 reg_def XMM3d( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(3));
 132 reg_def XMM3e( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(4));
 133 reg_def XMM3f( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(5));
 134 reg_def XMM3g( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(6));
 135 reg_def XMM3h( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(7));
 136 reg_def XMM3i( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(8));
 137 reg_def XMM3j( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(9));
 138 reg_def XMM3k( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(10));
 139 reg_def XMM3l( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(11));
 140 reg_def XMM3m( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(12));
 141 reg_def XMM3n( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(13));
 142 reg_def XMM3o( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(14));
 143 reg_def XMM3p( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(15));
 144 
 145 reg_def XMM4 ( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg());
 146 reg_def XMM4b( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(1));
 147 reg_def XMM4c( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(2));
 148 reg_def XMM4d( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(3));
 149 reg_def XMM4e( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(4));
 150 reg_def XMM4f( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(5));
 151 reg_def XMM4g( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(6));
 152 reg_def XMM4h( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(7));
 153 reg_def XMM4i( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(8));
 154 reg_def XMM4j( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(9));
 155 reg_def XMM4k( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(10));
 156 reg_def XMM4l( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(11));
 157 reg_def XMM4m( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(12));
 158 reg_def XMM4n( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(13));
 159 reg_def XMM4o( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(14));
 160 reg_def XMM4p( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(15));
 161 
 162 reg_def XMM5 ( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg());
 163 reg_def XMM5b( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(1));
 164 reg_def XMM5c( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(2));
 165 reg_def XMM5d( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(3));
 166 reg_def XMM5e( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(4));
 167 reg_def XMM5f( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(5));
 168 reg_def XMM5g( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(6));
 169 reg_def XMM5h( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(7));
 170 reg_def XMM5i( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(8));
 171 reg_def XMM5j( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(9));
 172 reg_def XMM5k( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(10));
 173 reg_def XMM5l( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(11));
 174 reg_def XMM5m( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(12));
 175 reg_def XMM5n( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(13));
 176 reg_def XMM5o( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(14));
 177 reg_def XMM5p( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(15));
 178 
 179 reg_def XMM6 ( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg());
 180 reg_def XMM6b( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(1));
 181 reg_def XMM6c( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(2));
 182 reg_def XMM6d( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(3));
 183 reg_def XMM6e( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(4));
 184 reg_def XMM6f( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(5));
 185 reg_def XMM6g( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(6));
 186 reg_def XMM6h( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(7));
 187 reg_def XMM6i( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(8));
 188 reg_def XMM6j( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(9));
 189 reg_def XMM6k( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(10));
 190 reg_def XMM6l( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(11));
 191 reg_def XMM6m( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(12));
 192 reg_def XMM6n( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(13));
 193 reg_def XMM6o( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(14));
 194 reg_def XMM6p( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(15));
 195 
 196 reg_def XMM7 ( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg());
 197 reg_def XMM7b( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(1));
 198 reg_def XMM7c( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(2));
 199 reg_def XMM7d( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(3));
 200 reg_def XMM7e( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(4));
 201 reg_def XMM7f( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(5));
 202 reg_def XMM7g( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(6));
 203 reg_def XMM7h( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(7));
 204 reg_def XMM7i( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(8));
 205 reg_def XMM7j( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(9));
 206 reg_def XMM7k( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(10));
 207 reg_def XMM7l( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(11));
 208 reg_def XMM7m( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(12));
 209 reg_def XMM7n( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(13));
 210 reg_def XMM7o( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(14));
 211 reg_def XMM7p( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(15));
 212 
 213 #ifdef _LP64
 214 
 215 reg_def XMM8 ( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg());
 216 reg_def XMM8b( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(1));
 217 reg_def XMM8c( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(2));
 218 reg_def XMM8d( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(3));
 219 reg_def XMM8e( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(4));
 220 reg_def XMM8f( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(5));
 221 reg_def XMM8g( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(6));
 222 reg_def XMM8h( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(7));
 223 reg_def XMM8i( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(8));
 224 reg_def XMM8j( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(9));
 225 reg_def XMM8k( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(10));
 226 reg_def XMM8l( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(11));
 227 reg_def XMM8m( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(12));
 228 reg_def XMM8n( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(13));
 229 reg_def XMM8o( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(14));
 230 reg_def XMM8p( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(15));
 231 
 232 reg_def XMM9 ( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg());
 233 reg_def XMM9b( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(1));
 234 reg_def XMM9c( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(2));
 235 reg_def XMM9d( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(3));
 236 reg_def XMM9e( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(4));
 237 reg_def XMM9f( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(5));
 238 reg_def XMM9g( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(6));
 239 reg_def XMM9h( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(7));
 240 reg_def XMM9i( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(8));
 241 reg_def XMM9j( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(9));
 242 reg_def XMM9k( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(10));
 243 reg_def XMM9l( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(11));
 244 reg_def XMM9m( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(12));
 245 reg_def XMM9n( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(13));
 246 reg_def XMM9o( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(14));
 247 reg_def XMM9p( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(15));
 248 
 249 reg_def XMM10 ( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg());
 250 reg_def XMM10b( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(1));
 251 reg_def XMM10c( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(2));
 252 reg_def XMM10d( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(3));
 253 reg_def XMM10e( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(4));
 254 reg_def XMM10f( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(5));
 255 reg_def XMM10g( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(6));
 256 reg_def XMM10h( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(7));
 257 reg_def XMM10i( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(8));
 258 reg_def XMM10j( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(9));
 259 reg_def XMM10k( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(10));
 260 reg_def XMM10l( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(11));
 261 reg_def XMM10m( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(12));
 262 reg_def XMM10n( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(13));
 263 reg_def XMM10o( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(14));
 264 reg_def XMM10p( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(15));
 265 
 266 reg_def XMM11 ( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg());
 267 reg_def XMM11b( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(1));
 268 reg_def XMM11c( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(2));
 269 reg_def XMM11d( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(3));
 270 reg_def XMM11e( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(4));
 271 reg_def XMM11f( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(5));
 272 reg_def XMM11g( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(6));
 273 reg_def XMM11h( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(7));
 274 reg_def XMM11i( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(8));
 275 reg_def XMM11j( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(9));
 276 reg_def XMM11k( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(10));
 277 reg_def XMM11l( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(11));
 278 reg_def XMM11m( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(12));
 279 reg_def XMM11n( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(13));
 280 reg_def XMM11o( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(14));
 281 reg_def XMM11p( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(15));
 282 
 283 reg_def XMM12 ( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg());
 284 reg_def XMM12b( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(1));
 285 reg_def XMM12c( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(2));
 286 reg_def XMM12d( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(3));
 287 reg_def XMM12e( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(4));
 288 reg_def XMM12f( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(5));
 289 reg_def XMM12g( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(6));
 290 reg_def XMM12h( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(7));
 291 reg_def XMM12i( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(8));
 292 reg_def XMM12j( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(9));
 293 reg_def XMM12k( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(10));
 294 reg_def XMM12l( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(11));
 295 reg_def XMM12m( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(12));
 296 reg_def XMM12n( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(13));
 297 reg_def XMM12o( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(14));
 298 reg_def XMM12p( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(15));
 299 
 300 reg_def XMM13 ( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg());
 301 reg_def XMM13b( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(1));
 302 reg_def XMM13c( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(2));
 303 reg_def XMM13d( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(3));
 304 reg_def XMM13e( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(4));
 305 reg_def XMM13f( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(5));
 306 reg_def XMM13g( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(6));
 307 reg_def XMM13h( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(7));
 308 reg_def XMM13i( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(8));
 309 reg_def XMM13j( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(9));
 310 reg_def XMM13k( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(10));
 311 reg_def XMM13l( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(11));
 312 reg_def XMM13m( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(12));
 313 reg_def XMM13n( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(13));
 314 reg_def XMM13o( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(14));
 315 reg_def XMM13p( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(15));
 316 
 317 reg_def XMM14 ( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg());
 318 reg_def XMM14b( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(1));
 319 reg_def XMM14c( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(2));
 320 reg_def XMM14d( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(3));
 321 reg_def XMM14e( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(4));
 322 reg_def XMM14f( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(5));
 323 reg_def XMM14g( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(6));
 324 reg_def XMM14h( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(7));
 325 reg_def XMM14i( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(8));
 326 reg_def XMM14j( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(9));
 327 reg_def XMM14k( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(10));
 328 reg_def XMM14l( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(11));
 329 reg_def XMM14m( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(12));
 330 reg_def XMM14n( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(13));
 331 reg_def XMM14o( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(14));
 332 reg_def XMM14p( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(15));
 333 
 334 reg_def XMM15 ( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg());
 335 reg_def XMM15b( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(1));
 336 reg_def XMM15c( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(2));
 337 reg_def XMM15d( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(3));
 338 reg_def XMM15e( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(4));
 339 reg_def XMM15f( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(5));
 340 reg_def XMM15g( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(6));
 341 reg_def XMM15h( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(7));
 342 reg_def XMM15i( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(8));
 343 reg_def XMM15j( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(9));
 344 reg_def XMM15k( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(10));
 345 reg_def XMM15l( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(11));
 346 reg_def XMM15m( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(12));
 347 reg_def XMM15n( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(13));
 348 reg_def XMM15o( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(14));
 349 reg_def XMM15p( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(15));
 350 
 351 reg_def XMM16 ( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg());
 352 reg_def XMM16b( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(1));
 353 reg_def XMM16c( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(2));
 354 reg_def XMM16d( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(3));
 355 reg_def XMM16e( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(4));
 356 reg_def XMM16f( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(5));
 357 reg_def XMM16g( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(6));
 358 reg_def XMM16h( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(7));
 359 reg_def XMM16i( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(8));
 360 reg_def XMM16j( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(9));
 361 reg_def XMM16k( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(10));
 362 reg_def XMM16l( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(11));
 363 reg_def XMM16m( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(12));
 364 reg_def XMM16n( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(13));
 365 reg_def XMM16o( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(14));
 366 reg_def XMM16p( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(15));
 367 
 368 reg_def XMM17 ( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg());
 369 reg_def XMM17b( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(1));
 370 reg_def XMM17c( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(2));
 371 reg_def XMM17d( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(3));
 372 reg_def XMM17e( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(4));
 373 reg_def XMM17f( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(5));
 374 reg_def XMM17g( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(6));
 375 reg_def XMM17h( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(7));
 376 reg_def XMM17i( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(8));
 377 reg_def XMM17j( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(9));
 378 reg_def XMM17k( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(10));
 379 reg_def XMM17l( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(11));
 380 reg_def XMM17m( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(12));
 381 reg_def XMM17n( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(13));
 382 reg_def XMM17o( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(14));
 383 reg_def XMM17p( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(15));
 384 
 385 reg_def XMM18 ( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg());
 386 reg_def XMM18b( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(1));
 387 reg_def XMM18c( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(2));
 388 reg_def XMM18d( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(3));
 389 reg_def XMM18e( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(4));
 390 reg_def XMM18f( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(5));
 391 reg_def XMM18g( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(6));
 392 reg_def XMM18h( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(7));
 393 reg_def XMM18i( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(8));
 394 reg_def XMM18j( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(9));
 395 reg_def XMM18k( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(10));
 396 reg_def XMM18l( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(11));
 397 reg_def XMM18m( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(12));
 398 reg_def XMM18n( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(13));
 399 reg_def XMM18o( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(14));
 400 reg_def XMM18p( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(15));
 401 
 402 reg_def XMM19 ( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg());
 403 reg_def XMM19b( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(1));
 404 reg_def XMM19c( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(2));
 405 reg_def XMM19d( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(3));
 406 reg_def XMM19e( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(4));
 407 reg_def XMM19f( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(5));
 408 reg_def XMM19g( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(6));
 409 reg_def XMM19h( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(7));
 410 reg_def XMM19i( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(8));
 411 reg_def XMM19j( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(9));
 412 reg_def XMM19k( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(10));
 413 reg_def XMM19l( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(11));
 414 reg_def XMM19m( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(12));
 415 reg_def XMM19n( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(13));
 416 reg_def XMM19o( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(14));
 417 reg_def XMM19p( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(15));
 418 
 419 reg_def XMM20 ( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg());
 420 reg_def XMM20b( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(1));
 421 reg_def XMM20c( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(2));
 422 reg_def XMM20d( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(3));
 423 reg_def XMM20e( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(4));
 424 reg_def XMM20f( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(5));
 425 reg_def XMM20g( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(6));
 426 reg_def XMM20h( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(7));
 427 reg_def XMM20i( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(8));
 428 reg_def XMM20j( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(9));
 429 reg_def XMM20k( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(10));
 430 reg_def XMM20l( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(11));
 431 reg_def XMM20m( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(12));
 432 reg_def XMM20n( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(13));
 433 reg_def XMM20o( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(14));
 434 reg_def XMM20p( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(15));
 435 
 436 reg_def XMM21 ( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg());
 437 reg_def XMM21b( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(1));
 438 reg_def XMM21c( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(2));
 439 reg_def XMM21d( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(3));
 440 reg_def XMM21e( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(4));
 441 reg_def XMM21f( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(5));
 442 reg_def XMM21g( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(6));
 443 reg_def XMM21h( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(7));
 444 reg_def XMM21i( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(8));
 445 reg_def XMM21j( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(9));
 446 reg_def XMM21k( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(10));
 447 reg_def XMM21l( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(11));
 448 reg_def XMM21m( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(12));
 449 reg_def XMM21n( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(13));
 450 reg_def XMM21o( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(14));
 451 reg_def XMM21p( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(15));
 452 
 453 reg_def XMM22 ( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg());
 454 reg_def XMM22b( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(1));
 455 reg_def XMM22c( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(2));
 456 reg_def XMM22d( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(3));
 457 reg_def XMM22e( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(4));
 458 reg_def XMM22f( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(5));
 459 reg_def XMM22g( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(6));
 460 reg_def XMM22h( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(7));
 461 reg_def XMM22i( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(8));
 462 reg_def XMM22j( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(9));
 463 reg_def XMM22k( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(10));
 464 reg_def XMM22l( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(11));
 465 reg_def XMM22m( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(12));
 466 reg_def XMM22n( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(13));
 467 reg_def XMM22o( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(14));
 468 reg_def XMM22p( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(15));
 469 
 470 reg_def XMM23 ( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg());
 471 reg_def XMM23b( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(1));
 472 reg_def XMM23c( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(2));
 473 reg_def XMM23d( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(3));
 474 reg_def XMM23e( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(4));
 475 reg_def XMM23f( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(5));
 476 reg_def XMM23g( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(6));
 477 reg_def XMM23h( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(7));
 478 reg_def XMM23i( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(8));
 479 reg_def XMM23j( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(9));
 480 reg_def XMM23k( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(10));
 481 reg_def XMM23l( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(11));
 482 reg_def XMM23m( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(12));
 483 reg_def XMM23n( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(13));
 484 reg_def XMM23o( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(14));
 485 reg_def XMM23p( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(15));
 486 
 487 reg_def XMM24 ( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg());
 488 reg_def XMM24b( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(1));
 489 reg_def XMM24c( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(2));
 490 reg_def XMM24d( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(3));
 491 reg_def XMM24e( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(4));
 492 reg_def XMM24f( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(5));
 493 reg_def XMM24g( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(6));
 494 reg_def XMM24h( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(7));
 495 reg_def XMM24i( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(8));
 496 reg_def XMM24j( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(9));
 497 reg_def XMM24k( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(10));
 498 reg_def XMM24l( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(11));
 499 reg_def XMM24m( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(12));
 500 reg_def XMM24n( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(13));
 501 reg_def XMM24o( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(14));
 502 reg_def XMM24p( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(15));
 503 
 504 reg_def XMM25 ( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg());
 505 reg_def XMM25b( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(1));
 506 reg_def XMM25c( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(2));
 507 reg_def XMM25d( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(3));
 508 reg_def XMM25e( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(4));
 509 reg_def XMM25f( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(5));
 510 reg_def XMM25g( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(6));
 511 reg_def XMM25h( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(7));
 512 reg_def XMM25i( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(8));
 513 reg_def XMM25j( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(9));
 514 reg_def XMM25k( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(10));
 515 reg_def XMM25l( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(11));
 516 reg_def XMM25m( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(12));
 517 reg_def XMM25n( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(13));
 518 reg_def XMM25o( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(14));
 519 reg_def XMM25p( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(15));
 520 
 521 reg_def XMM26 ( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg());
 522 reg_def XMM26b( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(1));
 523 reg_def XMM26c( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(2));
 524 reg_def XMM26d( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(3));
 525 reg_def XMM26e( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(4));
 526 reg_def XMM26f( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(5));
 527 reg_def XMM26g( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(6));
 528 reg_def XMM26h( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(7));
 529 reg_def XMM26i( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(8));
 530 reg_def XMM26j( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(9));
 531 reg_def XMM26k( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(10));
 532 reg_def XMM26l( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(11));
 533 reg_def XMM26m( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(12));
 534 reg_def XMM26n( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(13));
 535 reg_def XMM26o( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(14));
 536 reg_def XMM26p( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(15));
 537 
 538 reg_def XMM27 ( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg());
 539 reg_def XMM27b( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(1));
 540 reg_def XMM27c( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(2));
 541 reg_def XMM27d( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(3));
 542 reg_def XMM27e( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(4));
 543 reg_def XMM27f( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(5));
 544 reg_def XMM27g( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(6));
 545 reg_def XMM27h( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(7));
 546 reg_def XMM27i( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(8));
 547 reg_def XMM27j( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(9));
 548 reg_def XMM27k( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(10));
 549 reg_def XMM27l( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(11));
 550 reg_def XMM27m( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(12));
 551 reg_def XMM27n( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(13));
 552 reg_def XMM27o( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(14));
 553 reg_def XMM27p( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(15));
 554 
 555 reg_def XMM28 ( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg());
 556 reg_def XMM28b( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(1));
 557 reg_def XMM28c( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(2));
 558 reg_def XMM28d( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(3));
 559 reg_def XMM28e( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(4));
 560 reg_def XMM28f( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(5));
 561 reg_def XMM28g( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(6));
 562 reg_def XMM28h( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(7));
 563 reg_def XMM28i( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(8));
 564 reg_def XMM28j( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(9));
 565 reg_def XMM28k( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(10));
 566 reg_def XMM28l( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(11));
 567 reg_def XMM28m( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(12));
 568 reg_def XMM28n( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(13));
 569 reg_def XMM28o( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(14));
 570 reg_def XMM28p( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(15));
 571 
 572 reg_def XMM29 ( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg());
 573 reg_def XMM29b( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(1));
 574 reg_def XMM29c( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(2));
 575 reg_def XMM29d( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(3));
 576 reg_def XMM29e( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(4));
 577 reg_def XMM29f( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(5));
 578 reg_def XMM29g( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(6));
 579 reg_def XMM29h( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(7));
 580 reg_def XMM29i( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(8));
 581 reg_def XMM29j( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(9));
 582 reg_def XMM29k( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(10));
 583 reg_def XMM29l( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(11));
 584 reg_def XMM29m( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(12));
 585 reg_def XMM29n( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(13));
 586 reg_def XMM29o( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(14));
 587 reg_def XMM29p( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(15));
 588 
 589 reg_def XMM30 ( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg());
 590 reg_def XMM30b( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(1));
 591 reg_def XMM30c( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(2));
 592 reg_def XMM30d( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(3));
 593 reg_def XMM30e( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(4));
 594 reg_def XMM30f( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(5));
 595 reg_def XMM30g( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(6));
 596 reg_def XMM30h( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(7));
 597 reg_def XMM30i( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(8));
 598 reg_def XMM30j( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(9));
 599 reg_def XMM30k( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(10));
 600 reg_def XMM30l( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(11));
 601 reg_def XMM30m( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(12));
 602 reg_def XMM30n( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(13));
 603 reg_def XMM30o( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(14));
 604 reg_def XMM30p( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(15));
 605 
 606 reg_def XMM31 ( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg());
 607 reg_def XMM31b( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(1));
 608 reg_def XMM31c( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(2));
 609 reg_def XMM31d( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(3));
 610 reg_def XMM31e( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(4));
 611 reg_def XMM31f( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(5));
 612 reg_def XMM31g( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(6));
 613 reg_def XMM31h( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(7));
 614 reg_def XMM31i( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(8));
 615 reg_def XMM31j( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(9));
 616 reg_def XMM31k( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(10));
 617 reg_def XMM31l( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(11));
 618 reg_def XMM31m( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(12));
 619 reg_def XMM31n( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(13));
 620 reg_def XMM31o( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(14));
 621 reg_def XMM31p( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(15));
 622 
 623 #endif // _LP64
 624 
 625 #ifdef _LP64
 626 reg_def RFLAGS(SOC, SOC, 0, 16, VMRegImpl::Bad());
 627 #else
 628 reg_def RFLAGS(SOC, SOC, 0, 8, VMRegImpl::Bad());
 629 #endif // _LP64
 630 
 631 alloc_class chunk1(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
 632                    XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
 633                    XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
 634                    XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
 635                    XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
 636                    XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
 637                    XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
 638                    XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
 639 #ifdef _LP64
 640                   ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
 641                    XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
 642                    XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
 643                    XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
 644                    XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
 645                    XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
 646                    XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
 647                    XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
 648                   ,XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
 649                    XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
 650                    XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
 651                    XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
 652                    XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
 653                    XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
 654                    XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
 655                    XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
 656                    XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
 657                    XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
 658                    XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
 659                    XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
 660                    XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
 661                    XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
 662                    XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
 663                    XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p
 664 #endif
 665                       );
 666 
 667 // flags allocation class should be last.
 668 alloc_class chunk2(RFLAGS);
 669 
 670 // Singleton class for condition codes
 671 reg_class int_flags(RFLAGS);
 672 
 673 // Class for pre evex float registers
 674 reg_class float_reg_legacy(XMM0,
 675                     XMM1,
 676                     XMM2,
 677                     XMM3,
 678                     XMM4,
 679                     XMM5,
 680                     XMM6,
 681                     XMM7
 682 #ifdef _LP64
 683                    ,XMM8,
 684                     XMM9,
 685                     XMM10,
 686                     XMM11,
 687                     XMM12,
 688                     XMM13,
 689                     XMM14,
 690                     XMM15
 691 #endif
 692                     );
 693 
 694 // Class for evex float registers
 695 reg_class float_reg_evex(XMM0,
 696                     XMM1,
 697                     XMM2,
 698                     XMM3,
 699                     XMM4,
 700                     XMM5,
 701                     XMM6,
 702                     XMM7
 703 #ifdef _LP64
 704                    ,XMM8,
 705                     XMM9,
 706                     XMM10,
 707                     XMM11,
 708                     XMM12,
 709                     XMM13,
 710                     XMM14,
 711                     XMM15,
 712                     XMM16,
 713                     XMM17,
 714                     XMM18,
 715                     XMM19,
 716                     XMM20,
 717                     XMM21,
 718                     XMM22,
 719                     XMM23,
 720                     XMM24,
 721                     XMM25,
 722                     XMM26,
 723                     XMM27,
 724                     XMM28,
 725                     XMM29,
 726                     XMM30,
 727                     XMM31
 728 #endif
 729                     );
 730 
 731 reg_class_dynamic float_reg(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() %} );
 732 
 733 // Class for pre evex double registers
 734 reg_class double_reg_legacy(XMM0,  XMM0b,
 735                      XMM1,  XMM1b,
 736                      XMM2,  XMM2b,
 737                      XMM3,  XMM3b,
 738                      XMM4,  XMM4b,
 739                      XMM5,  XMM5b,
 740                      XMM6,  XMM6b,
 741                      XMM7,  XMM7b
 742 #ifdef _LP64
 743                     ,XMM8,  XMM8b,
 744                      XMM9,  XMM9b,
 745                      XMM10, XMM10b,
 746                      XMM11, XMM11b,
 747                      XMM12, XMM12b,
 748                      XMM13, XMM13b,
 749                      XMM14, XMM14b,
 750                      XMM15, XMM15b
 751 #endif
 752                      );
 753 
 754 // Class for evex double registers
 755 reg_class double_reg_evex(XMM0,  XMM0b,
 756                      XMM1,  XMM1b,
 757                      XMM2,  XMM2b,
 758                      XMM3,  XMM3b,
 759                      XMM4,  XMM4b,
 760                      XMM5,  XMM5b,
 761                      XMM6,  XMM6b,
 762                      XMM7,  XMM7b
 763 #ifdef _LP64
 764                     ,XMM8,  XMM8b,
 765                      XMM9,  XMM9b,
 766                      XMM10, XMM10b,
 767                      XMM11, XMM11b,
 768                      XMM12, XMM12b,
 769                      XMM13, XMM13b,
 770                      XMM14, XMM14b,
 771                      XMM15, XMM15b,
 772                      XMM16, XMM16b,
 773                      XMM17, XMM17b,
 774                      XMM18, XMM18b,
 775                      XMM19, XMM19b,
 776                      XMM20, XMM20b,
 777                      XMM21, XMM21b,
 778                      XMM22, XMM22b,
 779                      XMM23, XMM23b,
 780                      XMM24, XMM24b,
 781                      XMM25, XMM25b,
 782                      XMM26, XMM26b,
 783                      XMM27, XMM27b,
 784                      XMM28, XMM28b,
 785                      XMM29, XMM29b,
 786                      XMM30, XMM30b,
 787                      XMM31, XMM31b
 788 #endif
 789                      );
 790 
 791 reg_class_dynamic double_reg(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() %} );
 792 
 793 // Class for pre evex 32bit vector registers
 794 reg_class vectors_reg_legacy(XMM0,
 795                       XMM1,
 796                       XMM2,
 797                       XMM3,
 798                       XMM4,
 799                       XMM5,
 800                       XMM6,
 801                       XMM7
 802 #ifdef _LP64
 803                      ,XMM8,
 804                       XMM9,
 805                       XMM10,
 806                       XMM11,
 807                       XMM12,
 808                       XMM13,
 809                       XMM14,
 810                       XMM15
 811 #endif
 812                       );
 813 
 814 // Class for evex 32bit vector registers
 815 reg_class vectors_reg_evex(XMM0,
 816                       XMM1,
 817                       XMM2,
 818                       XMM3,
 819                       XMM4,
 820                       XMM5,
 821                       XMM6,
 822                       XMM7
 823 #ifdef _LP64
 824                      ,XMM8,
 825                       XMM9,
 826                       XMM10,
 827                       XMM11,
 828                       XMM12,
 829                       XMM13,
 830                       XMM14,
 831                       XMM15,
 832                       XMM16,
 833                       XMM17,
 834                       XMM18,
 835                       XMM19,
 836                       XMM20,
 837                       XMM21,
 838                       XMM22,
 839                       XMM23,
 840                       XMM24,
 841                       XMM25,
 842                       XMM26,
 843                       XMM27,
 844                       XMM28,
 845                       XMM29,
 846                       XMM30,
 847                       XMM31
 848 #endif
 849                       );
 850 
 851 reg_class_dynamic vectors_reg(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_evex() %} );
 852 
 853 // Class for all 64bit vector registers
 854 reg_class vectord_reg_legacy(XMM0,  XMM0b,
 855                       XMM1,  XMM1b,
 856                       XMM2,  XMM2b,
 857                       XMM3,  XMM3b,
 858                       XMM4,  XMM4b,
 859                       XMM5,  XMM5b,
 860                       XMM6,  XMM6b,
 861                       XMM7,  XMM7b
 862 #ifdef _LP64
 863                      ,XMM8,  XMM8b,
 864                       XMM9,  XMM9b,
 865                       XMM10, XMM10b,
 866                       XMM11, XMM11b,
 867                       XMM12, XMM12b,
 868                       XMM13, XMM13b,
 869                       XMM14, XMM14b,
 870                       XMM15, XMM15b
 871 #endif
 872                       );
 873 
 874 // Class for all 64bit vector registers
 875 reg_class vectord_reg_evex(XMM0,  XMM0b,
 876                       XMM1,  XMM1b,
 877                       XMM2,  XMM2b,
 878                       XMM3,  XMM3b,
 879                       XMM4,  XMM4b,
 880                       XMM5,  XMM5b,
 881                       XMM6,  XMM6b,
 882                       XMM7,  XMM7b
 883 #ifdef _LP64
 884                      ,XMM8,  XMM8b,
 885                       XMM9,  XMM9b,
 886                       XMM10, XMM10b,
 887                       XMM11, XMM11b,
 888                       XMM12, XMM12b,
 889                       XMM13, XMM13b,
 890                       XMM14, XMM14b,
 891                       XMM15, XMM15b,
 892                       XMM16, XMM16b,
 893                       XMM17, XMM17b,
 894                       XMM18, XMM18b,
 895                       XMM19, XMM19b,
 896                       XMM20, XMM20b,
 897                       XMM21, XMM21b,
 898                       XMM22, XMM22b,
 899                       XMM23, XMM23b,
 900                       XMM24, XMM24b,
 901                       XMM25, XMM25b,
 902                       XMM26, XMM26b,
 903                       XMM27, XMM27b,
 904                       XMM28, XMM28b,
 905                       XMM29, XMM29b,
 906                       XMM30, XMM30b,
 907                       XMM31, XMM31b
 908 #endif
 909                       );
 910 
 911 reg_class_dynamic vectord_reg(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_evex() %} );
 912 
 913 // Class for all 128bit vector registers
 914 reg_class vectorx_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,
 915                       XMM1,  XMM1b,  XMM1c,  XMM1d,
 916                       XMM2,  XMM2b,  XMM2c,  XMM2d,
 917                       XMM3,  XMM3b,  XMM3c,  XMM3d,
 918                       XMM4,  XMM4b,  XMM4c,  XMM4d,
 919                       XMM5,  XMM5b,  XMM5c,  XMM5d,
 920                       XMM6,  XMM6b,  XMM6c,  XMM6d,
 921                       XMM7,  XMM7b,  XMM7c,  XMM7d
 922 #ifdef _LP64
 923                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,
 924                       XMM9,  XMM9b,  XMM9c,  XMM9d,
 925                       XMM10, XMM10b, XMM10c, XMM10d,
 926                       XMM11, XMM11b, XMM11c, XMM11d,
 927                       XMM12, XMM12b, XMM12c, XMM12d,
 928                       XMM13, XMM13b, XMM13c, XMM13d,
 929                       XMM14, XMM14b, XMM14c, XMM14d,
 930                       XMM15, XMM15b, XMM15c, XMM15d
 931 #endif
 932                       );
 933 
 934 // Class for all 128bit vector registers
 935 reg_class vectorx_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,
 936                       XMM1,  XMM1b,  XMM1c,  XMM1d,
 937                       XMM2,  XMM2b,  XMM2c,  XMM2d,
 938                       XMM3,  XMM3b,  XMM3c,  XMM3d,
 939                       XMM4,  XMM4b,  XMM4c,  XMM4d,
 940                       XMM5,  XMM5b,  XMM5c,  XMM5d,
 941                       XMM6,  XMM6b,  XMM6c,  XMM6d,
 942                       XMM7,  XMM7b,  XMM7c,  XMM7d
 943 #ifdef _LP64
 944                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,
 945                       XMM9,  XMM9b,  XMM9c,  XMM9d,
 946                       XMM10, XMM10b, XMM10c, XMM10d,
 947                       XMM11, XMM11b, XMM11c, XMM11d,
 948                       XMM12, XMM12b, XMM12c, XMM12d,
 949                       XMM13, XMM13b, XMM13c, XMM13d,
 950                       XMM14, XMM14b, XMM14c, XMM14d,
 951                       XMM15, XMM15b, XMM15c, XMM15d,
 952                       XMM16, XMM16b, XMM16c, XMM16d,
 953                       XMM17, XMM17b, XMM17c, XMM17d,
 954                       XMM18, XMM18b, XMM18c, XMM18d,
 955                       XMM19, XMM19b, XMM19c, XMM19d,
 956                       XMM20, XMM20b, XMM20c, XMM20d,
 957                       XMM21, XMM21b, XMM21c, XMM21d,
 958                       XMM22, XMM22b, XMM22c, XMM22d,
 959                       XMM23, XMM23b, XMM23c, XMM23d,
 960                       XMM24, XMM24b, XMM24c, XMM24d,
 961                       XMM25, XMM25b, XMM25c, XMM25d,
 962                       XMM26, XMM26b, XMM26c, XMM26d,
 963                       XMM27, XMM27b, XMM27c, XMM27d,
 964                       XMM28, XMM28b, XMM28c, XMM28d,
 965                       XMM29, XMM29b, XMM29c, XMM29d,
 966                       XMM30, XMM30b, XMM30c, XMM30d,
 967                       XMM31, XMM31b, XMM31c, XMM31d
 968 #endif
 969                       );
 970 
 971 reg_class_dynamic vectorx_reg(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_evex() %} );
 972 
 973 // Class for all 256bit vector registers
 974 reg_class vectory_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
 975                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
 976                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
 977                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
 978                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
 979                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
 980                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
 981                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
 982 #ifdef _LP64
 983                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
 984                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
 985                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
 986                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
 987                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
 988                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
 989                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
 990                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h
 991 #endif
 992                       );
 993 
 994 // Class for all 256bit vector registers
 995 reg_class vectory_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
 996                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
 997                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
 998                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
 999                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
1000                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
1001                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
1002                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
1003 #ifdef _LP64
1004                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
1005                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
1006                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
1007                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
1008                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
1009                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
1010                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
1011                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h,
1012                       XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h,
1013                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h,
1014                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h,
1015                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h,
1016                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h,
1017                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h,
1018                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h,
1019                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h,
1020                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h,
1021                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h,
1022                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h,
1023                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h,
1024                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h,
1025                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h,
1026                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h,
1027                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h
1028 #endif
1029                       );
1030 
1031 reg_class_dynamic vectory_reg(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_evex() %} );
1032 
1033 // Class for all 512bit vector registers
1034 reg_class vectorz_reg(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
1035                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
1036                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
1037                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
1038                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
1039                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
1040                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
1041                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
1042 #ifdef _LP64
1043                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
1044                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
1045                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
1046                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
1047                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
1048                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
1049                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
1050                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
1051                      ,XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
1052                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
1053                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
1054                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
1055                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
1056                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
1057                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
1058                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
1059                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
1060                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
1061                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
1062                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
1063                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
1064                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
1065                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
1066                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p
1067 #endif
1068                       );
1069 
1070 reg_class xmm0_reg(XMM0, XMM0b, XMM0c, XMM0d);
1071 reg_class ymm0_reg(XMM0, XMM0b, XMM0c, XMM0d, XMM0e, XMM0f, XMM0g, XMM0h);
1072 reg_class zmm0_reg(XMM0, XMM0b, XMM0c, XMM0d, XMM0e, XMM0f, XMM0g, XMM0h, XMM0i, XMM0j, XMM0k, XMM0l, XMM0m, XMM0n, XMM0o, XMM0p);
1073 
1074 reg_class xmm1_reg(XMM1, XMM1b, XMM1c, XMM1d);
1075 reg_class ymm1_reg(XMM1, XMM1b, XMM1c, XMM1d, XMM1e, XMM1f, XMM1g, XMM1h);
1076 reg_class zmm1_reg(XMM1, XMM1b, XMM1c, XMM1d, XMM1e, XMM1f, XMM1g, XMM1h, XMM1i, XMM1j, XMM1k, XMM1l, XMM1m, XMM1n, XMM1o, XMM1p);
1077 
1078 reg_class xmm2_reg(XMM2, XMM2b, XMM2c, XMM2d);
1079 reg_class ymm2_reg(XMM2, XMM2b, XMM2c, XMM2d, XMM2e, XMM2f, XMM2g, XMM2h);
1080 reg_class zmm2_reg(XMM2, XMM2b, XMM2c, XMM2d, XMM2e, XMM2f, XMM2g, XMM2h, XMM2i, XMM2j, XMM2k, XMM2l, XMM2m, XMM2n, XMM2o, XMM2p);
1081 
1082 reg_class xmm3_reg(XMM3, XMM3b, XMM3c, XMM3d);
1083 reg_class ymm3_reg(XMM3, XMM3b, XMM3c, XMM3d, XMM3e, XMM3f, XMM3g, XMM3h);
1084 reg_class zmm3_reg(XMM3, XMM3b, XMM3c, XMM3d, XMM3e, XMM3f, XMM3g, XMM3h, XMM3i, XMM3j, XMM3k, XMM3l, XMM3m, XMM3n, XMM3o, XMM3p);
1085 
1086 reg_class xmm4_reg(XMM4, XMM4b, XMM4c, XMM4d);
1087 reg_class ymm4_reg(XMM4, XMM4b, XMM4c, XMM4d, XMM4e, XMM4f, XMM4g, XMM4h);
1088 reg_class zmm4_reg(XMM4, XMM4b, XMM4c, XMM4d, XMM4e, XMM4f, XMM4g, XMM4h, XMM4i, XMM4j, XMM4k, XMM4l, XMM4m, XMM4n, XMM4o, XMM4p);
1089 
1090 reg_class xmm5_reg(XMM5, XMM5b, XMM5c, XMM5d);
1091 reg_class ymm5_reg(XMM5, XMM5b, XMM5c, XMM5d, XMM5e, XMM5f, XMM5g, XMM5h);
1092 reg_class zmm5_reg(XMM5, XMM5b, XMM5c, XMM5d, XMM5e, XMM5f, XMM5g, XMM5h, XMM5i, XMM5j, XMM5k, XMM5l, XMM5m, XMM5n, XMM5o, XMM5p);
1093 
1094 reg_class xmm6_reg(XMM6, XMM6b, XMM6c, XMM6d);
1095 reg_class ymm6_reg(XMM6, XMM6b, XMM6c, XMM6d, XMM6e, XMM6f, XMM6g, XMM6h);
1096 reg_class zmm6_reg(XMM6, XMM6b, XMM6c, XMM6d, XMM6e, XMM6f, XMM6g, XMM6h, XMM6i, XMM6j, XMM6k, XMM6l, XMM6m, XMM6n, XMM6o, XMM6p);
1097 
1098 reg_class xmm7_reg(XMM7, XMM7b, XMM7c, XMM7d);
1099 reg_class ymm7_reg(XMM7, XMM7b, XMM7c, XMM7d, XMM7e, XMM7f, XMM7g, XMM7h);
1100 reg_class zmm7_reg(XMM7, XMM7b, XMM7c, XMM7d, XMM7e, XMM7f, XMM7g, XMM7h, XMM7i, XMM7j, XMM7k, XMM7l, XMM7m, XMM7n, XMM7o, XMM7p);
1101 
1102 #ifdef _LP64
1103 
1104 reg_class xmm8_reg(XMM8, XMM8b, XMM8c, XMM8d);
1105 reg_class ymm8_reg(XMM8, XMM8b, XMM8c, XMM8d, XMM8e, XMM8f, XMM8g, XMM8h);
1106 reg_class zmm8_reg(XMM8, XMM8b, XMM8c, XMM8d, XMM8e, XMM8f, XMM8g, XMM8h, XMM8i, XMM8j, XMM8k, XMM8l, XMM8m, XMM8n, XMM8o, XMM8p);
1107 
1108 reg_class xmm9_reg(XMM9, XMM9b, XMM9c, XMM9d);
1109 reg_class ymm9_reg(XMM9, XMM9b, XMM9c, XMM9d, XMM9e, XMM9f, XMM9g, XMM9h);
1110 reg_class zmm9_reg(XMM9, XMM9b, XMM9c, XMM9d, XMM9e, XMM9f, XMM9g, XMM9h, XMM9i, XMM9j, XMM9k, XMM9l, XMM9m, XMM9n, XMM9o, XMM9p);
1111 
1112 reg_class xmm10_reg(XMM10, XMM10b, XMM10c, XMM10d);
1113 reg_class ymm10_reg(XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h);
1114 reg_class zmm10_reg(XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p);
1115 
1116 reg_class xmm11_reg(XMM11, XMM11b, XMM11c, XMM11d);
1117 reg_class ymm11_reg(XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h);
1118 reg_class zmm11_reg(XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p);
1119 
1120 reg_class xmm12_reg(XMM12, XMM12b, XMM12c, XMM12d);
1121 reg_class ymm12_reg(XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h);
1122 reg_class zmm12_reg(XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p);
1123 
1124 reg_class xmm13_reg(XMM13, XMM13b, XMM13c, XMM13d);
1125 reg_class ymm13_reg(XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h);
1126 reg_class zmm13_reg(XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p);
1127 
1128 reg_class xmm14_reg(XMM14, XMM14b, XMM14c, XMM14d);
1129 reg_class ymm14_reg(XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h);
1130 reg_class zmm14_reg(XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p);
1131 
1132 reg_class xmm15_reg(XMM15, XMM15b, XMM15c, XMM15d);
1133 reg_class ymm15_reg(XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h);
1134 reg_class zmm15_reg(XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p);
1135 
1136 reg_class xmm16_reg(XMM16, XMM16b, XMM16c, XMM16d);
1137 reg_class ymm16_reg(XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h);
1138 reg_class zmm16_reg(XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p);
1139 
1140 reg_class xmm17_reg(XMM17, XMM17b, XMM17c, XMM17d);
1141 reg_class ymm17_reg(XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h);
1142 reg_class zmm17_reg(XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p);
1143 
1144 reg_class xmm18_reg(XMM18, XMM18b, XMM18c, XMM18d);
1145 reg_class ymm18_reg(XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h);
1146 reg_class zmm18_reg(XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p);
1147 
1148 reg_class xmm19_reg(XMM19, XMM19b, XMM19c, XMM19d);
1149 reg_class ymm19_reg(XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h);
1150 reg_class zmm19_reg(XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p);
1151 
1152 reg_class xmm20_reg(XMM20, XMM20b, XMM20c, XMM20d);
1153 reg_class ymm20_reg(XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h);
1154 reg_class zmm20_reg(XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p);
1155 
1156 reg_class xmm21_reg(XMM21, XMM21b, XMM21c, XMM21d);
1157 reg_class ymm21_reg(XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h);
1158 reg_class zmm21_reg(XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p);
1159 
1160 reg_class xmm22_reg(XMM22, XMM22b, XMM22c, XMM22d);
1161 reg_class ymm22_reg(XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h);
1162 reg_class zmm22_reg(XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p);
1163 
1164 reg_class xmm23_reg(XMM23, XMM23b, XMM23c, XMM23d);
1165 reg_class ymm23_reg(XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h);
1166 reg_class zmm23_reg(XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p);
1167 
1168 reg_class xmm24_reg(XMM24, XMM24b, XMM24c, XMM24d);
1169 reg_class ymm24_reg(XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h);
1170 reg_class zmm24_reg(XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p);
1171 
1172 reg_class xmm25_reg(XMM25, XMM25b, XMM25c, XMM25d);
1173 reg_class ymm25_reg(XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h);
1174 reg_class zmm25_reg(XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p);
1175 
1176 reg_class xmm26_reg(XMM26, XMM26b, XMM26c, XMM26d);
1177 reg_class ymm26_reg(XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h);
1178 reg_class zmm26_reg(XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p);
1179 
1180 reg_class xmm27_reg(XMM27, XMM27b, XMM27c, XMM27d);
1181 reg_class ymm27_reg(XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h);
1182 reg_class zmm27_reg(XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p);
1183 
1184 reg_class xmm28_reg(XMM28, XMM28b, XMM28c, XMM28d);
1185 reg_class ymm28_reg(XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h);
1186 reg_class zmm28_reg(XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p);
1187 
1188 reg_class xmm29_reg(XMM29, XMM29b, XMM29c, XMM29d);
1189 reg_class ymm29_reg(XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h);
1190 reg_class zmm29_reg(XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p);
1191 
1192 reg_class xmm30_reg(XMM30, XMM30b, XMM30c, XMM30d);
1193 reg_class ymm30_reg(XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h);
1194 reg_class zmm30_reg(XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p);
1195 
1196 reg_class xmm31_reg(XMM31, XMM31b, XMM31c, XMM31d);
1197 reg_class ymm31_reg(XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h);
1198 reg_class zmm31_reg(XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p);
1199 
1200 #endif
1201 
1202 %}
1203 
1204 
1205 //----------SOURCE BLOCK-------------------------------------------------------
1206 // This is a block of C++ code which provides values, functions, and
1207 // definitions necessary in the rest of the architecture description
1208 
1209 source_hpp %{
1210 // Header information of the source block.
1211 // Method declarations/definitions which are used outside
1212 // the ad-scope can conveniently be defined here.
1213 //
1214 // To keep related declarations/definitions/uses close together,
1215 // we switch between source %{ }% and source_hpp %{ }% freely as needed.
1216 
1217 class NativeJump;
1218 
1219 class CallStubImpl {
1220 
1221   //--------------------------------------------------------------
1222   //---<  Used for optimization in Compile::shorten_branches  >---
1223   //--------------------------------------------------------------
1224 
1225  public:
1226   // Size of call trampoline stub.
1227   static uint size_call_trampoline() {
1228     return 0; // no call trampolines on this platform
1229   }
1230 
1231   // number of relocations needed by a call trampoline stub
1232   static uint reloc_call_trampoline() {
1233     return 0; // no call trampolines on this platform
1234   }
1235 };
1236 
1237 class HandlerImpl {
1238 
1239  public:
1240 
1241   static int emit_exception_handler(CodeBuffer &cbuf);
1242   static int emit_deopt_handler(CodeBuffer& cbuf);
1243 
1244   static uint size_exception_handler() {
1245     // NativeCall instruction size is the same as NativeJump.
1246     // exception handler starts out as jump and can be patched to
1247     // a call be deoptimization.  (4932387)
1248     // Note that this value is also credited (in output.cpp) to
1249     // the size of the code section.
1250     return NativeJump::instruction_size;
1251   }
1252 
1253 #ifdef _LP64
1254   static uint size_deopt_handler() {
1255     // three 5 byte instructions plus one move for unreachable address.
1256     return 15+3;
1257   }
1258 #else
1259   static uint size_deopt_handler() {
1260     // NativeCall instruction size is the same as NativeJump.
1261     // exception handler starts out as jump and can be patched to
1262     // a call be deoptimization.  (4932387)
1263     // Note that this value is also credited (in output.cpp) to
1264     // the size of the code section.
1265     return 5 + NativeJump::instruction_size; // pushl(); jmp;
1266   }
1267 #endif
1268 };
1269 
1270 %} // end source_hpp
1271 
1272 source %{
1273 
1274 #include "opto/addnode.hpp"
1275 
1276 // Emit exception handler code.
1277 // Stuff framesize into a register and call a VM stub routine.
1278 int HandlerImpl::emit_exception_handler(CodeBuffer& cbuf) {
1279 
1280   // Note that the code buffer's insts_mark is always relative to insts.
1281   // That's why we must use the macroassembler to generate a handler.
1282   MacroAssembler _masm(&cbuf);
1283   address base = __ start_a_stub(size_exception_handler());
1284   if (base == NULL) {
1285     ciEnv::current()->record_failure("CodeCache is full");
1286     return 0;  // CodeBuffer::expand failed
1287   }
1288   int offset = __ offset();
1289   __ jump(RuntimeAddress(OptoRuntime::exception_blob()->entry_point()));
1290   assert(__ offset() - offset <= (int) size_exception_handler(), "overflow");
1291   __ end_a_stub();
1292   return offset;
1293 }
1294 
1295 // Emit deopt handler code.
1296 int HandlerImpl::emit_deopt_handler(CodeBuffer& cbuf) {
1297 
1298   // Note that the code buffer's insts_mark is always relative to insts.
1299   // That's why we must use the macroassembler to generate a handler.
1300   MacroAssembler _masm(&cbuf);
1301   address base = __ start_a_stub(size_deopt_handler());
1302   if (base == NULL) {
1303     ciEnv::current()->record_failure("CodeCache is full");
1304     return 0;  // CodeBuffer::expand failed
1305   }
1306   int offset = __ offset();
1307 
1308 #ifdef _LP64
1309   address the_pc = (address) __ pc();
1310   Label next;
1311   // push a "the_pc" on the stack without destroying any registers
1312   // as they all may be live.
1313 
1314   // push address of "next"
1315   __ call(next, relocInfo::none); // reloc none is fine since it is a disp32
1316   __ bind(next);
1317   // adjust it so it matches "the_pc"
1318   __ subptr(Address(rsp, 0), __ offset() - offset);
1319 #else
1320   InternalAddress here(__ pc());
1321   __ pushptr(here.addr());
1322 #endif
1323 
1324   __ jump(RuntimeAddress(SharedRuntime::deopt_blob()->unpack()));
1325   assert(__ offset() - offset <= (int) size_deopt_handler(), "overflow %d", (__ offset() - offset));
1326   __ end_a_stub();
1327   return offset;
1328 }
1329 
1330 
1331 //=============================================================================
1332 
1333   // Float masks come from different places depending on platform.
1334 #ifdef _LP64
1335   static address float_signmask()  { return StubRoutines::x86::float_sign_mask(); }
1336   static address float_signflip()  { return StubRoutines::x86::float_sign_flip(); }
1337   static address double_signmask() { return StubRoutines::x86::double_sign_mask(); }
1338   static address double_signflip() { return StubRoutines::x86::double_sign_flip(); }
1339 #else
1340   static address float_signmask()  { return (address)float_signmask_pool; }
1341   static address float_signflip()  { return (address)float_signflip_pool; }
1342   static address double_signmask() { return (address)double_signmask_pool; }
1343   static address double_signflip() { return (address)double_signflip_pool; }
1344 #endif
1345 
1346 
1347 const bool Matcher::match_rule_supported(int opcode) {
1348   if (!has_match_rule(opcode))
1349     return false;
1350 
1351   bool ret_value = true;
1352   switch (opcode) {
1353     case Op_PopCountI:
1354     case Op_PopCountL:
1355       if (!UsePopCountInstruction)
1356         ret_value = false;
1357       break;
1358     case Op_PopCountVI:
1359       if (!UsePopCountInstruction || !VM_Version::supports_vpopcntdq())
1360         ret_value = false;
1361       break;
1362     case Op_MulVI:
1363       if ((UseSSE < 4) && (UseAVX < 1)) // only with SSE4_1 or AVX
1364         ret_value = false;
1365       break;
1366     case Op_MulVL:
1367     case Op_MulReductionVL:
1368       if (VM_Version::supports_avx512dq() == false)
1369         ret_value = false;
1370       break;
1371     case Op_AddReductionVL:
1372       if (UseAVX < 3) // only EVEX : vector connectivity becomes an issue here
1373         ret_value = false;
1374       break;
1375     case Op_AddReductionVI:
1376       if (UseSSE < 3) // requires at least SSE3
1377         ret_value = false;
1378       break;
1379     case Op_MulReductionVI:
1380       if (UseSSE < 4) // requires at least SSE4
1381         ret_value = false;
1382       break;
1383     case Op_AddReductionVF:
1384     case Op_AddReductionVD:
1385     case Op_MulReductionVF:
1386     case Op_MulReductionVD:
1387       if (UseSSE < 1) // requires at least SSE
1388         ret_value = false;
1389       break;
1390     case Op_SqrtVD:
1391     case Op_SqrtVF:
1392       if (UseAVX < 1) // enabled for AVX only
1393         ret_value = false;
1394       break;
1395     case Op_CompareAndSwapL:
1396 #ifdef _LP64
1397     case Op_CompareAndSwapP:
1398 #endif
1399       if (!VM_Version::supports_cx8())
1400         ret_value = false;
1401       break;
1402     case Op_CMoveVF:
1403     case Op_CMoveVD:
1404       if (UseAVX < 1 || UseAVX > 2)
1405         ret_value = false;
1406       break;
1407     case Op_StrIndexOf:
1408       if (!UseSSE42Intrinsics)
1409         ret_value = false;
1410       break;
1411     case Op_StrIndexOfChar:
1412       if (!UseSSE42Intrinsics)
1413         ret_value = false;
1414       break;
1415     case Op_OnSpinWait:
1416       if (VM_Version::supports_on_spin_wait() == false)
1417         ret_value = false;
1418       break;
1419   }
1420 
1421   return ret_value;  // Per default match rules are supported.
1422 }
1423 
1424 const bool Matcher::match_rule_supported_vector(int opcode, int vlen) {
1425   // identify extra cases that we might want to provide match rules for
1426   // e.g. Op_ vector nodes and other intrinsics while guarding with vlen
1427   bool ret_value = match_rule_supported(opcode);
1428   if (ret_value) {
1429     switch (opcode) {
1430       case Op_AddVB:
1431       case Op_SubVB:
1432         if ((vlen == 64) && (VM_Version::supports_avx512bw() == false))
1433           ret_value = false;
1434         break;
1435       case Op_URShiftVS:
1436       case Op_RShiftVS:
1437       case Op_LShiftVS:
1438       case Op_MulVS:
1439       case Op_AddVS:
1440       case Op_SubVS:
1441         if ((vlen == 32) && (VM_Version::supports_avx512bw() == false))
1442           ret_value = false;
1443         break;
1444       case Op_CMoveVF:
1445         if (vlen != 8)
1446           ret_value  = false;
1447         break;
1448       case Op_CMoveVD:
1449         if (vlen != 4)
1450           ret_value  = false;
1451         break;
1452     }
1453   }
1454 
1455   return ret_value;  // Per default match rules are supported.
1456 }
1457 
1458 const bool Matcher::has_predicated_vectors(void) {
1459   bool ret_value = false;
1460   if (UseAVX > 2) {
1461     ret_value = VM_Version::supports_avx512vl();
1462   }
1463 
1464   return ret_value;
1465 }
1466 
1467 const int Matcher::float_pressure(int default_pressure_threshold) {
1468   int float_pressure_threshold = default_pressure_threshold;
1469 #ifdef _LP64
1470   if (UseAVX > 2) {
1471     // Increase pressure threshold on machines with AVX3 which have
1472     // 2x more XMM registers.
1473     float_pressure_threshold = default_pressure_threshold * 2;
1474   }
1475 #endif
1476   return float_pressure_threshold;
1477 }
1478 
1479 // Max vector size in bytes. 0 if not supported.
1480 const int Matcher::vector_width_in_bytes(BasicType bt) {
1481   assert(is_java_primitive(bt), "only primitive type vectors");
1482   if (UseSSE < 2) return 0;
1483   // SSE2 supports 128bit vectors for all types.
1484   // AVX2 supports 256bit vectors for all types.
1485   // AVX2/EVEX supports 512bit vectors for all types.
1486   int size = (UseAVX > 1) ? (1 << UseAVX) * 8 : 16;
1487   // AVX1 supports 256bit vectors only for FLOAT and DOUBLE.
1488   if (UseAVX > 0 && (bt == T_FLOAT || bt == T_DOUBLE))
1489     size = (UseAVX > 2) ? 64 : 32;
1490   // Use flag to limit vector size.
1491   size = MIN2(size,(int)MaxVectorSize);
1492   // Minimum 2 values in vector (or 4 for bytes).
1493   switch (bt) {
1494   case T_DOUBLE:
1495   case T_LONG:
1496     if (size < 16) return 0;
1497     break;
1498   case T_FLOAT:
1499   case T_INT:
1500     if (size < 8) return 0;
1501     break;
1502   case T_BOOLEAN:
1503     if (size < 4) return 0;
1504     break;
1505   case T_CHAR:
1506     if (size < 4) return 0;
1507     break;
1508   case T_BYTE:
1509     if (size < 4) return 0;
1510     break;
1511   case T_SHORT:
1512     if (size < 4) return 0;
1513     break;
1514   default:
1515     ShouldNotReachHere();
1516   }
1517   return size;
1518 }
1519 
1520 // Limits on vector size (number of elements) loaded into vector.
1521 const int Matcher::max_vector_size(const BasicType bt) {
1522   return vector_width_in_bytes(bt)/type2aelembytes(bt);
1523 }
1524 const int Matcher::min_vector_size(const BasicType bt) {
1525   int max_size = max_vector_size(bt);
1526   // Min size which can be loaded into vector is 4 bytes.
1527   int size = (type2aelembytes(bt) == 1) ? 4 : 2;
1528   return MIN2(size,max_size);
1529 }
1530 
1531 // Vector ideal reg corresponding to specidied size in bytes
1532 const uint Matcher::vector_ideal_reg(int size) {
1533   assert(MaxVectorSize >= size, "");
1534   switch(size) {
1535     case  4: return Op_VecS;
1536     case  8: return Op_VecD;
1537     case 16: return Op_VecX;
1538     case 32: return Op_VecY;
1539     case 64: return Op_VecZ;
1540   }
1541   ShouldNotReachHere();
1542   return 0;
1543 }
1544 
1545 // Only lowest bits of xmm reg are used for vector shift count.
1546 const uint Matcher::vector_shift_count_ideal_reg(int size) {
1547   return Op_VecS;
1548 }
1549 
1550 // x86 supports misaligned vectors store/load.
1551 const bool Matcher::misaligned_vectors_ok() {
1552   return !AlignVector; // can be changed by flag
1553 }
1554 
1555 // x86 AES instructions are compatible with SunJCE expanded
1556 // keys, hence we do not need to pass the original key to stubs
1557 const bool Matcher::pass_original_key_for_aes() {
1558   return false;
1559 }
1560 
1561 
1562 const bool Matcher::convi2l_type_required = true;
1563 
1564 // Check for shift by small constant as well
1565 static bool clone_shift(Node* shift, Matcher* matcher, Matcher::MStack& mstack, VectorSet& address_visited) {
1566   if (shift->Opcode() == Op_LShiftX && shift->in(2)->is_Con() &&
1567       shift->in(2)->get_int() <= 3 &&
1568       // Are there other uses besides address expressions?
1569       !matcher->is_visited(shift)) {
1570     address_visited.set(shift->_idx); // Flag as address_visited
1571     mstack.push(shift->in(2), Matcher::Visit);
1572     Node *conv = shift->in(1);
1573 #ifdef _LP64
1574     // Allow Matcher to match the rule which bypass
1575     // ConvI2L operation for an array index on LP64
1576     // if the index value is positive.
1577     if (conv->Opcode() == Op_ConvI2L &&
1578         conv->as_Type()->type()->is_long()->_lo >= 0 &&
1579         // Are there other uses besides address expressions?
1580         !matcher->is_visited(conv)) {
1581       address_visited.set(conv->_idx); // Flag as address_visited
1582       mstack.push(conv->in(1), Matcher::Pre_Visit);
1583     } else
1584 #endif
1585       mstack.push(conv, Matcher::Pre_Visit);
1586     return true;
1587   }
1588   return false;
1589 }
1590 
1591 // Should the Matcher clone shifts on addressing modes, expecting them
1592 // to be subsumed into complex addressing expressions or compute them
1593 // into registers?
1594 bool Matcher::clone_address_expressions(AddPNode* m, Matcher::MStack& mstack, VectorSet& address_visited) {
1595   Node *off = m->in(AddPNode::Offset);
1596   if (off->is_Con()) {
1597     address_visited.test_set(m->_idx); // Flag as address_visited
1598     Node *adr = m->in(AddPNode::Address);
1599 
1600     // Intel can handle 2 adds in addressing mode
1601     // AtomicAdd is not an addressing expression.
1602     // Cheap to find it by looking for screwy base.
1603     if (adr->is_AddP() &&
1604         !adr->in(AddPNode::Base)->is_top() &&
1605         // Are there other uses besides address expressions?
1606         !is_visited(adr)) {
1607       address_visited.set(adr->_idx); // Flag as address_visited
1608       Node *shift = adr->in(AddPNode::Offset);
1609       if (!clone_shift(shift, this, mstack, address_visited)) {
1610         mstack.push(shift, Pre_Visit);
1611       }
1612       mstack.push(adr->in(AddPNode::Address), Pre_Visit);
1613       mstack.push(adr->in(AddPNode::Base), Pre_Visit);
1614     } else {
1615       mstack.push(adr, Pre_Visit);
1616     }
1617 
1618     // Clone X+offset as it also folds into most addressing expressions
1619     mstack.push(off, Visit);
1620     mstack.push(m->in(AddPNode::Base), Pre_Visit);
1621     return true;
1622   } else if (clone_shift(off, this, mstack, address_visited)) {
1623     address_visited.test_set(m->_idx); // Flag as address_visited
1624     mstack.push(m->in(AddPNode::Address), Pre_Visit);
1625     mstack.push(m->in(AddPNode::Base), Pre_Visit);
1626     return true;
1627   }
1628   return false;
1629 }
1630 
1631 void Compile::reshape_address(AddPNode* addp) {
1632 }
1633 
1634 // Helper methods for MachSpillCopyNode::implementation().
1635 static int vec_mov_helper(CodeBuffer *cbuf, bool do_size, int src_lo, int dst_lo,
1636                           int src_hi, int dst_hi, uint ireg, outputStream* st) {
1637   // In 64-bit VM size calculation is very complex. Emitting instructions
1638   // into scratch buffer is used to get size in 64-bit VM.
1639   LP64_ONLY( assert(!do_size, "this method calculates size only for 32-bit VM"); )
1640   assert(ireg == Op_VecS || // 32bit vector
1641          (src_lo & 1) == 0 && (src_lo + 1) == src_hi &&
1642          (dst_lo & 1) == 0 && (dst_lo + 1) == dst_hi,
1643          "no non-adjacent vector moves" );
1644   if (cbuf) {
1645     MacroAssembler _masm(cbuf);
1646     int offset = __ offset();
1647     switch (ireg) {
1648     case Op_VecS: // copy whole register
1649     case Op_VecD:
1650     case Op_VecX:
1651       __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
1652       break;
1653     case Op_VecY:
1654       __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
1655       break;
1656     case Op_VecZ:
1657       __ evmovdquq(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 2);
1658       break;
1659     default:
1660       ShouldNotReachHere();
1661     }
1662     int size = __ offset() - offset;
1663 #ifdef ASSERT
1664     // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
1665     assert(!do_size || size == 4, "incorrect size calculattion");
1666 #endif
1667     return size;
1668 #ifndef PRODUCT
1669   } else if (!do_size) {
1670     switch (ireg) {
1671     case Op_VecS:
1672     case Op_VecD:
1673     case Op_VecX:
1674       st->print("movdqu  %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
1675       break;
1676     case Op_VecY:
1677     case Op_VecZ:
1678       st->print("vmovdqu %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
1679       break;
1680     default:
1681       ShouldNotReachHere();
1682     }
1683 #endif
1684   }
1685   // VEX_2bytes prefix is used if UseAVX > 0, and it takes the same 2 bytes as SIMD prefix.
1686   return (UseAVX > 2) ? 6 : 4;
1687 }
1688 
1689 static int vec_spill_helper(CodeBuffer *cbuf, bool do_size, bool is_load,
1690                             int stack_offset, int reg, uint ireg, outputStream* st) {
1691   // In 64-bit VM size calculation is very complex. Emitting instructions
1692   // into scratch buffer is used to get size in 64-bit VM.
1693   LP64_ONLY( assert(!do_size, "this method calculates size only for 32-bit VM"); )
1694   if (cbuf) {
1695     MacroAssembler _masm(cbuf);
1696     int offset = __ offset();
1697     if (is_load) {
1698       switch (ireg) {
1699       case Op_VecS:
1700         __ movdl(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
1701         break;
1702       case Op_VecD:
1703         __ movq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
1704         break;
1705       case Op_VecX:
1706         __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
1707         break;
1708       case Op_VecY:
1709         __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
1710         break;
1711       case Op_VecZ:
1712         __ evmovdquq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset), 2);
1713         break;
1714       default:
1715         ShouldNotReachHere();
1716       }
1717     } else { // store
1718       switch (ireg) {
1719       case Op_VecS:
1720         __ movdl(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
1721         break;
1722       case Op_VecD:
1723         __ movq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
1724         break;
1725       case Op_VecX:
1726         __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
1727         break;
1728       case Op_VecY:
1729         __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
1730         break;
1731       case Op_VecZ:
1732         __ evmovdquq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 2);
1733         break;
1734       default:
1735         ShouldNotReachHere();
1736       }
1737     }
1738     int size = __ offset() - offset;
1739 #ifdef ASSERT
1740     int offset_size = (stack_offset == 0) ? 0 : ((stack_offset < 0x80) ? 1 : (UseAVX > 2) ? 6 : 4);
1741     // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
1742     assert(!do_size || size == (5+offset_size), "incorrect size calculattion");
1743 #endif
1744     return size;
1745 #ifndef PRODUCT
1746   } else if (!do_size) {
1747     if (is_load) {
1748       switch (ireg) {
1749       case Op_VecS:
1750         st->print("movd    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
1751         break;
1752       case Op_VecD:
1753         st->print("movq    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
1754         break;
1755        case Op_VecX:
1756         st->print("movdqu  %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
1757         break;
1758       case Op_VecY:
1759       case Op_VecZ:
1760         st->print("vmovdqu %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
1761         break;
1762       default:
1763         ShouldNotReachHere();
1764       }
1765     } else { // store
1766       switch (ireg) {
1767       case Op_VecS:
1768         st->print("movd    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
1769         break;
1770       case Op_VecD:
1771         st->print("movq    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
1772         break;
1773        case Op_VecX:
1774         st->print("movdqu  [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
1775         break;
1776       case Op_VecY:
1777       case Op_VecZ:
1778         st->print("vmovdqu [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
1779         break;
1780       default:
1781         ShouldNotReachHere();
1782       }
1783     }
1784 #endif
1785   }
1786   bool is_single_byte = false;
1787   int vec_len = 0;
1788   if ((UseAVX > 2) && (stack_offset != 0)) {
1789     int tuple_type = Assembler::EVEX_FVM;
1790     int input_size = Assembler::EVEX_32bit;
1791     switch (ireg) {
1792     case Op_VecS:
1793       tuple_type = Assembler::EVEX_T1S;
1794       break;
1795     case Op_VecD:
1796       tuple_type = Assembler::EVEX_T1S;
1797       input_size = Assembler::EVEX_64bit;
1798       break;
1799     case Op_VecX:
1800       break;
1801     case Op_VecY:
1802       vec_len = 1;
1803       break;
1804     case Op_VecZ:
1805       vec_len = 2;
1806       break;
1807     }
1808     is_single_byte = Assembler::query_compressed_disp_byte(stack_offset, true, vec_len, tuple_type, input_size, 0);
1809   }
1810   int offset_size = 0;
1811   int size = 5;
1812   if (UseAVX > 2 ) {
1813     if (VM_Version::supports_avx512novl() && (vec_len == 2)) {
1814       offset_size = (stack_offset == 0) ? 0 : ((is_single_byte) ? 1 : 4);
1815       size += 2; // Need an additional two bytes for EVEX encoding
1816     } else if (VM_Version::supports_avx512novl() && (vec_len < 2)) {
1817       offset_size = (stack_offset == 0) ? 0 : ((stack_offset <= 127) ? 1 : 4);
1818     } else {
1819       offset_size = (stack_offset == 0) ? 0 : ((is_single_byte) ? 1 : 4);
1820       size += 2; // Need an additional two bytes for EVEX encodding
1821     }
1822   } else {
1823     offset_size = (stack_offset == 0) ? 0 : ((stack_offset <= 127) ? 1 : 4);
1824   }
1825   // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
1826   return size+offset_size;
1827 }
1828 
1829 static inline jint replicate4_imm(int con, int width) {
1830   // Load a constant of "width" (in bytes) and replicate it to fill 32bit.
1831   assert(width == 1 || width == 2, "only byte or short types here");
1832   int bit_width = width * 8;
1833   jint val = con;
1834   val &= (1 << bit_width) - 1;  // mask off sign bits
1835   while(bit_width < 32) {
1836     val |= (val << bit_width);
1837     bit_width <<= 1;
1838   }
1839   return val;
1840 }
1841 
1842 static inline jlong replicate8_imm(int con, int width) {
1843   // Load a constant of "width" (in bytes) and replicate it to fill 64bit.
1844   assert(width == 1 || width == 2 || width == 4, "only byte, short or int types here");
1845   int bit_width = width * 8;
1846   jlong val = con;
1847   val &= (((jlong) 1) << bit_width) - 1;  // mask off sign bits
1848   while(bit_width < 64) {
1849     val |= (val << bit_width);
1850     bit_width <<= 1;
1851   }
1852   return val;
1853 }
1854 
1855 #ifndef PRODUCT
1856   void MachNopNode::format(PhaseRegAlloc*, outputStream* st) const {
1857     st->print("nop \t# %d bytes pad for loops and calls", _count);
1858   }
1859 #endif
1860 
1861   void MachNopNode::emit(CodeBuffer &cbuf, PhaseRegAlloc*) const {
1862     MacroAssembler _masm(&cbuf);
1863     __ nop(_count);
1864   }
1865 
1866   uint MachNopNode::size(PhaseRegAlloc*) const {
1867     return _count;
1868   }
1869 
1870 #ifndef PRODUCT
1871   void MachBreakpointNode::format(PhaseRegAlloc*, outputStream* st) const {
1872     st->print("# breakpoint");
1873   }
1874 #endif
1875 
1876   void MachBreakpointNode::emit(CodeBuffer &cbuf, PhaseRegAlloc* ra_) const {
1877     MacroAssembler _masm(&cbuf);
1878     __ int3();
1879   }
1880 
1881   uint MachBreakpointNode::size(PhaseRegAlloc* ra_) const {
1882     return MachNode::size(ra_);
1883   }
1884 
1885 %}
1886 
1887 encode %{
1888 
1889   enc_class call_epilog %{
1890     if (VerifyStackAtCalls) {
1891       // Check that stack depth is unchanged: find majik cookie on stack
1892       int framesize = ra_->reg2offset_unchecked(OptoReg::add(ra_->_matcher._old_SP, -3*VMRegImpl::slots_per_word));
1893       MacroAssembler _masm(&cbuf);
1894       Label L;
1895       __ cmpptr(Address(rsp, framesize), (int32_t)0xbadb100d);
1896       __ jccb(Assembler::equal, L);
1897       // Die if stack mismatch
1898       __ int3();
1899       __ bind(L);
1900     }
1901   %}
1902 
1903 %}
1904 
1905 
1906 //----------OPERANDS-----------------------------------------------------------
1907 // Operand definitions must precede instruction definitions for correct parsing
1908 // in the ADLC because operands constitute user defined types which are used in
1909 // instruction definitions.
1910 
1911 // This one generically applies only for evex, so only one version
1912 operand vecZ() %{
1913   constraint(ALLOC_IN_RC(vectorz_reg));
1914   match(VecZ);
1915 
1916   format %{ %}
1917   interface(REG_INTER);
1918 %}
1919 
1920 // Comparison Code for FP conditional move
1921 operand cmpOp_vcmppd() %{
1922   match(Bool);
1923 
1924   predicate(n->as_Bool()->_test._test != BoolTest::overflow &&
1925             n->as_Bool()->_test._test != BoolTest::no_overflow);
1926   format %{ "" %}
1927   interface(COND_INTER) %{
1928     equal        (0x0, "eq");
1929     less         (0x1, "lt");
1930     less_equal   (0x2, "le");
1931     not_equal    (0xC, "ne");
1932     greater_equal(0xD, "ge");
1933     greater      (0xE, "gt");
1934     //TODO cannot compile (adlc breaks) without two next lines with error:
1935     // x86_64.ad(13987) Syntax Error: :In operand cmpOp_vcmppd: Do not support this encode constant: ' %{
1936     // equal' for overflow.
1937     overflow     (0x20, "o");  // not really supported by the instruction
1938     no_overflow  (0x21, "no"); // not really supported by the instruction
1939   %}
1940 %}
1941 
1942 
1943 // INSTRUCTIONS -- Platform independent definitions (same for 32- and 64-bit)
1944 
1945 // ============================================================================
1946 
1947 instruct ShouldNotReachHere() %{
1948   match(Halt);
1949   format %{ "ud2\t# ShouldNotReachHere" %}
1950   ins_encode %{
1951     __ ud2();
1952   %}
1953   ins_pipe(pipe_slow);
1954 %}
1955 
1956 // =================================EVEX special===============================
1957 
1958 instruct setMask(rRegI dst, rRegI src) %{
1959   predicate(Matcher::has_predicated_vectors());
1960   match(Set dst (SetVectMaskI  src));
1961   effect(TEMP dst);
1962   format %{ "setvectmask   $dst, $src" %}
1963   ins_encode %{
1964     __ setvectmask($dst$$Register, $src$$Register);
1965   %}
1966   ins_pipe(pipe_slow);
1967 %}
1968 
1969 // ============================================================================
1970 
1971 instruct addF_reg(regF dst, regF src) %{
1972   predicate((UseSSE>=1) && (UseAVX == 0));
1973   match(Set dst (AddF dst src));
1974 
1975   format %{ "addss   $dst, $src" %}
1976   ins_cost(150);
1977   ins_encode %{
1978     __ addss($dst$$XMMRegister, $src$$XMMRegister);
1979   %}
1980   ins_pipe(pipe_slow);
1981 %}
1982 
1983 instruct addF_mem(regF dst, memory src) %{
1984   predicate((UseSSE>=1) && (UseAVX == 0));
1985   match(Set dst (AddF dst (LoadF src)));
1986 
1987   format %{ "addss   $dst, $src" %}
1988   ins_cost(150);
1989   ins_encode %{
1990     __ addss($dst$$XMMRegister, $src$$Address);
1991   %}
1992   ins_pipe(pipe_slow);
1993 %}
1994 
1995 instruct addF_imm(regF dst, immF con) %{
1996   predicate((UseSSE>=1) && (UseAVX == 0));
1997   match(Set dst (AddF dst con));
1998   format %{ "addss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
1999   ins_cost(150);
2000   ins_encode %{
2001     __ addss($dst$$XMMRegister, $constantaddress($con));
2002   %}
2003   ins_pipe(pipe_slow);
2004 %}
2005 
2006 instruct addF_reg_reg(regF dst, regF src1, regF src2) %{
2007   predicate(UseAVX > 0);
2008   match(Set dst (AddF src1 src2));
2009 
2010   format %{ "vaddss  $dst, $src1, $src2" %}
2011   ins_cost(150);
2012   ins_encode %{
2013     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2014   %}
2015   ins_pipe(pipe_slow);
2016 %}
2017 
2018 instruct addF_reg_mem(regF dst, regF src1, memory src2) %{
2019   predicate(UseAVX > 0);
2020   match(Set dst (AddF src1 (LoadF src2)));
2021 
2022   format %{ "vaddss  $dst, $src1, $src2" %}
2023   ins_cost(150);
2024   ins_encode %{
2025     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2026   %}
2027   ins_pipe(pipe_slow);
2028 %}
2029 
2030 instruct addF_reg_imm(regF dst, regF src, immF con) %{
2031   predicate(UseAVX > 0);
2032   match(Set dst (AddF src con));
2033 
2034   format %{ "vaddss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
2035   ins_cost(150);
2036   ins_encode %{
2037     __ vaddss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2038   %}
2039   ins_pipe(pipe_slow);
2040 %}
2041 
2042 instruct addD_reg(regD dst, regD src) %{
2043   predicate((UseSSE>=2) && (UseAVX == 0));
2044   match(Set dst (AddD dst src));
2045 
2046   format %{ "addsd   $dst, $src" %}
2047   ins_cost(150);
2048   ins_encode %{
2049     __ addsd($dst$$XMMRegister, $src$$XMMRegister);
2050   %}
2051   ins_pipe(pipe_slow);
2052 %}
2053 
2054 instruct addD_mem(regD dst, memory src) %{
2055   predicate((UseSSE>=2) && (UseAVX == 0));
2056   match(Set dst (AddD dst (LoadD src)));
2057 
2058   format %{ "addsd   $dst, $src" %}
2059   ins_cost(150);
2060   ins_encode %{
2061     __ addsd($dst$$XMMRegister, $src$$Address);
2062   %}
2063   ins_pipe(pipe_slow);
2064 %}
2065 
2066 instruct addD_imm(regD dst, immD con) %{
2067   predicate((UseSSE>=2) && (UseAVX == 0));
2068   match(Set dst (AddD dst con));
2069   format %{ "addsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2070   ins_cost(150);
2071   ins_encode %{
2072     __ addsd($dst$$XMMRegister, $constantaddress($con));
2073   %}
2074   ins_pipe(pipe_slow);
2075 %}
2076 
2077 instruct addD_reg_reg(regD dst, regD src1, regD src2) %{
2078   predicate(UseAVX > 0);
2079   match(Set dst (AddD src1 src2));
2080 
2081   format %{ "vaddsd  $dst, $src1, $src2" %}
2082   ins_cost(150);
2083   ins_encode %{
2084     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2085   %}
2086   ins_pipe(pipe_slow);
2087 %}
2088 
2089 instruct addD_reg_mem(regD dst, regD src1, memory src2) %{
2090   predicate(UseAVX > 0);
2091   match(Set dst (AddD src1 (LoadD src2)));
2092 
2093   format %{ "vaddsd  $dst, $src1, $src2" %}
2094   ins_cost(150);
2095   ins_encode %{
2096     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2097   %}
2098   ins_pipe(pipe_slow);
2099 %}
2100 
2101 instruct addD_reg_imm(regD dst, regD src, immD con) %{
2102   predicate(UseAVX > 0);
2103   match(Set dst (AddD src con));
2104 
2105   format %{ "vaddsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
2106   ins_cost(150);
2107   ins_encode %{
2108     __ vaddsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2109   %}
2110   ins_pipe(pipe_slow);
2111 %}
2112 
2113 instruct subF_reg(regF dst, regF src) %{
2114   predicate((UseSSE>=1) && (UseAVX == 0));
2115   match(Set dst (SubF dst src));
2116 
2117   format %{ "subss   $dst, $src" %}
2118   ins_cost(150);
2119   ins_encode %{
2120     __ subss($dst$$XMMRegister, $src$$XMMRegister);
2121   %}
2122   ins_pipe(pipe_slow);
2123 %}
2124 
2125 instruct subF_mem(regF dst, memory src) %{
2126   predicate((UseSSE>=1) && (UseAVX == 0));
2127   match(Set dst (SubF dst (LoadF src)));
2128 
2129   format %{ "subss   $dst, $src" %}
2130   ins_cost(150);
2131   ins_encode %{
2132     __ subss($dst$$XMMRegister, $src$$Address);
2133   %}
2134   ins_pipe(pipe_slow);
2135 %}
2136 
2137 instruct subF_imm(regF dst, immF con) %{
2138   predicate((UseSSE>=1) && (UseAVX == 0));
2139   match(Set dst (SubF dst con));
2140   format %{ "subss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2141   ins_cost(150);
2142   ins_encode %{
2143     __ subss($dst$$XMMRegister, $constantaddress($con));
2144   %}
2145   ins_pipe(pipe_slow);
2146 %}
2147 
2148 instruct subF_reg_reg(regF dst, regF src1, regF src2) %{
2149   predicate(UseAVX > 0);
2150   match(Set dst (SubF src1 src2));
2151 
2152   format %{ "vsubss  $dst, $src1, $src2" %}
2153   ins_cost(150);
2154   ins_encode %{
2155     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2156   %}
2157   ins_pipe(pipe_slow);
2158 %}
2159 
2160 instruct subF_reg_mem(regF dst, regF src1, memory src2) %{
2161   predicate(UseAVX > 0);
2162   match(Set dst (SubF src1 (LoadF src2)));
2163 
2164   format %{ "vsubss  $dst, $src1, $src2" %}
2165   ins_cost(150);
2166   ins_encode %{
2167     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2168   %}
2169   ins_pipe(pipe_slow);
2170 %}
2171 
2172 instruct subF_reg_imm(regF dst, regF src, immF con) %{
2173   predicate(UseAVX > 0);
2174   match(Set dst (SubF src con));
2175 
2176   format %{ "vsubss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
2177   ins_cost(150);
2178   ins_encode %{
2179     __ vsubss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2180   %}
2181   ins_pipe(pipe_slow);
2182 %}
2183 
2184 instruct subD_reg(regD dst, regD src) %{
2185   predicate((UseSSE>=2) && (UseAVX == 0));
2186   match(Set dst (SubD dst src));
2187 
2188   format %{ "subsd   $dst, $src" %}
2189   ins_cost(150);
2190   ins_encode %{
2191     __ subsd($dst$$XMMRegister, $src$$XMMRegister);
2192   %}
2193   ins_pipe(pipe_slow);
2194 %}
2195 
2196 instruct subD_mem(regD dst, memory src) %{
2197   predicate((UseSSE>=2) && (UseAVX == 0));
2198   match(Set dst (SubD dst (LoadD src)));
2199 
2200   format %{ "subsd   $dst, $src" %}
2201   ins_cost(150);
2202   ins_encode %{
2203     __ subsd($dst$$XMMRegister, $src$$Address);
2204   %}
2205   ins_pipe(pipe_slow);
2206 %}
2207 
2208 instruct subD_imm(regD dst, immD con) %{
2209   predicate((UseSSE>=2) && (UseAVX == 0));
2210   match(Set dst (SubD dst con));
2211   format %{ "subsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2212   ins_cost(150);
2213   ins_encode %{
2214     __ subsd($dst$$XMMRegister, $constantaddress($con));
2215   %}
2216   ins_pipe(pipe_slow);
2217 %}
2218 
2219 instruct subD_reg_reg(regD dst, regD src1, regD src2) %{
2220   predicate(UseAVX > 0);
2221   match(Set dst (SubD src1 src2));
2222 
2223   format %{ "vsubsd  $dst, $src1, $src2" %}
2224   ins_cost(150);
2225   ins_encode %{
2226     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2227   %}
2228   ins_pipe(pipe_slow);
2229 %}
2230 
2231 instruct subD_reg_mem(regD dst, regD src1, memory src2) %{
2232   predicate(UseAVX > 0);
2233   match(Set dst (SubD src1 (LoadD src2)));
2234 
2235   format %{ "vsubsd  $dst, $src1, $src2" %}
2236   ins_cost(150);
2237   ins_encode %{
2238     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2239   %}
2240   ins_pipe(pipe_slow);
2241 %}
2242 
2243 instruct subD_reg_imm(regD dst, regD src, immD con) %{
2244   predicate(UseAVX > 0);
2245   match(Set dst (SubD src con));
2246 
2247   format %{ "vsubsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
2248   ins_cost(150);
2249   ins_encode %{
2250     __ vsubsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2251   %}
2252   ins_pipe(pipe_slow);
2253 %}
2254 
2255 instruct mulF_reg(regF dst, regF src) %{
2256   predicate((UseSSE>=1) && (UseAVX == 0));
2257   match(Set dst (MulF dst src));
2258 
2259   format %{ "mulss   $dst, $src" %}
2260   ins_cost(150);
2261   ins_encode %{
2262     __ mulss($dst$$XMMRegister, $src$$XMMRegister);
2263   %}
2264   ins_pipe(pipe_slow);
2265 %}
2266 
2267 instruct mulF_mem(regF dst, memory src) %{
2268   predicate((UseSSE>=1) && (UseAVX == 0));
2269   match(Set dst (MulF dst (LoadF src)));
2270 
2271   format %{ "mulss   $dst, $src" %}
2272   ins_cost(150);
2273   ins_encode %{
2274     __ mulss($dst$$XMMRegister, $src$$Address);
2275   %}
2276   ins_pipe(pipe_slow);
2277 %}
2278 
2279 instruct mulF_imm(regF dst, immF con) %{
2280   predicate((UseSSE>=1) && (UseAVX == 0));
2281   match(Set dst (MulF dst con));
2282   format %{ "mulss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2283   ins_cost(150);
2284   ins_encode %{
2285     __ mulss($dst$$XMMRegister, $constantaddress($con));
2286   %}
2287   ins_pipe(pipe_slow);
2288 %}
2289 
2290 instruct mulF_reg_reg(regF dst, regF src1, regF src2) %{
2291   predicate(UseAVX > 0);
2292   match(Set dst (MulF src1 src2));
2293 
2294   format %{ "vmulss  $dst, $src1, $src2" %}
2295   ins_cost(150);
2296   ins_encode %{
2297     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2298   %}
2299   ins_pipe(pipe_slow);
2300 %}
2301 
2302 instruct mulF_reg_mem(regF dst, regF src1, memory src2) %{
2303   predicate(UseAVX > 0);
2304   match(Set dst (MulF src1 (LoadF src2)));
2305 
2306   format %{ "vmulss  $dst, $src1, $src2" %}
2307   ins_cost(150);
2308   ins_encode %{
2309     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2310   %}
2311   ins_pipe(pipe_slow);
2312 %}
2313 
2314 instruct mulF_reg_imm(regF dst, regF src, immF con) %{
2315   predicate(UseAVX > 0);
2316   match(Set dst (MulF src con));
2317 
2318   format %{ "vmulss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
2319   ins_cost(150);
2320   ins_encode %{
2321     __ vmulss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2322   %}
2323   ins_pipe(pipe_slow);
2324 %}
2325 
2326 instruct mulD_reg(regD dst, regD src) %{
2327   predicate((UseSSE>=2) && (UseAVX == 0));
2328   match(Set dst (MulD dst src));
2329 
2330   format %{ "mulsd   $dst, $src" %}
2331   ins_cost(150);
2332   ins_encode %{
2333     __ mulsd($dst$$XMMRegister, $src$$XMMRegister);
2334   %}
2335   ins_pipe(pipe_slow);
2336 %}
2337 
2338 instruct mulD_mem(regD dst, memory src) %{
2339   predicate((UseSSE>=2) && (UseAVX == 0));
2340   match(Set dst (MulD dst (LoadD src)));
2341 
2342   format %{ "mulsd   $dst, $src" %}
2343   ins_cost(150);
2344   ins_encode %{
2345     __ mulsd($dst$$XMMRegister, $src$$Address);
2346   %}
2347   ins_pipe(pipe_slow);
2348 %}
2349 
2350 instruct mulD_imm(regD dst, immD con) %{
2351   predicate((UseSSE>=2) && (UseAVX == 0));
2352   match(Set dst (MulD dst con));
2353   format %{ "mulsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2354   ins_cost(150);
2355   ins_encode %{
2356     __ mulsd($dst$$XMMRegister, $constantaddress($con));
2357   %}
2358   ins_pipe(pipe_slow);
2359 %}
2360 
2361 instruct mulD_reg_reg(regD dst, regD src1, regD src2) %{
2362   predicate(UseAVX > 0);
2363   match(Set dst (MulD src1 src2));
2364 
2365   format %{ "vmulsd  $dst, $src1, $src2" %}
2366   ins_cost(150);
2367   ins_encode %{
2368     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2369   %}
2370   ins_pipe(pipe_slow);
2371 %}
2372 
2373 instruct mulD_reg_mem(regD dst, regD src1, memory src2) %{
2374   predicate(UseAVX > 0);
2375   match(Set dst (MulD src1 (LoadD src2)));
2376 
2377   format %{ "vmulsd  $dst, $src1, $src2" %}
2378   ins_cost(150);
2379   ins_encode %{
2380     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2381   %}
2382   ins_pipe(pipe_slow);
2383 %}
2384 
2385 instruct mulD_reg_imm(regD dst, regD src, immD con) %{
2386   predicate(UseAVX > 0);
2387   match(Set dst (MulD src con));
2388 
2389   format %{ "vmulsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
2390   ins_cost(150);
2391   ins_encode %{
2392     __ vmulsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2393   %}
2394   ins_pipe(pipe_slow);
2395 %}
2396 
2397 instruct divF_reg(regF dst, regF src) %{
2398   predicate((UseSSE>=1) && (UseAVX == 0));
2399   match(Set dst (DivF dst src));
2400 
2401   format %{ "divss   $dst, $src" %}
2402   ins_cost(150);
2403   ins_encode %{
2404     __ divss($dst$$XMMRegister, $src$$XMMRegister);
2405   %}
2406   ins_pipe(pipe_slow);
2407 %}
2408 
2409 instruct divF_mem(regF dst, memory src) %{
2410   predicate((UseSSE>=1) && (UseAVX == 0));
2411   match(Set dst (DivF dst (LoadF src)));
2412 
2413   format %{ "divss   $dst, $src" %}
2414   ins_cost(150);
2415   ins_encode %{
2416     __ divss($dst$$XMMRegister, $src$$Address);
2417   %}
2418   ins_pipe(pipe_slow);
2419 %}
2420 
2421 instruct divF_imm(regF dst, immF con) %{
2422   predicate((UseSSE>=1) && (UseAVX == 0));
2423   match(Set dst (DivF dst con));
2424   format %{ "divss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2425   ins_cost(150);
2426   ins_encode %{
2427     __ divss($dst$$XMMRegister, $constantaddress($con));
2428   %}
2429   ins_pipe(pipe_slow);
2430 %}
2431 
2432 instruct divF_reg_reg(regF dst, regF src1, regF src2) %{
2433   predicate(UseAVX > 0);
2434   match(Set dst (DivF src1 src2));
2435 
2436   format %{ "vdivss  $dst, $src1, $src2" %}
2437   ins_cost(150);
2438   ins_encode %{
2439     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2440   %}
2441   ins_pipe(pipe_slow);
2442 %}
2443 
2444 instruct divF_reg_mem(regF dst, regF src1, memory src2) %{
2445   predicate(UseAVX > 0);
2446   match(Set dst (DivF src1 (LoadF src2)));
2447 
2448   format %{ "vdivss  $dst, $src1, $src2" %}
2449   ins_cost(150);
2450   ins_encode %{
2451     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2452   %}
2453   ins_pipe(pipe_slow);
2454 %}
2455 
2456 instruct divF_reg_imm(regF dst, regF src, immF con) %{
2457   predicate(UseAVX > 0);
2458   match(Set dst (DivF src con));
2459 
2460   format %{ "vdivss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
2461   ins_cost(150);
2462   ins_encode %{
2463     __ vdivss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2464   %}
2465   ins_pipe(pipe_slow);
2466 %}
2467 
2468 instruct divD_reg(regD dst, regD src) %{
2469   predicate((UseSSE>=2) && (UseAVX == 0));
2470   match(Set dst (DivD dst src));
2471 
2472   format %{ "divsd   $dst, $src" %}
2473   ins_cost(150);
2474   ins_encode %{
2475     __ divsd($dst$$XMMRegister, $src$$XMMRegister);
2476   %}
2477   ins_pipe(pipe_slow);
2478 %}
2479 
2480 instruct divD_mem(regD dst, memory src) %{
2481   predicate((UseSSE>=2) && (UseAVX == 0));
2482   match(Set dst (DivD dst (LoadD src)));
2483 
2484   format %{ "divsd   $dst, $src" %}
2485   ins_cost(150);
2486   ins_encode %{
2487     __ divsd($dst$$XMMRegister, $src$$Address);
2488   %}
2489   ins_pipe(pipe_slow);
2490 %}
2491 
2492 instruct divD_imm(regD dst, immD con) %{
2493   predicate((UseSSE>=2) && (UseAVX == 0));
2494   match(Set dst (DivD dst con));
2495   format %{ "divsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2496   ins_cost(150);
2497   ins_encode %{
2498     __ divsd($dst$$XMMRegister, $constantaddress($con));
2499   %}
2500   ins_pipe(pipe_slow);
2501 %}
2502 
2503 instruct divD_reg_reg(regD dst, regD src1, regD src2) %{
2504   predicate(UseAVX > 0);
2505   match(Set dst (DivD src1 src2));
2506 
2507   format %{ "vdivsd  $dst, $src1, $src2" %}
2508   ins_cost(150);
2509   ins_encode %{
2510     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2511   %}
2512   ins_pipe(pipe_slow);
2513 %}
2514 
2515 instruct divD_reg_mem(regD dst, regD src1, memory src2) %{
2516   predicate(UseAVX > 0);
2517   match(Set dst (DivD src1 (LoadD src2)));
2518 
2519   format %{ "vdivsd  $dst, $src1, $src2" %}
2520   ins_cost(150);
2521   ins_encode %{
2522     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2523   %}
2524   ins_pipe(pipe_slow);
2525 %}
2526 
2527 instruct divD_reg_imm(regD dst, regD src, immD con) %{
2528   predicate(UseAVX > 0);
2529   match(Set dst (DivD src con));
2530 
2531   format %{ "vdivsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
2532   ins_cost(150);
2533   ins_encode %{
2534     __ vdivsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2535   %}
2536   ins_pipe(pipe_slow);
2537 %}
2538 
2539 instruct absF_reg(regF dst) %{
2540   predicate((UseSSE>=1) && (UseAVX == 0));
2541   match(Set dst (AbsF dst));
2542   ins_cost(150);
2543   format %{ "andps   $dst, [0x7fffffff]\t# abs float by sign masking" %}
2544   ins_encode %{
2545     __ andps($dst$$XMMRegister, ExternalAddress(float_signmask()));
2546   %}
2547   ins_pipe(pipe_slow);
2548 %}
2549 
2550 instruct absF_reg_reg(regF dst, regF src) %{
2551   predicate(VM_Version::supports_avxonly());
2552   match(Set dst (AbsF src));
2553   ins_cost(150);
2554   format %{ "vandps  $dst, $src, [0x7fffffff]\t# abs float by sign masking" %}
2555   ins_encode %{
2556     int vector_len = 0;
2557     __ vandps($dst$$XMMRegister, $src$$XMMRegister,
2558               ExternalAddress(float_signmask()), vector_len);
2559   %}
2560   ins_pipe(pipe_slow);
2561 %}
2562 
2563 #ifdef _LP64
2564 instruct absF_reg_reg_evex(regF dst, regF src) %{
2565   predicate(UseAVX > 2 && VM_Version::supports_avx512vl());
2566   match(Set dst (AbsF src));
2567   ins_cost(150);
2568   format %{ "vandps  $dst, $src, [0x7fffffff]\t# abs float by sign masking" %}
2569   ins_encode %{
2570     int vector_len = 0;
2571     __ vandps($dst$$XMMRegister, $src$$XMMRegister,
2572               ExternalAddress(float_signmask()), vector_len);
2573   %}
2574   ins_pipe(pipe_slow);
2575 %}
2576 
2577 instruct absF_reg_reg_evex_special(regF dst, regF src1, regF src2) %{
2578   predicate(VM_Version::supports_avx512novl());
2579   match(Set dst (AbsF src1));
2580   effect(TEMP src2);
2581   ins_cost(150);
2582   format %{ "vabsss  $dst, $src1, $src2, [0x7fffffff]\t# abs float by sign masking" %}
2583   ins_encode %{
2584     int vector_len = 0;
2585     __ vabsss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister,
2586               ExternalAddress(float_signmask()), vector_len);
2587   %}
2588   ins_pipe(pipe_slow);
2589 %}
2590 #else // _LP64
2591 instruct absF_reg_reg_evex(regF dst, regF src) %{
2592   predicate(UseAVX > 2);
2593   match(Set dst (AbsF src));
2594   ins_cost(150);
2595   format %{ "vandps  $dst, $src, [0x7fffffff]\t# abs float by sign masking" %}
2596   ins_encode %{
2597     int vector_len = 0;
2598     __ vandps($dst$$XMMRegister, $src$$XMMRegister,
2599               ExternalAddress(float_signmask()), vector_len);
2600   %}
2601   ins_pipe(pipe_slow);
2602 %}
2603 #endif
2604 
2605 instruct absD_reg(regD dst) %{
2606   predicate((UseSSE>=2) && (UseAVX == 0));
2607   match(Set dst (AbsD dst));
2608   ins_cost(150);
2609   format %{ "andpd   $dst, [0x7fffffffffffffff]\t"
2610             "# abs double by sign masking" %}
2611   ins_encode %{
2612     __ andpd($dst$$XMMRegister, ExternalAddress(double_signmask()));
2613   %}
2614   ins_pipe(pipe_slow);
2615 %}
2616 
2617 instruct absD_reg_reg(regD dst, regD src) %{
2618   predicate(VM_Version::supports_avxonly());
2619   match(Set dst (AbsD src));
2620   ins_cost(150);
2621   format %{ "vandpd  $dst, $src, [0x7fffffffffffffff]\t"
2622             "# abs double by sign masking" %}
2623   ins_encode %{
2624     int vector_len = 0;
2625     __ vandpd($dst$$XMMRegister, $src$$XMMRegister,
2626               ExternalAddress(double_signmask()), vector_len);
2627   %}
2628   ins_pipe(pipe_slow);
2629 %}
2630 
2631 #ifdef _LP64
2632 instruct absD_reg_reg_evex(regD dst, regD src) %{
2633   predicate(UseAVX > 2 && VM_Version::supports_avx512vl());
2634   match(Set dst (AbsD src));
2635   ins_cost(150);
2636   format %{ "vandpd  $dst, $src, [0x7fffffffffffffff]\t"
2637             "# abs double by sign masking" %}
2638   ins_encode %{
2639     int vector_len = 0;
2640     __ vandpd($dst$$XMMRegister, $src$$XMMRegister,
2641               ExternalAddress(double_signmask()), vector_len);
2642   %}
2643   ins_pipe(pipe_slow);
2644 %}
2645 
2646 instruct absD_reg_reg_evex_special(regD dst, regD src1, regD src2) %{
2647   predicate(VM_Version::supports_avx512novl());
2648   match(Set dst (AbsD src1));
2649   effect(TEMP src2);
2650   ins_cost(150);
2651   format %{ "vabssd  $dst, $src1, $src2, [0x7fffffffffffffff]\t# abs float by sign masking" %}
2652   ins_encode %{
2653     int vector_len = 0;
2654     __ vabssd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister,
2655               ExternalAddress(double_signmask()), vector_len);
2656   %}
2657   ins_pipe(pipe_slow);
2658 %}
2659 #else // _LP64
2660 instruct absD_reg_reg_evex(regD dst, regD src) %{
2661   predicate(UseAVX > 2);
2662   match(Set dst (AbsD src));
2663   ins_cost(150);
2664   format %{ "vandpd  $dst, $src, [0x7fffffffffffffff]\t"
2665             "# abs double by sign masking" %}
2666   ins_encode %{
2667     int vector_len = 0;
2668     __ vandpd($dst$$XMMRegister, $src$$XMMRegister,
2669               ExternalAddress(double_signmask()), vector_len);
2670   %}
2671   ins_pipe(pipe_slow);
2672 %}
2673 #endif
2674 
2675 instruct negF_reg(regF dst) %{
2676   predicate((UseSSE>=1) && (UseAVX == 0));
2677   match(Set dst (NegF dst));
2678   ins_cost(150);
2679   format %{ "xorps   $dst, [0x80000000]\t# neg float by sign flipping" %}
2680   ins_encode %{
2681     __ xorps($dst$$XMMRegister, ExternalAddress(float_signflip()));
2682   %}
2683   ins_pipe(pipe_slow);
2684 %}
2685 
2686 instruct negF_reg_reg(regF dst, regF src) %{
2687   predicate(UseAVX > 0);
2688   match(Set dst (NegF src));
2689   ins_cost(150);
2690   format %{ "vnegatess  $dst, $src, [0x80000000]\t# neg float by sign flipping" %}
2691   ins_encode %{
2692     __ vnegatess($dst$$XMMRegister, $src$$XMMRegister,
2693                  ExternalAddress(float_signflip()));
2694   %}
2695   ins_pipe(pipe_slow);
2696 %}
2697 
2698 instruct negD_reg(regD dst) %{
2699   predicate((UseSSE>=2) && (UseAVX == 0));
2700   match(Set dst (NegD dst));
2701   ins_cost(150);
2702   format %{ "xorpd   $dst, [0x8000000000000000]\t"
2703             "# neg double by sign flipping" %}
2704   ins_encode %{
2705     __ xorpd($dst$$XMMRegister, ExternalAddress(double_signflip()));
2706   %}
2707   ins_pipe(pipe_slow);
2708 %}
2709 
2710 instruct negD_reg_reg(regD dst, regD src) %{
2711   predicate(UseAVX > 0);
2712   match(Set dst (NegD src));
2713   ins_cost(150);
2714   format %{ "vnegatess  $dst, $src, [0x8000000000000000]\t"
2715             "# neg double by sign flipping" %}
2716   ins_encode %{
2717     __ vnegatesd($dst$$XMMRegister, $src$$XMMRegister,
2718                  ExternalAddress(double_signflip()));
2719   %}
2720   ins_pipe(pipe_slow);
2721 %}
2722 
2723 instruct sqrtF_reg(regF dst, regF src) %{
2724   predicate(UseSSE>=1);
2725   match(Set dst (SqrtF src));
2726 
2727   format %{ "sqrtss  $dst, $src" %}
2728   ins_cost(150);
2729   ins_encode %{
2730     __ sqrtss($dst$$XMMRegister, $src$$XMMRegister);
2731   %}
2732   ins_pipe(pipe_slow);
2733 %}
2734 
2735 instruct sqrtF_mem(regF dst, memory src) %{
2736   predicate(UseSSE>=1);
2737   match(Set dst (SqrtF (LoadF src)));
2738 
2739   format %{ "sqrtss  $dst, $src" %}
2740   ins_cost(150);
2741   ins_encode %{
2742     __ sqrtss($dst$$XMMRegister, $src$$Address);
2743   %}
2744   ins_pipe(pipe_slow);
2745 %}
2746 
2747 instruct sqrtF_imm(regF dst, immF con) %{
2748   predicate(UseSSE>=1);
2749   match(Set dst (SqrtF con));
2750 
2751   format %{ "sqrtss  $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2752   ins_cost(150);
2753   ins_encode %{
2754     __ sqrtss($dst$$XMMRegister, $constantaddress($con));
2755   %}
2756   ins_pipe(pipe_slow);
2757 %}
2758 
2759 instruct sqrtD_reg(regD dst, regD src) %{
2760   predicate(UseSSE>=2);
2761   match(Set dst (SqrtD src));
2762 
2763   format %{ "sqrtsd  $dst, $src" %}
2764   ins_cost(150);
2765   ins_encode %{
2766     __ sqrtsd($dst$$XMMRegister, $src$$XMMRegister);
2767   %}
2768   ins_pipe(pipe_slow);
2769 %}
2770 
2771 instruct sqrtD_mem(regD dst, memory src) %{
2772   predicate(UseSSE>=2);
2773   match(Set dst (SqrtD (LoadD src)));
2774 
2775   format %{ "sqrtsd  $dst, $src" %}
2776   ins_cost(150);
2777   ins_encode %{
2778     __ sqrtsd($dst$$XMMRegister, $src$$Address);
2779   %}
2780   ins_pipe(pipe_slow);
2781 %}
2782 
2783 instruct sqrtD_imm(regD dst, immD con) %{
2784   predicate(UseSSE>=2);
2785   match(Set dst (SqrtD con));
2786   format %{ "sqrtsd  $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2787   ins_cost(150);
2788   ins_encode %{
2789     __ sqrtsd($dst$$XMMRegister, $constantaddress($con));
2790   %}
2791   ins_pipe(pipe_slow);
2792 %}
2793 
2794 instruct onspinwait() %{
2795   match(OnSpinWait);
2796   ins_cost(200);
2797 
2798   format %{
2799     $$template
2800     $$emit$$"pause\t! membar_onspinwait"
2801   %}
2802   ins_encode %{
2803     __ pause();
2804   %}
2805   ins_pipe(pipe_slow);
2806 %}
2807 
2808 // a * b + c
2809 instruct fmaD_reg(regD a, regD b, regD c) %{
2810   predicate(UseFMA);
2811   match(Set c (FmaD  c (Binary a b)));
2812   format %{ "fmasd $a,$b,$c\t# $c = $a * $b + $c" %}
2813   ins_cost(150);
2814   ins_encode %{
2815     __ fmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
2816   %}
2817   ins_pipe( pipe_slow );
2818 %}
2819 
2820 // a * b + c
2821 instruct fmaF_reg(regF a, regF b, regF c) %{
2822   predicate(UseFMA);
2823   match(Set c (FmaF  c (Binary a b)));
2824   format %{ "fmass $a,$b,$c\t# $c = $a * $b + $c" %}
2825   ins_cost(150);
2826   ins_encode %{
2827     __ fmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
2828   %}
2829   ins_pipe( pipe_slow );
2830 %}
2831 
2832 // ====================VECTOR INSTRUCTIONS=====================================
2833 
2834 // Load vectors (4 bytes long)
2835 instruct loadV4(vecS dst, memory mem) %{
2836   predicate(n->as_LoadVector()->memory_size() == 4);
2837   match(Set dst (LoadVector mem));
2838   ins_cost(125);
2839   format %{ "movd    $dst,$mem\t! load vector (4 bytes)" %}
2840   ins_encode %{
2841     __ movdl($dst$$XMMRegister, $mem$$Address);
2842   %}
2843   ins_pipe( pipe_slow );
2844 %}
2845 
2846 // Load vectors (8 bytes long)
2847 instruct loadV8(vecD dst, memory mem) %{
2848   predicate(n->as_LoadVector()->memory_size() == 8);
2849   match(Set dst (LoadVector mem));
2850   ins_cost(125);
2851   format %{ "movq    $dst,$mem\t! load vector (8 bytes)" %}
2852   ins_encode %{
2853     __ movq($dst$$XMMRegister, $mem$$Address);
2854   %}
2855   ins_pipe( pipe_slow );
2856 %}
2857 
2858 // Load vectors (16 bytes long)
2859 instruct loadV16(vecX dst, memory mem) %{
2860   predicate(n->as_LoadVector()->memory_size() == 16);
2861   match(Set dst (LoadVector mem));
2862   ins_cost(125);
2863   format %{ "movdqu  $dst,$mem\t! load vector (16 bytes)" %}
2864   ins_encode %{
2865     __ movdqu($dst$$XMMRegister, $mem$$Address);
2866   %}
2867   ins_pipe( pipe_slow );
2868 %}
2869 
2870 // Load vectors (32 bytes long)
2871 instruct loadV32(vecY dst, memory mem) %{
2872   predicate(n->as_LoadVector()->memory_size() == 32);
2873   match(Set dst (LoadVector mem));
2874   ins_cost(125);
2875   format %{ "vmovdqu $dst,$mem\t! load vector (32 bytes)" %}
2876   ins_encode %{
2877     __ vmovdqu($dst$$XMMRegister, $mem$$Address);
2878   %}
2879   ins_pipe( pipe_slow );
2880 %}
2881 
2882 // Load vectors (64 bytes long)
2883 instruct loadV64_dword(vecZ dst, memory mem) %{
2884   predicate(n->as_LoadVector()->memory_size() == 64 && n->as_LoadVector()->element_size() <= 4);
2885   match(Set dst (LoadVector mem));
2886   ins_cost(125);
2887   format %{ "vmovdqul $dst k0,$mem\t! load vector (64 bytes)" %}
2888   ins_encode %{
2889     int vector_len = 2;
2890     __ evmovdqul($dst$$XMMRegister, $mem$$Address, vector_len);
2891   %}
2892   ins_pipe( pipe_slow );
2893 %}
2894 
2895 // Load vectors (64 bytes long)
2896 instruct loadV64_qword(vecZ dst, memory mem) %{
2897   predicate(n->as_LoadVector()->memory_size() == 64 && n->as_LoadVector()->element_size() > 4);
2898   match(Set dst (LoadVector mem));
2899   ins_cost(125);
2900   format %{ "vmovdquq $dst k0,$mem\t! load vector (64 bytes)" %}
2901   ins_encode %{
2902     int vector_len = 2;
2903     __ evmovdquq($dst$$XMMRegister, $mem$$Address, vector_len);
2904   %}
2905   ins_pipe( pipe_slow );
2906 %}
2907 
2908 // Store vectors
2909 instruct storeV4(memory mem, vecS src) %{
2910   predicate(n->as_StoreVector()->memory_size() == 4);
2911   match(Set mem (StoreVector mem src));
2912   ins_cost(145);
2913   format %{ "movd    $mem,$src\t! store vector (4 bytes)" %}
2914   ins_encode %{
2915     __ movdl($mem$$Address, $src$$XMMRegister);
2916   %}
2917   ins_pipe( pipe_slow );
2918 %}
2919 
2920 instruct storeV8(memory mem, vecD src) %{
2921   predicate(n->as_StoreVector()->memory_size() == 8);
2922   match(Set mem (StoreVector mem src));
2923   ins_cost(145);
2924   format %{ "movq    $mem,$src\t! store vector (8 bytes)" %}
2925   ins_encode %{
2926     __ movq($mem$$Address, $src$$XMMRegister);
2927   %}
2928   ins_pipe( pipe_slow );
2929 %}
2930 
2931 instruct storeV16(memory mem, vecX src) %{
2932   predicate(n->as_StoreVector()->memory_size() == 16);
2933   match(Set mem (StoreVector mem src));
2934   ins_cost(145);
2935   format %{ "movdqu  $mem,$src\t! store vector (16 bytes)" %}
2936   ins_encode %{
2937     __ movdqu($mem$$Address, $src$$XMMRegister);
2938   %}
2939   ins_pipe( pipe_slow );
2940 %}
2941 
2942 instruct storeV32(memory mem, vecY src) %{
2943   predicate(n->as_StoreVector()->memory_size() == 32);
2944   match(Set mem (StoreVector mem src));
2945   ins_cost(145);
2946   format %{ "vmovdqu $mem,$src\t! store vector (32 bytes)" %}
2947   ins_encode %{
2948     __ vmovdqu($mem$$Address, $src$$XMMRegister);
2949   %}
2950   ins_pipe( pipe_slow );
2951 %}
2952 
2953 instruct storeV64_dword(memory mem, vecZ src) %{
2954   predicate(n->as_StoreVector()->memory_size() == 64 && n->as_StoreVector()->element_size() <= 4);
2955   match(Set mem (StoreVector mem src));
2956   ins_cost(145);
2957   format %{ "vmovdqul $mem k0,$src\t! store vector (64 bytes)" %}
2958   ins_encode %{
2959     int vector_len = 2;
2960     __ evmovdqul($mem$$Address, $src$$XMMRegister, vector_len);
2961   %}
2962   ins_pipe( pipe_slow );
2963 %}
2964 
2965 instruct storeV64_qword(memory mem, vecZ src) %{
2966   predicate(n->as_StoreVector()->memory_size() == 64 && n->as_StoreVector()->element_size() > 4);
2967   match(Set mem (StoreVector mem src));
2968   ins_cost(145);
2969   format %{ "vmovdquq $mem k0,$src\t! store vector (64 bytes)" %}
2970   ins_encode %{
2971     int vector_len = 2;
2972     __ evmovdquq($mem$$Address, $src$$XMMRegister, vector_len);
2973   %}
2974   ins_pipe( pipe_slow );
2975 %}
2976 
2977 // ====================LEGACY REPLICATE=======================================
2978 
2979 instruct Repl4B_mem(vecS dst, memory mem) %{
2980   predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
2981   match(Set dst (ReplicateB (LoadB mem)));
2982   format %{ "punpcklbw $dst,$mem\n\t"
2983             "pshuflw $dst,$dst,0x00\t! replicate4B" %}
2984   ins_encode %{
2985     __ punpcklbw($dst$$XMMRegister, $mem$$Address);
2986     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
2987   %}
2988   ins_pipe( pipe_slow );
2989 %}
2990 
2991 instruct Repl8B_mem(vecD dst, memory mem) %{
2992   predicate(n->as_Vector()->length() == 8 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
2993   match(Set dst (ReplicateB (LoadB mem)));
2994   format %{ "punpcklbw $dst,$mem\n\t"
2995             "pshuflw $dst,$dst,0x00\t! replicate8B" %}
2996   ins_encode %{
2997     __ punpcklbw($dst$$XMMRegister, $mem$$Address);
2998     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
2999   %}
3000   ins_pipe( pipe_slow );
3001 %}
3002 
3003 instruct Repl16B(vecX dst, rRegI src) %{
3004   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
3005   match(Set dst (ReplicateB src));
3006   format %{ "movd    $dst,$src\n\t"
3007             "punpcklbw $dst,$dst\n\t"
3008             "pshuflw $dst,$dst,0x00\n\t"
3009             "punpcklqdq $dst,$dst\t! replicate16B" %}
3010   ins_encode %{
3011     __ movdl($dst$$XMMRegister, $src$$Register);
3012     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
3013     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3014     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3015   %}
3016   ins_pipe( pipe_slow );
3017 %}
3018 
3019 instruct Repl16B_mem(vecX dst, memory mem) %{
3020   predicate(n->as_Vector()->length() == 16 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
3021   match(Set dst (ReplicateB (LoadB mem)));
3022   format %{ "punpcklbw $dst,$mem\n\t"
3023             "pshuflw $dst,$dst,0x00\n\t"
3024             "punpcklqdq $dst,$dst\t! replicate16B" %}
3025   ins_encode %{
3026     __ punpcklbw($dst$$XMMRegister, $mem$$Address);
3027     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3028     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3029   %}
3030   ins_pipe( pipe_slow );
3031 %}
3032 
3033 instruct Repl32B(vecY dst, rRegI src) %{
3034   predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
3035   match(Set dst (ReplicateB src));
3036   format %{ "movd    $dst,$src\n\t"
3037             "punpcklbw $dst,$dst\n\t"
3038             "pshuflw $dst,$dst,0x00\n\t"
3039             "punpcklqdq $dst,$dst\n\t"
3040             "vinserti128_high $dst,$dst\t! replicate32B" %}
3041   ins_encode %{
3042     __ movdl($dst$$XMMRegister, $src$$Register);
3043     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
3044     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3045     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3046     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3047   %}
3048   ins_pipe( pipe_slow );
3049 %}
3050 
3051 instruct Repl32B_mem(vecY dst, memory mem) %{
3052   predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
3053   match(Set dst (ReplicateB (LoadB mem)));
3054   format %{ "punpcklbw $dst,$mem\n\t"
3055             "pshuflw $dst,$dst,0x00\n\t"
3056             "punpcklqdq $dst,$dst\n\t"
3057             "vinserti128_high $dst,$dst\t! replicate32B" %}
3058   ins_encode %{
3059     __ punpcklbw($dst$$XMMRegister, $mem$$Address);
3060     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3061     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3062     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3063   %}
3064   ins_pipe( pipe_slow );
3065 %}
3066 
3067 instruct Repl16B_imm(vecX dst, immI con) %{
3068   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
3069   match(Set dst (ReplicateB con));
3070   format %{ "movq    $dst,[$constantaddress]\n\t"
3071             "punpcklqdq $dst,$dst\t! replicate16B($con)" %}
3072   ins_encode %{
3073     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
3074     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3075   %}
3076   ins_pipe( pipe_slow );
3077 %}
3078 
3079 instruct Repl32B_imm(vecY dst, immI con) %{
3080   predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
3081   match(Set dst (ReplicateB con));
3082   format %{ "movq    $dst,[$constantaddress]\n\t"
3083             "punpcklqdq $dst,$dst\n\t"
3084             "vinserti128_high $dst,$dst\t! lreplicate32B($con)" %}
3085   ins_encode %{
3086     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
3087     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3088     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3089   %}
3090   ins_pipe( pipe_slow );
3091 %}
3092 
3093 instruct Repl4S(vecD dst, rRegI src) %{
3094   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vlbw());
3095   match(Set dst (ReplicateS src));
3096   format %{ "movd    $dst,$src\n\t"
3097             "pshuflw $dst,$dst,0x00\t! replicate4S" %}
3098   ins_encode %{
3099     __ movdl($dst$$XMMRegister, $src$$Register);
3100     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3101   %}
3102   ins_pipe( pipe_slow );
3103 %}
3104 
3105 instruct Repl4S_mem(vecD dst, memory mem) %{
3106   predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
3107   match(Set dst (ReplicateS (LoadS mem)));
3108   format %{ "pshuflw $dst,$mem,0x00\t! replicate4S" %}
3109   ins_encode %{
3110     __ pshuflw($dst$$XMMRegister, $mem$$Address, 0x00);
3111   %}
3112   ins_pipe( pipe_slow );
3113 %}
3114 
3115 instruct Repl8S(vecX dst, rRegI src) %{
3116   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vlbw());
3117   match(Set dst (ReplicateS src));
3118   format %{ "movd    $dst,$src\n\t"
3119             "pshuflw $dst,$dst,0x00\n\t"
3120             "punpcklqdq $dst,$dst\t! replicate8S" %}
3121   ins_encode %{
3122     __ movdl($dst$$XMMRegister, $src$$Register);
3123     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3124     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3125   %}
3126   ins_pipe( pipe_slow );
3127 %}
3128 
3129 instruct Repl8S_mem(vecX dst, memory mem) %{
3130   predicate(n->as_Vector()->length() == 8 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
3131   match(Set dst (ReplicateS (LoadS mem)));
3132   format %{ "pshuflw $dst,$mem,0x00\n\t"
3133             "punpcklqdq $dst,$dst\t! replicate8S" %}
3134   ins_encode %{
3135     __ pshuflw($dst$$XMMRegister, $mem$$Address, 0x00);
3136     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3137   %}
3138   ins_pipe( pipe_slow );
3139 %}
3140 
3141 instruct Repl8S_imm(vecX dst, immI con) %{
3142   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vlbw());
3143   match(Set dst (ReplicateS con));
3144   format %{ "movq    $dst,[$constantaddress]\n\t"
3145             "punpcklqdq $dst,$dst\t! replicate8S($con)" %}
3146   ins_encode %{
3147     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
3148     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3149   %}
3150   ins_pipe( pipe_slow );
3151 %}
3152 
3153 instruct Repl16S(vecY dst, rRegI src) %{
3154   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
3155   match(Set dst (ReplicateS src));
3156   format %{ "movd    $dst,$src\n\t"
3157             "pshuflw $dst,$dst,0x00\n\t"
3158             "punpcklqdq $dst,$dst\n\t"
3159             "vinserti128_high $dst,$dst\t! replicate16S" %}
3160   ins_encode %{
3161     __ movdl($dst$$XMMRegister, $src$$Register);
3162     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3163     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3164     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3165   %}
3166   ins_pipe( pipe_slow );
3167 %}
3168 
3169 instruct Repl16S_mem(vecY dst, memory mem) %{
3170   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
3171   match(Set dst (ReplicateS (LoadS mem)));
3172   format %{ "pshuflw $dst,$mem,0x00\n\t"
3173             "punpcklqdq $dst,$dst\n\t"
3174             "vinserti128_high $dst,$dst\t! replicate16S" %}
3175   ins_encode %{
3176     __ pshuflw($dst$$XMMRegister, $mem$$Address, 0x00);
3177     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3178     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3179   %}
3180   ins_pipe( pipe_slow );
3181 %}
3182 
3183 instruct Repl16S_imm(vecY dst, immI con) %{
3184   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
3185   match(Set dst (ReplicateS con));
3186   format %{ "movq    $dst,[$constantaddress]\n\t"
3187             "punpcklqdq $dst,$dst\n\t"
3188             "vinserti128_high $dst,$dst\t! replicate16S($con)" %}
3189   ins_encode %{
3190     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
3191     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3192     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3193   %}
3194   ins_pipe( pipe_slow );
3195 %}
3196 
3197 instruct Repl4I(vecX dst, rRegI src) %{
3198   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3199   match(Set dst (ReplicateI src));
3200   format %{ "movd    $dst,$src\n\t"
3201             "pshufd  $dst,$dst,0x00\t! replicate4I" %}
3202   ins_encode %{
3203     __ movdl($dst$$XMMRegister, $src$$Register);
3204     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3205   %}
3206   ins_pipe( pipe_slow );
3207 %}
3208 
3209 instruct Repl4I_mem(vecX dst, memory mem) %{
3210   predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3211   match(Set dst (ReplicateI (LoadI mem)));
3212   format %{ "pshufd  $dst,$mem,0x00\t! replicate4I" %}
3213   ins_encode %{
3214     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3215   %}
3216   ins_pipe( pipe_slow );
3217 %}
3218 
3219 instruct Repl8I(vecY dst, rRegI src) %{
3220   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3221   match(Set dst (ReplicateI src));
3222   format %{ "movd    $dst,$src\n\t"
3223             "pshufd  $dst,$dst,0x00\n\t"
3224             "vinserti128_high $dst,$dst\t! replicate8I" %}
3225   ins_encode %{
3226     __ movdl($dst$$XMMRegister, $src$$Register);
3227     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3228     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3229   %}
3230   ins_pipe( pipe_slow );
3231 %}
3232 
3233 instruct Repl8I_mem(vecY dst, memory mem) %{
3234   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3235   match(Set dst (ReplicateI (LoadI mem)));
3236   format %{ "pshufd  $dst,$mem,0x00\n\t"
3237             "vinserti128_high $dst,$dst\t! replicate8I" %}
3238   ins_encode %{
3239     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3240     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3241   %}
3242   ins_pipe( pipe_slow );
3243 %}
3244 
3245 instruct Repl4I_imm(vecX dst, immI con) %{
3246   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3247   match(Set dst (ReplicateI con));
3248   format %{ "movq    $dst,[$constantaddress]\t! replicate4I($con)\n\t"
3249             "punpcklqdq $dst,$dst" %}
3250   ins_encode %{
3251     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
3252     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3253   %}
3254   ins_pipe( pipe_slow );
3255 %}
3256 
3257 instruct Repl8I_imm(vecY dst, immI con) %{
3258   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3259   match(Set dst (ReplicateI con));
3260   format %{ "movq    $dst,[$constantaddress]\t! replicate8I($con)\n\t"
3261             "punpcklqdq $dst,$dst\n\t"
3262             "vinserti128_high $dst,$dst" %}
3263   ins_encode %{
3264     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
3265     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3266     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3267   %}
3268   ins_pipe( pipe_slow );
3269 %}
3270 
3271 // Long could be loaded into xmm register directly from memory.
3272 instruct Repl2L_mem(vecX dst, memory mem) %{
3273   predicate(n->as_Vector()->length() == 2 && !VM_Version::supports_avx512vlbw());
3274   match(Set dst (ReplicateL (LoadL mem)));
3275   format %{ "movq    $dst,$mem\n\t"
3276             "punpcklqdq $dst,$dst\t! replicate2L" %}
3277   ins_encode %{
3278     __ movq($dst$$XMMRegister, $mem$$Address);
3279     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3280   %}
3281   ins_pipe( pipe_slow );
3282 %}
3283 
3284 // Replicate long (8 byte) scalar to be vector
3285 #ifdef _LP64
3286 instruct Repl4L(vecY dst, rRegL src) %{
3287   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3288   match(Set dst (ReplicateL src));
3289   format %{ "movdq   $dst,$src\n\t"
3290             "punpcklqdq $dst,$dst\n\t"
3291             "vinserti128_high $dst,$dst\t! replicate4L" %}
3292   ins_encode %{
3293     __ movdq($dst$$XMMRegister, $src$$Register);
3294     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3295     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3296   %}
3297   ins_pipe( pipe_slow );
3298 %}
3299 #else // _LP64
3300 instruct Repl4L(vecY dst, eRegL src, regD tmp) %{
3301   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3302   match(Set dst (ReplicateL src));
3303   effect(TEMP dst, USE src, TEMP tmp);
3304   format %{ "movdl   $dst,$src.lo\n\t"
3305             "movdl   $tmp,$src.hi\n\t"
3306             "punpckldq $dst,$tmp\n\t"
3307             "punpcklqdq $dst,$dst\n\t"
3308             "vinserti128_high $dst,$dst\t! replicate4L" %}
3309   ins_encode %{
3310     __ movdl($dst$$XMMRegister, $src$$Register);
3311     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
3312     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
3313     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3314     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3315   %}
3316   ins_pipe( pipe_slow );
3317 %}
3318 #endif // _LP64
3319 
3320 instruct Repl4L_imm(vecY dst, immL con) %{
3321   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3322   match(Set dst (ReplicateL con));
3323   format %{ "movq    $dst,[$constantaddress]\n\t"
3324             "punpcklqdq $dst,$dst\n\t"
3325             "vinserti128_high $dst,$dst\t! replicate4L($con)" %}
3326   ins_encode %{
3327     __ movq($dst$$XMMRegister, $constantaddress($con));
3328     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3329     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3330   %}
3331   ins_pipe( pipe_slow );
3332 %}
3333 
3334 instruct Repl4L_mem(vecY dst, memory mem) %{
3335   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3336   match(Set dst (ReplicateL (LoadL mem)));
3337   format %{ "movq    $dst,$mem\n\t"
3338             "punpcklqdq $dst,$dst\n\t"
3339             "vinserti128_high $dst,$dst\t! replicate4L" %}
3340   ins_encode %{
3341     __ movq($dst$$XMMRegister, $mem$$Address);
3342     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3343     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3344   %}
3345   ins_pipe( pipe_slow );
3346 %}
3347 
3348 instruct Repl2F_mem(vecD dst, memory mem) %{
3349   predicate(n->as_Vector()->length() == 2 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3350   match(Set dst (ReplicateF (LoadF mem)));
3351   format %{ "pshufd  $dst,$mem,0x00\t! replicate2F" %}
3352   ins_encode %{
3353     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3354   %}
3355   ins_pipe( pipe_slow );
3356 %}
3357 
3358 instruct Repl4F_mem(vecX dst, memory mem) %{
3359   predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3360   match(Set dst (ReplicateF (LoadF mem)));
3361   format %{ "pshufd  $dst,$mem,0x00\t! replicate4F" %}
3362   ins_encode %{
3363     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3364   %}
3365   ins_pipe( pipe_slow );
3366 %}
3367 
3368 instruct Repl8F(vecY dst, regF src) %{
3369   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3370   match(Set dst (ReplicateF src));
3371   format %{ "pshufd  $dst,$src,0x00\n\t"
3372             "vinsertf128_high $dst,$dst\t! replicate8F" %}
3373   ins_encode %{
3374     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
3375     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3376   %}
3377   ins_pipe( pipe_slow );
3378 %}
3379 
3380 instruct Repl8F_mem(vecY dst, memory mem) %{
3381   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3382   match(Set dst (ReplicateF (LoadF mem)));
3383   format %{ "pshufd  $dst,$mem,0x00\n\t"
3384             "vinsertf128_high $dst,$dst\t! replicate8F" %}
3385   ins_encode %{
3386     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3387     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3388   %}
3389   ins_pipe( pipe_slow );
3390 %}
3391 
3392 instruct Repl2F_zero(vecD dst, immF0 zero) %{
3393   predicate(n->as_Vector()->length() == 2 && UseAVX < 3);
3394   match(Set dst (ReplicateF zero));
3395   format %{ "xorps   $dst,$dst\t! replicate2F zero" %}
3396   ins_encode %{
3397     __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
3398   %}
3399   ins_pipe( fpu_reg_reg );
3400 %}
3401 
3402 instruct Repl4F_zero(vecX dst, immF0 zero) %{
3403   predicate(n->as_Vector()->length() == 4 && UseAVX < 3);
3404   match(Set dst (ReplicateF zero));
3405   format %{ "xorps   $dst,$dst\t! replicate4F zero" %}
3406   ins_encode %{
3407     __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
3408   %}
3409   ins_pipe( fpu_reg_reg );
3410 %}
3411 
3412 instruct Repl8F_zero(vecY dst, immF0 zero) %{
3413   predicate(n->as_Vector()->length() == 8 && UseAVX < 3);
3414   match(Set dst (ReplicateF zero));
3415   format %{ "vxorps  $dst,$dst,$dst\t! replicate8F zero" %}
3416   ins_encode %{
3417     int vector_len = 1;
3418     __ vxorps($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3419   %}
3420   ins_pipe( fpu_reg_reg );
3421 %}
3422 
3423 instruct Repl2D_mem(vecX dst, memory mem) %{
3424   predicate(n->as_Vector()->length() == 2 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3425   match(Set dst (ReplicateD (LoadD mem)));
3426   format %{ "pshufd  $dst,$mem,0x44\t! replicate2D" %}
3427   ins_encode %{
3428     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x44);
3429   %}
3430   ins_pipe( pipe_slow );
3431 %}
3432 
3433 instruct Repl4D(vecY dst, regD src) %{
3434   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3435   match(Set dst (ReplicateD src));
3436   format %{ "pshufd  $dst,$src,0x44\n\t"
3437             "vinsertf128_high $dst,$dst\t! replicate4D" %}
3438   ins_encode %{
3439     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
3440     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3441   %}
3442   ins_pipe( pipe_slow );
3443 %}
3444 
3445 instruct Repl4D_mem(vecY dst, memory mem) %{
3446   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3447   match(Set dst (ReplicateD (LoadD mem)));
3448   format %{ "pshufd  $dst,$mem,0x44\n\t"
3449             "vinsertf128_high $dst,$dst\t! replicate4D" %}
3450   ins_encode %{
3451     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x44);
3452     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3453   %}
3454   ins_pipe( pipe_slow );
3455 %}
3456 
3457 // Replicate double (8 byte) scalar zero to be vector
3458 instruct Repl2D_zero(vecX dst, immD0 zero) %{
3459   predicate(n->as_Vector()->length() == 2 && UseAVX < 3);
3460   match(Set dst (ReplicateD zero));
3461   format %{ "xorpd   $dst,$dst\t! replicate2D zero" %}
3462   ins_encode %{
3463     __ xorpd($dst$$XMMRegister, $dst$$XMMRegister);
3464   %}
3465   ins_pipe( fpu_reg_reg );
3466 %}
3467 
3468 instruct Repl4D_zero(vecY dst, immD0 zero) %{
3469   predicate(n->as_Vector()->length() == 4 && UseAVX < 3);
3470   match(Set dst (ReplicateD zero));
3471   format %{ "vxorpd  $dst,$dst,$dst,vect256\t! replicate4D zero" %}
3472   ins_encode %{
3473     int vector_len = 1;
3474     __ vxorpd($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3475   %}
3476   ins_pipe( fpu_reg_reg );
3477 %}
3478 
3479 // ====================GENERIC REPLICATE==========================================
3480 
3481 // Replicate byte scalar to be vector
3482 instruct Repl4B(vecS dst, rRegI src) %{
3483   predicate(n->as_Vector()->length() == 4);
3484   match(Set dst (ReplicateB src));
3485   format %{ "movd    $dst,$src\n\t"
3486             "punpcklbw $dst,$dst\n\t"
3487             "pshuflw $dst,$dst,0x00\t! replicate4B" %}
3488   ins_encode %{
3489     __ movdl($dst$$XMMRegister, $src$$Register);
3490     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
3491     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3492   %}
3493   ins_pipe( pipe_slow );
3494 %}
3495 
3496 instruct Repl8B(vecD dst, rRegI src) %{
3497   predicate(n->as_Vector()->length() == 8);
3498   match(Set dst (ReplicateB src));
3499   format %{ "movd    $dst,$src\n\t"
3500             "punpcklbw $dst,$dst\n\t"
3501             "pshuflw $dst,$dst,0x00\t! replicate8B" %}
3502   ins_encode %{
3503     __ movdl($dst$$XMMRegister, $src$$Register);
3504     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
3505     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3506   %}
3507   ins_pipe( pipe_slow );
3508 %}
3509 
3510 // Replicate byte scalar immediate to be vector by loading from const table.
3511 instruct Repl4B_imm(vecS dst, immI con) %{
3512   predicate(n->as_Vector()->length() == 4);
3513   match(Set dst (ReplicateB con));
3514   format %{ "movdl   $dst,[$constantaddress]\t! replicate4B($con)" %}
3515   ins_encode %{
3516     __ movdl($dst$$XMMRegister, $constantaddress(replicate4_imm($con$$constant, 1)));
3517   %}
3518   ins_pipe( pipe_slow );
3519 %}
3520 
3521 instruct Repl8B_imm(vecD dst, immI con) %{
3522   predicate(n->as_Vector()->length() == 8);
3523   match(Set dst (ReplicateB con));
3524   format %{ "movq    $dst,[$constantaddress]\t! replicate8B($con)" %}
3525   ins_encode %{
3526     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
3527   %}
3528   ins_pipe( pipe_slow );
3529 %}
3530 
3531 // Replicate byte scalar zero to be vector
3532 instruct Repl4B_zero(vecS dst, immI0 zero) %{
3533   predicate(n->as_Vector()->length() == 4);
3534   match(Set dst (ReplicateB zero));
3535   format %{ "pxor    $dst,$dst\t! replicate4B zero" %}
3536   ins_encode %{
3537     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3538   %}
3539   ins_pipe( fpu_reg_reg );
3540 %}
3541 
3542 instruct Repl8B_zero(vecD dst, immI0 zero) %{
3543   predicate(n->as_Vector()->length() == 8);
3544   match(Set dst (ReplicateB zero));
3545   format %{ "pxor    $dst,$dst\t! replicate8B zero" %}
3546   ins_encode %{
3547     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3548   %}
3549   ins_pipe( fpu_reg_reg );
3550 %}
3551 
3552 instruct Repl16B_zero(vecX dst, immI0 zero) %{
3553   predicate(n->as_Vector()->length() == 16);
3554   match(Set dst (ReplicateB zero));
3555   format %{ "pxor    $dst,$dst\t! replicate16B zero" %}
3556   ins_encode %{
3557     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3558   %}
3559   ins_pipe( fpu_reg_reg );
3560 %}
3561 
3562 instruct Repl32B_zero(vecY dst, immI0 zero) %{
3563   predicate(n->as_Vector()->length() == 32);
3564   match(Set dst (ReplicateB zero));
3565   format %{ "vpxor   $dst,$dst,$dst\t! replicate32B zero" %}
3566   ins_encode %{
3567     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
3568     int vector_len = 1;
3569     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3570   %}
3571   ins_pipe( fpu_reg_reg );
3572 %}
3573 
3574 // Replicate char/short (2 byte) scalar to be vector
3575 instruct Repl2S(vecS dst, rRegI src) %{
3576   predicate(n->as_Vector()->length() == 2);
3577   match(Set dst (ReplicateS src));
3578   format %{ "movd    $dst,$src\n\t"
3579             "pshuflw $dst,$dst,0x00\t! replicate2S" %}
3580   ins_encode %{
3581     __ movdl($dst$$XMMRegister, $src$$Register);
3582     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3583   %}
3584   ins_pipe( fpu_reg_reg );
3585 %}
3586 
3587 // Replicate char/short (2 byte) scalar immediate to be vector by loading from const table.
3588 instruct Repl2S_imm(vecS dst, immI con) %{
3589   predicate(n->as_Vector()->length() == 2);
3590   match(Set dst (ReplicateS con));
3591   format %{ "movdl   $dst,[$constantaddress]\t! replicate2S($con)" %}
3592   ins_encode %{
3593     __ movdl($dst$$XMMRegister, $constantaddress(replicate4_imm($con$$constant, 2)));
3594   %}
3595   ins_pipe( fpu_reg_reg );
3596 %}
3597 
3598 instruct Repl4S_imm(vecD dst, immI con) %{
3599   predicate(n->as_Vector()->length() == 4);
3600   match(Set dst (ReplicateS con));
3601   format %{ "movq    $dst,[$constantaddress]\t! replicate4S($con)" %}
3602   ins_encode %{
3603     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
3604   %}
3605   ins_pipe( fpu_reg_reg );
3606 %}
3607 
3608 // Replicate char/short (2 byte) scalar zero to be vector
3609 instruct Repl2S_zero(vecS dst, immI0 zero) %{
3610   predicate(n->as_Vector()->length() == 2);
3611   match(Set dst (ReplicateS zero));
3612   format %{ "pxor    $dst,$dst\t! replicate2S zero" %}
3613   ins_encode %{
3614     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3615   %}
3616   ins_pipe( fpu_reg_reg );
3617 %}
3618 
3619 instruct Repl4S_zero(vecD dst, immI0 zero) %{
3620   predicate(n->as_Vector()->length() == 4);
3621   match(Set dst (ReplicateS zero));
3622   format %{ "pxor    $dst,$dst\t! replicate4S zero" %}
3623   ins_encode %{
3624     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3625   %}
3626   ins_pipe( fpu_reg_reg );
3627 %}
3628 
3629 instruct Repl8S_zero(vecX dst, immI0 zero) %{
3630   predicate(n->as_Vector()->length() == 8);
3631   match(Set dst (ReplicateS zero));
3632   format %{ "pxor    $dst,$dst\t! replicate8S zero" %}
3633   ins_encode %{
3634     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3635   %}
3636   ins_pipe( fpu_reg_reg );
3637 %}
3638 
3639 instruct Repl16S_zero(vecY dst, immI0 zero) %{
3640   predicate(n->as_Vector()->length() == 16);
3641   match(Set dst (ReplicateS zero));
3642   format %{ "vpxor   $dst,$dst,$dst\t! replicate16S zero" %}
3643   ins_encode %{
3644     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
3645     int vector_len = 1;
3646     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3647   %}
3648   ins_pipe( fpu_reg_reg );
3649 %}
3650 
3651 // Replicate integer (4 byte) scalar to be vector
3652 instruct Repl2I(vecD dst, rRegI src) %{
3653   predicate(n->as_Vector()->length() == 2);
3654   match(Set dst (ReplicateI src));
3655   format %{ "movd    $dst,$src\n\t"
3656             "pshufd  $dst,$dst,0x00\t! replicate2I" %}
3657   ins_encode %{
3658     __ movdl($dst$$XMMRegister, $src$$Register);
3659     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3660   %}
3661   ins_pipe( fpu_reg_reg );
3662 %}
3663 
3664 // Integer could be loaded into xmm register directly from memory.
3665 instruct Repl2I_mem(vecD dst, memory mem) %{
3666   predicate(n->as_Vector()->length() == 2);
3667   match(Set dst (ReplicateI (LoadI mem)));
3668   format %{ "movd    $dst,$mem\n\t"
3669             "pshufd  $dst,$dst,0x00\t! replicate2I" %}
3670   ins_encode %{
3671     __ movdl($dst$$XMMRegister, $mem$$Address);
3672     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3673   %}
3674   ins_pipe( fpu_reg_reg );
3675 %}
3676 
3677 // Replicate integer (4 byte) scalar immediate to be vector by loading from const table.
3678 instruct Repl2I_imm(vecD dst, immI con) %{
3679   predicate(n->as_Vector()->length() == 2);
3680   match(Set dst (ReplicateI con));
3681   format %{ "movq    $dst,[$constantaddress]\t! replicate2I($con)" %}
3682   ins_encode %{
3683     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
3684   %}
3685   ins_pipe( fpu_reg_reg );
3686 %}
3687 
3688 // Replicate integer (4 byte) scalar zero to be vector
3689 instruct Repl2I_zero(vecD dst, immI0 zero) %{
3690   predicate(n->as_Vector()->length() == 2);
3691   match(Set dst (ReplicateI zero));
3692   format %{ "pxor    $dst,$dst\t! replicate2I" %}
3693   ins_encode %{
3694     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3695   %}
3696   ins_pipe( fpu_reg_reg );
3697 %}
3698 
3699 instruct Repl4I_zero(vecX dst, immI0 zero) %{
3700   predicate(n->as_Vector()->length() == 4);
3701   match(Set dst (ReplicateI zero));
3702   format %{ "pxor    $dst,$dst\t! replicate4I zero)" %}
3703   ins_encode %{
3704     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3705   %}
3706   ins_pipe( fpu_reg_reg );
3707 %}
3708 
3709 instruct Repl8I_zero(vecY dst, immI0 zero) %{
3710   predicate(n->as_Vector()->length() == 8);
3711   match(Set dst (ReplicateI zero));
3712   format %{ "vpxor   $dst,$dst,$dst\t! replicate8I zero" %}
3713   ins_encode %{
3714     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
3715     int vector_len = 1;
3716     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3717   %}
3718   ins_pipe( fpu_reg_reg );
3719 %}
3720 
3721 // Replicate long (8 byte) scalar to be vector
3722 #ifdef _LP64
3723 instruct Repl2L(vecX dst, rRegL src) %{
3724   predicate(n->as_Vector()->length() == 2);
3725   match(Set dst (ReplicateL src));
3726   format %{ "movdq   $dst,$src\n\t"
3727             "punpcklqdq $dst,$dst\t! replicate2L" %}
3728   ins_encode %{
3729     __ movdq($dst$$XMMRegister, $src$$Register);
3730     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3731   %}
3732   ins_pipe( pipe_slow );
3733 %}
3734 #else // _LP64
3735 instruct Repl2L(vecX dst, eRegL src, regD tmp) %{
3736   predicate(n->as_Vector()->length() == 2);
3737   match(Set dst (ReplicateL src));
3738   effect(TEMP dst, USE src, TEMP tmp);
3739   format %{ "movdl   $dst,$src.lo\n\t"
3740             "movdl   $tmp,$src.hi\n\t"
3741             "punpckldq $dst,$tmp\n\t"
3742             "punpcklqdq $dst,$dst\t! replicate2L"%}
3743   ins_encode %{
3744     __ movdl($dst$$XMMRegister, $src$$Register);
3745     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
3746     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
3747     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3748   %}
3749   ins_pipe( pipe_slow );
3750 %}
3751 #endif // _LP64
3752 
3753 // Replicate long (8 byte) scalar immediate to be vector by loading from const table.
3754 instruct Repl2L_imm(vecX dst, immL con) %{
3755   predicate(n->as_Vector()->length() == 2);
3756   match(Set dst (ReplicateL con));
3757   format %{ "movq    $dst,[$constantaddress]\n\t"
3758             "punpcklqdq $dst,$dst\t! replicate2L($con)" %}
3759   ins_encode %{
3760     __ movq($dst$$XMMRegister, $constantaddress($con));
3761     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3762   %}
3763   ins_pipe( pipe_slow );
3764 %}
3765 
3766 // Replicate long (8 byte) scalar zero to be vector
3767 instruct Repl2L_zero(vecX dst, immL0 zero) %{
3768   predicate(n->as_Vector()->length() == 2);
3769   match(Set dst (ReplicateL zero));
3770   format %{ "pxor    $dst,$dst\t! replicate2L zero" %}
3771   ins_encode %{
3772     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3773   %}
3774   ins_pipe( fpu_reg_reg );
3775 %}
3776 
3777 instruct Repl4L_zero(vecY dst, immL0 zero) %{
3778   predicate(n->as_Vector()->length() == 4);
3779   match(Set dst (ReplicateL zero));
3780   format %{ "vpxor   $dst,$dst,$dst\t! replicate4L zero" %}
3781   ins_encode %{
3782     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
3783     int vector_len = 1;
3784     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3785   %}
3786   ins_pipe( fpu_reg_reg );
3787 %}
3788 
3789 // Replicate float (4 byte) scalar to be vector
3790 instruct Repl2F(vecD dst, regF src) %{
3791   predicate(n->as_Vector()->length() == 2);
3792   match(Set dst (ReplicateF src));
3793   format %{ "pshufd  $dst,$dst,0x00\t! replicate2F" %}
3794   ins_encode %{
3795     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
3796   %}
3797   ins_pipe( fpu_reg_reg );
3798 %}
3799 
3800 instruct Repl4F(vecX dst, regF src) %{
3801   predicate(n->as_Vector()->length() == 4);
3802   match(Set dst (ReplicateF src));
3803   format %{ "pshufd  $dst,$dst,0x00\t! replicate4F" %}
3804   ins_encode %{
3805     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
3806   %}
3807   ins_pipe( pipe_slow );
3808 %}
3809 
3810 // Replicate double (8 bytes) scalar to be vector
3811 instruct Repl2D(vecX dst, regD src) %{
3812   predicate(n->as_Vector()->length() == 2);
3813   match(Set dst (ReplicateD src));
3814   format %{ "pshufd  $dst,$src,0x44\t! replicate2D" %}
3815   ins_encode %{
3816     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
3817   %}
3818   ins_pipe( pipe_slow );
3819 %}
3820 
3821 // ====================EVEX REPLICATE=============================================
3822 
3823 instruct Repl4B_mem_evex(vecS dst, memory mem) %{
3824   predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vlbw());
3825   match(Set dst (ReplicateB (LoadB mem)));
3826   format %{ "vpbroadcastb  $dst,$mem\t! replicate4B" %}
3827   ins_encode %{
3828     int vector_len = 0;
3829     __ evpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
3830   %}
3831   ins_pipe( pipe_slow );
3832 %}
3833 
3834 instruct Repl8B_mem_evex(vecD dst, memory mem) %{
3835   predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vlbw());
3836   match(Set dst (ReplicateB (LoadB mem)));
3837   format %{ "vpbroadcastb  $dst,$mem\t! replicate8B" %}
3838   ins_encode %{
3839     int vector_len = 0;
3840     __ evpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
3841   %}
3842   ins_pipe( pipe_slow );
3843 %}
3844 
3845 instruct Repl16B_evex(vecX dst, rRegI src) %{
3846   predicate(n->as_Vector()->length() == 16 && VM_Version::supports_avx512vlbw());
3847   match(Set dst (ReplicateB src));
3848   format %{ "vpbroadcastb $dst,$src\t! replicate16B" %}
3849   ins_encode %{
3850    int vector_len = 0;
3851     __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vector_len);
3852   %}
3853   ins_pipe( pipe_slow );
3854 %}
3855 
3856 instruct Repl16B_mem_evex(vecX dst, memory mem) %{
3857   predicate(n->as_Vector()->length() == 16 && VM_Version::supports_avx512vlbw());
3858   match(Set dst (ReplicateB (LoadB mem)));
3859   format %{ "vpbroadcastb  $dst,$mem\t! replicate16B" %}
3860   ins_encode %{
3861     int vector_len = 0;
3862     __ evpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
3863   %}
3864   ins_pipe( pipe_slow );
3865 %}
3866 
3867 instruct Repl32B_evex(vecY dst, rRegI src) %{
3868   predicate(n->as_Vector()->length() == 32 && VM_Version::supports_avx512vlbw());
3869   match(Set dst (ReplicateB src));
3870   format %{ "vpbroadcastb $dst,$src\t! replicate32B" %}
3871   ins_encode %{
3872    int vector_len = 1;
3873     __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vector_len);
3874   %}
3875   ins_pipe( pipe_slow );
3876 %}
3877 
3878 instruct Repl32B_mem_evex(vecY dst, memory mem) %{
3879   predicate(n->as_Vector()->length() == 32 && VM_Version::supports_avx512vlbw());
3880   match(Set dst (ReplicateB (LoadB mem)));
3881   format %{ "vpbroadcastb  $dst,$mem\t! replicate32B" %}
3882   ins_encode %{
3883     int vector_len = 1;
3884     __ evpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
3885   %}
3886   ins_pipe( pipe_slow );
3887 %}
3888 
3889 instruct Repl64B_evex(vecZ dst, rRegI src) %{
3890   predicate(n->as_Vector()->length() == 64 && VM_Version::supports_avx512bw());
3891   match(Set dst (ReplicateB src));
3892   format %{ "vpbroadcastb $dst,$src\t! upper replicate64B" %}
3893   ins_encode %{
3894    int vector_len = 2;
3895     __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vector_len);
3896   %}
3897   ins_pipe( pipe_slow );
3898 %}
3899 
3900 instruct Repl64B_mem_evex(vecZ dst, memory mem) %{
3901   predicate(n->as_Vector()->length() == 64 && VM_Version::supports_avx512bw());
3902   match(Set dst (ReplicateB (LoadB mem)));
3903   format %{ "vpbroadcastb  $dst,$mem\t! replicate64B" %}
3904   ins_encode %{
3905     int vector_len = 2;
3906     __ evpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
3907   %}
3908   ins_pipe( pipe_slow );
3909 %}
3910 
3911 instruct Repl16B_imm_evex(vecX dst, immI con) %{
3912   predicate(n->as_Vector()->length() == 16 && VM_Version::supports_avx512vlbw());
3913   match(Set dst (ReplicateB con));
3914   format %{ "movq    $dst,[$constantaddress]\n\t"
3915             "vpbroadcastb $dst,$dst\t! replicate16B" %}
3916   ins_encode %{
3917    int vector_len = 0;
3918     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
3919     __ evpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3920   %}
3921   ins_pipe( pipe_slow );
3922 %}
3923 
3924 instruct Repl32B_imm_evex(vecY dst, immI con) %{
3925   predicate(n->as_Vector()->length() == 32 && VM_Version::supports_avx512vlbw());
3926   match(Set dst (ReplicateB con));
3927   format %{ "movq    $dst,[$constantaddress]\n\t"
3928             "vpbroadcastb $dst,$dst\t! replicate32B" %}
3929   ins_encode %{
3930    int vector_len = 1;
3931     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
3932     __ evpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3933   %}
3934   ins_pipe( pipe_slow );
3935 %}
3936 
3937 instruct Repl64B_imm_evex(vecZ dst, immI con) %{
3938   predicate(n->as_Vector()->length() == 64 && VM_Version::supports_avx512bw());
3939   match(Set dst (ReplicateB con));
3940   format %{ "movq    $dst,[$constantaddress]\n\t"
3941             "vpbroadcastb $dst,$dst\t! upper replicate64B" %}
3942   ins_encode %{
3943    int vector_len = 2;
3944     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
3945     __ evpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3946   %}
3947   ins_pipe( pipe_slow );
3948 %}
3949 
3950 instruct Repl64B_zero_evex(vecZ dst, immI0 zero) %{
3951   predicate(n->as_Vector()->length() == 64 && UseAVX > 2);
3952   match(Set dst (ReplicateB zero));
3953   format %{ "vpxor   $dst k0,$dst,$dst\t! replicate64B zero" %}
3954   ins_encode %{
3955     // Use vxorpd since AVX does not have vpxor for 512-bit (EVEX will have it).
3956     int vector_len = 2;
3957     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3958   %}
3959   ins_pipe( fpu_reg_reg );
3960 %}
3961 
3962 instruct Repl4S_evex(vecD dst, rRegI src) %{
3963   predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vlbw());
3964   match(Set dst (ReplicateS src));
3965   format %{ "vpbroadcastw $dst,$src\t! replicate4S" %}
3966   ins_encode %{
3967    int vector_len = 0;
3968     __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
3969   %}
3970   ins_pipe( pipe_slow );
3971 %}
3972 
3973 instruct Repl4S_mem_evex(vecD dst, memory mem) %{
3974   predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vlbw());
3975   match(Set dst (ReplicateS (LoadS mem)));
3976   format %{ "vpbroadcastw  $dst,$mem\t! replicate4S" %}
3977   ins_encode %{
3978     int vector_len = 0;
3979     __ evpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
3980   %}
3981   ins_pipe( pipe_slow );
3982 %}
3983 
3984 instruct Repl8S_evex(vecX dst, rRegI src) %{
3985   predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vlbw());
3986   match(Set dst (ReplicateS src));
3987   format %{ "vpbroadcastw $dst,$src\t! replicate8S" %}
3988   ins_encode %{
3989    int vector_len = 0;
3990     __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
3991   %}
3992   ins_pipe( pipe_slow );
3993 %}
3994 
3995 instruct Repl8S_mem_evex(vecX dst, memory mem) %{
3996   predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vlbw());
3997   match(Set dst (ReplicateS (LoadS mem)));
3998   format %{ "vpbroadcastw  $dst,$mem\t! replicate8S" %}
3999   ins_encode %{
4000     int vector_len = 0;
4001     __ evpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
4002   %}
4003   ins_pipe( pipe_slow );
4004 %}
4005 
4006 instruct Repl16S_evex(vecY dst, rRegI src) %{
4007   predicate(n->as_Vector()->length() == 16 && VM_Version::supports_avx512vlbw());
4008   match(Set dst (ReplicateS src));
4009   format %{ "vpbroadcastw $dst,$src\t! replicate16S" %}
4010   ins_encode %{
4011    int vector_len = 1;
4012     __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
4013   %}
4014   ins_pipe( pipe_slow );
4015 %}
4016 
4017 instruct Repl16S_mem_evex(vecY dst, memory mem) %{
4018   predicate(n->as_Vector()->length() == 16 && VM_Version::supports_avx512vlbw());
4019   match(Set dst (ReplicateS (LoadS mem)));
4020   format %{ "vpbroadcastw  $dst,$mem\t! replicate16S" %}
4021   ins_encode %{
4022     int vector_len = 1;
4023     __ evpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
4024   %}
4025   ins_pipe( pipe_slow );
4026 %}
4027 
4028 instruct Repl32S_evex(vecZ dst, rRegI src) %{
4029   predicate(n->as_Vector()->length() == 32 && VM_Version::supports_avx512bw());
4030   match(Set dst (ReplicateS src));
4031   format %{ "vpbroadcastw $dst,$src\t! replicate32S" %}
4032   ins_encode %{
4033    int vector_len = 2;
4034     __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
4035   %}
4036   ins_pipe( pipe_slow );
4037 %}
4038 
4039 instruct Repl32S_mem_evex(vecZ dst, memory mem) %{
4040   predicate(n->as_Vector()->length() == 32 && VM_Version::supports_avx512bw());
4041   match(Set dst (ReplicateS (LoadS mem)));
4042   format %{ "vpbroadcastw  $dst,$mem\t! replicate32S" %}
4043   ins_encode %{
4044     int vector_len = 2;
4045     __ evpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
4046   %}
4047   ins_pipe( pipe_slow );
4048 %}
4049 
4050 instruct Repl8S_imm_evex(vecX dst, immI con) %{
4051   predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vlbw());
4052   match(Set dst (ReplicateS con));
4053   format %{ "movq    $dst,[$constantaddress]\n\t"
4054             "vpbroadcastw $dst,$dst\t! replicate8S" %}
4055   ins_encode %{
4056    int vector_len = 0;
4057     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
4058     __ evpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4059   %}
4060   ins_pipe( pipe_slow );
4061 %}
4062 
4063 instruct Repl16S_imm_evex(vecY dst, immI con) %{
4064   predicate(n->as_Vector()->length() == 16 && VM_Version::supports_avx512vlbw());
4065   match(Set dst (ReplicateS con));
4066   format %{ "movq    $dst,[$constantaddress]\n\t"
4067             "vpbroadcastw $dst,$dst\t! replicate16S" %}
4068   ins_encode %{
4069    int vector_len = 1;
4070     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
4071     __ evpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4072   %}
4073   ins_pipe( pipe_slow );
4074 %}
4075 
4076 instruct Repl32S_imm_evex(vecZ dst, immI con) %{
4077   predicate(n->as_Vector()->length() == 32 && VM_Version::supports_avx512bw());
4078   match(Set dst (ReplicateS con));
4079   format %{ "movq    $dst,[$constantaddress]\n\t"
4080             "vpbroadcastw $dst,$dst\t! replicate32S" %}
4081   ins_encode %{
4082    int vector_len = 2;
4083     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
4084     __ evpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4085   %}
4086   ins_pipe( pipe_slow );
4087 %}
4088 
4089 instruct Repl32S_zero_evex(vecZ dst, immI0 zero) %{
4090   predicate(n->as_Vector()->length() == 32 && UseAVX > 2);
4091   match(Set dst (ReplicateS zero));
4092   format %{ "vpxor   $dst k0,$dst,$dst\t! replicate32S zero" %}
4093   ins_encode %{
4094     // Use vxorpd since AVX does not have vpxor for 512-bit (EVEX will have it).
4095     int vector_len = 2;
4096     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4097   %}
4098   ins_pipe( fpu_reg_reg );
4099 %}
4100 
4101 instruct Repl4I_evex(vecX dst, rRegI src) %{
4102   predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
4103   match(Set dst (ReplicateI src));
4104   format %{ "vpbroadcastd  $dst,$src\t! replicate4I" %}
4105   ins_encode %{
4106     int vector_len = 0;
4107     __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vector_len);
4108   %}
4109   ins_pipe( pipe_slow );
4110 %}
4111 
4112 instruct Repl4I_mem_evex(vecX dst, memory mem) %{
4113   predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
4114   match(Set dst (ReplicateI (LoadI mem)));
4115   format %{ "vpbroadcastd  $dst,$mem\t! replicate4I" %}
4116   ins_encode %{
4117     int vector_len = 0;
4118     __ evpbroadcastd($dst$$XMMRegister, $mem$$Address, vector_len);
4119   %}
4120   ins_pipe( pipe_slow );
4121 %}
4122 
4123 instruct Repl8I_evex(vecY dst, rRegI src) %{
4124   predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vl());
4125   match(Set dst (ReplicateI src));
4126   format %{ "vpbroadcastd  $dst,$src\t! replicate8I" %}
4127   ins_encode %{
4128     int vector_len = 1;
4129     __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vector_len);
4130   %}
4131   ins_pipe( pipe_slow );
4132 %}
4133 
4134 instruct Repl8I_mem_evex(vecY dst, memory mem) %{
4135   predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vl());
4136   match(Set dst (ReplicateI (LoadI mem)));
4137   format %{ "vpbroadcastd  $dst,$mem\t! replicate8I" %}
4138   ins_encode %{
4139     int vector_len = 1;
4140     __ evpbroadcastd($dst$$XMMRegister, $mem$$Address, vector_len);
4141   %}
4142   ins_pipe( pipe_slow );
4143 %}
4144 
4145 instruct Repl16I_evex(vecZ dst, rRegI src) %{
4146   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4147   match(Set dst (ReplicateI src));
4148   format %{ "vpbroadcastd  $dst,$src\t! replicate16I" %}
4149   ins_encode %{
4150     int vector_len = 2;
4151     __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vector_len);
4152   %}
4153   ins_pipe( pipe_slow );
4154 %}
4155 
4156 instruct Repl16I_mem_evex(vecZ dst, memory mem) %{
4157   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4158   match(Set dst (ReplicateI (LoadI mem)));
4159   format %{ "vpbroadcastd  $dst,$mem\t! replicate16I" %}
4160   ins_encode %{
4161     int vector_len = 2;
4162     __ evpbroadcastd($dst$$XMMRegister, $mem$$Address, vector_len);
4163   %}
4164   ins_pipe( pipe_slow );
4165 %}
4166 
4167 instruct Repl4I_imm_evex(vecX dst, immI con) %{
4168   predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
4169   match(Set dst (ReplicateI con));
4170   format %{ "movq    $dst,[$constantaddress]\t! replicate8I($con)\n\t"
4171             "vpbroadcastd  $dst,$dst\t! replicate4I" %}
4172   ins_encode %{
4173     int vector_len = 0;
4174     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
4175     __ evpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4176   %}
4177   ins_pipe( pipe_slow );
4178 %}
4179 
4180 instruct Repl8I_imm_evex(vecY dst, immI con) %{
4181   predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vl());
4182   match(Set dst (ReplicateI con));
4183   format %{ "movq    $dst,[$constantaddress]\t! replicate8I($con)\n\t"
4184             "vpbroadcastd  $dst,$dst\t! replicate8I" %}
4185   ins_encode %{
4186     int vector_len = 1;
4187     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
4188     __ evpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4189   %}
4190   ins_pipe( pipe_slow );
4191 %}
4192 
4193 instruct Repl16I_imm_evex(vecZ dst, immI con) %{
4194   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4195   match(Set dst (ReplicateI con));
4196   format %{ "movq    $dst,[$constantaddress]\t! replicate16I($con)\n\t"
4197             "vpbroadcastd  $dst,$dst\t! replicate16I" %}
4198   ins_encode %{
4199     int vector_len = 2;
4200     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
4201     __ evpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4202   %}
4203   ins_pipe( pipe_slow );
4204 %}
4205 
4206 instruct Repl16I_zero_evex(vecZ dst, immI0 zero) %{
4207   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4208   match(Set dst (ReplicateI zero));
4209   format %{ "vpxor   $dst k0,$dst,$dst\t! replicate16I zero" %}
4210   ins_encode %{
4211     // Use vxorpd since AVX does not have vpxor for 512-bit (AVX2 will have it).
4212     int vector_len = 2;
4213     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4214   %}
4215   ins_pipe( fpu_reg_reg );
4216 %}
4217 
4218 // Replicate long (8 byte) scalar to be vector
4219 #ifdef _LP64
4220 instruct Repl4L_evex(vecY dst, rRegL src) %{
4221   predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
4222   match(Set dst (ReplicateL src));
4223   format %{ "vpbroadcastq  $dst,$src\t! replicate4L" %}
4224   ins_encode %{
4225     int vector_len = 1;
4226     __ evpbroadcastq($dst$$XMMRegister, $src$$Register, vector_len);
4227   %}
4228   ins_pipe( pipe_slow );
4229 %}
4230 
4231 instruct Repl8L_evex(vecZ dst, rRegL src) %{
4232   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4233   match(Set dst (ReplicateL src));
4234   format %{ "vpbroadcastq  $dst,$src\t! replicate8L" %}
4235   ins_encode %{
4236     int vector_len = 2;
4237     __ evpbroadcastq($dst$$XMMRegister, $src$$Register, vector_len);
4238   %}
4239   ins_pipe( pipe_slow );
4240 %}
4241 #else // _LP64
4242 instruct Repl4L_evex(vecY dst, eRegL src, regD tmp) %{
4243   predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
4244   match(Set dst (ReplicateL src));
4245   effect(TEMP dst, USE src, TEMP tmp);
4246   format %{ "movdl   $dst,$src.lo\n\t"
4247             "movdl   $tmp,$src.hi\n\t"
4248             "punpckldq $dst,$tmp\n\t"
4249             "vpbroadcastq  $dst,$dst\t! replicate4L" %}
4250   ins_encode %{
4251     int vector_len = 1;
4252     __ movdl($dst$$XMMRegister, $src$$Register);
4253     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
4254     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
4255     __ evpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4256   %}
4257   ins_pipe( pipe_slow );
4258 %}
4259 
4260 instruct Repl8L_evex(vecZ dst, eRegL src, regD tmp) %{
4261   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4262   match(Set dst (ReplicateL src));
4263   effect(TEMP dst, USE src, TEMP tmp);
4264   format %{ "movdl   $dst,$src.lo\n\t"
4265             "movdl   $tmp,$src.hi\n\t"
4266             "punpckldq $dst,$tmp\n\t"
4267             "vpbroadcastq  $dst,$dst\t! replicate8L" %}
4268   ins_encode %{
4269     int vector_len = 2;
4270     __ movdl($dst$$XMMRegister, $src$$Register);
4271     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
4272     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
4273     __ evpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4274   %}
4275   ins_pipe( pipe_slow );
4276 %}
4277 #endif // _LP64
4278 
4279 instruct Repl4L_imm_evex(vecY dst, immL con) %{
4280   predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
4281   match(Set dst (ReplicateL con));
4282   format %{ "movq    $dst,[$constantaddress]\n\t"
4283             "vpbroadcastq  $dst,$dst\t! replicate4L" %}
4284   ins_encode %{
4285     int vector_len = 1;
4286     __ movq($dst$$XMMRegister, $constantaddress($con));
4287     __ evpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4288   %}
4289   ins_pipe( pipe_slow );
4290 %}
4291 
4292 instruct Repl8L_imm_evex(vecZ dst, immL con) %{
4293   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4294   match(Set dst (ReplicateL con));
4295   format %{ "movq    $dst,[$constantaddress]\n\t"
4296             "vpbroadcastq  $dst,$dst\t! replicate8L" %}
4297   ins_encode %{
4298     int vector_len = 2;
4299     __ movq($dst$$XMMRegister, $constantaddress($con));
4300     __ evpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4301   %}
4302   ins_pipe( pipe_slow );
4303 %}
4304 
4305 instruct Repl2L_mem_evex(vecX dst, memory mem) %{
4306   predicate(n->as_Vector()->length() == 2 && VM_Version::supports_avx512vl());
4307   match(Set dst (ReplicateL (LoadL mem)));
4308   format %{ "vpbroadcastd  $dst,$mem\t! replicate2L" %}
4309   ins_encode %{
4310     int vector_len = 0;
4311     __ evpbroadcastq($dst$$XMMRegister, $mem$$Address, vector_len);
4312   %}
4313   ins_pipe( pipe_slow );
4314 %}
4315 
4316 instruct Repl4L_mem_evex(vecY dst, memory mem) %{
4317   predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
4318   match(Set dst (ReplicateL (LoadL mem)));
4319   format %{ "vpbroadcastd  $dst,$mem\t! replicate4L" %}
4320   ins_encode %{
4321     int vector_len = 1;
4322     __ evpbroadcastq($dst$$XMMRegister, $mem$$Address, vector_len);
4323   %}
4324   ins_pipe( pipe_slow );
4325 %}
4326 
4327 instruct Repl8L_mem_evex(vecZ dst, memory mem) %{
4328   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4329   match(Set dst (ReplicateL (LoadL mem)));
4330   format %{ "vpbroadcastd  $dst,$mem\t! replicate8L" %}
4331   ins_encode %{
4332     int vector_len = 2;
4333     __ evpbroadcastq($dst$$XMMRegister, $mem$$Address, vector_len);
4334   %}
4335   ins_pipe( pipe_slow );
4336 %}
4337 
4338 instruct Repl8L_zero_evex(vecZ dst, immL0 zero) %{
4339   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4340   match(Set dst (ReplicateL zero));
4341   format %{ "vpxor   $dst k0,$dst,$dst\t! replicate8L zero" %}
4342   ins_encode %{
4343     // Use vxorpd since AVX does not have vpxor for 512-bit (EVEX will have it).
4344     int vector_len = 2;
4345     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4346   %}
4347   ins_pipe( fpu_reg_reg );
4348 %}
4349 
4350 instruct Repl8F_evex(vecY dst, regF src) %{
4351   predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vl());
4352   match(Set dst (ReplicateF src));
4353   format %{ "vbroadcastss $dst,$src\t! replicate8F" %}
4354   ins_encode %{
4355     int vector_len = 1;
4356     __ evpbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vector_len);
4357   %}
4358   ins_pipe( pipe_slow );
4359 %}
4360 
4361 instruct Repl8F_mem_evex(vecY dst, memory mem) %{
4362   predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vl());
4363   match(Set dst (ReplicateF (LoadF mem)));
4364   format %{ "vbroadcastss  $dst,$mem\t! replicate8F" %}
4365   ins_encode %{
4366     int vector_len = 1;
4367     __ evpbroadcastss($dst$$XMMRegister, $mem$$Address, vector_len);
4368   %}
4369   ins_pipe( pipe_slow );
4370 %}
4371 
4372 instruct Repl16F_evex(vecZ dst, regF src) %{
4373   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4374   match(Set dst (ReplicateF src));
4375   format %{ "vbroadcastss $dst,$src\t! replicate16F" %}
4376   ins_encode %{
4377     int vector_len = 2;
4378     __ evpbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vector_len);
4379   %}
4380   ins_pipe( pipe_slow );
4381 %}
4382 
4383 instruct Repl16F_mem_evex(vecZ dst, memory mem) %{
4384   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4385   match(Set dst (ReplicateF (LoadF mem)));
4386   format %{ "vbroadcastss  $dst,$mem\t! replicate16F" %}
4387   ins_encode %{
4388     int vector_len = 2;
4389     __ evpbroadcastss($dst$$XMMRegister, $mem$$Address, vector_len);
4390   %}
4391   ins_pipe( pipe_slow );
4392 %}
4393 
4394 instruct Repl2F_zero_evex(vecD dst, immF0 zero) %{
4395   predicate(n->as_Vector()->length() == 2 && UseAVX > 2);
4396   match(Set dst (ReplicateF zero));
4397   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate2F zero" %}
4398   ins_encode %{
4399     // Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
4400     int vector_len = 2;
4401     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4402   %}
4403   ins_pipe( fpu_reg_reg );
4404 %}
4405 
4406 instruct Repl4F_zero_evex(vecX dst, immF0 zero) %{
4407   predicate(n->as_Vector()->length() == 4 && UseAVX > 2);
4408   match(Set dst (ReplicateF zero));
4409   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate4F zero" %}
4410   ins_encode %{
4411     // Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
4412     int vector_len = 2;
4413     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4414   %}
4415   ins_pipe( fpu_reg_reg );
4416 %}
4417 
4418 instruct Repl8F_zero_evex(vecY dst, immF0 zero) %{
4419   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4420   match(Set dst (ReplicateF zero));
4421   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate8F zero" %}
4422   ins_encode %{
4423     // Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
4424     int vector_len = 2;
4425     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4426   %}
4427   ins_pipe( fpu_reg_reg );
4428 %}
4429 
4430 instruct Repl16F_zero_evex(vecZ dst, immF0 zero) %{
4431   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4432   match(Set dst (ReplicateF zero));
4433   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate16F zero" %}
4434   ins_encode %{
4435     // Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
4436     int vector_len = 2;
4437     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4438   %}
4439   ins_pipe( fpu_reg_reg );
4440 %}
4441 
4442 instruct Repl4D_evex(vecY dst, regD src) %{
4443   predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
4444   match(Set dst (ReplicateD src));
4445   format %{ "vbroadcastsd $dst,$src\t! replicate4D" %}
4446   ins_encode %{
4447     int vector_len = 1;
4448     __ evpbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
4449   %}
4450   ins_pipe( pipe_slow );
4451 %}
4452 
4453 instruct Repl4D_mem_evex(vecY dst, memory mem) %{
4454   predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
4455   match(Set dst (ReplicateD (LoadD mem)));
4456   format %{ "vbroadcastsd  $dst,$mem\t! replicate4D" %}
4457   ins_encode %{
4458     int vector_len = 1;
4459     __ evpbroadcastsd($dst$$XMMRegister, $mem$$Address, vector_len);
4460   %}
4461   ins_pipe( pipe_slow );
4462 %}
4463 
4464 instruct Repl8D_evex(vecZ dst, regD src) %{
4465   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4466   match(Set dst (ReplicateD src));
4467   format %{ "vbroadcastsd $dst,$src\t! replicate8D" %}
4468   ins_encode %{
4469     int vector_len = 2;
4470     __ evpbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
4471   %}
4472   ins_pipe( pipe_slow );
4473 %}
4474 
4475 instruct Repl8D_mem_evex(vecZ dst, memory mem) %{
4476   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4477   match(Set dst (ReplicateD (LoadD mem)));
4478   format %{ "vbroadcastsd  $dst,$mem\t! replicate8D" %}
4479   ins_encode %{
4480     int vector_len = 2;
4481     __ evpbroadcastsd($dst$$XMMRegister, $mem$$Address, vector_len);
4482   %}
4483   ins_pipe( pipe_slow );
4484 %}
4485 
4486 instruct Repl2D_zero_evex(vecX dst, immD0 zero) %{
4487   predicate(n->as_Vector()->length() == 2 && UseAVX > 2);
4488   match(Set dst (ReplicateD zero));
4489   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate2D zero" %}
4490   ins_encode %{
4491     // Use vpxor in place of vxorpd since EVEX has a constriant on dq for vxorpd: this is a 512-bit operation
4492     int vector_len = 2;
4493     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4494   %}
4495   ins_pipe( fpu_reg_reg );
4496 %}
4497 
4498 instruct Repl4D_zero_evex(vecY dst, immD0 zero) %{
4499   predicate(n->as_Vector()->length() == 4 && UseAVX > 2);
4500   match(Set dst (ReplicateD zero));
4501   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate4D zero" %}
4502   ins_encode %{
4503     // Use vpxor in place of vxorpd since EVEX has a constriant on dq for vxorpd: this is a 512-bit operation
4504     int vector_len = 2;
4505     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4506   %}
4507   ins_pipe( fpu_reg_reg );
4508 %}
4509 
4510 instruct Repl8D_zero_evex(vecZ dst, immD0 zero) %{
4511   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4512   match(Set dst (ReplicateD zero));
4513   format %{ "vpxor  $dst k0,$dst,$dst,vect512\t! replicate8D zero" %}
4514   ins_encode %{
4515     // Use vpxor in place of vxorpd since EVEX has a constriant on dq for vxorpd: this is a 512-bit operation
4516     int vector_len = 2;
4517     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4518   %}
4519   ins_pipe( fpu_reg_reg );
4520 %}
4521 
4522 // ====================REDUCTION ARITHMETIC=======================================
4523 
4524 instruct rsadd2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
4525   predicate(UseSSE > 2 && UseAVX == 0);
4526   match(Set dst (AddReductionVI src1 src2));
4527   effect(TEMP tmp2, TEMP tmp);
4528   format %{ "movdqu  $tmp2,$src2\n\t"
4529             "phaddd  $tmp2,$tmp2\n\t"
4530             "movd    $tmp,$src1\n\t"
4531             "paddd   $tmp,$tmp2\n\t"
4532             "movd    $dst,$tmp\t! add reduction2I" %}
4533   ins_encode %{
4534     __ movdqu($tmp2$$XMMRegister, $src2$$XMMRegister);
4535     __ phaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister);
4536     __ movdl($tmp$$XMMRegister, $src1$$Register);
4537     __ paddd($tmp$$XMMRegister, $tmp2$$XMMRegister);
4538     __ movdl($dst$$Register, $tmp$$XMMRegister);
4539   %}
4540   ins_pipe( pipe_slow );
4541 %}
4542 
4543 instruct rvadd2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
4544   predicate(VM_Version::supports_avxonly());
4545   match(Set dst (AddReductionVI src1 src2));
4546   effect(TEMP tmp, TEMP tmp2);
4547   format %{ "vphaddd  $tmp,$src2,$src2\n\t"
4548             "movd     $tmp2,$src1\n\t"
4549             "vpaddd   $tmp2,$tmp2,$tmp\n\t"
4550             "movd     $dst,$tmp2\t! add reduction2I" %}
4551   ins_encode %{
4552     int vector_len = 0;
4553     __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len);
4554     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4555     __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
4556     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4557   %}
4558   ins_pipe( pipe_slow );
4559 %}
4560 
4561 instruct rvadd2I_reduction_reg_evex(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
4562   predicate(UseAVX > 2);
4563   match(Set dst (AddReductionVI src1 src2));
4564   effect(TEMP tmp, TEMP tmp2);
4565   format %{ "pshufd  $tmp2,$src2,0x1\n\t"
4566             "vpaddd  $tmp,$src2,$tmp2\n\t"
4567             "movd    $tmp2,$src1\n\t"
4568             "vpaddd  $tmp2,$tmp,$tmp2\n\t"
4569             "movd    $dst,$tmp2\t! add reduction2I" %}
4570   ins_encode %{
4571     int vector_len = 0;
4572     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
4573     __ vpaddd($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4574     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4575     __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4576     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4577   %}
4578   ins_pipe( pipe_slow );
4579 %}
4580 
4581 instruct rsadd4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
4582   predicate(UseSSE > 2 && UseAVX == 0);
4583   match(Set dst (AddReductionVI src1 src2));
4584   effect(TEMP tmp, TEMP tmp2);
4585   format %{ "movdqu  $tmp,$src2\n\t"
4586             "phaddd  $tmp,$tmp\n\t"
4587             "phaddd  $tmp,$tmp\n\t"
4588             "movd    $tmp2,$src1\n\t"
4589             "paddd   $tmp2,$tmp\n\t"
4590             "movd    $dst,$tmp2\t! add reduction4I" %}
4591   ins_encode %{
4592     __ movdqu($tmp$$XMMRegister, $src2$$XMMRegister);
4593     __ phaddd($tmp$$XMMRegister, $tmp$$XMMRegister);
4594     __ phaddd($tmp$$XMMRegister, $tmp$$XMMRegister);
4595     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4596     __ paddd($tmp2$$XMMRegister, $tmp$$XMMRegister);
4597     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4598   %}
4599   ins_pipe( pipe_slow );
4600 %}
4601 
4602 instruct rvadd4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
4603   predicate(VM_Version::supports_avxonly());
4604   match(Set dst (AddReductionVI src1 src2));
4605   effect(TEMP tmp, TEMP tmp2);
4606   format %{ "vphaddd  $tmp,$src2,$src2\n\t"
4607             "vphaddd  $tmp,$tmp,$tmp\n\t"
4608             "movd     $tmp2,$src1\n\t"
4609             "vpaddd   $tmp2,$tmp2,$tmp\n\t"
4610             "movd     $dst,$tmp2\t! add reduction4I" %}
4611   ins_encode %{
4612     int vector_len = 0;
4613     __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len);
4614     __ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp$$XMMRegister, vector_len);
4615     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4616     __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
4617     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4618   %}
4619   ins_pipe( pipe_slow );
4620 %}
4621 
4622 instruct rvadd4I_reduction_reg_evex(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
4623   predicate(UseAVX > 2);
4624   match(Set dst (AddReductionVI src1 src2));
4625   effect(TEMP tmp, TEMP tmp2);
4626   format %{ "pshufd  $tmp2,$src2,0xE\n\t"
4627             "vpaddd  $tmp,$src2,$tmp2\n\t"
4628             "pshufd  $tmp2,$tmp,0x1\n\t"
4629             "vpaddd  $tmp,$tmp,$tmp2\n\t"
4630             "movd    $tmp2,$src1\n\t"
4631             "vpaddd  $tmp2,$tmp,$tmp2\n\t"
4632             "movd    $dst,$tmp2\t! add reduction4I" %}
4633   ins_encode %{
4634     int vector_len = 0;
4635     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
4636     __ vpaddd($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4637     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
4638     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4639     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4640     __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4641     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4642   %}
4643   ins_pipe( pipe_slow );
4644 %}
4645 
4646 instruct rvadd8I_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2) %{
4647   predicate(VM_Version::supports_avxonly());
4648   match(Set dst (AddReductionVI src1 src2));
4649   effect(TEMP tmp, TEMP tmp2);
4650   format %{ "vphaddd  $tmp,$src2,$src2\n\t"
4651             "vphaddd  $tmp,$tmp,$tmp2\n\t"
4652             "vextracti128_high  $tmp2,$tmp\n\t"
4653             "vpaddd   $tmp,$tmp,$tmp2\n\t"
4654             "movd     $tmp2,$src1\n\t"
4655             "vpaddd   $tmp2,$tmp2,$tmp\n\t"
4656             "movd     $dst,$tmp2\t! add reduction8I" %}
4657   ins_encode %{
4658     int vector_len = 1;
4659     __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len);
4660     __ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4661     __ vextracti128_high($tmp2$$XMMRegister, $tmp$$XMMRegister);
4662     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
4663     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4664     __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
4665     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4666   %}
4667   ins_pipe( pipe_slow );
4668 %}
4669 
4670 instruct rvadd8I_reduction_reg_evex(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2) %{
4671   predicate(UseAVX > 2);
4672   match(Set dst (AddReductionVI src1 src2));
4673   effect(TEMP tmp, TEMP tmp2);
4674   format %{ "vextracti128_high  $tmp,$src2\n\t"
4675             "vpaddd  $tmp,$tmp,$src2\n\t"
4676             "pshufd  $tmp2,$tmp,0xE\n\t"
4677             "vpaddd  $tmp,$tmp,$tmp2\n\t"
4678             "pshufd  $tmp2,$tmp,0x1\n\t"
4679             "vpaddd  $tmp,$tmp,$tmp2\n\t"
4680             "movd    $tmp2,$src1\n\t"
4681             "vpaddd  $tmp2,$tmp,$tmp2\n\t"
4682             "movd    $dst,$tmp2\t! add reduction8I" %}
4683   ins_encode %{
4684     int vector_len = 0;
4685     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
4686     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
4687     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
4688     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4689     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
4690     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4691     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4692     __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4693     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4694   %}
4695   ins_pipe( pipe_slow );
4696 %}
4697 
4698 instruct rvadd16I_reduction_reg_evex(rRegI dst, rRegI src1, vecZ src2, regF tmp, regF tmp2, regF tmp3) %{
4699   predicate(UseAVX > 2);
4700   match(Set dst (AddReductionVI src1 src2));
4701   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
4702   format %{ "vextracti64x4_high  $tmp3,$src2\n\t"
4703             "vpaddd  $tmp3,$tmp3,$src2\n\t"
4704             "vextracti128_high  $tmp,$tmp3\n\t"
4705             "vpaddd  $tmp,$tmp,$tmp3\n\t"
4706             "pshufd  $tmp2,$tmp,0xE\n\t"
4707             "vpaddd  $tmp,$tmp,$tmp2\n\t"
4708             "pshufd  $tmp2,$tmp,0x1\n\t"
4709             "vpaddd  $tmp,$tmp,$tmp2\n\t"
4710             "movd    $tmp2,$src1\n\t"
4711             "vpaddd  $tmp2,$tmp,$tmp2\n\t"
4712             "movd    $dst,$tmp2\t! mul reduction16I" %}
4713   ins_encode %{
4714     __ vextracti64x4_high($tmp3$$XMMRegister, $src2$$XMMRegister);
4715     __ vpaddd($tmp3$$XMMRegister, $tmp3$$XMMRegister, $src2$$XMMRegister, 1);
4716     __ vextracti128_high($tmp$$XMMRegister, $tmp3$$XMMRegister);
4717     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp3$$XMMRegister, 0);
4718     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
4719     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
4720     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
4721     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
4722     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4723     __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
4724     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4725   %}
4726   ins_pipe( pipe_slow );
4727 %}
4728 
4729 #ifdef _LP64
4730 instruct rvadd2L_reduction_reg(rRegL dst, rRegL src1, vecX src2, regF tmp, regF tmp2) %{
4731   predicate(UseAVX > 2);
4732   match(Set dst (AddReductionVL src1 src2));
4733   effect(TEMP tmp, TEMP tmp2);
4734   format %{ "pshufd  $tmp2,$src2,0xE\n\t"
4735             "vpaddq  $tmp,$src2,$tmp2\n\t"
4736             "movdq   $tmp2,$src1\n\t"
4737             "vpaddq  $tmp2,$tmp,$tmp2\n\t"
4738             "movdq   $dst,$tmp2\t! add reduction2L" %}
4739   ins_encode %{
4740     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
4741     __ vpaddq($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, 0);
4742     __ movdq($tmp2$$XMMRegister, $src1$$Register);
4743     __ vpaddq($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
4744     __ movdq($dst$$Register, $tmp2$$XMMRegister);
4745   %}
4746   ins_pipe( pipe_slow );
4747 %}
4748 
4749 instruct rvadd4L_reduction_reg(rRegL dst, rRegL src1, vecY src2, regF tmp, regF tmp2) %{
4750   predicate(UseAVX > 2);
4751   match(Set dst (AddReductionVL src1 src2));
4752   effect(TEMP tmp, TEMP tmp2);
4753   format %{ "vextracti128_high  $tmp,$src2\n\t"
4754             "vpaddq  $tmp2,$tmp,$src2\n\t"
4755             "pshufd  $tmp,$tmp2,0xE\n\t"
4756             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
4757             "movdq   $tmp,$src1\n\t"
4758             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
4759             "movdq   $dst,$tmp2\t! add reduction4L" %}
4760   ins_encode %{
4761     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
4762     __ vpaddq($tmp2$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, 0);
4763     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
4764     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
4765     __ movdq($tmp$$XMMRegister, $src1$$Register);
4766     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
4767     __ movdq($dst$$Register, $tmp2$$XMMRegister);
4768   %}
4769   ins_pipe( pipe_slow );
4770 %}
4771 
4772 instruct rvadd8L_reduction_reg(rRegL dst, rRegL src1, vecZ src2, regF tmp, regF tmp2) %{
4773   predicate(UseAVX > 2);
4774   match(Set dst (AddReductionVL src1 src2));
4775   effect(TEMP tmp, TEMP tmp2);
4776   format %{ "vextracti64x4_high  $tmp2,$src2\n\t"
4777             "vpaddq  $tmp2,$tmp2,$src2\n\t"
4778             "vextracti128_high  $tmp,$tmp2\n\t"
4779             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
4780             "pshufd  $tmp,$tmp2,0xE\n\t"
4781             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
4782             "movdq   $tmp,$src1\n\t"
4783             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
4784             "movdq   $dst,$tmp2\t! add reduction8L" %}
4785   ins_encode %{
4786     __ vextracti64x4_high($tmp2$$XMMRegister, $src2$$XMMRegister);
4787     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
4788     __ vextracti128_high($tmp$$XMMRegister, $tmp2$$XMMRegister);
4789     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
4790     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
4791     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
4792     __ movdq($tmp$$XMMRegister, $src1$$Register);
4793     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
4794     __ movdq($dst$$Register, $tmp2$$XMMRegister);
4795   %}
4796   ins_pipe( pipe_slow );
4797 %}
4798 #endif
4799 
4800 instruct rsadd2F_reduction_reg(regF dst, vecD src2, regF tmp) %{
4801   predicate(UseSSE >= 1 && UseAVX == 0);
4802   match(Set dst (AddReductionVF dst src2));
4803   effect(TEMP dst, TEMP tmp);
4804   format %{ "addss   $dst,$src2\n\t"
4805             "pshufd  $tmp,$src2,0x01\n\t"
4806             "addss   $dst,$tmp\t! add reduction2F" %}
4807   ins_encode %{
4808     __ addss($dst$$XMMRegister, $src2$$XMMRegister);
4809     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
4810     __ addss($dst$$XMMRegister, $tmp$$XMMRegister);
4811   %}
4812   ins_pipe( pipe_slow );
4813 %}
4814 
4815 instruct rvadd2F_reduction_reg(regF dst, vecD src2, regF tmp) %{
4816   predicate(UseAVX > 0);
4817   match(Set dst (AddReductionVF dst src2));
4818   effect(TEMP dst, TEMP tmp);
4819   format %{ "vaddss  $dst,$dst,$src2\n\t"
4820             "pshufd  $tmp,$src2,0x01\n\t"
4821             "vaddss  $dst,$dst,$tmp\t! add reduction2F" %}
4822   ins_encode %{
4823     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
4824     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
4825     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4826   %}
4827   ins_pipe( pipe_slow );
4828 %}
4829 
4830 instruct rsadd4F_reduction_reg(regF dst, vecX src2, regF tmp) %{
4831   predicate(UseSSE >= 1 && UseAVX == 0);
4832   match(Set dst (AddReductionVF dst src2));
4833   effect(TEMP dst, TEMP tmp);
4834   format %{ "addss   $dst,$src2\n\t"
4835             "pshufd  $tmp,$src2,0x01\n\t"
4836             "addss   $dst,$tmp\n\t"
4837             "pshufd  $tmp,$src2,0x02\n\t"
4838             "addss   $dst,$tmp\n\t"
4839             "pshufd  $tmp,$src2,0x03\n\t"
4840             "addss   $dst,$tmp\t! add reduction4F" %}
4841   ins_encode %{
4842     __ addss($dst$$XMMRegister, $src2$$XMMRegister);
4843     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
4844     __ addss($dst$$XMMRegister, $tmp$$XMMRegister);
4845     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
4846     __ addss($dst$$XMMRegister, $tmp$$XMMRegister);
4847     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
4848     __ addss($dst$$XMMRegister, $tmp$$XMMRegister);
4849   %}
4850   ins_pipe( pipe_slow );
4851 %}
4852 
4853 instruct rvadd4F_reduction_reg(regF dst, vecX src2, regF tmp) %{
4854   predicate(UseAVX > 0);
4855   match(Set dst (AddReductionVF dst src2));
4856   effect(TEMP tmp, TEMP dst);
4857   format %{ "vaddss  $dst,dst,$src2\n\t"
4858             "pshufd  $tmp,$src2,0x01\n\t"
4859             "vaddss  $dst,$dst,$tmp\n\t"
4860             "pshufd  $tmp,$src2,0x02\n\t"
4861             "vaddss  $dst,$dst,$tmp\n\t"
4862             "pshufd  $tmp,$src2,0x03\n\t"
4863             "vaddss  $dst,$dst,$tmp\t! add reduction4F" %}
4864   ins_encode %{
4865     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
4866     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
4867     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4868     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
4869     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4870     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
4871     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4872   %}
4873   ins_pipe( pipe_slow );
4874 %}
4875 
4876 instruct radd8F_reduction_reg(regF dst, vecY src2, regF tmp, regF tmp2) %{
4877   predicate(UseAVX > 0);
4878   match(Set dst (AddReductionVF dst src2));
4879   effect(TEMP tmp, TEMP dst, TEMP tmp2);
4880   format %{ "vaddss  $dst,$dst,$src2\n\t"
4881             "pshufd  $tmp,$src2,0x01\n\t"
4882             "vaddss  $dst,$dst,$tmp\n\t"
4883             "pshufd  $tmp,$src2,0x02\n\t"
4884             "vaddss  $dst,$dst,$tmp\n\t"
4885             "pshufd  $tmp,$src2,0x03\n\t"
4886             "vaddss  $dst,$dst,$tmp\n\t"
4887             "vextractf128_high  $tmp2,$src2\n\t"
4888             "vaddss  $dst,$dst,$tmp2\n\t"
4889             "pshufd  $tmp,$tmp2,0x01\n\t"
4890             "vaddss  $dst,$dst,$tmp\n\t"
4891             "pshufd  $tmp,$tmp2,0x02\n\t"
4892             "vaddss  $dst,$dst,$tmp\n\t"
4893             "pshufd  $tmp,$tmp2,0x03\n\t"
4894             "vaddss  $dst,$dst,$tmp\t! add reduction8F" %}
4895   ins_encode %{
4896     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
4897     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
4898     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4899     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
4900     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4901     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
4902     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4903     __ vextractf128_high($tmp2$$XMMRegister, $src2$$XMMRegister);
4904     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
4905     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
4906     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4907     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
4908     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4909     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
4910     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4911   %}
4912   ins_pipe( pipe_slow );
4913 %}
4914 
4915 instruct radd16F_reduction_reg(regF dst, vecZ src2, regF tmp, regF tmp2) %{
4916   predicate(UseAVX > 2);
4917   match(Set dst (AddReductionVF dst src2));
4918   effect(TEMP tmp, TEMP dst, TEMP tmp2);
4919   format %{ "vaddss  $dst,$dst,$src2\n\t"
4920             "pshufd  $tmp,$src2,0x01\n\t"
4921             "vaddss  $dst,$dst,$tmp\n\t"
4922             "pshufd  $tmp,$src2,0x02\n\t"
4923             "vaddss  $dst,$dst,$tmp\n\t"
4924             "pshufd  $tmp,$src2,0x03\n\t"
4925             "vaddss  $dst,$dst,$tmp\n\t"
4926             "vextractf32x4  $tmp2,$src2,0x1\n\t"
4927             "vaddss  $dst,$dst,$tmp2\n\t"
4928             "pshufd  $tmp,$tmp2,0x01\n\t"
4929             "vaddss  $dst,$dst,$tmp\n\t"
4930             "pshufd  $tmp,$tmp2,0x02\n\t"
4931             "vaddss  $dst,$dst,$tmp\n\t"
4932             "pshufd  $tmp,$tmp2,0x03\n\t"
4933             "vaddss  $dst,$dst,$tmp\n\t"
4934             "vextractf32x4  $tmp2,$src2,0x2\n\t"
4935             "vaddss  $dst,$dst,$tmp2\n\t"
4936             "pshufd  $tmp,$tmp2,0x01\n\t"
4937             "vaddss  $dst,$dst,$tmp\n\t"
4938             "pshufd  $tmp,$tmp2,0x02\n\t"
4939             "vaddss  $dst,$dst,$tmp\n\t"
4940             "pshufd  $tmp,$tmp2,0x03\n\t"
4941             "vaddss  $dst,$dst,$tmp\n\t"
4942             "vextractf32x4  $tmp2,$src2,0x3\n\t"
4943             "vaddss  $dst,$dst,$tmp2\n\t"
4944             "pshufd  $tmp,$tmp2,0x01\n\t"
4945             "vaddss  $dst,$dst,$tmp\n\t"
4946             "pshufd  $tmp,$tmp2,0x02\n\t"
4947             "vaddss  $dst,$dst,$tmp\n\t"
4948             "pshufd  $tmp,$tmp2,0x03\n\t"
4949             "vaddss  $dst,$dst,$tmp\t! add reduction16F" %}
4950   ins_encode %{
4951     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
4952     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
4953     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4954     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
4955     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4956     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
4957     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4958     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
4959     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
4960     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
4961     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4962     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
4963     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4964     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
4965     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4966     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
4967     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
4968     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
4969     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4970     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
4971     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4972     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
4973     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4974     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
4975     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
4976     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
4977     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4978     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
4979     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4980     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
4981     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4982   %}
4983   ins_pipe( pipe_slow );
4984 %}
4985 
4986 instruct rsadd2D_reduction_reg(regD dst, vecX src2, regD tmp) %{
4987   predicate(UseSSE >= 1 && UseAVX == 0);
4988   match(Set dst (AddReductionVD dst src2));
4989   effect(TEMP tmp, TEMP dst);
4990   format %{ "addsd   $dst,$src2\n\t"
4991             "pshufd  $tmp,$src2,0xE\n\t"
4992             "addsd   $dst,$tmp\t! add reduction2D" %}
4993   ins_encode %{
4994     __ addsd($dst$$XMMRegister, $src2$$XMMRegister);
4995     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
4996     __ addsd($dst$$XMMRegister, $tmp$$XMMRegister);
4997   %}
4998   ins_pipe( pipe_slow );
4999 %}
5000 
5001 instruct rvadd2D_reduction_reg(regD dst, vecX src2, regD tmp) %{
5002   predicate(UseAVX > 0);
5003   match(Set dst (AddReductionVD dst src2));
5004   effect(TEMP tmp, TEMP dst);
5005   format %{ "vaddsd  $dst,$dst,$src2\n\t"
5006             "pshufd  $tmp,$src2,0xE\n\t"
5007             "vaddsd  $dst,$dst,$tmp\t! add reduction2D" %}
5008   ins_encode %{
5009     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5010     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5011     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5012   %}
5013   ins_pipe( pipe_slow );
5014 %}
5015 
5016 instruct rvadd4D_reduction_reg(regD dst, vecY src2, regD tmp, regD tmp2) %{
5017   predicate(UseAVX > 0);
5018   match(Set dst (AddReductionVD dst src2));
5019   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5020   format %{ "vaddsd  $dst,$dst,$src2\n\t"
5021             "pshufd  $tmp,$src2,0xE\n\t"
5022             "vaddsd  $dst,$dst,$tmp\n\t"
5023             "vextractf32x4  $tmp2,$src2,0x1\n\t"
5024             "vaddsd  $dst,$dst,$tmp2\n\t"
5025             "pshufd  $tmp,$tmp2,0xE\n\t"
5026             "vaddsd  $dst,$dst,$tmp\t! add reduction4D" %}
5027   ins_encode %{
5028     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5029     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5030     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5031     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5032     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5033     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5034     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5035   %}
5036   ins_pipe( pipe_slow );
5037 %}
5038 
5039 instruct rvadd8D_reduction_reg(regD dst, vecZ src2, regD tmp, regD tmp2) %{
5040   predicate(UseAVX > 2);
5041   match(Set dst (AddReductionVD dst src2));
5042   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5043   format %{ "vaddsd  $dst,$dst,$src2\n\t"
5044             "pshufd  $tmp,$src2,0xE\n\t"
5045             "vaddsd  $dst,$dst,$tmp\n\t"
5046             "vextractf32x4  $tmp2,$src2,0x1\n\t"
5047             "vaddsd  $dst,$dst,$tmp2\n\t"
5048             "pshufd  $tmp,$tmp2,0xE\n\t"
5049             "vaddsd  $dst,$dst,$tmp\n\t"
5050             "vextractf32x4  $tmp2,$src2,0x2\n\t"
5051             "vaddsd  $dst,$dst,$tmp2\n\t"
5052             "pshufd  $tmp,$tmp2,0xE\n\t"
5053             "vaddsd  $dst,$dst,$tmp\n\t"
5054             "vextractf32x4  $tmp2,$src2,0x3\n\t"
5055             "vaddsd  $dst,$dst,$tmp2\n\t"
5056             "pshufd  $tmp,$tmp2,0xE\n\t"
5057             "vaddsd  $dst,$dst,$tmp\t! add reduction8D" %}
5058   ins_encode %{
5059     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5060     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5061     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5062     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5063     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5064     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5065     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5066     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
5067     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5068     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5069     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5070     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
5071     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5072     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5073     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5074   %}
5075   ins_pipe( pipe_slow );
5076 %}
5077 
5078 instruct rsmul2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
5079   predicate(UseSSE > 3 && UseAVX == 0);
5080   match(Set dst (MulReductionVI src1 src2));
5081   effect(TEMP tmp, TEMP tmp2);
5082   format %{ "pshufd  $tmp2,$src2,0x1\n\t"
5083             "pmulld  $tmp2,$src2\n\t"
5084             "movd    $tmp,$src1\n\t"
5085             "pmulld  $tmp2,$tmp\n\t"
5086             "movd    $dst,$tmp2\t! mul reduction2I" %}
5087   ins_encode %{
5088     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5089     __ pmulld($tmp2$$XMMRegister, $src2$$XMMRegister);
5090     __ movdl($tmp$$XMMRegister, $src1$$Register);
5091     __ pmulld($tmp2$$XMMRegister, $tmp$$XMMRegister);
5092     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5093   %}
5094   ins_pipe( pipe_slow );
5095 %}
5096 
5097 instruct rvmul2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
5098   predicate(UseAVX > 0);
5099   match(Set dst (MulReductionVI src1 src2));
5100   effect(TEMP tmp, TEMP tmp2);
5101   format %{ "pshufd   $tmp2,$src2,0x1\n\t"
5102             "vpmulld  $tmp,$src2,$tmp2\n\t"
5103             "movd     $tmp2,$src1\n\t"
5104             "vpmulld  $tmp2,$tmp,$tmp2\n\t"
5105             "movd     $dst,$tmp2\t! mul reduction2I" %}
5106   ins_encode %{
5107     int vector_len = 0;
5108     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5109     __ vpmulld($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5110     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5111     __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5112     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5113   %}
5114   ins_pipe( pipe_slow );
5115 %}
5116 
5117 instruct rsmul4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
5118   predicate(UseSSE > 3 && UseAVX == 0);
5119   match(Set dst (MulReductionVI src1 src2));
5120   effect(TEMP tmp, TEMP tmp2);
5121   format %{ "pshufd  $tmp2,$src2,0xE\n\t"
5122             "pmulld  $tmp2,$src2\n\t"
5123             "pshufd  $tmp,$tmp2,0x1\n\t"
5124             "pmulld  $tmp2,$tmp\n\t"
5125             "movd    $tmp,$src1\n\t"
5126             "pmulld  $tmp2,$tmp\n\t"
5127             "movd    $dst,$tmp2\t! mul reduction4I" %}
5128   ins_encode %{
5129     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
5130     __ pmulld($tmp2$$XMMRegister, $src2$$XMMRegister);
5131     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x1);
5132     __ pmulld($tmp2$$XMMRegister, $tmp$$XMMRegister);
5133     __ movdl($tmp$$XMMRegister, $src1$$Register);
5134     __ pmulld($tmp2$$XMMRegister, $tmp$$XMMRegister);
5135     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5136   %}
5137   ins_pipe( pipe_slow );
5138 %}
5139 
5140 instruct rvmul4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
5141   predicate(UseAVX > 0);
5142   match(Set dst (MulReductionVI src1 src2));
5143   effect(TEMP tmp, TEMP tmp2);
5144   format %{ "pshufd   $tmp2,$src2,0xE\n\t"
5145             "vpmulld  $tmp,$src2,$tmp2\n\t"
5146             "pshufd   $tmp2,$tmp,0x1\n\t"
5147             "vpmulld  $tmp,$tmp,$tmp2\n\t"
5148             "movd     $tmp2,$src1\n\t"
5149             "vpmulld  $tmp2,$tmp,$tmp2\n\t"
5150             "movd     $dst,$tmp2\t! mul reduction4I" %}
5151   ins_encode %{
5152     int vector_len = 0;
5153     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
5154     __ vpmulld($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5155     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
5156     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5157     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5158     __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5159     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5160   %}
5161   ins_pipe( pipe_slow );
5162 %}
5163 
5164 instruct rvmul8I_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2) %{
5165   predicate(UseAVX > 0);
5166   match(Set dst (MulReductionVI src1 src2));
5167   effect(TEMP tmp, TEMP tmp2);
5168   format %{ "vextracti128_high  $tmp,$src2\n\t"
5169             "vpmulld  $tmp,$tmp,$src2\n\t"
5170             "pshufd   $tmp2,$tmp,0xE\n\t"
5171             "vpmulld  $tmp,$tmp,$tmp2\n\t"
5172             "pshufd   $tmp2,$tmp,0x1\n\t"
5173             "vpmulld  $tmp,$tmp,$tmp2\n\t"
5174             "movd     $tmp2,$src1\n\t"
5175             "vpmulld  $tmp2,$tmp,$tmp2\n\t"
5176             "movd     $dst,$tmp2\t! mul reduction8I" %}
5177   ins_encode %{
5178     int vector_len = 0;
5179     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
5180     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
5181     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
5182     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5183     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
5184     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5185     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5186     __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5187     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5188   %}
5189   ins_pipe( pipe_slow );
5190 %}
5191 
5192 instruct rvmul16I_reduction_reg(rRegI dst, rRegI src1, vecZ src2, regF tmp, regF tmp2, regF tmp3) %{
5193   predicate(UseAVX > 2);
5194   match(Set dst (MulReductionVI src1 src2));
5195   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
5196   format %{ "vextracti64x4_high  $tmp3,$src2\n\t"
5197             "vpmulld  $tmp3,$tmp3,$src2\n\t"
5198             "vextracti128_high  $tmp,$tmp3\n\t"
5199             "vpmulld  $tmp,$tmp,$src2\n\t"
5200             "pshufd   $tmp2,$tmp,0xE\n\t"
5201             "vpmulld  $tmp,$tmp,$tmp2\n\t"
5202             "pshufd   $tmp2,$tmp,0x1\n\t"
5203             "vpmulld  $tmp,$tmp,$tmp2\n\t"
5204             "movd     $tmp2,$src1\n\t"
5205             "vpmulld  $tmp2,$tmp,$tmp2\n\t"
5206             "movd     $dst,$tmp2\t! mul reduction16I" %}
5207   ins_encode %{
5208     __ vextracti64x4_high($tmp3$$XMMRegister, $src2$$XMMRegister);
5209     __ vpmulld($tmp3$$XMMRegister, $tmp3$$XMMRegister, $src2$$XMMRegister, 1);
5210     __ vextracti128_high($tmp$$XMMRegister, $tmp3$$XMMRegister);
5211     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp3$$XMMRegister, 0);
5212     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
5213     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5214     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
5215     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5216     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5217     __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5218     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5219   %}
5220   ins_pipe( pipe_slow );
5221 %}
5222 
5223 #ifdef _LP64
5224 instruct rvmul2L_reduction_reg(rRegL dst, rRegL src1, vecX src2, regF tmp, regF tmp2) %{
5225   predicate(UseAVX > 2 && VM_Version::supports_avx512dq());
5226   match(Set dst (MulReductionVL src1 src2));
5227   effect(TEMP tmp, TEMP tmp2);
5228   format %{ "pshufd   $tmp2,$src2,0xE\n\t"
5229             "vpmullq  $tmp,$src2,$tmp2\n\t"
5230             "movdq    $tmp2,$src1\n\t"
5231             "vpmullq  $tmp2,$tmp,$tmp2\n\t"
5232             "movdq    $dst,$tmp2\t! mul reduction2L" %}
5233   ins_encode %{
5234     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
5235     __ vpmullq($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, 0);
5236     __ movdq($tmp2$$XMMRegister, $src1$$Register);
5237     __ vpmullq($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5238     __ movdq($dst$$Register, $tmp2$$XMMRegister);
5239   %}
5240   ins_pipe( pipe_slow );
5241 %}
5242 
5243 instruct rvmul4L_reduction_reg(rRegL dst, rRegL src1, vecY src2, regF tmp, regF tmp2) %{
5244   predicate(UseAVX > 2 && VM_Version::supports_avx512dq());
5245   match(Set dst (MulReductionVL src1 src2));
5246   effect(TEMP tmp, TEMP tmp2);
5247   format %{ "vextracti128_high  $tmp,$src2\n\t"
5248             "vpmullq  $tmp2,$tmp,$src2\n\t"
5249             "pshufd   $tmp,$tmp2,0xE\n\t"
5250             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5251             "movdq    $tmp,$src1\n\t"
5252             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5253             "movdq    $dst,$tmp2\t! mul reduction4L" %}
5254   ins_encode %{
5255     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
5256     __ vpmullq($tmp2$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, 0);
5257     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5258     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5259     __ movdq($tmp$$XMMRegister, $src1$$Register);
5260     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5261     __ movdq($dst$$Register, $tmp2$$XMMRegister);
5262   %}
5263   ins_pipe( pipe_slow );
5264 %}
5265 
5266 instruct rvmul8L_reduction_reg(rRegL dst, rRegL src1, vecZ src2, regF tmp, regF tmp2) %{
5267   predicate(UseAVX > 2 && VM_Version::supports_avx512dq());
5268   match(Set dst (MulReductionVL src1 src2));
5269   effect(TEMP tmp, TEMP tmp2);
5270   format %{ "vextracti64x4_high  $tmp2,$src2\n\t"
5271             "vpmullq  $tmp2,$tmp2,$src2\n\t"
5272             "vextracti128_high  $tmp,$tmp2\n\t"
5273             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5274             "pshufd   $tmp,$tmp2,0xE\n\t"
5275             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5276             "movdq    $tmp,$src1\n\t"
5277             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5278             "movdq    $dst,$tmp2\t! mul reduction8L" %}
5279   ins_encode %{
5280     __ vextracti64x4_high($tmp2$$XMMRegister, $src2$$XMMRegister);
5281     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
5282     __ vextracti128_high($tmp$$XMMRegister, $tmp2$$XMMRegister);
5283     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5284     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5285     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5286     __ movdq($tmp$$XMMRegister, $src1$$Register);
5287     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5288     __ movdq($dst$$Register, $tmp2$$XMMRegister);
5289   %}
5290   ins_pipe( pipe_slow );
5291 %}
5292 #endif
5293 
5294 instruct rsmul2F_reduction(regF dst, vecD src2, regF tmp) %{
5295   predicate(UseSSE >= 1 && UseAVX == 0);
5296   match(Set dst (MulReductionVF dst src2));
5297   effect(TEMP dst, TEMP tmp);
5298   format %{ "mulss   $dst,$src2\n\t"
5299             "pshufd  $tmp,$src2,0x01\n\t"
5300             "mulss   $dst,$tmp\t! mul reduction2F" %}
5301   ins_encode %{
5302     __ mulss($dst$$XMMRegister, $src2$$XMMRegister);
5303     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5304     __ mulss($dst$$XMMRegister, $tmp$$XMMRegister);
5305   %}
5306   ins_pipe( pipe_slow );
5307 %}
5308 
5309 instruct rvmul2F_reduction_reg(regF dst, vecD src2, regF tmp) %{
5310   predicate(UseAVX > 0);
5311   match(Set dst (MulReductionVF dst src2));
5312   effect(TEMP tmp, TEMP dst);
5313   format %{ "vmulss  $dst,$dst,$src2\n\t"
5314             "pshufd  $tmp,$src2,0x01\n\t"
5315             "vmulss  $dst,$dst,$tmp\t! mul reduction2F" %}
5316   ins_encode %{
5317     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5318     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5319     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5320   %}
5321   ins_pipe( pipe_slow );
5322 %}
5323 
5324 instruct rsmul4F_reduction_reg(regF dst, vecX src2, regF tmp) %{
5325   predicate(UseSSE >= 1 && UseAVX == 0);
5326   match(Set dst (MulReductionVF dst src2));
5327   effect(TEMP dst, TEMP tmp);
5328   format %{ "mulss   $dst,$src2\n\t"
5329             "pshufd  $tmp,$src2,0x01\n\t"
5330             "mulss   $dst,$tmp\n\t"
5331             "pshufd  $tmp,$src2,0x02\n\t"
5332             "mulss   $dst,$tmp\n\t"
5333             "pshufd  $tmp,$src2,0x03\n\t"
5334             "mulss   $dst,$tmp\t! mul reduction4F" %}
5335   ins_encode %{
5336     __ mulss($dst$$XMMRegister, $src2$$XMMRegister);
5337     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5338     __ mulss($dst$$XMMRegister, $tmp$$XMMRegister);
5339     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5340     __ mulss($dst$$XMMRegister, $tmp$$XMMRegister);
5341     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5342     __ mulss($dst$$XMMRegister, $tmp$$XMMRegister);
5343   %}
5344   ins_pipe( pipe_slow );
5345 %}
5346 
5347 instruct rvmul4F_reduction_reg(regF dst, vecX src2, regF tmp) %{
5348   predicate(UseAVX > 0);
5349   match(Set dst (MulReductionVF dst src2));
5350   effect(TEMP tmp, TEMP dst);
5351   format %{ "vmulss  $dst,$dst,$src2\n\t"
5352             "pshufd  $tmp,$src2,0x01\n\t"
5353             "vmulss  $dst,$dst,$tmp\n\t"
5354             "pshufd  $tmp,$src2,0x02\n\t"
5355             "vmulss  $dst,$dst,$tmp\n\t"
5356             "pshufd  $tmp,$src2,0x03\n\t"
5357             "vmulss  $dst,$dst,$tmp\t! mul reduction4F" %}
5358   ins_encode %{
5359     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5360     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5361     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5362     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5363     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5364     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5365     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5366   %}
5367   ins_pipe( pipe_slow );
5368 %}
5369 
5370 instruct rvmul8F_reduction_reg(regF dst, vecY src2, regF tmp, regF tmp2) %{
5371   predicate(UseAVX > 0);
5372   match(Set dst (MulReductionVF dst src2));
5373   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5374   format %{ "vmulss  $dst,$dst,$src2\n\t"
5375             "pshufd  $tmp,$src2,0x01\n\t"
5376             "vmulss  $dst,$dst,$tmp\n\t"
5377             "pshufd  $tmp,$src2,0x02\n\t"
5378             "vmulss  $dst,$dst,$tmp\n\t"
5379             "pshufd  $tmp,$src2,0x03\n\t"
5380             "vmulss  $dst,$dst,$tmp\n\t"
5381             "vextractf128_high  $tmp2,$src2\n\t"
5382             "vmulss  $dst,$dst,$tmp2\n\t"
5383             "pshufd  $tmp,$tmp2,0x01\n\t"
5384             "vmulss  $dst,$dst,$tmp\n\t"
5385             "pshufd  $tmp,$tmp2,0x02\n\t"
5386             "vmulss  $dst,$dst,$tmp\n\t"
5387             "pshufd  $tmp,$tmp2,0x03\n\t"
5388             "vmulss  $dst,$dst,$tmp\t! mul reduction8F" %}
5389   ins_encode %{
5390     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5391     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5392     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5393     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5394     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5395     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5396     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5397     __ vextractf128_high($tmp2$$XMMRegister, $src2$$XMMRegister);
5398     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5399     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5400     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5401     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5402     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5403     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5404     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5405   %}
5406   ins_pipe( pipe_slow );
5407 %}
5408 
5409 instruct rvmul16F_reduction_reg(regF dst, vecZ src2, regF tmp, regF tmp2) %{
5410   predicate(UseAVX > 2);
5411   match(Set dst (MulReductionVF dst src2));
5412   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5413   format %{ "vmulss  $dst,$dst,$src2\n\t"
5414             "pshufd  $tmp,$src2,0x01\n\t"
5415             "vmulss  $dst,$dst,$tmp\n\t"
5416             "pshufd  $tmp,$src2,0x02\n\t"
5417             "vmulss  $dst,$dst,$tmp\n\t"
5418             "pshufd  $tmp,$src2,0x03\n\t"
5419             "vmulss  $dst,$dst,$tmp\n\t"
5420             "vextractf32x4  $tmp2,$src2,0x1\n\t"
5421             "vmulss  $dst,$dst,$tmp2\n\t"
5422             "pshufd  $tmp,$tmp2,0x01\n\t"
5423             "vmulss  $dst,$dst,$tmp\n\t"
5424             "pshufd  $tmp,$tmp2,0x02\n\t"
5425             "vmulss  $dst,$dst,$tmp\n\t"
5426             "pshufd  $tmp,$tmp2,0x03\n\t"
5427             "vmulss  $dst,$dst,$tmp\n\t"
5428             "vextractf32x4  $tmp2,$src2,0x2\n\t"
5429             "vmulss  $dst,$dst,$tmp2\n\t"
5430             "pshufd  $tmp,$tmp2,0x01\n\t"
5431             "vmulss  $dst,$dst,$tmp\n\t"
5432             "pshufd  $tmp,$tmp2,0x02\n\t"
5433             "vmulss  $dst,$dst,$tmp\n\t"
5434             "pshufd  $tmp,$tmp2,0x03\n\t"
5435             "vmulss  $dst,$dst,$tmp\n\t"
5436             "vextractf32x4  $tmp2,$src2,0x3\n\t"
5437             "vmulss  $dst,$dst,$tmp2\n\t"
5438             "pshufd  $tmp,$tmp2,0x01\n\t"
5439             "vmulss  $dst,$dst,$tmp\n\t"
5440             "pshufd  $tmp,$tmp2,0x02\n\t"
5441             "vmulss  $dst,$dst,$tmp\n\t"
5442             "pshufd  $tmp,$tmp2,0x03\n\t"
5443             "vmulss  $dst,$dst,$tmp\t! mul reduction16F" %}
5444   ins_encode %{
5445     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5446     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5447     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5448     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5449     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5450     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5451     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5452     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5453     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5454     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5455     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5456     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5457     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5458     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5459     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5460     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
5461     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5462     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5463     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5464     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5465     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5466     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5467     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5468     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
5469     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5470     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5471     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5472     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5473     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5474     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5475     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5476   %}
5477   ins_pipe( pipe_slow );
5478 %}
5479 
5480 instruct rsmul2D_reduction_reg(regD dst, vecX src2, regD tmp) %{
5481   predicate(UseSSE >= 1 && UseAVX == 0);
5482   match(Set dst (MulReductionVD dst src2));
5483   effect(TEMP dst, TEMP tmp);
5484   format %{ "mulsd   $dst,$src2\n\t"
5485             "pshufd  $tmp,$src2,0xE\n\t"
5486             "mulsd   $dst,$tmp\t! mul reduction2D" %}
5487   ins_encode %{
5488     __ mulsd($dst$$XMMRegister, $src2$$XMMRegister);
5489     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5490     __ mulsd($dst$$XMMRegister, $tmp$$XMMRegister);
5491   %}
5492   ins_pipe( pipe_slow );
5493 %}
5494 
5495 instruct rvmul2D_reduction_reg(regD dst, vecX src2, regD tmp) %{
5496   predicate(UseAVX > 0);
5497   match(Set dst (MulReductionVD dst src2));
5498   effect(TEMP tmp, TEMP dst);
5499   format %{ "vmulsd  $dst,$dst,$src2\n\t"
5500             "pshufd  $tmp,$src2,0xE\n\t"
5501             "vmulsd  $dst,$dst,$tmp\t! mul reduction2D" %}
5502   ins_encode %{
5503     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5504     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5505     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5506   %}
5507   ins_pipe( pipe_slow );
5508 %}
5509 
5510 instruct rvmul4D_reduction_reg(regD dst, vecY src2, regD tmp, regD tmp2) %{
5511   predicate(UseAVX > 0);
5512   match(Set dst (MulReductionVD dst src2));
5513   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5514   format %{ "vmulsd  $dst,$dst,$src2\n\t"
5515             "pshufd  $tmp,$src2,0xE\n\t"
5516             "vmulsd  $dst,$dst,$tmp\n\t"
5517             "vextractf128_high  $tmp2,$src2\n\t"
5518             "vmulsd  $dst,$dst,$tmp2\n\t"
5519             "pshufd  $tmp,$tmp2,0xE\n\t"
5520             "vmulsd  $dst,$dst,$tmp\t! mul reduction4D" %}
5521   ins_encode %{
5522     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5523     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5524     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5525     __ vextractf128_high($tmp2$$XMMRegister, $src2$$XMMRegister);
5526     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5527     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5528     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5529   %}
5530   ins_pipe( pipe_slow );
5531 %}
5532 
5533 instruct rvmul8D_reduction_reg(regD dst, vecZ src2, regD tmp, regD tmp2) %{
5534   predicate(UseAVX > 2);
5535   match(Set dst (MulReductionVD dst src2));
5536   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5537   format %{ "vmulsd  $dst,$dst,$src2\n\t"
5538             "pshufd  $tmp,$src2,0xE\n\t"
5539             "vmulsd  $dst,$dst,$tmp\n\t"
5540             "vextractf32x4  $tmp2,$src2,0x1\n\t"
5541             "vmulsd  $dst,$dst,$tmp2\n\t"
5542             "pshufd  $tmp,$src2,0xE\n\t"
5543             "vmulsd  $dst,$dst,$tmp\n\t"
5544             "vextractf32x4  $tmp2,$src2,0x2\n\t"
5545             "vmulsd  $dst,$dst,$tmp2\n\t"
5546             "pshufd  $tmp,$tmp2,0xE\n\t"
5547             "vmulsd  $dst,$dst,$tmp\n\t"
5548             "vextractf32x4  $tmp2,$src2,0x3\n\t"
5549             "vmulsd  $dst,$dst,$tmp2\n\t"
5550             "pshufd  $tmp,$tmp2,0xE\n\t"
5551             "vmulsd  $dst,$dst,$tmp\t! mul reduction8D" %}
5552   ins_encode %{
5553     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5554     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5555     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5556     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5557     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5558     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5559     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5560     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
5561     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5562     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5563     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5564     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
5565     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5566     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5567     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5568   %}
5569   ins_pipe( pipe_slow );
5570 %}
5571 
5572 // ====================VECTOR ARITHMETIC=======================================
5573 
5574 // --------------------------------- ADD --------------------------------------
5575 
5576 // Bytes vector add
5577 instruct vadd4B(vecS dst, vecS src) %{
5578   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
5579   match(Set dst (AddVB dst src));
5580   format %{ "paddb   $dst,$src\t! add packed4B" %}
5581   ins_encode %{
5582     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
5583   %}
5584   ins_pipe( pipe_slow );
5585 %}
5586 
5587 instruct vadd4B_reg_avx(vecS dst, vecS src1, vecS src2) %{
5588   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
5589   match(Set dst (AddVB src1 src2));
5590   format %{ "vpaddb  $dst,$src1,$src2\t! add packed4B" %}
5591   ins_encode %{
5592     int vector_len = 0;
5593     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5594   %}
5595   ins_pipe( pipe_slow );
5596 %}
5597 
5598 instruct vadd4B_reg_evex(vecS dst, vecS src1, vecS src2) %{
5599   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
5600   match(Set dst (AddVB src1 src2));
5601   format %{ "vpaddb  $dst,$src1,$src2\t! add packed4B" %}
5602   ins_encode %{
5603     int vector_len = 0;
5604     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5605   %}
5606   ins_pipe( pipe_slow );
5607 %}
5608 
5609 instruct vadd4B_reg_evex_special(vecS dst, vecS src1, vecS src2) %{
5610   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
5611   match(Set dst (AddVB dst src2));
5612   effect(TEMP src1);
5613   format %{ "vpaddb  $dst,$dst,$src2\t! add packed4B" %}
5614   ins_encode %{
5615     int vector_len = 0;
5616     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5617   %}
5618   ins_pipe( pipe_slow );
5619 %}
5620 
5621 instruct vadd4B_mem_avx(vecS dst, vecS src, memory mem) %{
5622   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
5623   match(Set dst (AddVB src (LoadVector mem)));
5624   format %{ "vpaddb  $dst,$src,$mem\t! add packed4B" %}
5625   ins_encode %{
5626     int vector_len = 0;
5627     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5628   %}
5629   ins_pipe( pipe_slow );
5630 %}
5631 
5632 instruct vadd4B_mem_evex(vecS dst, vecS src, memory mem) %{
5633   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
5634   match(Set dst (AddVB src (LoadVector mem)));
5635   format %{ "vpaddb  $dst,$src,$mem\t! add packed4B" %}
5636   ins_encode %{
5637     int vector_len = 0;
5638     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5639   %}
5640   ins_pipe( pipe_slow );
5641 %}
5642 
5643 instruct vadd4B_mem_evex_special(vecS dst, vecS src, memory mem) %{
5644   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
5645   match(Set dst (AddVB dst (LoadVector mem)));
5646   effect(TEMP src);
5647   format %{ "vpaddb  $dst,$src,$mem\t! add packed4B" %}
5648   ins_encode %{
5649     int vector_len = 0;
5650     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5651   %}
5652   ins_pipe( pipe_slow );
5653 %}
5654 
5655 instruct vadd8B(vecD dst, vecD src) %{
5656   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
5657   match(Set dst (AddVB dst src));
5658   format %{ "paddb   $dst,$src\t! add packed8B" %}
5659   ins_encode %{
5660     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
5661   %}
5662   ins_pipe( pipe_slow );
5663 %}
5664 
5665 instruct vadd8B_reg_avx(vecD dst, vecD src1, vecD src2) %{
5666   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
5667   match(Set dst (AddVB src1 src2));
5668   format %{ "vpaddb  $dst,$src1,$src2\t! add packed8B" %}
5669   ins_encode %{
5670     int vector_len = 0;
5671     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5672   %}
5673   ins_pipe( pipe_slow );
5674 %}
5675 
5676 instruct vadd8B_reg_evex(vecD dst, vecD src1, vecD src2) %{
5677   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
5678   match(Set dst (AddVB src1 src2));
5679   format %{ "vpaddb  $dst,$src1,$src2\t! add packed8B" %}
5680   ins_encode %{
5681     int vector_len = 0;
5682     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5683   %}
5684   ins_pipe( pipe_slow );
5685 %}
5686 
5687 instruct vadd8B_reg_evex_special(vecD dst, vecD src1, vecD src2) %{
5688   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
5689   match(Set dst (AddVB dst src2));
5690   effect(TEMP src1);
5691   format %{ "vpaddb  $dst,$dst,$src2\t! add packed8B" %}
5692   ins_encode %{
5693     int vector_len = 0;
5694     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5695   %}
5696   ins_pipe( pipe_slow );
5697 %}
5698 
5699 instruct vadd8B_mem_avx(vecD dst, vecD src, memory mem) %{
5700   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
5701   match(Set dst (AddVB src (LoadVector mem)));
5702   format %{ "vpaddb  $dst,$src,$mem\t! add packed8B" %}
5703   ins_encode %{
5704     int vector_len = 0;
5705     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5706   %}
5707   ins_pipe( pipe_slow );
5708 %}
5709 
5710 instruct vadd8B_mem_evex(vecD dst, vecD src, memory mem) %{
5711   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
5712   match(Set dst (AddVB src (LoadVector mem)));
5713   format %{ "vpaddb  $dst,$src,$mem\t! add packed8B" %}
5714   ins_encode %{
5715     int vector_len = 0;
5716     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5717   %}
5718   ins_pipe( pipe_slow );
5719 %}
5720 
5721 instruct vadd8B_mem_evex_special(vecD dst, vecD src, memory mem) %{
5722   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
5723   match(Set dst (AddVB dst (LoadVector mem)));
5724   effect(TEMP src);
5725   format %{ "vpaddb  $dst,$src,$mem\t! add packed8B" %}
5726   ins_encode %{
5727     int vector_len = 0;
5728     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5729   %}
5730   ins_pipe( pipe_slow );
5731 %}
5732 
5733 instruct vadd16B(vecX dst, vecX src) %{
5734   predicate(UseAVX == 0 && n->as_Vector()->length() == 16);
5735   match(Set dst (AddVB dst src));
5736   format %{ "paddb   $dst,$src\t! add packed16B" %}
5737   ins_encode %{
5738     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
5739   %}
5740   ins_pipe( pipe_slow );
5741 %}
5742 
5743 instruct vadd16B_reg_avx(vecX dst, vecX src1, vecX src2) %{
5744   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 16);
5745   match(Set dst (AddVB src1 src2));
5746   format %{ "vpaddb  $dst,$src1,$src2\t! add packed16B" %}
5747   ins_encode %{
5748     int vector_len = 0;
5749     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5750   %}
5751   ins_pipe( pipe_slow );
5752 %}
5753 
5754 instruct vadd16B_reg_evex(vecX dst, vecX src1, vecX src2) %{
5755   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
5756   match(Set dst (AddVB src1 src2));
5757   format %{ "vpaddb  $dst,$src1,$src2\t! add packed16B" %}
5758   ins_encode %{
5759     int vector_len = 0;
5760     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5761   %}
5762   ins_pipe( pipe_slow );
5763 %}
5764 
5765 instruct vadd16B_reg_evex_special(vecX dst, vecX src1, vecX src2) %{
5766   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
5767   match(Set dst (AddVB dst src2));
5768   effect(TEMP src1);
5769   format %{ "vpaddb  $dst,$dst,$src2\t! add packed16B" %}
5770   ins_encode %{
5771     int vector_len = 0;
5772     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5773   %}
5774   ins_pipe( pipe_slow );
5775 %}
5776 
5777 instruct vadd16B_mem_avx(vecX dst, vecX src, memory mem) %{
5778   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 16);
5779   match(Set dst (AddVB src (LoadVector mem)));
5780   format %{ "vpaddb  $dst,$src,$mem\t! add packed16B" %}
5781   ins_encode %{
5782     int vector_len = 0;
5783     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5784   %}
5785   ins_pipe( pipe_slow );
5786 %}
5787 
5788 instruct vadd16B_mem_evex(vecX dst, vecX src, memory mem) %{
5789   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
5790   match(Set dst (AddVB src (LoadVector mem)));
5791   format %{ "vpaddb  $dst,$src,$mem\t! add packed16B" %}
5792   ins_encode %{
5793     int vector_len = 0;
5794     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5795   %}
5796   ins_pipe( pipe_slow );
5797 %}
5798 
5799 instruct vadd16B_mem_evex_special(vecX dst, vecX src, memory mem) %{
5800   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
5801   match(Set dst (AddVB dst (LoadVector mem)));
5802   effect(TEMP src);
5803   format %{ "vpaddb  $dst,$src,$mem\t! add packed16B" %}
5804   ins_encode %{
5805     int vector_len = 0;
5806     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5807   %}
5808   ins_pipe( pipe_slow );
5809 %}
5810 
5811 instruct vadd32B_reg_avx(vecY dst, vecY src1, vecY src2) %{
5812   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 32);
5813   match(Set dst (AddVB src1 src2));
5814   format %{ "vpaddb  $dst,$src1,$src2\t! add packed32B" %}
5815   ins_encode %{
5816     int vector_len = 1;
5817     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5818   %}
5819   ins_pipe( pipe_slow );
5820 %}
5821 
5822 instruct vadd32B_reg_evex(vecY dst, vecY src1, vecY src2) %{
5823   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
5824   match(Set dst (AddVB src1 src2));
5825   format %{ "vpaddb  $dst,$src1,$src2\t! add packed32B" %}
5826   ins_encode %{
5827     int vector_len = 1;
5828     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5829   %}
5830   ins_pipe( pipe_slow );
5831 %}
5832 
5833 instruct vadd32B_reg_evex_special(vecY dst, vecY src1, vecY src2) %{
5834   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 32);
5835   match(Set dst (AddVB dst src2));
5836   effect(TEMP src1);
5837   format %{ "vpaddb  $dst,$dst,$src2\t! add packed32B" %}
5838   ins_encode %{
5839     int vector_len = 1;
5840     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5841   %}
5842   ins_pipe( pipe_slow );
5843 %}
5844 
5845 instruct vadd32B_mem_avx(vecY dst, vecY src, memory mem) %{
5846   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 32);
5847   match(Set dst (AddVB src (LoadVector mem)));
5848   format %{ "vpaddb  $dst,$src,$mem\t! add packed32B" %}
5849   ins_encode %{
5850     int vector_len = 1;
5851     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5852   %}
5853   ins_pipe( pipe_slow );
5854 %}
5855 
5856 instruct vadd32B_mem_evex(vecY dst, vecY src, memory mem) %{
5857   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
5858   match(Set dst (AddVB src (LoadVector mem)));
5859   format %{ "vpaddb  $dst,$src,$mem\t! add packed32B" %}
5860   ins_encode %{
5861     int vector_len = 1;
5862     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5863   %}
5864   ins_pipe( pipe_slow );
5865 %}
5866 
5867 instruct vadd32B_mem_evex_special(vecY dst, vecY src, memory mem) %{
5868   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 32);
5869   match(Set dst (AddVB dst (LoadVector mem)));
5870   effect(TEMP src);
5871   format %{ "vpaddb  $dst,$src,$mem\t! add packed32B" %}
5872   ins_encode %{
5873     int vector_len = 1;
5874     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5875   %}
5876   ins_pipe( pipe_slow );
5877 %}
5878 
5879 instruct vadd64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
5880   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64);
5881   match(Set dst (AddVB src1 src2));
5882   format %{ "vpaddb  $dst,$src1,$src2\t! add packed64B" %}
5883   ins_encode %{
5884     int vector_len = 2;
5885     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5886   %}
5887   ins_pipe( pipe_slow );
5888 %}
5889 
5890 instruct vadd64B_mem(vecZ dst, vecZ src, memory mem) %{
5891   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64);
5892   match(Set dst (AddVB src (LoadVector mem)));
5893   format %{ "vpaddb  $dst,$src,$mem\t! add packed64B" %}
5894   ins_encode %{
5895     int vector_len = 2;
5896     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5897   %}
5898   ins_pipe( pipe_slow );
5899 %}
5900 
5901 // Shorts/Chars vector add
5902 instruct vadd2S(vecS dst, vecS src) %{
5903   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
5904   match(Set dst (AddVS dst src));
5905   format %{ "paddw   $dst,$src\t! add packed2S" %}
5906   ins_encode %{
5907     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
5908   %}
5909   ins_pipe( pipe_slow );
5910 %}
5911 
5912 instruct vadd2S_reg_avx(vecS dst, vecS src1, vecS src2) %{
5913   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
5914   match(Set dst (AddVS src1 src2));
5915   format %{ "vpaddw  $dst,$src1,$src2\t! add packed2S" %}
5916   ins_encode %{
5917     int vector_len = 0;
5918     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5919   %}
5920   ins_pipe( pipe_slow );
5921 %}
5922 
5923 instruct vadd2S_reg_evex(vecS dst, vecS src1, vecS src2) %{
5924   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
5925   match(Set dst (AddVS src1 src2));
5926   format %{ "vpaddw  $dst,$src1,$src2\t! add packed2S" %}
5927   ins_encode %{
5928     int vector_len = 0;
5929     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5930   %}
5931   ins_pipe( pipe_slow );
5932 %}
5933 
5934 instruct vadd2S_reg_evex_special(vecS dst, vecS src1, vecS src2) %{
5935   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
5936   match(Set dst (AddVS dst src2));
5937   effect(TEMP src1);
5938   format %{ "vpaddw  $dst,$dst,$src2\t! add packed2S" %}
5939   ins_encode %{
5940     int vector_len = 0;
5941     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5942   %}
5943   ins_pipe( pipe_slow );
5944 %}
5945 
5946 instruct vadd2S_mem_avx(vecS dst, vecS src, memory mem) %{
5947   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
5948   match(Set dst (AddVS src (LoadVector mem)));
5949   format %{ "vpaddw  $dst,$src,$mem\t! add packed2S" %}
5950   ins_encode %{
5951     int vector_len = 0;
5952     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5953   %}
5954   ins_pipe( pipe_slow );
5955 %}
5956 
5957 instruct vadd2S_mem_evex(vecS dst, vecS src, memory mem) %{
5958   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
5959   match(Set dst (AddVS src (LoadVector mem)));
5960   format %{ "vpaddw  $dst,$src,$mem\t! add packed2S" %}
5961   ins_encode %{
5962     int vector_len = 0;
5963     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5964   %}
5965   ins_pipe( pipe_slow );
5966 %}
5967 
5968 instruct vadd2S_mem_evex_special(vecS dst, vecS src, memory mem) %{
5969   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
5970   match(Set dst (AddVS dst (LoadVector mem)));
5971   effect(TEMP src);
5972   format %{ "vpaddw  $dst,$src,$mem\t! add packed2S" %}
5973   ins_encode %{
5974     int vector_len = 0;
5975     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5976   %}
5977   ins_pipe( pipe_slow );
5978 %}
5979 
5980 instruct vadd4S(vecD dst, vecD src) %{
5981   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
5982   match(Set dst (AddVS dst src));
5983   format %{ "paddw   $dst,$src\t! add packed4S" %}
5984   ins_encode %{
5985     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
5986   %}
5987   ins_pipe( pipe_slow );
5988 %}
5989 
5990 instruct vadd4S_reg_avx(vecD dst, vecD src1, vecD src2) %{
5991   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
5992   match(Set dst (AddVS src1 src2));
5993   format %{ "vpaddw  $dst,$src1,$src2\t! add packed4S" %}
5994   ins_encode %{
5995     int vector_len = 0;
5996     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5997   %}
5998   ins_pipe( pipe_slow );
5999 %}
6000 
6001 instruct vadd4S_reg_evex(vecD dst, vecD src1, vecD src2) %{
6002   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
6003   match(Set dst (AddVS src1 src2));
6004   format %{ "vpaddw  $dst,$src1,$src2\t! add packed4S" %}
6005   ins_encode %{
6006     int vector_len = 0;
6007     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6008   %}
6009   ins_pipe( pipe_slow );
6010 %}
6011 
6012 instruct vadd4S_reg_evex_special(vecD dst, vecD src1, vecD src2) %{
6013   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
6014   match(Set dst (AddVS dst src2));
6015   effect(TEMP src1);
6016   format %{ "vpaddw  $dst,$dst,$src2\t! add packed4S" %}
6017   ins_encode %{
6018     int vector_len = 0;
6019     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6020   %}
6021   ins_pipe( pipe_slow );
6022 %}
6023 
6024 instruct vadd4S_mem_avx(vecD dst, vecD src, memory mem) %{
6025   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
6026   match(Set dst (AddVS src (LoadVector mem)));
6027   format %{ "vpaddw  $dst,$src,$mem\t! add packed4S" %}
6028   ins_encode %{
6029     int vector_len = 0;
6030     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6031   %}
6032   ins_pipe( pipe_slow );
6033 %}
6034 
6035 instruct vadd4S_mem_evex(vecD dst, vecD src, memory mem) %{
6036   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
6037   match(Set dst (AddVS src (LoadVector mem)));
6038   format %{ "vpaddw  $dst,$src,$mem\t! add packed4S" %}
6039   ins_encode %{
6040     int vector_len = 0;
6041     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6042   %}
6043   ins_pipe( pipe_slow );
6044 %}
6045 
6046 instruct vadd4S_mem_evex_special(vecD dst, vecD src, memory mem) %{
6047   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
6048   match(Set dst (AddVS dst (LoadVector mem)));
6049   effect(TEMP src);
6050   format %{ "vpaddw  $dst,$src,$mem\t! add packed4S" %}
6051   ins_encode %{
6052     int vector_len = 0;
6053     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6054   %}
6055   ins_pipe( pipe_slow );
6056 %}
6057 
6058 instruct vadd8S(vecX dst, vecX src) %{
6059   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
6060   match(Set dst (AddVS dst src));
6061   format %{ "paddw   $dst,$src\t! add packed8S" %}
6062   ins_encode %{
6063     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
6064   %}
6065   ins_pipe( pipe_slow );
6066 %}
6067 
6068 instruct vadd8S_reg_avx(vecX dst, vecX src1, vecX src2) %{
6069   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
6070   match(Set dst (AddVS src1 src2));
6071   format %{ "vpaddw  $dst,$src1,$src2\t! add packed8S" %}
6072   ins_encode %{
6073     int vector_len = 0;
6074     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6075   %}
6076   ins_pipe( pipe_slow );
6077 %}
6078 
6079 instruct vadd8S_reg_evex(vecX dst, vecX src1, vecX src2) %{
6080   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
6081   match(Set dst (AddVS src1 src2));
6082   format %{ "vpaddw  $dst,$src1,$src2\t! add packed8S" %}
6083   ins_encode %{
6084     int vector_len = 0;
6085     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6086   %}
6087   ins_pipe( pipe_slow );
6088 %}
6089 
6090 instruct vadd8S_reg_evex_special(vecX dst, vecX src1, vecX src2) %{
6091   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
6092   match(Set dst (AddVS dst src2));
6093   effect(TEMP src1);
6094   format %{ "vpaddw  $dst,$dst,$src2\t! add packed8S" %}
6095   ins_encode %{
6096     int vector_len = 0;
6097     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6098   %}
6099   ins_pipe( pipe_slow );
6100 %}
6101 
6102 instruct vadd8S_mem_avx(vecX dst, vecX src, memory mem) %{
6103   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
6104   match(Set dst (AddVS src (LoadVector mem)));
6105   format %{ "vpaddw  $dst,$src,$mem\t! add packed8S" %}
6106   ins_encode %{
6107     int vector_len = 0;
6108     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6109   %}
6110   ins_pipe( pipe_slow );
6111 %}
6112 
6113 instruct vadd8S_mem_evex(vecX dst, vecX src, memory mem) %{
6114   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
6115   match(Set dst (AddVS src (LoadVector mem)));
6116   format %{ "vpaddw  $dst,$src,$mem\t! add packed8S" %}
6117   ins_encode %{
6118     int vector_len = 0;
6119     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6120   %}
6121   ins_pipe( pipe_slow );
6122 %}
6123 
6124 instruct vadd8S_mem_evex_special(vecX dst, vecX src, memory mem) %{
6125   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
6126   match(Set dst (AddVS dst (LoadVector mem)));
6127   effect(TEMP src);
6128   format %{ "vpaddw  $dst,$src,$mem\t! add packed8S" %}
6129   ins_encode %{
6130     int vector_len = 0;
6131     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6132   %}
6133   ins_pipe( pipe_slow );
6134 %}
6135 
6136 instruct vadd16S_reg_avx(vecY dst, vecY src1, vecY src2) %{
6137   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
6138   match(Set dst (AddVS src1 src2));
6139   format %{ "vpaddw  $dst,$src1,$src2\t! add packed16S" %}
6140   ins_encode %{
6141     int vector_len = 1;
6142     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6143   %}
6144   ins_pipe( pipe_slow );
6145 %}
6146 
6147 instruct vadd16S_reg_evex(vecY dst, vecY src1, vecY src2) %{
6148   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
6149   match(Set dst (AddVS src1 src2));
6150   format %{ "vpaddw  $dst,$src1,$src2\t! add packed16S" %}
6151   ins_encode %{
6152     int vector_len = 1;
6153     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6154   %}
6155   ins_pipe( pipe_slow );
6156 %}
6157 
6158 instruct vadd16S_reg_evex_special(vecY dst, vecY src1, vecY src2) %{
6159   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
6160   match(Set dst (AddVS dst src2));
6161   effect(TEMP src1);
6162   format %{ "vpaddw  $dst,$dst,$src2\t! add packed16S" %}
6163   ins_encode %{
6164     int vector_len = 1;
6165     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6166   %}
6167   ins_pipe( pipe_slow );
6168 %}
6169 
6170 instruct vadd16S_mem_avx(vecY dst, vecY src, memory mem) %{
6171   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
6172   match(Set dst (AddVS src (LoadVector mem)));
6173   format %{ "vpaddw  $dst,$src,$mem\t! add packed16S" %}
6174   ins_encode %{
6175     int vector_len = 1;
6176     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6177   %}
6178   ins_pipe( pipe_slow );
6179 %}
6180 
6181 instruct vadd16S_mem_evex(vecY dst, vecY src, memory mem) %{
6182   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
6183   match(Set dst (AddVS src (LoadVector mem)));
6184   format %{ "vpaddw  $dst,$src,$mem\t! add packed16S" %}
6185   ins_encode %{
6186     int vector_len = 1;
6187     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6188   %}
6189   ins_pipe( pipe_slow );
6190 %}
6191 
6192 instruct vadd16S_mem_evex_special(vecY dst, vecY src, memory mem) %{
6193   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
6194   match(Set dst (AddVS dst (LoadVector mem)));
6195   effect(TEMP src);
6196   format %{ "vpaddw  $dst,$src,$mem\t! add packed16S" %}
6197   ins_encode %{
6198     int vector_len = 1;
6199     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6200   %}
6201   ins_pipe( pipe_slow );
6202 %}
6203 
6204 instruct vadd32S_reg(vecZ dst, vecZ src1, vecZ src2) %{
6205   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
6206   match(Set dst (AddVS src1 src2));
6207   format %{ "vpaddw  $dst,$src1,$src2\t! add packed32S" %}
6208   ins_encode %{
6209     int vector_len = 2;
6210     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6211   %}
6212   ins_pipe( pipe_slow );
6213 %}
6214 
6215 instruct vadd32S_mem(vecZ dst, vecZ src, memory mem) %{
6216   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
6217   match(Set dst (AddVS src (LoadVector mem)));
6218   format %{ "vpaddw  $dst,$src,$mem\t! add packed32S" %}
6219   ins_encode %{
6220     int vector_len = 2;
6221     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6222   %}
6223   ins_pipe( pipe_slow );
6224 %}
6225 
6226 // Integers vector add
6227 instruct vadd2I(vecD dst, vecD src) %{
6228   predicate(n->as_Vector()->length() == 2);
6229   match(Set dst (AddVI dst src));
6230   format %{ "paddd   $dst,$src\t! add packed2I" %}
6231   ins_encode %{
6232     __ paddd($dst$$XMMRegister, $src$$XMMRegister);
6233   %}
6234   ins_pipe( pipe_slow );
6235 %}
6236 
6237 instruct vadd2I_reg(vecD dst, vecD src1, vecD src2) %{
6238   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6239   match(Set dst (AddVI src1 src2));
6240   format %{ "vpaddd  $dst,$src1,$src2\t! add packed2I" %}
6241   ins_encode %{
6242     int vector_len = 0;
6243     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6244   %}
6245   ins_pipe( pipe_slow );
6246 %}
6247 
6248 instruct vadd2I_mem(vecD dst, vecD src, memory mem) %{
6249   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6250   match(Set dst (AddVI src (LoadVector mem)));
6251   format %{ "vpaddd  $dst,$src,$mem\t! add packed2I" %}
6252   ins_encode %{
6253     int vector_len = 0;
6254     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6255   %}
6256   ins_pipe( pipe_slow );
6257 %}
6258 
6259 instruct vadd4I(vecX dst, vecX src) %{
6260   predicate(n->as_Vector()->length() == 4);
6261   match(Set dst (AddVI dst src));
6262   format %{ "paddd   $dst,$src\t! add packed4I" %}
6263   ins_encode %{
6264     __ paddd($dst$$XMMRegister, $src$$XMMRegister);
6265   %}
6266   ins_pipe( pipe_slow );
6267 %}
6268 
6269 instruct vadd4I_reg(vecX dst, vecX src1, vecX src2) %{
6270   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6271   match(Set dst (AddVI src1 src2));
6272   format %{ "vpaddd  $dst,$src1,$src2\t! add packed4I" %}
6273   ins_encode %{
6274     int vector_len = 0;
6275     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6276   %}
6277   ins_pipe( pipe_slow );
6278 %}
6279 
6280 instruct vadd4I_mem(vecX dst, vecX src, memory mem) %{
6281   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6282   match(Set dst (AddVI src (LoadVector mem)));
6283   format %{ "vpaddd  $dst,$src,$mem\t! add packed4I" %}
6284   ins_encode %{
6285     int vector_len = 0;
6286     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6287   %}
6288   ins_pipe( pipe_slow );
6289 %}
6290 
6291 instruct vadd8I_reg(vecY dst, vecY src1, vecY src2) %{
6292   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
6293   match(Set dst (AddVI src1 src2));
6294   format %{ "vpaddd  $dst,$src1,$src2\t! add packed8I" %}
6295   ins_encode %{
6296     int vector_len = 1;
6297     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6298   %}
6299   ins_pipe( pipe_slow );
6300 %}
6301 
6302 instruct vadd8I_mem(vecY dst, vecY src, memory mem) %{
6303   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
6304   match(Set dst (AddVI src (LoadVector mem)));
6305   format %{ "vpaddd  $dst,$src,$mem\t! add packed8I" %}
6306   ins_encode %{
6307     int vector_len = 1;
6308     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6309   %}
6310   ins_pipe( pipe_slow );
6311 %}
6312 
6313 instruct vadd16I_reg(vecZ dst, vecZ src1, vecZ src2) %{
6314   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
6315   match(Set dst (AddVI src1 src2));
6316   format %{ "vpaddd  $dst,$src1,$src2\t! add packed16I" %}
6317   ins_encode %{
6318     int vector_len = 2;
6319     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6320   %}
6321   ins_pipe( pipe_slow );
6322 %}
6323 
6324 instruct vadd16I_mem(vecZ dst, vecZ src, memory mem) %{
6325   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
6326   match(Set dst (AddVI src (LoadVector mem)));
6327   format %{ "vpaddd  $dst,$src,$mem\t! add packed16I" %}
6328   ins_encode %{
6329     int vector_len = 2;
6330     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6331   %}
6332   ins_pipe( pipe_slow );
6333 %}
6334 
6335 // Longs vector add
6336 instruct vadd2L(vecX dst, vecX src) %{
6337   predicate(n->as_Vector()->length() == 2);
6338   match(Set dst (AddVL dst src));
6339   format %{ "paddq   $dst,$src\t! add packed2L" %}
6340   ins_encode %{
6341     __ paddq($dst$$XMMRegister, $src$$XMMRegister);
6342   %}
6343   ins_pipe( pipe_slow );
6344 %}
6345 
6346 instruct vadd2L_reg(vecX dst, vecX src1, vecX src2) %{
6347   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6348   match(Set dst (AddVL src1 src2));
6349   format %{ "vpaddq  $dst,$src1,$src2\t! add packed2L" %}
6350   ins_encode %{
6351     int vector_len = 0;
6352     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6353   %}
6354   ins_pipe( pipe_slow );
6355 %}
6356 
6357 instruct vadd2L_mem(vecX dst, vecX src, memory mem) %{
6358   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6359   match(Set dst (AddVL src (LoadVector mem)));
6360   format %{ "vpaddq  $dst,$src,$mem\t! add packed2L" %}
6361   ins_encode %{
6362     int vector_len = 0;
6363     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6364   %}
6365   ins_pipe( pipe_slow );
6366 %}
6367 
6368 instruct vadd4L_reg(vecY dst, vecY src1, vecY src2) %{
6369   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
6370   match(Set dst (AddVL src1 src2));
6371   format %{ "vpaddq  $dst,$src1,$src2\t! add packed4L" %}
6372   ins_encode %{
6373     int vector_len = 1;
6374     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6375   %}
6376   ins_pipe( pipe_slow );
6377 %}
6378 
6379 instruct vadd4L_mem(vecY dst, vecY src, memory mem) %{
6380   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
6381   match(Set dst (AddVL src (LoadVector mem)));
6382   format %{ "vpaddq  $dst,$src,$mem\t! add packed4L" %}
6383   ins_encode %{
6384     int vector_len = 1;
6385     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6386   %}
6387   ins_pipe( pipe_slow );
6388 %}
6389 
6390 instruct vadd8L_reg(vecZ dst, vecZ src1, vecZ src2) %{
6391   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
6392   match(Set dst (AddVL src1 src2));
6393   format %{ "vpaddq  $dst,$src1,$src2\t! add packed8L" %}
6394   ins_encode %{
6395     int vector_len = 2;
6396     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6397   %}
6398   ins_pipe( pipe_slow );
6399 %}
6400 
6401 instruct vadd8L_mem(vecZ dst, vecZ src, memory mem) %{
6402   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
6403   match(Set dst (AddVL src (LoadVector mem)));
6404   format %{ "vpaddq  $dst,$src,$mem\t! add packed8L" %}
6405   ins_encode %{
6406     int vector_len = 2;
6407     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6408   %}
6409   ins_pipe( pipe_slow );
6410 %}
6411 
6412 // Floats vector add
6413 instruct vadd2F(vecD dst, vecD src) %{
6414   predicate(n->as_Vector()->length() == 2);
6415   match(Set dst (AddVF dst src));
6416   format %{ "addps   $dst,$src\t! add packed2F" %}
6417   ins_encode %{
6418     __ addps($dst$$XMMRegister, $src$$XMMRegister);
6419   %}
6420   ins_pipe( pipe_slow );
6421 %}
6422 
6423 instruct vadd2F_reg(vecD dst, vecD src1, vecD src2) %{
6424   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6425   match(Set dst (AddVF src1 src2));
6426   format %{ "vaddps  $dst,$src1,$src2\t! add packed2F" %}
6427   ins_encode %{
6428     int vector_len = 0;
6429     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6430   %}
6431   ins_pipe( pipe_slow );
6432 %}
6433 
6434 instruct vadd2F_mem(vecD dst, vecD src, memory mem) %{
6435   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6436   match(Set dst (AddVF src (LoadVector mem)));
6437   format %{ "vaddps  $dst,$src,$mem\t! add packed2F" %}
6438   ins_encode %{
6439     int vector_len = 0;
6440     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6441   %}
6442   ins_pipe( pipe_slow );
6443 %}
6444 
6445 instruct vadd4F(vecX dst, vecX src) %{
6446   predicate(n->as_Vector()->length() == 4);
6447   match(Set dst (AddVF dst src));
6448   format %{ "addps   $dst,$src\t! add packed4F" %}
6449   ins_encode %{
6450     __ addps($dst$$XMMRegister, $src$$XMMRegister);
6451   %}
6452   ins_pipe( pipe_slow );
6453 %}
6454 
6455 instruct vadd4F_reg(vecX dst, vecX src1, vecX src2) %{
6456   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6457   match(Set dst (AddVF src1 src2));
6458   format %{ "vaddps  $dst,$src1,$src2\t! add packed4F" %}
6459   ins_encode %{
6460     int vector_len = 0;
6461     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6462   %}
6463   ins_pipe( pipe_slow );
6464 %}
6465 
6466 instruct vadd4F_mem(vecX dst, vecX src, memory mem) %{
6467   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6468   match(Set dst (AddVF src (LoadVector mem)));
6469   format %{ "vaddps  $dst,$src,$mem\t! add packed4F" %}
6470   ins_encode %{
6471     int vector_len = 0;
6472     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6473   %}
6474   ins_pipe( pipe_slow );
6475 %}
6476 
6477 instruct vadd8F_reg(vecY dst, vecY src1, vecY src2) %{
6478   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6479   match(Set dst (AddVF src1 src2));
6480   format %{ "vaddps  $dst,$src1,$src2\t! add packed8F" %}
6481   ins_encode %{
6482     int vector_len = 1;
6483     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6484   %}
6485   ins_pipe( pipe_slow );
6486 %}
6487 
6488 instruct vadd8F_mem(vecY dst, vecY src, memory mem) %{
6489   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6490   match(Set dst (AddVF src (LoadVector mem)));
6491   format %{ "vaddps  $dst,$src,$mem\t! add packed8F" %}
6492   ins_encode %{
6493     int vector_len = 1;
6494     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6495   %}
6496   ins_pipe( pipe_slow );
6497 %}
6498 
6499 instruct vadd16F_reg(vecZ dst, vecZ src1, vecZ src2) %{
6500   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
6501   match(Set dst (AddVF src1 src2));
6502   format %{ "vaddps  $dst,$src1,$src2\t! add packed16F" %}
6503   ins_encode %{
6504     int vector_len = 2;
6505     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6506   %}
6507   ins_pipe( pipe_slow );
6508 %}
6509 
6510 instruct vadd16F_mem(vecZ dst, vecZ src, memory mem) %{
6511   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
6512   match(Set dst (AddVF src (LoadVector mem)));
6513   format %{ "vaddps  $dst,$src,$mem\t! add packed16F" %}
6514   ins_encode %{
6515     int vector_len = 2;
6516     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6517   %}
6518   ins_pipe( pipe_slow );
6519 %}
6520 
6521 // Doubles vector add
6522 instruct vadd2D(vecX dst, vecX src) %{
6523   predicate(n->as_Vector()->length() == 2);
6524   match(Set dst (AddVD dst src));
6525   format %{ "addpd   $dst,$src\t! add packed2D" %}
6526   ins_encode %{
6527     __ addpd($dst$$XMMRegister, $src$$XMMRegister);
6528   %}
6529   ins_pipe( pipe_slow );
6530 %}
6531 
6532 instruct vadd2D_reg(vecX dst, vecX src1, vecX src2) %{
6533   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6534   match(Set dst (AddVD src1 src2));
6535   format %{ "vaddpd  $dst,$src1,$src2\t! add packed2D" %}
6536   ins_encode %{
6537     int vector_len = 0;
6538     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6539   %}
6540   ins_pipe( pipe_slow );
6541 %}
6542 
6543 instruct vadd2D_mem(vecX dst, vecX src, memory mem) %{
6544   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6545   match(Set dst (AddVD src (LoadVector mem)));
6546   format %{ "vaddpd  $dst,$src,$mem\t! add packed2D" %}
6547   ins_encode %{
6548     int vector_len = 0;
6549     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6550   %}
6551   ins_pipe( pipe_slow );
6552 %}
6553 
6554 instruct vadd4D_reg(vecY dst, vecY src1, vecY src2) %{
6555   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6556   match(Set dst (AddVD src1 src2));
6557   format %{ "vaddpd  $dst,$src1,$src2\t! add packed4D" %}
6558   ins_encode %{
6559     int vector_len = 1;
6560     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6561   %}
6562   ins_pipe( pipe_slow );
6563 %}
6564 
6565 instruct vadd4D_mem(vecY dst, vecY src, memory mem) %{
6566   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6567   match(Set dst (AddVD src (LoadVector mem)));
6568   format %{ "vaddpd  $dst,$src,$mem\t! add packed4D" %}
6569   ins_encode %{
6570     int vector_len = 1;
6571     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6572   %}
6573   ins_pipe( pipe_slow );
6574 %}
6575 
6576 instruct vadd8D_reg(vecZ dst, vecZ src1, vecZ src2) %{
6577   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
6578   match(Set dst (AddVD src1 src2));
6579   format %{ "vaddpd  $dst,$src1,$src2\t! add packed8D" %}
6580   ins_encode %{
6581     int vector_len = 2;
6582     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6583   %}
6584   ins_pipe( pipe_slow );
6585 %}
6586 
6587 instruct vadd8D_mem(vecZ dst, vecZ src, memory mem) %{
6588   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
6589   match(Set dst (AddVD src (LoadVector mem)));
6590   format %{ "vaddpd  $dst,$src,$mem\t! add packed8D" %}
6591   ins_encode %{
6592     int vector_len = 2;
6593     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6594   %}
6595   ins_pipe( pipe_slow );
6596 %}
6597 
6598 // --------------------------------- SUB --------------------------------------
6599 
6600 // Bytes vector sub
6601 instruct vsub4B(vecS dst, vecS src) %{
6602   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
6603   match(Set dst (SubVB dst src));
6604   format %{ "psubb   $dst,$src\t! sub packed4B" %}
6605   ins_encode %{
6606     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
6607   %}
6608   ins_pipe( pipe_slow );
6609 %}
6610 
6611 instruct vsub4B_reg_avx(vecS dst, vecS src1, vecS src2) %{
6612   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
6613   match(Set dst (SubVB src1 src2));
6614   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed4B" %}
6615   ins_encode %{
6616     int vector_len = 0;
6617     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6618   %}
6619   ins_pipe( pipe_slow );
6620 %}
6621 
6622 instruct vsub4B_reg_evex(vecS dst, vecS src1, vecS src2) %{
6623   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
6624   match(Set dst (SubVB src1 src2));
6625   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed4B" %}
6626   ins_encode %{
6627     int vector_len = 0;
6628     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6629   %}
6630   ins_pipe( pipe_slow );
6631 %}
6632 
6633 instruct vsub4B_reg_exex_special(vecS dst, vecS src1, vecS src2) %{
6634   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
6635   match(Set dst (SubVB dst src2));
6636   effect(TEMP src1);
6637   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed4B" %}
6638   ins_encode %{
6639     int vector_len = 0;
6640     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6641   %}
6642   ins_pipe( pipe_slow );
6643 %}
6644 
6645 instruct vsub4B_mem_avx(vecS dst, vecS src, memory mem) %{
6646   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
6647   match(Set dst (SubVB src (LoadVector mem)));
6648   format %{ "vpsubb  $dst,$src,$mem\t! sub packed4B" %}
6649   ins_encode %{
6650     int vector_len = 0;
6651     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6652   %}
6653   ins_pipe( pipe_slow );
6654 %}
6655 
6656 instruct vsub4B_mem_evex(vecS dst, vecS src, memory mem) %{
6657   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
6658   match(Set dst (SubVB src (LoadVector mem)));
6659   format %{ "vpsubb  $dst,$src,$mem\t! sub packed4B" %}
6660   ins_encode %{
6661     int vector_len = 0;
6662     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6663   %}
6664   ins_pipe( pipe_slow );
6665 %}
6666 
6667 instruct vsub4B_mem_evex_special(vecS dst, vecS src, memory mem) %{
6668   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
6669   match(Set dst (SubVB dst (LoadVector mem)));
6670   effect(TEMP src);
6671   format %{ "vpsubb  $dst,$src,$mem\t! sub packed4B" %}
6672   ins_encode %{
6673     int vector_len = 0;
6674     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6675   %}
6676   ins_pipe( pipe_slow );
6677 %}
6678 
6679 instruct vsub8B(vecD dst, vecD src) %{
6680   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
6681   match(Set dst (SubVB dst src));
6682   format %{ "psubb   $dst,$src\t! sub packed8B" %}
6683   ins_encode %{
6684     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
6685   %}
6686   ins_pipe( pipe_slow );
6687 %}
6688 
6689 instruct vsub8B_reg_avx(vecD dst, vecD src1, vecD src2) %{
6690   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
6691   match(Set dst (SubVB src1 src2));
6692   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed8B" %}
6693   ins_encode %{
6694     int vector_len = 0;
6695     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6696   %}
6697   ins_pipe( pipe_slow );
6698 %}
6699 
6700 instruct vsub8B_reg_evex(vecD dst, vecD src1, vecD src2) %{
6701   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
6702   match(Set dst (SubVB src1 src2));
6703   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed8B" %}
6704   ins_encode %{
6705     int vector_len = 0;
6706     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6707   %}
6708   ins_pipe( pipe_slow );
6709 %}
6710 
6711 instruct vsub8B_reg_evex_special(vecD dst, vecD src1, vecD src2) %{
6712   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
6713   match(Set dst (SubVB dst src2));
6714   effect(TEMP src1);
6715   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed8B" %}
6716   ins_encode %{
6717     int vector_len = 0;
6718     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6719   %}
6720   ins_pipe( pipe_slow );
6721 %}
6722 
6723 instruct vsub8B_mem_avx(vecD dst, vecD src, memory mem) %{
6724   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
6725   match(Set dst (SubVB src (LoadVector mem)));
6726   format %{ "vpsubb  $dst,$src,$mem\t! sub packed8B" %}
6727   ins_encode %{
6728     int vector_len = 0;
6729     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6730   %}
6731   ins_pipe( pipe_slow );
6732 %}
6733 
6734 instruct vsub8B_mem_evex(vecD dst, vecD src, memory mem) %{
6735   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
6736   match(Set dst (SubVB src (LoadVector mem)));
6737   format %{ "vpsubb  $dst,$src,$mem\t! sub packed8B" %}
6738   ins_encode %{
6739     int vector_len = 0;
6740     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6741   %}
6742   ins_pipe( pipe_slow );
6743 %}
6744 
6745 instruct vsub8B_mem_evex_special(vecD dst, vecD src, memory mem) %{
6746   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
6747   match(Set dst (SubVB dst (LoadVector mem)));
6748   effect(TEMP src);
6749   format %{ "vpsubb  $dst,$src,$mem\t! sub packed8B" %}
6750   ins_encode %{
6751     int vector_len = 0;
6752     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6753   %}
6754   ins_pipe( pipe_slow );
6755 %}
6756 
6757 instruct vsub16B(vecX dst, vecX src) %{
6758   predicate(UseAVX == 0 && n->as_Vector()->length() == 16);
6759   match(Set dst (SubVB dst src));
6760   format %{ "psubb   $dst,$src\t! sub packed16B" %}
6761   ins_encode %{
6762     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
6763   %}
6764   ins_pipe( pipe_slow );
6765 %}
6766 
6767 instruct vsub16B_reg_avx(vecX dst, vecX src1, vecX src2) %{
6768   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 16);
6769   match(Set dst (SubVB src1 src2));
6770   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed16B" %}
6771   ins_encode %{
6772     int vector_len = 0;
6773     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6774   %}
6775   ins_pipe( pipe_slow );
6776 %}
6777 
6778 instruct vsub16B_reg_evex(vecX dst, vecX src1, vecX src2) %{
6779   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
6780   match(Set dst (SubVB src1 src2));
6781   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed16B" %}
6782   ins_encode %{
6783     int vector_len = 0;
6784     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6785   %}
6786   ins_pipe( pipe_slow );
6787 %}
6788 
6789 instruct vsub16B_reg_evex_special(vecX dst, vecX src1, vecX src2) %{
6790   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
6791   match(Set dst (SubVB dst src2));
6792   effect(TEMP src1);
6793   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed16B" %}
6794   ins_encode %{
6795     int vector_len = 0;
6796     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6797   %}
6798   ins_pipe( pipe_slow );
6799 %}
6800 
6801 instruct vsub16B_mem_avx(vecX dst, vecX src, memory mem) %{
6802   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 16);
6803   match(Set dst (SubVB src (LoadVector mem)));
6804   format %{ "vpsubb  $dst,$src,$mem\t! sub packed16B" %}
6805   ins_encode %{
6806     int vector_len = 0;
6807     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6808   %}
6809   ins_pipe( pipe_slow );
6810 %}
6811 
6812 instruct vsub16B_mem_evex(vecX dst, vecX src, memory mem) %{
6813   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
6814   match(Set dst (SubVB src (LoadVector mem)));
6815   format %{ "vpsubb  $dst,$src,$mem\t! sub packed16B" %}
6816   ins_encode %{
6817     int vector_len = 0;
6818     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6819   %}
6820   ins_pipe( pipe_slow );
6821 %}
6822 
6823 instruct vsub16B_mem_evex_special(vecX dst, vecX src, memory mem) %{
6824   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
6825   match(Set dst (SubVB dst (LoadVector mem)));
6826   effect(TEMP src);
6827   format %{ "vpsubb  $dst,$src,$mem\t! sub packed16B" %}
6828   ins_encode %{
6829     int vector_len = 0;
6830     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6831   %}
6832   ins_pipe( pipe_slow );
6833 %}
6834 
6835 instruct vsub32B_reg_avx(vecY dst, vecY src1, vecY src2) %{
6836   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 32);
6837   match(Set dst (SubVB src1 src2));
6838   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed32B" %}
6839   ins_encode %{
6840     int vector_len = 1;
6841     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6842   %}
6843   ins_pipe( pipe_slow );
6844 %}
6845 
6846 instruct vsub32B_reg_evex(vecY dst, vecY src1, vecY src2) %{
6847   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
6848   match(Set dst (SubVB src1 src2));
6849   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed32B" %}
6850   ins_encode %{
6851     int vector_len = 1;
6852     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6853   %}
6854   ins_pipe( pipe_slow );
6855 %}
6856 
6857 instruct vsub32B_reg_evex_special(vecY dst, vecY src1, vecY src2) %{
6858   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 32);
6859   match(Set dst (SubVB dst src2));
6860   effect(TEMP src1);
6861   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed32B" %}
6862   ins_encode %{
6863     int vector_len = 1;
6864     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6865   %}
6866   ins_pipe( pipe_slow );
6867 %}
6868 
6869 instruct vsub32B_mem_avx(vecY dst, vecY src, memory mem) %{
6870   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 32);
6871   match(Set dst (SubVB src (LoadVector mem)));
6872   format %{ "vpsubb  $dst,$src,$mem\t! sub packed32B" %}
6873   ins_encode %{
6874     int vector_len = 1;
6875     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6876   %}
6877   ins_pipe( pipe_slow );
6878 %}
6879 
6880 instruct vsub32B_mem_evex(vecY dst, vecY src, memory mem) %{
6881   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
6882   match(Set dst (SubVB src (LoadVector mem)));
6883   format %{ "vpsubb  $dst,$src,$mem\t! sub packed32B" %}
6884   ins_encode %{
6885     int vector_len = 1;
6886     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6887   %}
6888   ins_pipe( pipe_slow );
6889 %}
6890 
6891 instruct vsub32B_mem_evex_special(vecY dst, vecY src, memory mem) %{
6892   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 32);
6893   match(Set dst (SubVB dst (LoadVector mem)));
6894   effect(TEMP src);
6895   format %{ "vpsubb  $dst,$src,$mem\t! sub packed32B" %}
6896   ins_encode %{
6897     int vector_len = 1;
6898     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6899   %}
6900   ins_pipe( pipe_slow );
6901 %}
6902 
6903 instruct vsub64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
6904   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64);
6905   match(Set dst (SubVB src1 src2));
6906   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed64B" %}
6907   ins_encode %{
6908     int vector_len = 2;
6909     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6910   %}
6911   ins_pipe( pipe_slow );
6912 %}
6913 
6914 instruct vsub64B_mem(vecZ dst, vecZ src, memory mem) %{
6915   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64);
6916   match(Set dst (SubVB src (LoadVector mem)));
6917   format %{ "vpsubb  $dst,$src,$mem\t! sub packed64B" %}
6918   ins_encode %{
6919     int vector_len = 2;
6920     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6921   %}
6922   ins_pipe( pipe_slow );
6923 %}
6924 
6925 // Shorts/Chars vector sub
6926 instruct vsub2S(vecS dst, vecS src) %{
6927   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
6928   match(Set dst (SubVS dst src));
6929   format %{ "psubw   $dst,$src\t! sub packed2S" %}
6930   ins_encode %{
6931     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
6932   %}
6933   ins_pipe( pipe_slow );
6934 %}
6935 
6936 instruct vsub2S_reg_avx(vecS dst, vecS src1, vecS src2) %{
6937   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
6938   match(Set dst (SubVS src1 src2));
6939   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed2S" %}
6940   ins_encode %{
6941     int vector_len = 0;
6942     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6943   %}
6944   ins_pipe( pipe_slow );
6945 %}
6946 
6947 instruct vsub2S_reg_evex(vecS dst, vecS src1, vecS src2) %{
6948   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
6949   match(Set dst (SubVS src1 src2));
6950   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed2S" %}
6951   ins_encode %{
6952     int vector_len = 0;
6953     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6954   %}
6955   ins_pipe( pipe_slow );
6956 %}
6957 
6958 instruct vsub2S_reg_evex_special(vecS dst, vecS src1, vecS src2) %{
6959   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
6960   match(Set dst (SubVS dst src2));
6961   effect(TEMP src1);
6962   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed2S" %}
6963   ins_encode %{
6964     int vector_len = 0;
6965     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6966   %}
6967   ins_pipe( pipe_slow );
6968 %}
6969 
6970 instruct vsub2S_mem_avx(vecS dst, vecS src, memory mem) %{
6971   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
6972   match(Set dst (SubVS src (LoadVector mem)));
6973   format %{ "vpsubw  $dst,$src,$mem\t! sub packed2S" %}
6974   ins_encode %{
6975     int vector_len = 0;
6976     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6977   %}
6978   ins_pipe( pipe_slow );
6979 %}
6980 
6981 instruct vsub2S_mem_evex(vecS dst, vecS src, memory mem) %{
6982   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
6983   match(Set dst (SubVS src (LoadVector mem)));
6984   format %{ "vpsubw  $dst,$src,$mem\t! sub packed2S" %}
6985   ins_encode %{
6986     int vector_len = 0;
6987     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6988   %}
6989   ins_pipe( pipe_slow );
6990 %}
6991 
6992 instruct vsub2S_mem_evex_special(vecS dst, vecS src, memory mem) %{
6993   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
6994   match(Set dst (SubVS dst (LoadVector mem)));
6995   effect(TEMP src);
6996   format %{ "vpsubw  $dst,$src,$mem\t! sub packed2S" %}
6997   ins_encode %{
6998     int vector_len = 0;
6999     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7000   %}
7001   ins_pipe( pipe_slow );
7002 %}
7003 
7004 instruct vsub4S(vecD dst, vecD src) %{
7005   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
7006   match(Set dst (SubVS dst src));
7007   format %{ "psubw   $dst,$src\t! sub packed4S" %}
7008   ins_encode %{
7009     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
7010   %}
7011   ins_pipe( pipe_slow );
7012 %}
7013 
7014 instruct vsub4S_reg_avx(vecD dst, vecD src1, vecD src2) %{
7015   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
7016   match(Set dst (SubVS src1 src2));
7017   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed4S" %}
7018   ins_encode %{
7019     int vector_len = 0;
7020     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7021   %}
7022   ins_pipe( pipe_slow );
7023 %}
7024 
7025 instruct vsub4S_reg_evex(vecD dst, vecD src1, vecD src2) %{
7026   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
7027   match(Set dst (SubVS src1 src2));
7028   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed4S" %}
7029   ins_encode %{
7030     int vector_len = 0;
7031     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7032   %}
7033   ins_pipe( pipe_slow );
7034 %}
7035 
7036 instruct vsub4S_reg_evex_special(vecD dst, vecD src1, vecD src2) %{
7037   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
7038   match(Set dst (SubVS dst src2));
7039   effect(TEMP src1);
7040   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed4S" %}
7041   ins_encode %{
7042     int vector_len = 0;
7043     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7044   %}
7045   ins_pipe( pipe_slow );
7046 %}
7047 
7048 instruct vsub4S_mem_avx(vecD dst, vecD src, memory mem) %{
7049   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
7050   match(Set dst (SubVS src (LoadVector mem)));
7051   format %{ "vpsubw  $dst,$src,$mem\t! sub packed4S" %}
7052   ins_encode %{
7053     int vector_len = 0;
7054     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7055   %}
7056   ins_pipe( pipe_slow );
7057 %}
7058 
7059 instruct vsub4S_mem_evex(vecD dst, vecD src, memory mem) %{
7060   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
7061   match(Set dst (SubVS src (LoadVector mem)));
7062   format %{ "vpsubw  $dst,$src,$mem\t! sub packed4S" %}
7063   ins_encode %{
7064     int vector_len = 0;
7065     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7066   %}
7067   ins_pipe( pipe_slow );
7068 %}
7069 
7070 instruct vsub4S_mem_evex_special(vecD dst, vecD src, memory mem) %{
7071   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
7072   match(Set dst (SubVS dst (LoadVector mem)));
7073   effect(TEMP src);
7074   format %{ "vpsubw  $dst,$src,$mem\t! sub packed4S" %}
7075   ins_encode %{
7076     int vector_len = 0;
7077     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7078   %}
7079   ins_pipe( pipe_slow );
7080 %}
7081 
7082 instruct vsub8S(vecX dst, vecX src) %{
7083   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
7084   match(Set dst (SubVS dst src));
7085   format %{ "psubw   $dst,$src\t! sub packed8S" %}
7086   ins_encode %{
7087     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
7088   %}
7089   ins_pipe( pipe_slow );
7090 %}
7091 
7092 instruct vsub8S_reg_avx(vecX dst, vecX src1, vecX src2) %{
7093   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
7094   match(Set dst (SubVS src1 src2));
7095   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed8S" %}
7096   ins_encode %{
7097     int vector_len = 0;
7098     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7099   %}
7100   ins_pipe( pipe_slow );
7101 %}
7102 
7103 instruct vsub8S_reg_evex(vecX dst, vecX src1, vecX src2) %{
7104   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
7105   match(Set dst (SubVS src1 src2));
7106   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed8S" %}
7107   ins_encode %{
7108     int vector_len = 0;
7109     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7110   %}
7111   ins_pipe( pipe_slow );
7112 %}
7113 
7114 instruct vsub8S_reg_evex_special(vecX dst, vecX src1, vecX src2) %{
7115   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
7116   match(Set dst (SubVS dst src2));
7117   effect(TEMP src1);
7118   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed8S" %}
7119   ins_encode %{
7120     int vector_len = 0;
7121     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7122   %}
7123   ins_pipe( pipe_slow );
7124 %}
7125 
7126 instruct vsub8S_mem_avx(vecX dst, vecX src, memory mem) %{
7127   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
7128   match(Set dst (SubVS src (LoadVector mem)));
7129   format %{ "vpsubw  $dst,$src,$mem\t! sub packed8S" %}
7130   ins_encode %{
7131     int vector_len = 0;
7132     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7133   %}
7134   ins_pipe( pipe_slow );
7135 %}
7136 
7137 instruct vsub8S_mem_evex(vecX dst, vecX src, memory mem) %{
7138   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
7139   match(Set dst (SubVS src (LoadVector mem)));
7140   format %{ "vpsubw  $dst,$src,$mem\t! sub packed8S" %}
7141   ins_encode %{
7142     int vector_len = 0;
7143     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7144   %}
7145   ins_pipe( pipe_slow );
7146 %}
7147 
7148 instruct vsub8S_mem_evex_special(vecX dst, vecX src, memory mem) %{
7149   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
7150   match(Set dst (SubVS dst (LoadVector mem)));
7151   effect(TEMP src);
7152   format %{ "vpsubw  $dst,$src,$mem\t! sub packed8S" %}
7153   ins_encode %{
7154     int vector_len = 0;
7155     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7156   %}
7157   ins_pipe( pipe_slow );
7158 %}
7159 
7160 instruct vsub16S_reg_avx(vecY dst, vecY src1, vecY src2) %{
7161   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
7162   match(Set dst (SubVS src1 src2));
7163   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed16S" %}
7164   ins_encode %{
7165     int vector_len = 1;
7166     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7167   %}
7168   ins_pipe( pipe_slow );
7169 %}
7170 
7171 instruct vsub16S_reg_evex(vecY dst, vecY src1, vecY src2) %{
7172   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
7173   match(Set dst (SubVS src1 src2));
7174   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed16S" %}
7175   ins_encode %{
7176     int vector_len = 1;
7177     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7178   %}
7179   ins_pipe( pipe_slow );
7180 %}
7181 
7182 instruct vsub16S_reg_evex_special(vecY dst, vecY src1, vecY src2) %{
7183   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
7184   match(Set dst (SubVS dst src2));
7185   effect(TEMP src1);
7186   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed16S" %}
7187   ins_encode %{
7188     int vector_len = 1;
7189     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7190   %}
7191   ins_pipe( pipe_slow );
7192 %}
7193 
7194 instruct vsub16S_mem_avx(vecY dst, vecY src, memory mem) %{
7195   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
7196   match(Set dst (SubVS src (LoadVector mem)));
7197   format %{ "vpsubw  $dst,$src,$mem\t! sub packed16S" %}
7198   ins_encode %{
7199     int vector_len = 1;
7200     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7201   %}
7202   ins_pipe( pipe_slow );
7203 %}
7204 
7205 instruct vsub16S_mem_evex(vecY dst, vecY src, memory mem) %{
7206   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
7207   match(Set dst (SubVS src (LoadVector mem)));
7208   format %{ "vpsubw  $dst,$src,$mem\t! sub packed16S" %}
7209   ins_encode %{
7210     int vector_len = 1;
7211     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7212   %}
7213   ins_pipe( pipe_slow );
7214 %}
7215 
7216 instruct vsub16S_mem_evex_special(vecY dst, vecY src, memory mem) %{
7217   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
7218   match(Set dst (SubVS dst (LoadVector mem)));
7219    effect(TEMP src);
7220   format %{ "vpsubw  $dst,$src,$mem\t! sub packed16S" %}
7221   ins_encode %{
7222     int vector_len = 1;
7223     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7224   %}
7225   ins_pipe( pipe_slow );
7226 %}
7227 
7228 instruct vsub32S_reg(vecZ dst, vecZ src1, vecZ src2) %{
7229   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
7230   match(Set dst (SubVS src1 src2));
7231   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed32S" %}
7232   ins_encode %{
7233     int vector_len = 2;
7234     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7235   %}
7236   ins_pipe( pipe_slow );
7237 %}
7238 
7239 instruct vsub32S_mem(vecZ dst, vecZ src, memory mem) %{
7240   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
7241   match(Set dst (SubVS src (LoadVector mem)));
7242   format %{ "vpsubw  $dst,$src,$mem\t! sub packed32S" %}
7243   ins_encode %{
7244     int vector_len = 2;
7245     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7246   %}
7247   ins_pipe( pipe_slow );
7248 %}
7249 
7250 // Integers vector sub
7251 instruct vsub2I(vecD dst, vecD src) %{
7252   predicate(n->as_Vector()->length() == 2);
7253   match(Set dst (SubVI dst src));
7254   format %{ "psubd   $dst,$src\t! sub packed2I" %}
7255   ins_encode %{
7256     __ psubd($dst$$XMMRegister, $src$$XMMRegister);
7257   %}
7258   ins_pipe( pipe_slow );
7259 %}
7260 
7261 instruct vsub2I_reg(vecD dst, vecD src1, vecD src2) %{
7262   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7263   match(Set dst (SubVI src1 src2));
7264   format %{ "vpsubd  $dst,$src1,$src2\t! sub packed2I" %}
7265   ins_encode %{
7266     int vector_len = 0;
7267     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7268   %}
7269   ins_pipe( pipe_slow );
7270 %}
7271 
7272 instruct vsub2I_mem(vecD dst, vecD src, memory mem) %{
7273   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7274   match(Set dst (SubVI src (LoadVector mem)));
7275   format %{ "vpsubd  $dst,$src,$mem\t! sub packed2I" %}
7276   ins_encode %{
7277     int vector_len = 0;
7278     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7279   %}
7280   ins_pipe( pipe_slow );
7281 %}
7282 
7283 instruct vsub4I(vecX dst, vecX src) %{
7284   predicate(n->as_Vector()->length() == 4);
7285   match(Set dst (SubVI dst src));
7286   format %{ "psubd   $dst,$src\t! sub packed4I" %}
7287   ins_encode %{
7288     __ psubd($dst$$XMMRegister, $src$$XMMRegister);
7289   %}
7290   ins_pipe( pipe_slow );
7291 %}
7292 
7293 instruct vsub4I_reg(vecX dst, vecX src1, vecX src2) %{
7294   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7295   match(Set dst (SubVI src1 src2));
7296   format %{ "vpsubd  $dst,$src1,$src2\t! sub packed4I" %}
7297   ins_encode %{
7298     int vector_len = 0;
7299     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7300   %}
7301   ins_pipe( pipe_slow );
7302 %}
7303 
7304 instruct vsub4I_mem(vecX dst, vecX src, memory mem) %{
7305   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7306   match(Set dst (SubVI src (LoadVector mem)));
7307   format %{ "vpsubd  $dst,$src,$mem\t! sub packed4I" %}
7308   ins_encode %{
7309     int vector_len = 0;
7310     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7311   %}
7312   ins_pipe( pipe_slow );
7313 %}
7314 
7315 instruct vsub8I_reg(vecY dst, vecY src1, vecY src2) %{
7316   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
7317   match(Set dst (SubVI src1 src2));
7318   format %{ "vpsubd  $dst,$src1,$src2\t! sub packed8I" %}
7319   ins_encode %{
7320     int vector_len = 1;
7321     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7322   %}
7323   ins_pipe( pipe_slow );
7324 %}
7325 
7326 instruct vsub8I_mem(vecY dst, vecY src, memory mem) %{
7327   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
7328   match(Set dst (SubVI src (LoadVector mem)));
7329   format %{ "vpsubd  $dst,$src,$mem\t! sub packed8I" %}
7330   ins_encode %{
7331     int vector_len = 1;
7332     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7333   %}
7334   ins_pipe( pipe_slow );
7335 %}
7336 
7337 instruct vsub16I_reg(vecZ dst, vecZ src1, vecZ src2) %{
7338   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
7339   match(Set dst (SubVI src1 src2));
7340   format %{ "vpsubd  $dst,$src1,$src2\t! sub packed16I" %}
7341   ins_encode %{
7342     int vector_len = 2;
7343     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7344   %}
7345   ins_pipe( pipe_slow );
7346 %}
7347 
7348 instruct vsub16I_mem(vecZ dst, vecZ src, memory mem) %{
7349   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
7350   match(Set dst (SubVI src (LoadVector mem)));
7351   format %{ "vpsubd  $dst,$src,$mem\t! sub packed16I" %}
7352   ins_encode %{
7353     int vector_len = 2;
7354     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7355   %}
7356   ins_pipe( pipe_slow );
7357 %}
7358 
7359 // Longs vector sub
7360 instruct vsub2L(vecX dst, vecX src) %{
7361   predicate(n->as_Vector()->length() == 2);
7362   match(Set dst (SubVL dst src));
7363   format %{ "psubq   $dst,$src\t! sub packed2L" %}
7364   ins_encode %{
7365     __ psubq($dst$$XMMRegister, $src$$XMMRegister);
7366   %}
7367   ins_pipe( pipe_slow );
7368 %}
7369 
7370 instruct vsub2L_reg(vecX dst, vecX src1, vecX src2) %{
7371   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7372   match(Set dst (SubVL src1 src2));
7373   format %{ "vpsubq  $dst,$src1,$src2\t! sub packed2L" %}
7374   ins_encode %{
7375     int vector_len = 0;
7376     __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7377   %}
7378   ins_pipe( pipe_slow );
7379 %}
7380 
7381 instruct vsub2L_mem(vecX dst, vecX src, memory mem) %{
7382   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7383   match(Set dst (SubVL src (LoadVector mem)));
7384   format %{ "vpsubq  $dst,$src,$mem\t! sub packed2L" %}
7385   ins_encode %{
7386     int vector_len = 0;
7387     __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7388   %}
7389   ins_pipe( pipe_slow );
7390 %}
7391 
7392 instruct vsub4L_reg(vecY dst, vecY src1, vecY src2) %{
7393   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
7394   match(Set dst (SubVL src1 src2));
7395   format %{ "vpsubq  $dst,$src1,$src2\t! sub packed4L" %}
7396   ins_encode %{
7397     int vector_len = 1;
7398     __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7399   %}
7400   ins_pipe( pipe_slow );
7401 %}
7402 
7403 instruct vsub4L_mem(vecY dst, vecY src, memory mem) %{
7404   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
7405   match(Set dst (SubVL src (LoadVector mem)));
7406   format %{ "vpsubq  $dst,$src,$mem\t! sub packed4L" %}
7407   ins_encode %{
7408     int vector_len = 1;
7409     __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7410   %}
7411   ins_pipe( pipe_slow );
7412 %}
7413 
7414 instruct vsub8L_reg(vecZ dst, vecZ src1, vecZ src2) %{
7415   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
7416   match(Set dst (SubVL src1 src2));
7417   format %{ "vpsubq  $dst,$src1,$src2\t! sub packed8L" %}
7418   ins_encode %{
7419     int vector_len = 2;
7420     __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7421   %}
7422   ins_pipe( pipe_slow );
7423 %}
7424 
7425 instruct vsub8L_mem(vecZ dst, vecZ src, memory mem) %{
7426   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
7427   match(Set dst (SubVL src (LoadVector mem)));
7428   format %{ "vpsubq  $dst,$src,$mem\t! sub packed8L" %}
7429   ins_encode %{
7430     int vector_len = 2;
7431     __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7432   %}
7433   ins_pipe( pipe_slow );
7434 %}
7435 
7436 // Floats vector sub
7437 instruct vsub2F(vecD dst, vecD src) %{
7438   predicate(n->as_Vector()->length() == 2);
7439   match(Set dst (SubVF dst src));
7440   format %{ "subps   $dst,$src\t! sub packed2F" %}
7441   ins_encode %{
7442     __ subps($dst$$XMMRegister, $src$$XMMRegister);
7443   %}
7444   ins_pipe( pipe_slow );
7445 %}
7446 
7447 instruct vsub2F_reg(vecD dst, vecD src1, vecD src2) %{
7448   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7449   match(Set dst (SubVF src1 src2));
7450   format %{ "vsubps  $dst,$src1,$src2\t! sub packed2F" %}
7451   ins_encode %{
7452     int vector_len = 0;
7453     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7454   %}
7455   ins_pipe( pipe_slow );
7456 %}
7457 
7458 instruct vsub2F_mem(vecD dst, vecD src, memory mem) %{
7459   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7460   match(Set dst (SubVF src (LoadVector mem)));
7461   format %{ "vsubps  $dst,$src,$mem\t! sub packed2F" %}
7462   ins_encode %{
7463     int vector_len = 0;
7464     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7465   %}
7466   ins_pipe( pipe_slow );
7467 %}
7468 
7469 instruct vsub4F(vecX dst, vecX src) %{
7470   predicate(n->as_Vector()->length() == 4);
7471   match(Set dst (SubVF dst src));
7472   format %{ "subps   $dst,$src\t! sub packed4F" %}
7473   ins_encode %{
7474     __ subps($dst$$XMMRegister, $src$$XMMRegister);
7475   %}
7476   ins_pipe( pipe_slow );
7477 %}
7478 
7479 instruct vsub4F_reg(vecX dst, vecX src1, vecX src2) %{
7480   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7481   match(Set dst (SubVF src1 src2));
7482   format %{ "vsubps  $dst,$src1,$src2\t! sub packed4F" %}
7483   ins_encode %{
7484     int vector_len = 0;
7485     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7486   %}
7487   ins_pipe( pipe_slow );
7488 %}
7489 
7490 instruct vsub4F_mem(vecX dst, vecX src, memory mem) %{
7491   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7492   match(Set dst (SubVF src (LoadVector mem)));
7493   format %{ "vsubps  $dst,$src,$mem\t! sub packed4F" %}
7494   ins_encode %{
7495     int vector_len = 0;
7496     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7497   %}
7498   ins_pipe( pipe_slow );
7499 %}
7500 
7501 instruct vsub8F_reg(vecY dst, vecY src1, vecY src2) %{
7502   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
7503   match(Set dst (SubVF src1 src2));
7504   format %{ "vsubps  $dst,$src1,$src2\t! sub packed8F" %}
7505   ins_encode %{
7506     int vector_len = 1;
7507     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7508   %}
7509   ins_pipe( pipe_slow );
7510 %}
7511 
7512 instruct vsub8F_mem(vecY dst, vecY src, memory mem) %{
7513   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
7514   match(Set dst (SubVF src (LoadVector mem)));
7515   format %{ "vsubps  $dst,$src,$mem\t! sub packed8F" %}
7516   ins_encode %{
7517     int vector_len = 1;
7518     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7519   %}
7520   ins_pipe( pipe_slow );
7521 %}
7522 
7523 instruct vsub16F_reg(vecZ dst, vecZ src1, vecZ src2) %{
7524   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
7525   match(Set dst (SubVF src1 src2));
7526   format %{ "vsubps  $dst,$src1,$src2\t! sub packed16F" %}
7527   ins_encode %{
7528     int vector_len = 2;
7529     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7530   %}
7531   ins_pipe( pipe_slow );
7532 %}
7533 
7534 instruct vsub16F_mem(vecZ dst, vecZ src, memory mem) %{
7535   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
7536   match(Set dst (SubVF src (LoadVector mem)));
7537   format %{ "vsubps  $dst,$src,$mem\t! sub packed16F" %}
7538   ins_encode %{
7539     int vector_len = 2;
7540     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7541   %}
7542   ins_pipe( pipe_slow );
7543 %}
7544 
7545 // Doubles vector sub
7546 instruct vsub2D(vecX dst, vecX src) %{
7547   predicate(n->as_Vector()->length() == 2);
7548   match(Set dst (SubVD dst src));
7549   format %{ "subpd   $dst,$src\t! sub packed2D" %}
7550   ins_encode %{
7551     __ subpd($dst$$XMMRegister, $src$$XMMRegister);
7552   %}
7553   ins_pipe( pipe_slow );
7554 %}
7555 
7556 instruct vsub2D_reg(vecX dst, vecX src1, vecX src2) %{
7557   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7558   match(Set dst (SubVD src1 src2));
7559   format %{ "vsubpd  $dst,$src1,$src2\t! sub packed2D" %}
7560   ins_encode %{
7561     int vector_len = 0;
7562     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7563   %}
7564   ins_pipe( pipe_slow );
7565 %}
7566 
7567 instruct vsub2D_mem(vecX dst, vecX src, memory mem) %{
7568   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7569   match(Set dst (SubVD src (LoadVector mem)));
7570   format %{ "vsubpd  $dst,$src,$mem\t! sub packed2D" %}
7571   ins_encode %{
7572     int vector_len = 0;
7573     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7574   %}
7575   ins_pipe( pipe_slow );
7576 %}
7577 
7578 instruct vsub4D_reg(vecY dst, vecY src1, vecY src2) %{
7579   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7580   match(Set dst (SubVD src1 src2));
7581   format %{ "vsubpd  $dst,$src1,$src2\t! sub packed4D" %}
7582   ins_encode %{
7583     int vector_len = 1;
7584     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7585   %}
7586   ins_pipe( pipe_slow );
7587 %}
7588 
7589 instruct vsub4D_mem(vecY dst, vecY src, memory mem) %{
7590   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7591   match(Set dst (SubVD src (LoadVector mem)));
7592   format %{ "vsubpd  $dst,$src,$mem\t! sub packed4D" %}
7593   ins_encode %{
7594     int vector_len = 1;
7595     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7596   %}
7597   ins_pipe( pipe_slow );
7598 %}
7599 
7600 instruct vsub8D_reg(vecZ dst, vecZ src1, vecZ src2) %{
7601   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
7602   match(Set dst (SubVD src1 src2));
7603   format %{ "vsubpd  $dst,$src1,$src2\t! sub packed8D" %}
7604   ins_encode %{
7605     int vector_len = 2;
7606     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7607   %}
7608   ins_pipe( pipe_slow );
7609 %}
7610 
7611 instruct vsub8D_mem(vecZ dst, vecZ src, memory mem) %{
7612   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
7613   match(Set dst (SubVD src (LoadVector mem)));
7614   format %{ "vsubpd  $dst,$src,$mem\t! sub packed8D" %}
7615   ins_encode %{
7616     int vector_len = 2;
7617     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7618   %}
7619   ins_pipe( pipe_slow );
7620 %}
7621 
7622 // --------------------------------- MUL --------------------------------------
7623 
7624 // Shorts/Chars vector mul
7625 instruct vmul2S(vecS dst, vecS src) %{
7626   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
7627   match(Set dst (MulVS dst src));
7628   format %{ "pmullw $dst,$src\t! mul packed2S" %}
7629   ins_encode %{
7630     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
7631   %}
7632   ins_pipe( pipe_slow );
7633 %}
7634 
7635 instruct vmul2S_reg_avx(vecS dst, vecS src1, vecS src2) %{
7636   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
7637   match(Set dst (MulVS src1 src2));
7638   format %{ "vpmullw $dst,$src1,$src2\t! mul packed2S" %}
7639   ins_encode %{
7640     int vector_len = 0;
7641     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7642   %}
7643   ins_pipe( pipe_slow );
7644 %}
7645 
7646 instruct vmul2S_reg_evex(vecS dst, vecS src1, vecS src2) %{
7647   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
7648   match(Set dst (MulVS src1 src2));
7649   format %{ "vpmullw $dst,$src1,$src2\t! mul packed2S" %}
7650   ins_encode %{
7651     int vector_len = 0;
7652     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7653   %}
7654   ins_pipe( pipe_slow );
7655 %}
7656 
7657 instruct vmul2S_evex_special(vecS dst, vecS src1, vecS src2) %{
7658   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
7659   match(Set dst (MulVS dst src2));
7660   effect(TEMP src1);
7661   format %{ "vpmullw $dst,$src1,$src2\t! mul packed2S" %}
7662   ins_encode %{
7663     int vector_len = 0;
7664     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7665   %}
7666   ins_pipe( pipe_slow );
7667 %}
7668 
7669 instruct vmul2S_mem_avx(vecS dst, vecS src, memory mem) %{
7670   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
7671   match(Set dst (MulVS src (LoadVector mem)));
7672   format %{ "vpmullw $dst,$src,$mem\t! mul packed2S" %}
7673   ins_encode %{
7674     int vector_len = 0;
7675     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7676   %}
7677   ins_pipe( pipe_slow );
7678 %}
7679 
7680 instruct vmul2S_mem_evex(vecS dst, vecS src, memory mem) %{
7681   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
7682   match(Set dst (MulVS src (LoadVector mem)));
7683   format %{ "vpmullw $dst,$src,$mem\t! mul packed2S" %}
7684   ins_encode %{
7685     int vector_len = 0;
7686     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7687   %}
7688   ins_pipe( pipe_slow );
7689 %}
7690 
7691 instruct vmul2S_mem_evex_special(vecS dst, vecS src, memory mem) %{
7692   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
7693   match(Set dst (MulVS dst (LoadVector mem)));
7694   effect(TEMP src);
7695   format %{ "vpmullw $dst,$src,$mem\t! mul packed2S" %}
7696   ins_encode %{
7697     int vector_len = 0;
7698     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7699   %}
7700   ins_pipe( pipe_slow );
7701 %}
7702 
7703 instruct vmul4S(vecD dst, vecD src) %{
7704   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
7705   match(Set dst (MulVS dst src));
7706   format %{ "pmullw  $dst,$src\t! mul packed4S" %}
7707   ins_encode %{
7708     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
7709   %}
7710   ins_pipe( pipe_slow );
7711 %}
7712 
7713 instruct vmul4S_reg_avx(vecD dst, vecD src1, vecD src2) %{
7714   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
7715   match(Set dst (MulVS src1 src2));
7716   format %{ "vpmullw $dst,$src1,$src2\t! mul packed4S" %}
7717   ins_encode %{
7718     int vector_len = 0;
7719     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7720   %}
7721   ins_pipe( pipe_slow );
7722 %}
7723 
7724 instruct vmul4S_reg_evex(vecD dst, vecD src1, vecD src2) %{
7725   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
7726   match(Set dst (MulVS src1 src2));
7727   format %{ "vpmullw $dst,$src1,$src2\t! mul packed4S" %}
7728   ins_encode %{
7729     int vector_len = 0;
7730     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7731   %}
7732   ins_pipe( pipe_slow );
7733 %}
7734 
7735 instruct vmul4S_reg_evex_special(vecD dst, vecD src1, vecD src2) %{
7736   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
7737   match(Set dst (MulVS dst src2));
7738   effect(TEMP src1);
7739   format %{ "vpmullw $dst,$src1,$src2\t! mul packed4S" %}
7740   ins_encode %{
7741     int vector_len = 0;
7742     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7743   %}
7744   ins_pipe( pipe_slow );
7745 %}
7746 
7747 instruct vmul4S_mem_avx(vecD dst, vecD src, memory mem) %{
7748   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
7749   match(Set dst (MulVS src (LoadVector mem)));
7750   format %{ "vpmullw $dst,$src,$mem\t! mul packed4S" %}
7751   ins_encode %{
7752     int vector_len = 0;
7753     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7754   %}
7755   ins_pipe( pipe_slow );
7756 %}
7757 
7758 instruct vmul4S_mem_evex(vecD dst, vecD src, memory mem) %{
7759   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
7760   match(Set dst (MulVS src (LoadVector mem)));
7761   format %{ "vpmullw $dst,$src,$mem\t! mul packed4S" %}
7762   ins_encode %{
7763     int vector_len = 0;
7764     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7765   %}
7766   ins_pipe( pipe_slow );
7767 %}
7768 
7769 instruct vmul4S_mem_evex_special(vecD dst, vecD src, memory mem) %{
7770   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
7771   match(Set dst (MulVS dst (LoadVector mem)));
7772   effect(TEMP src);
7773   format %{ "vpmullw $dst,$src,$mem\t! mul packed4S" %}
7774   ins_encode %{
7775     int vector_len = 0;
7776     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7777   %}
7778   ins_pipe( pipe_slow );
7779 %}
7780 
7781 instruct vmul8S(vecX dst, vecX src) %{
7782   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
7783   match(Set dst (MulVS dst src));
7784   format %{ "pmullw  $dst,$src\t! mul packed8S" %}
7785   ins_encode %{
7786     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
7787   %}
7788   ins_pipe( pipe_slow );
7789 %}
7790 
7791 instruct vmul8S_reg_avx(vecX dst, vecX src1, vecX src2) %{
7792   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
7793   match(Set dst (MulVS src1 src2));
7794   format %{ "vpmullw $dst,$src1,$src2\t! mul packed8S" %}
7795   ins_encode %{
7796     int vector_len = 0;
7797     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7798   %}
7799   ins_pipe( pipe_slow );
7800 %}
7801 
7802 instruct vmul8S_reg_evex(vecX dst, vecX src1, vecX src2) %{
7803   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
7804   match(Set dst (MulVS src1 src2));
7805   format %{ "vpmullw $dst,$src1,$src2\t! mul packed8S" %}
7806   ins_encode %{
7807     int vector_len = 0;
7808     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7809   %}
7810   ins_pipe( pipe_slow );
7811 %}
7812 
7813 instruct vmul8S_reg_evex_special(vecX dst, vecX src1, vecX src2) %{
7814   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
7815   match(Set dst (MulVS dst src2));
7816   effect(TEMP src1);
7817   format %{ "vpmullw $dst,$src1,$src2\t! mul packed8S" %}
7818   ins_encode %{
7819     int vector_len = 0;
7820     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7821   %}
7822   ins_pipe( pipe_slow );
7823 %}
7824 
7825 instruct vmul8S_mem_avx(vecX dst, vecX src, memory mem) %{
7826   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
7827   match(Set dst (MulVS src (LoadVector mem)));
7828   format %{ "vpmullw $dst,$src,$mem\t! mul packed8S" %}
7829   ins_encode %{
7830     int vector_len = 0;
7831     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7832   %}
7833   ins_pipe( pipe_slow );
7834 %}
7835 
7836 instruct vmul8S_mem_evex(vecX dst, vecX src, memory mem) %{
7837   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
7838   match(Set dst (MulVS src (LoadVector mem)));
7839   format %{ "vpmullw $dst,$src,$mem\t! mul packed8S" %}
7840   ins_encode %{
7841     int vector_len = 0;
7842     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7843   %}
7844   ins_pipe( pipe_slow );
7845 %}
7846 
7847 instruct vmul8S_mem_evex_special(vecX dst, vecX src, memory mem) %{
7848   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
7849   match(Set dst (MulVS dst (LoadVector mem)));
7850   effect(TEMP src);
7851   format %{ "vpmullw $dst,$src,$mem\t! mul packed8S" %}
7852   ins_encode %{
7853     int vector_len = 0;
7854     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7855   %}
7856   ins_pipe( pipe_slow );
7857 %}
7858 
7859 instruct vmul16S_reg_avx(vecY dst, vecY src1, vecY src2) %{
7860   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
7861   match(Set dst (MulVS src1 src2));
7862   format %{ "vpmullw $dst,$src1,$src2\t! mul packed16S" %}
7863   ins_encode %{
7864     int vector_len = 1;
7865     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7866   %}
7867   ins_pipe( pipe_slow );
7868 %}
7869 
7870 instruct vmul16S_reg_evex(vecY dst, vecY src1, vecY src2) %{
7871   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
7872   match(Set dst (MulVS src1 src2));
7873   format %{ "vpmullw $dst,$src1,$src2\t! mul packed16S" %}
7874   ins_encode %{
7875     int vector_len = 1;
7876     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7877   %}
7878   ins_pipe( pipe_slow );
7879 %}
7880 
7881 instruct vmul16S_reg_evex_special(vecY dst, vecY src1, vecY src2) %{
7882   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
7883   match(Set dst (MulVS dst src2));
7884   effect(TEMP src1);
7885   format %{ "vpmullw $dst,$src1,$src2\t! mul packed16S" %}
7886   ins_encode %{
7887     int vector_len = 1;
7888     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7889   %}
7890   ins_pipe( pipe_slow );
7891 %}
7892 
7893 instruct vmul16S_mem_avx(vecY dst, vecY src, memory mem) %{
7894   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
7895   match(Set dst (MulVS src (LoadVector mem)));
7896   format %{ "vpmullw $dst,$src,$mem\t! mul packed16S" %}
7897   ins_encode %{
7898     int vector_len = 1;
7899     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7900   %}
7901   ins_pipe( pipe_slow );
7902 %}
7903 
7904 instruct vmul16S_mem_evex(vecY dst, vecY src, memory mem) %{
7905   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
7906   match(Set dst (MulVS src (LoadVector mem)));
7907   format %{ "vpmullw $dst,$src,$mem\t! mul packed16S" %}
7908   ins_encode %{
7909     int vector_len = 1;
7910     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7911   %}
7912   ins_pipe( pipe_slow );
7913 %}
7914 
7915 instruct vmul16S_mem_evex_special(vecY dst, vecY src, memory mem) %{
7916   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
7917   match(Set dst (MulVS dst (LoadVector mem)));
7918   effect(TEMP src);
7919   format %{ "vpmullw $dst,$src,$mem\t! mul packed16S" %}
7920   ins_encode %{
7921     int vector_len = 1;
7922     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7923   %}
7924   ins_pipe( pipe_slow );
7925 %}
7926 
7927 instruct vmul32S_reg(vecZ dst, vecZ src1, vecZ src2) %{
7928   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
7929   match(Set dst (MulVS src1 src2));
7930   format %{ "vpmullw $dst,$src1,$src2\t! mul packed32S" %}
7931   ins_encode %{
7932     int vector_len = 2;
7933     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7934   %}
7935   ins_pipe( pipe_slow );
7936 %}
7937 
7938 instruct vmul32S_mem(vecZ dst, vecZ src, memory mem) %{
7939   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
7940   match(Set dst (MulVS src (LoadVector mem)));
7941   format %{ "vpmullw $dst,$src,$mem\t! mul packed32S" %}
7942   ins_encode %{
7943     int vector_len = 2;
7944     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7945   %}
7946   ins_pipe( pipe_slow );
7947 %}
7948 
7949 // Integers vector mul (sse4_1)
7950 instruct vmul2I(vecD dst, vecD src) %{
7951   predicate(UseSSE > 3 && n->as_Vector()->length() == 2);
7952   match(Set dst (MulVI dst src));
7953   format %{ "pmulld  $dst,$src\t! mul packed2I" %}
7954   ins_encode %{
7955     __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
7956   %}
7957   ins_pipe( pipe_slow );
7958 %}
7959 
7960 instruct vmul2I_reg(vecD dst, vecD src1, vecD src2) %{
7961   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7962   match(Set dst (MulVI src1 src2));
7963   format %{ "vpmulld $dst,$src1,$src2\t! mul packed2I" %}
7964   ins_encode %{
7965     int vector_len = 0;
7966     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7967   %}
7968   ins_pipe( pipe_slow );
7969 %}
7970 
7971 instruct vmul2I_mem(vecD dst, vecD src, memory mem) %{
7972   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7973   match(Set dst (MulVI src (LoadVector mem)));
7974   format %{ "vpmulld $dst,$src,$mem\t! mul packed2I" %}
7975   ins_encode %{
7976     int vector_len = 0;
7977     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7978   %}
7979   ins_pipe( pipe_slow );
7980 %}
7981 
7982 instruct vmul4I(vecX dst, vecX src) %{
7983   predicate(UseSSE > 3 && n->as_Vector()->length() == 4);
7984   match(Set dst (MulVI dst src));
7985   format %{ "pmulld  $dst,$src\t! mul packed4I" %}
7986   ins_encode %{
7987     __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
7988   %}
7989   ins_pipe( pipe_slow );
7990 %}
7991 
7992 instruct vmul4I_reg(vecX dst, vecX src1, vecX src2) %{
7993   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7994   match(Set dst (MulVI src1 src2));
7995   format %{ "vpmulld $dst,$src1,$src2\t! mul packed4I" %}
7996   ins_encode %{
7997     int vector_len = 0;
7998     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7999   %}
8000   ins_pipe( pipe_slow );
8001 %}
8002 
8003 instruct vmul4I_mem(vecX dst, vecX src, memory mem) %{
8004   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8005   match(Set dst (MulVI src (LoadVector mem)));
8006   format %{ "vpmulld $dst,$src,$mem\t! mul packed4I" %}
8007   ins_encode %{
8008     int vector_len = 0;
8009     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8010   %}
8011   ins_pipe( pipe_slow );
8012 %}
8013 
8014 instruct vmul2L_reg(vecX dst, vecX src1, vecX src2) %{
8015   predicate(UseAVX > 2 && n->as_Vector()->length() == 2 && VM_Version::supports_avx512dq());
8016   match(Set dst (MulVL src1 src2));
8017   format %{ "vpmullq $dst,$src1,$src2\t! mul packed2L" %}
8018   ins_encode %{
8019     int vector_len = 0;
8020     __ vpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8021   %}
8022   ins_pipe( pipe_slow );
8023 %}
8024 
8025 instruct vmul2L_mem(vecX dst, vecX src, memory mem) %{
8026   predicate(UseAVX > 2 && n->as_Vector()->length() == 2 && VM_Version::supports_avx512dq());
8027   match(Set dst (MulVL src (LoadVector mem)));
8028   format %{ "vpmullq $dst,$src,$mem\t! mul packed2L" %}
8029   ins_encode %{
8030     int vector_len = 0;
8031     __ vpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8032   %}
8033   ins_pipe( pipe_slow );
8034 %}
8035 
8036 instruct vmul4L_reg(vecY dst, vecY src1, vecY src2) %{
8037   predicate(UseAVX > 2 && n->as_Vector()->length() == 4 && VM_Version::supports_avx512dq());
8038   match(Set dst (MulVL src1 src2));
8039   format %{ "vpmullq $dst,$src1,$src2\t! mul packed4L" %}
8040   ins_encode %{
8041     int vector_len = 1;
8042     __ vpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8043   %}
8044   ins_pipe( pipe_slow );
8045 %}
8046 
8047 instruct vmul4L_mem(vecY dst, vecY src, memory mem) %{
8048   predicate(UseAVX > 2 && n->as_Vector()->length() == 4 && VM_Version::supports_avx512dq());
8049   match(Set dst (MulVL src (LoadVector mem)));
8050   format %{ "vpmullq $dst,$src,$mem\t! mul packed4L" %}
8051   ins_encode %{
8052     int vector_len = 1;
8053     __ vpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8054   %}
8055   ins_pipe( pipe_slow );
8056 %}
8057 
8058 instruct vmul8L_reg(vecZ dst, vecZ src1, vecZ src2) %{
8059   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && VM_Version::supports_avx512dq());
8060   match(Set dst (MulVL src1 src2));
8061   format %{ "vpmullq $dst,$src1,$src2\t! mul packed8L" %}
8062   ins_encode %{
8063     int vector_len = 2;
8064     __ vpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8065   %}
8066   ins_pipe( pipe_slow );
8067 %}
8068 
8069 instruct vmul8L_mem(vecZ dst, vecZ src, memory mem) %{
8070   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && VM_Version::supports_avx512dq());
8071   match(Set dst (MulVL src (LoadVector mem)));
8072   format %{ "vpmullq $dst,$src,$mem\t! mul packed8L" %}
8073   ins_encode %{
8074     int vector_len = 2;
8075     __ vpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8076   %}
8077   ins_pipe( pipe_slow );
8078 %}
8079 
8080 instruct vmul8I_reg(vecY dst, vecY src1, vecY src2) %{
8081   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
8082   match(Set dst (MulVI src1 src2));
8083   format %{ "vpmulld $dst,$src1,$src2\t! mul packed8I" %}
8084   ins_encode %{
8085     int vector_len = 1;
8086     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8087   %}
8088   ins_pipe( pipe_slow );
8089 %}
8090 
8091 instruct vmul8I_mem(vecY dst, vecY src, memory mem) %{
8092   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
8093   match(Set dst (MulVI src (LoadVector mem)));
8094   format %{ "vpmulld $dst,$src,$mem\t! mul packed8I" %}
8095   ins_encode %{
8096     int vector_len = 1;
8097     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8098   %}
8099   ins_pipe( pipe_slow );
8100 %}
8101 
8102 instruct vmul16I_reg(vecZ dst, vecZ src1, vecZ src2) %{
8103   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
8104   match(Set dst (MulVI src1 src2));
8105   format %{ "vpmulld $dst,$src1,$src2\t! mul packed16I" %}
8106   ins_encode %{
8107     int vector_len = 2;
8108     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8109   %}
8110   ins_pipe( pipe_slow );
8111 %}
8112 
8113 instruct vmul16I_mem(vecZ dst, vecZ src, memory mem) %{
8114   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
8115   match(Set dst (MulVI src (LoadVector mem)));
8116   format %{ "vpmulld $dst,$src,$mem\t! mul packed16I" %}
8117   ins_encode %{
8118     int vector_len = 2;
8119     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8120   %}
8121   ins_pipe( pipe_slow );
8122 %}
8123 
8124 // Floats vector mul
8125 instruct vmul2F(vecD dst, vecD src) %{
8126   predicate(n->as_Vector()->length() == 2);
8127   match(Set dst (MulVF dst src));
8128   format %{ "mulps   $dst,$src\t! mul packed2F" %}
8129   ins_encode %{
8130     __ mulps($dst$$XMMRegister, $src$$XMMRegister);
8131   %}
8132   ins_pipe( pipe_slow );
8133 %}
8134 
8135 instruct vmul2F_reg(vecD dst, vecD src1, vecD src2) %{
8136   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8137   match(Set dst (MulVF src1 src2));
8138   format %{ "vmulps  $dst,$src1,$src2\t! mul packed2F" %}
8139   ins_encode %{
8140     int vector_len = 0;
8141     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8142   %}
8143   ins_pipe( pipe_slow );
8144 %}
8145 
8146 instruct vmul2F_mem(vecD dst, vecD src, memory mem) %{
8147   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8148   match(Set dst (MulVF src (LoadVector mem)));
8149   format %{ "vmulps  $dst,$src,$mem\t! mul packed2F" %}
8150   ins_encode %{
8151     int vector_len = 0;
8152     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8153   %}
8154   ins_pipe( pipe_slow );
8155 %}
8156 
8157 instruct vmul4F(vecX dst, vecX src) %{
8158   predicate(n->as_Vector()->length() == 4);
8159   match(Set dst (MulVF dst src));
8160   format %{ "mulps   $dst,$src\t! mul packed4F" %}
8161   ins_encode %{
8162     __ mulps($dst$$XMMRegister, $src$$XMMRegister);
8163   %}
8164   ins_pipe( pipe_slow );
8165 %}
8166 
8167 instruct vmul4F_reg(vecX dst, vecX src1, vecX src2) %{
8168   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8169   match(Set dst (MulVF src1 src2));
8170   format %{ "vmulps  $dst,$src1,$src2\t! mul packed4F" %}
8171   ins_encode %{
8172     int vector_len = 0;
8173     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8174   %}
8175   ins_pipe( pipe_slow );
8176 %}
8177 
8178 instruct vmul4F_mem(vecX dst, vecX src, memory mem) %{
8179   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8180   match(Set dst (MulVF src (LoadVector mem)));
8181   format %{ "vmulps  $dst,$src,$mem\t! mul packed4F" %}
8182   ins_encode %{
8183     int vector_len = 0;
8184     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8185   %}
8186   ins_pipe( pipe_slow );
8187 %}
8188 
8189 instruct vmul8F_reg(vecY dst, vecY src1, vecY src2) %{
8190   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
8191   match(Set dst (MulVF src1 src2));
8192   format %{ "vmulps  $dst,$src1,$src2\t! mul packed8F" %}
8193   ins_encode %{
8194     int vector_len = 1;
8195     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8196   %}
8197   ins_pipe( pipe_slow );
8198 %}
8199 
8200 instruct vmul8F_mem(vecY dst, vecY src, memory mem) %{
8201   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
8202   match(Set dst (MulVF src (LoadVector mem)));
8203   format %{ "vmulps  $dst,$src,$mem\t! mul packed8F" %}
8204   ins_encode %{
8205     int vector_len = 1;
8206     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8207   %}
8208   ins_pipe( pipe_slow );
8209 %}
8210 
8211 instruct vmul16F_reg(vecZ dst, vecZ src1, vecZ src2) %{
8212   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
8213   match(Set dst (MulVF src1 src2));
8214   format %{ "vmulps  $dst,$src1,$src2\t! mul packed16F" %}
8215   ins_encode %{
8216     int vector_len = 2;
8217     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8218   %}
8219   ins_pipe( pipe_slow );
8220 %}
8221 
8222 instruct vmul16F_mem(vecZ dst, vecZ src, memory mem) %{
8223   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
8224   match(Set dst (MulVF src (LoadVector mem)));
8225   format %{ "vmulps  $dst,$src,$mem\t! mul packed16F" %}
8226   ins_encode %{
8227     int vector_len = 2;
8228     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8229   %}
8230   ins_pipe( pipe_slow );
8231 %}
8232 
8233 // Doubles vector mul
8234 instruct vmul2D(vecX dst, vecX src) %{
8235   predicate(n->as_Vector()->length() == 2);
8236   match(Set dst (MulVD dst src));
8237   format %{ "mulpd   $dst,$src\t! mul packed2D" %}
8238   ins_encode %{
8239     __ mulpd($dst$$XMMRegister, $src$$XMMRegister);
8240   %}
8241   ins_pipe( pipe_slow );
8242 %}
8243 
8244 instruct vmul2D_reg(vecX dst, vecX src1, vecX src2) %{
8245   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8246   match(Set dst (MulVD src1 src2));
8247   format %{ "vmulpd  $dst,$src1,$src2\t! mul packed2D" %}
8248   ins_encode %{
8249     int vector_len = 0;
8250     __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8251   %}
8252   ins_pipe( pipe_slow );
8253 %}
8254 
8255 instruct vmul2D_mem(vecX dst, vecX src, memory mem) %{
8256   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8257   match(Set dst (MulVD src (LoadVector mem)));
8258   format %{ "vmulpd  $dst,$src,$mem\t! mul packed2D" %}
8259   ins_encode %{
8260     int vector_len = 0;
8261     __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8262   %}
8263   ins_pipe( pipe_slow );
8264 %}
8265 
8266 instruct vmul4D_reg(vecY dst, vecY src1, vecY src2) %{
8267   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8268   match(Set dst (MulVD src1 src2));
8269   format %{ "vmulpd  $dst,$src1,$src2\t! mul packed4D" %}
8270   ins_encode %{
8271     int vector_len = 1;
8272     __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8273   %}
8274   ins_pipe( pipe_slow );
8275 %}
8276 
8277 instruct vmul4D_mem(vecY dst, vecY src, memory mem) %{
8278   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8279   match(Set dst (MulVD src (LoadVector mem)));
8280   format %{ "vmulpd  $dst,$src,$mem\t! mul packed4D" %}
8281   ins_encode %{
8282     int vector_len = 1;
8283     __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8284   %}
8285   ins_pipe( pipe_slow );
8286 %}
8287 
8288 instruct vmul8D_reg(vecZ dst, vecZ src1, vecZ src2) %{
8289   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8290   match(Set dst (MulVD src1 src2));
8291   format %{ "vmulpd  $dst k0,$src1,$src2\t! mul packed8D" %}
8292   ins_encode %{
8293     int vector_len = 2;
8294     __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8295   %}
8296   ins_pipe( pipe_slow );
8297 %}
8298 
8299 instruct vmul8D_mem(vecZ dst, vecZ src, memory mem) %{
8300   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8301   match(Set dst (MulVD src (LoadVector mem)));
8302   format %{ "vmulpd  $dst k0,$src,$mem\t! mul packed8D" %}
8303   ins_encode %{
8304     int vector_len = 2;
8305     __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8306   %}
8307   ins_pipe( pipe_slow );
8308 %}
8309 
8310 instruct vcmov8F_reg(vecY dst, vecY src1, vecY src2, immI8 cop, cmpOp_vcmppd copnd) %{
8311   predicate(UseAVX > 0 && UseAVX < 3 && n->as_Vector()->length() == 8);
8312   match(Set dst (CMoveVF (Binary copnd cop) (Binary src1 src2)));
8313   effect(TEMP dst, USE src1, USE src2);
8314   format %{ "cmpps.$copnd  $dst, $src1, $src2  ! vcmovevf, cond=$cop\n\t"
8315             "blendvps $dst,$src1,$src2,$dst ! vcmovevf\n\t"
8316          %}
8317   ins_encode %{
8318     int vector_len = 1;
8319     int cond = (Assembler::Condition)($copnd$$cmpcode);
8320     __ cmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cond, vector_len);
8321     __ blendvps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $dst$$XMMRegister, vector_len);
8322   %}
8323   ins_pipe( pipe_slow );
8324 %}
8325 
8326 instruct vcmov4D_reg(vecY dst, vecY src1, vecY src2, immI8 cop, cmpOp_vcmppd copnd) %{
8327   predicate(UseAVX > 0 && UseAVX < 3 && n->as_Vector()->length() == 4);
8328   match(Set dst (CMoveVD (Binary copnd cop) (Binary src1 src2)));
8329   effect(TEMP dst, USE src1, USE src2);
8330   format %{ "cmppd.$copnd  $dst, $src1, $src2  ! vcmovevd, cond=$cop\n\t"
8331             "blendvpd $dst,$src1,$src2,$dst ! vcmovevd\n\t"
8332          %}
8333   ins_encode %{
8334     int vector_len = 1;
8335     int cond = (Assembler::Condition)($copnd$$cmpcode);
8336     __ cmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cond, vector_len);
8337     __ blendvpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $dst$$XMMRegister, vector_len);
8338   %}
8339   ins_pipe( pipe_slow );
8340 %}
8341 
8342 // --------------------------------- DIV --------------------------------------
8343 
8344 // Floats vector div
8345 instruct vdiv2F(vecD dst, vecD src) %{
8346   predicate(n->as_Vector()->length() == 2);
8347   match(Set dst (DivVF dst src));
8348   format %{ "divps   $dst,$src\t! div packed2F" %}
8349   ins_encode %{
8350     __ divps($dst$$XMMRegister, $src$$XMMRegister);
8351   %}
8352   ins_pipe( pipe_slow );
8353 %}
8354 
8355 instruct vdiv2F_reg(vecD dst, vecD src1, vecD src2) %{
8356   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8357   match(Set dst (DivVF src1 src2));
8358   format %{ "vdivps  $dst,$src1,$src2\t! div packed2F" %}
8359   ins_encode %{
8360     int vector_len = 0;
8361     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8362   %}
8363   ins_pipe( pipe_slow );
8364 %}
8365 
8366 instruct vdiv2F_mem(vecD dst, vecD src, memory mem) %{
8367   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8368   match(Set dst (DivVF src (LoadVector mem)));
8369   format %{ "vdivps  $dst,$src,$mem\t! div packed2F" %}
8370   ins_encode %{
8371     int vector_len = 0;
8372     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8373   %}
8374   ins_pipe( pipe_slow );
8375 %}
8376 
8377 instruct vdiv4F(vecX dst, vecX src) %{
8378   predicate(n->as_Vector()->length() == 4);
8379   match(Set dst (DivVF dst src));
8380   format %{ "divps   $dst,$src\t! div packed4F" %}
8381   ins_encode %{
8382     __ divps($dst$$XMMRegister, $src$$XMMRegister);
8383   %}
8384   ins_pipe( pipe_slow );
8385 %}
8386 
8387 instruct vdiv4F_reg(vecX dst, vecX src1, vecX src2) %{
8388   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8389   match(Set dst (DivVF src1 src2));
8390   format %{ "vdivps  $dst,$src1,$src2\t! div packed4F" %}
8391   ins_encode %{
8392     int vector_len = 0;
8393     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8394   %}
8395   ins_pipe( pipe_slow );
8396 %}
8397 
8398 instruct vdiv4F_mem(vecX dst, vecX src, memory mem) %{
8399   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8400   match(Set dst (DivVF src (LoadVector mem)));
8401   format %{ "vdivps  $dst,$src,$mem\t! div packed4F" %}
8402   ins_encode %{
8403     int vector_len = 0;
8404     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8405   %}
8406   ins_pipe( pipe_slow );
8407 %}
8408 
8409 instruct vdiv8F_reg(vecY dst, vecY src1, vecY src2) %{
8410   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
8411   match(Set dst (DivVF src1 src2));
8412   format %{ "vdivps  $dst,$src1,$src2\t! div packed8F" %}
8413   ins_encode %{
8414     int vector_len = 1;
8415     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8416   %}
8417   ins_pipe( pipe_slow );
8418 %}
8419 
8420 instruct vdiv8F_mem(vecY dst, vecY src, memory mem) %{
8421   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
8422   match(Set dst (DivVF src (LoadVector mem)));
8423   format %{ "vdivps  $dst,$src,$mem\t! div packed8F" %}
8424   ins_encode %{
8425     int vector_len = 1;
8426     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8427   %}
8428   ins_pipe( pipe_slow );
8429 %}
8430 
8431 instruct vdiv16F_reg(vecZ dst, vecZ src1, vecZ src2) %{
8432   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
8433   match(Set dst (DivVF src1 src2));
8434   format %{ "vdivps  $dst,$src1,$src2\t! div packed16F" %}
8435   ins_encode %{
8436     int vector_len = 2;
8437     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8438   %}
8439   ins_pipe( pipe_slow );
8440 %}
8441 
8442 instruct vdiv16F_mem(vecZ dst, vecZ src, memory mem) %{
8443   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
8444   match(Set dst (DivVF src (LoadVector mem)));
8445   format %{ "vdivps  $dst,$src,$mem\t! div packed16F" %}
8446   ins_encode %{
8447     int vector_len = 2;
8448     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8449   %}
8450   ins_pipe( pipe_slow );
8451 %}
8452 
8453 // Doubles vector div
8454 instruct vdiv2D(vecX dst, vecX src) %{
8455   predicate(n->as_Vector()->length() == 2);
8456   match(Set dst (DivVD dst src));
8457   format %{ "divpd   $dst,$src\t! div packed2D" %}
8458   ins_encode %{
8459     __ divpd($dst$$XMMRegister, $src$$XMMRegister);
8460   %}
8461   ins_pipe( pipe_slow );
8462 %}
8463 
8464 instruct vdiv2D_reg(vecX dst, vecX src1, vecX src2) %{
8465   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8466   match(Set dst (DivVD src1 src2));
8467   format %{ "vdivpd  $dst,$src1,$src2\t! div packed2D" %}
8468   ins_encode %{
8469     int vector_len = 0;
8470     __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8471   %}
8472   ins_pipe( pipe_slow );
8473 %}
8474 
8475 instruct vdiv2D_mem(vecX dst, vecX src, memory mem) %{
8476   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8477   match(Set dst (DivVD src (LoadVector mem)));
8478   format %{ "vdivpd  $dst,$src,$mem\t! div packed2D" %}
8479   ins_encode %{
8480     int vector_len = 0;
8481     __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8482   %}
8483   ins_pipe( pipe_slow );
8484 %}
8485 
8486 instruct vdiv4D_reg(vecY dst, vecY src1, vecY src2) %{
8487   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8488   match(Set dst (DivVD src1 src2));
8489   format %{ "vdivpd  $dst,$src1,$src2\t! div packed4D" %}
8490   ins_encode %{
8491     int vector_len = 1;
8492     __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8493   %}
8494   ins_pipe( pipe_slow );
8495 %}
8496 
8497 instruct vdiv4D_mem(vecY dst, vecY src, memory mem) %{
8498   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8499   match(Set dst (DivVD src (LoadVector mem)));
8500   format %{ "vdivpd  $dst,$src,$mem\t! div packed4D" %}
8501   ins_encode %{
8502     int vector_len = 1;
8503     __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8504   %}
8505   ins_pipe( pipe_slow );
8506 %}
8507 
8508 instruct vdiv8D_reg(vecZ dst, vecZ src1, vecZ src2) %{
8509   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8510   match(Set dst (DivVD src1 src2));
8511   format %{ "vdivpd  $dst,$src1,$src2\t! div packed8D" %}
8512   ins_encode %{
8513     int vector_len = 2;
8514     __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8515   %}
8516   ins_pipe( pipe_slow );
8517 %}
8518 
8519 instruct vdiv8D_mem(vecZ dst, vecZ src, memory mem) %{
8520   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8521   match(Set dst (DivVD src (LoadVector mem)));
8522   format %{ "vdivpd  $dst,$src,$mem\t! div packed8D" %}
8523   ins_encode %{
8524     int vector_len = 2;
8525     __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8526   %}
8527   ins_pipe( pipe_slow );
8528 %}
8529 
8530 // ------------------------------ Shift ---------------------------------------
8531 
8532 // Left and right shift count vectors are the same on x86
8533 // (only lowest bits of xmm reg are used for count).
8534 instruct vshiftcnt(vecS dst, rRegI cnt) %{
8535   match(Set dst (LShiftCntV cnt));
8536   match(Set dst (RShiftCntV cnt));
8537   format %{ "movd    $dst,$cnt\t! load shift count" %}
8538   ins_encode %{
8539     __ movdl($dst$$XMMRegister, $cnt$$Register);
8540   %}
8541   ins_pipe( pipe_slow );
8542 %}
8543 
8544 // --------------------------------- Sqrt --------------------------------------
8545 
8546 // Floating point vector sqrt
8547 instruct vsqrt2D_reg(vecX dst, vecX src) %{
8548   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8549   match(Set dst (SqrtVD src));
8550   format %{ "vsqrtpd  $dst,$src\t! sqrt packed2D" %}
8551   ins_encode %{
8552     int vector_len = 0;
8553     __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
8554   %}
8555   ins_pipe( pipe_slow );
8556 %}
8557 
8558 instruct vsqrt2D_mem(vecX dst, memory mem) %{
8559   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8560   match(Set dst (SqrtVD (LoadVector mem)));
8561   format %{ "vsqrtpd  $dst,$mem\t! sqrt packed2D" %}
8562   ins_encode %{
8563     int vector_len = 0;
8564     __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vector_len);
8565   %}
8566   ins_pipe( pipe_slow );
8567 %}
8568 
8569 instruct vsqrt4D_reg(vecY dst, vecY src) %{
8570   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8571   match(Set dst (SqrtVD src));
8572   format %{ "vsqrtpd  $dst,$src\t! sqrt packed4D" %}
8573   ins_encode %{
8574     int vector_len = 1;
8575     __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
8576   %}
8577   ins_pipe( pipe_slow );
8578 %}
8579 
8580 instruct vsqrt4D_mem(vecY dst, memory mem) %{
8581   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8582   match(Set dst (SqrtVD (LoadVector mem)));
8583   format %{ "vsqrtpd  $dst,$mem\t! sqrt packed4D" %}
8584   ins_encode %{
8585     int vector_len = 1;
8586     __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vector_len);
8587   %}
8588   ins_pipe( pipe_slow );
8589 %}
8590 
8591 instruct vsqrt8D_reg(vecZ dst, vecZ src) %{
8592   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8593   match(Set dst (SqrtVD src));
8594   format %{ "vsqrtpd  $dst,$src\t! sqrt packed8D" %}
8595   ins_encode %{
8596     int vector_len = 2;
8597     __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
8598   %}
8599   ins_pipe( pipe_slow );
8600 %}
8601 
8602 instruct vsqrt8D_mem(vecZ dst, memory mem) %{
8603   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8604   match(Set dst (SqrtVD (LoadVector mem)));
8605   format %{ "vsqrtpd  $dst,$mem\t! sqrt packed8D" %}
8606   ins_encode %{
8607     int vector_len = 2;
8608     __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vector_len);
8609   %}
8610   ins_pipe( pipe_slow );
8611 %}
8612 
8613 instruct vsqrt2F_reg(vecD dst, vecD src) %{
8614   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8615   match(Set dst (SqrtVF src));
8616   format %{ "vsqrtps  $dst,$src\t! sqrt packed2F" %}
8617   ins_encode %{
8618     int vector_len = 0;
8619     __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vector_len);
8620   %}
8621   ins_pipe( pipe_slow );
8622 %}
8623 
8624 instruct vsqrt2F_mem(vecD dst, memory mem) %{
8625   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8626   match(Set dst (SqrtVF (LoadVector mem)));
8627   format %{ "vsqrtps  $dst,$mem\t! sqrt packed2F" %}
8628   ins_encode %{
8629     int vector_len = 0;
8630     __ vsqrtps($dst$$XMMRegister, $mem$$Address, vector_len);
8631   %}
8632   ins_pipe( pipe_slow );
8633 %}
8634 
8635 instruct vsqrt4F_reg(vecX dst, vecX src) %{
8636   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8637   match(Set dst (SqrtVF src));
8638   format %{ "vsqrtps  $dst,$src\t! sqrt packed4F" %}
8639   ins_encode %{
8640     int vector_len = 0;
8641     __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vector_len);
8642   %}
8643   ins_pipe( pipe_slow );
8644 %}
8645 
8646 instruct vsqrt4F_mem(vecX dst, memory mem) %{
8647   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8648   match(Set dst (SqrtVF (LoadVector mem)));
8649   format %{ "vsqrtps  $dst,$mem\t! sqrt packed4F" %}
8650   ins_encode %{
8651     int vector_len = 0;
8652     __ vsqrtps($dst$$XMMRegister, $mem$$Address, vector_len);
8653   %}
8654   ins_pipe( pipe_slow );
8655 %}
8656 
8657 instruct vsqrt8F_reg(vecY dst, vecY src) %{
8658   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
8659   match(Set dst (SqrtVF src));
8660   format %{ "vsqrtps  $dst,$src\t! sqrt packed8F" %}
8661   ins_encode %{
8662     int vector_len = 1;
8663     __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vector_len);
8664   %}
8665   ins_pipe( pipe_slow );
8666 %}
8667 
8668 instruct vsqrt8F_mem(vecY dst, memory mem) %{
8669   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
8670   match(Set dst (SqrtVF (LoadVector mem)));
8671   format %{ "vsqrtps  $dst,$mem\t! sqrt packed8F" %}
8672   ins_encode %{
8673     int vector_len = 1;
8674     __ vsqrtps($dst$$XMMRegister, $mem$$Address, vector_len);
8675   %}
8676   ins_pipe( pipe_slow );
8677 %}
8678 
8679 instruct vsqrt16F_reg(vecZ dst, vecZ src) %{
8680   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
8681   match(Set dst (SqrtVF src));
8682   format %{ "vsqrtps  $dst,$src\t! sqrt packed16F" %}
8683   ins_encode %{
8684     int vector_len = 2;
8685     __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vector_len);
8686   %}
8687   ins_pipe( pipe_slow );
8688 %}
8689 
8690 instruct vsqrt16F_mem(vecZ dst, memory mem) %{
8691   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
8692   match(Set dst (SqrtVF (LoadVector mem)));
8693   format %{ "vsqrtps  $dst,$mem\t! sqrt packed16F" %}
8694   ins_encode %{
8695     int vector_len = 2;
8696     __ vsqrtps($dst$$XMMRegister, $mem$$Address, vector_len);
8697   %}
8698   ins_pipe( pipe_slow );
8699 %}
8700 
8701 // ------------------------------ LeftShift -----------------------------------
8702 
8703 // Shorts/Chars vector left shift
8704 instruct vsll2S(vecS dst, vecS shift) %{
8705   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
8706   match(Set dst (LShiftVS dst shift));
8707   format %{ "psllw   $dst,$shift\t! left shift packed2S" %}
8708   ins_encode %{
8709     __ psllw($dst$$XMMRegister, $shift$$XMMRegister);
8710   %}
8711   ins_pipe( pipe_slow );
8712 %}
8713 
8714 instruct vsll2S_imm(vecS dst, immI8 shift) %{
8715   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
8716   match(Set dst (LShiftVS dst shift));
8717   format %{ "psllw   $dst,$shift\t! left shift packed2S" %}
8718   ins_encode %{
8719     __ psllw($dst$$XMMRegister, (int)$shift$$constant);
8720   %}
8721   ins_pipe( pipe_slow );
8722 %}
8723 
8724 instruct vsll2S_reg_avx(vecS dst, vecS src, vecS shift) %{
8725   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
8726   match(Set dst (LShiftVS src shift));
8727   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
8728   ins_encode %{
8729     int vector_len = 0;
8730     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8731   %}
8732   ins_pipe( pipe_slow );
8733 %}
8734 
8735 instruct vsll2S_reg_evex(vecS dst, vecS src, vecS shift) %{
8736   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
8737   match(Set dst (LShiftVS src shift));
8738   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
8739   ins_encode %{
8740     int vector_len = 0;
8741     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8742   %}
8743   ins_pipe( pipe_slow );
8744 %}
8745 
8746 instruct vsll2S_reg_evex_special(vecS dst, vecS src, vecS shift) %{
8747   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
8748   match(Set dst (LShiftVS dst shift));
8749   effect(TEMP src);
8750   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
8751   ins_encode %{
8752     int vector_len = 0;
8753     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8754   %}
8755   ins_pipe( pipe_slow );
8756 %}
8757 
8758 instruct vsll2S_reg_imm_avx(vecS dst, vecS src, immI8 shift) %{
8759   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
8760   match(Set dst (LShiftVS src shift));
8761   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
8762   ins_encode %{
8763     int vector_len = 0;
8764     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8765   %}
8766   ins_pipe( pipe_slow );
8767 %}
8768 
8769 instruct vsll2S_reg_imm_evex(vecS dst, vecS src, immI8 shift) %{
8770   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
8771   match(Set dst (LShiftVS src shift));
8772   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
8773   ins_encode %{
8774     int vector_len = 0;
8775     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8776   %}
8777   ins_pipe( pipe_slow );
8778 %}
8779 
8780 instruct vsll2S_reg_imm_evex_special(vecS dst, vecS src, immI8 shift) %{
8781   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
8782   match(Set dst (LShiftVS dst shift));
8783   effect(TEMP src);
8784   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
8785   ins_encode %{
8786     int vector_len = 0;
8787     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8788   %}
8789   ins_pipe( pipe_slow );
8790 %}
8791 
8792 instruct vsll4S(vecD dst, vecS shift) %{
8793   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
8794   match(Set dst (LShiftVS dst shift));
8795   format %{ "psllw   $dst,$shift\t! left shift packed4S" %}
8796   ins_encode %{
8797     __ psllw($dst$$XMMRegister, $shift$$XMMRegister);
8798   %}
8799   ins_pipe( pipe_slow );
8800 %}
8801 
8802 instruct vsll4S_imm(vecD dst, immI8 shift) %{
8803   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
8804   match(Set dst (LShiftVS dst shift));
8805   format %{ "psllw   $dst,$shift\t! left shift packed4S" %}
8806   ins_encode %{
8807     __ psllw($dst$$XMMRegister, (int)$shift$$constant);
8808   %}
8809   ins_pipe( pipe_slow );
8810 %}
8811 
8812 instruct vsll4S_reg_avx(vecD dst, vecD src, vecS shift) %{
8813   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
8814   match(Set dst (LShiftVS src shift));
8815   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
8816   ins_encode %{
8817     int vector_len = 0;
8818     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8819   %}
8820   ins_pipe( pipe_slow );
8821 %}
8822 
8823 instruct vsll4S_reg_evex(vecD dst, vecD src, vecS shift) %{
8824   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
8825   match(Set dst (LShiftVS src shift));
8826   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
8827   ins_encode %{
8828     int vector_len = 0;
8829     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8830   %}
8831   ins_pipe( pipe_slow );
8832 %}
8833 
8834 instruct vsll4S_reg_evex_special(vecD dst, vecD src, vecS shift) %{
8835   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
8836   match(Set dst (LShiftVS dst shift));
8837   effect(TEMP src);
8838   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
8839   ins_encode %{
8840     int vector_len = 0;
8841     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8842   %}
8843   ins_pipe( pipe_slow );
8844 %}
8845 
8846 instruct vsll4S_reg_imm_avx(vecD dst, vecD src, immI8 shift) %{
8847   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
8848   match(Set dst (LShiftVS src shift));
8849   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
8850   ins_encode %{
8851     int vector_len = 0;
8852     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8853   %}
8854   ins_pipe( pipe_slow );
8855 %}
8856 
8857 instruct vsll4S_reg_imm_evex(vecD dst, vecD src, immI8 shift) %{
8858   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
8859   match(Set dst (LShiftVS src shift));
8860   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
8861   ins_encode %{
8862     int vector_len = 0;
8863     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8864   %}
8865   ins_pipe( pipe_slow );
8866 %}
8867 
8868 instruct vsll4S_reg_imm_evex_special(vecD dst, vecD src, immI8 shift) %{
8869   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
8870   match(Set dst (LShiftVS dst shift));
8871   effect(TEMP src);
8872   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
8873   ins_encode %{
8874     int vector_len = 0;
8875     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8876   %}
8877   ins_pipe( pipe_slow );
8878 %}
8879 
8880 instruct vsll8S(vecX dst, vecS shift) %{
8881   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
8882   match(Set dst (LShiftVS dst shift));
8883   format %{ "psllw   $dst,$shift\t! left shift packed8S" %}
8884   ins_encode %{
8885     __ psllw($dst$$XMMRegister, $shift$$XMMRegister);
8886   %}
8887   ins_pipe( pipe_slow );
8888 %}
8889 
8890 instruct vsll8S_imm(vecX dst, immI8 shift) %{
8891   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
8892   match(Set dst (LShiftVS dst shift));
8893   format %{ "psllw   $dst,$shift\t! left shift packed8S" %}
8894   ins_encode %{
8895     __ psllw($dst$$XMMRegister, (int)$shift$$constant);
8896   %}
8897   ins_pipe( pipe_slow );
8898 %}
8899 
8900 instruct vsll8S_reg_avx(vecX dst, vecX src, vecS shift) %{
8901   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
8902   match(Set dst (LShiftVS src shift));
8903   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
8904   ins_encode %{
8905     int vector_len = 0;
8906     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8907   %}
8908   ins_pipe( pipe_slow );
8909 %}
8910 
8911 instruct vsll8S_reg_evex(vecX dst, vecX src, vecS shift) %{
8912   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
8913   match(Set dst (LShiftVS src shift));
8914   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
8915   ins_encode %{
8916     int vector_len = 0;
8917     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8918   %}
8919   ins_pipe( pipe_slow );
8920 %}
8921 
8922 instruct vsll8S_reg_evex_special(vecX dst, vecX src, vecS shift) %{
8923   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
8924   match(Set dst (LShiftVS dst shift));
8925   effect(TEMP src);
8926   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
8927   ins_encode %{
8928     int vector_len = 0;
8929     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8930   %}
8931   ins_pipe( pipe_slow );
8932 %}
8933 
8934 instruct vsll8S_reg_imm_avx(vecX dst, vecX src, immI8 shift) %{
8935   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
8936   match(Set dst (LShiftVS src shift));
8937   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
8938   ins_encode %{
8939     int vector_len = 0;
8940     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8941   %}
8942   ins_pipe( pipe_slow );
8943 %}
8944 
8945 instruct vsll8S_reg_imm_evex(vecX dst, vecX src, immI8 shift) %{
8946   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
8947   match(Set dst (LShiftVS src shift));
8948   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
8949   ins_encode %{
8950     int vector_len = 0;
8951     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8952   %}
8953   ins_pipe( pipe_slow );
8954 %}
8955 
8956 instruct vsll8S_reg_imm_evex_special(vecX dst, vecX src, immI8 shift) %{
8957   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
8958   match(Set dst (LShiftVS dst shift));
8959   effect(TEMP src);
8960   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
8961   ins_encode %{
8962     int vector_len = 0;
8963     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8964   %}
8965   ins_pipe( pipe_slow );
8966 %}
8967 
8968 instruct vsll16S_reg_avx(vecY dst, vecY src, vecS shift) %{
8969   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
8970   match(Set dst (LShiftVS src shift));
8971   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
8972   ins_encode %{
8973     int vector_len = 1;
8974     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8975   %}
8976   ins_pipe( pipe_slow );
8977 %}
8978 
8979 instruct vsll16S_reg_evex(vecY dst, vecY src, vecS shift) %{
8980   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
8981   match(Set dst (LShiftVS src shift));
8982   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
8983   ins_encode %{
8984     int vector_len = 1;
8985     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8986   %}
8987   ins_pipe( pipe_slow );
8988 %}
8989 
8990 instruct vsll16S_reg_evex_special(vecY dst, vecY src, vecS shift) %{
8991   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
8992   match(Set dst (LShiftVS dst shift));
8993   effect(TEMP src);
8994   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
8995   ins_encode %{
8996     int vector_len = 1;
8997     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8998   %}
8999   ins_pipe( pipe_slow );
9000 %}
9001 
9002 instruct vsll16S_reg_imm_avx(vecY dst, vecY src, immI8 shift) %{
9003   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
9004   match(Set dst (LShiftVS src shift));
9005   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
9006   ins_encode %{
9007     int vector_len = 1;
9008     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9009   %}
9010   ins_pipe( pipe_slow );
9011 %}
9012 
9013 instruct vsll16S_reg_imm_evex(vecY dst, vecY src, immI8 shift) %{
9014   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
9015   match(Set dst (LShiftVS src shift));
9016   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
9017   ins_encode %{
9018     int vector_len = 1;
9019     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9020   %}
9021   ins_pipe( pipe_slow );
9022 %}
9023 
9024 instruct vsll16S_reg_imm_evex_special(vecY dst, vecY src, immI8 shift) %{
9025   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
9026   match(Set dst (LShiftVS dst shift));
9027   effect(TEMP src);
9028   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
9029   ins_encode %{
9030     int vector_len = 1;
9031     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9032   %}
9033   ins_pipe( pipe_slow );
9034 %}
9035 
9036 instruct vsll32S_reg(vecZ dst, vecZ src, vecS shift) %{
9037   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
9038   match(Set dst (LShiftVS src shift));
9039   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed32S" %}
9040   ins_encode %{
9041     int vector_len = 2;
9042     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9043   %}
9044   ins_pipe( pipe_slow );
9045 %}
9046 
9047 instruct vsll32S_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
9048   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
9049   match(Set dst (LShiftVS src shift));
9050   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed32S" %}
9051   ins_encode %{
9052     int vector_len = 2;
9053     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9054   %}
9055   ins_pipe( pipe_slow );
9056 %}
9057 
9058 // Integers vector left shift
9059 instruct vsll2I(vecD dst, vecS shift) %{
9060   predicate(n->as_Vector()->length() == 2);
9061   match(Set dst (LShiftVI dst shift));
9062   format %{ "pslld   $dst,$shift\t! left shift packed2I" %}
9063   ins_encode %{
9064     __ pslld($dst$$XMMRegister, $shift$$XMMRegister);
9065   %}
9066   ins_pipe( pipe_slow );
9067 %}
9068 
9069 instruct vsll2I_imm(vecD dst, immI8 shift) %{
9070   predicate(n->as_Vector()->length() == 2);
9071   match(Set dst (LShiftVI dst shift));
9072   format %{ "pslld   $dst,$shift\t! left shift packed2I" %}
9073   ins_encode %{
9074     __ pslld($dst$$XMMRegister, (int)$shift$$constant);
9075   %}
9076   ins_pipe( pipe_slow );
9077 %}
9078 
9079 instruct vsll2I_reg(vecD dst, vecD src, vecS shift) %{
9080   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
9081   match(Set dst (LShiftVI src shift));
9082   format %{ "vpslld  $dst,$src,$shift\t! left shift packed2I" %}
9083   ins_encode %{
9084     int vector_len = 0;
9085     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9086   %}
9087   ins_pipe( pipe_slow );
9088 %}
9089 
9090 instruct vsll2I_reg_imm(vecD dst, vecD src, immI8 shift) %{
9091   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
9092   match(Set dst (LShiftVI src shift));
9093   format %{ "vpslld  $dst,$src,$shift\t! left shift packed2I" %}
9094   ins_encode %{
9095     int vector_len = 0;
9096     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9097   %}
9098   ins_pipe( pipe_slow );
9099 %}
9100 
9101 instruct vsll4I(vecX dst, vecS shift) %{
9102   predicate(n->as_Vector()->length() == 4);
9103   match(Set dst (LShiftVI dst shift));
9104   format %{ "pslld   $dst,$shift\t! left shift packed4I" %}
9105   ins_encode %{
9106     __ pslld($dst$$XMMRegister, $shift$$XMMRegister);
9107   %}
9108   ins_pipe( pipe_slow );
9109 %}
9110 
9111 instruct vsll4I_imm(vecX dst, immI8 shift) %{
9112   predicate(n->as_Vector()->length() == 4);
9113   match(Set dst (LShiftVI dst shift));
9114   format %{ "pslld   $dst,$shift\t! left shift packed4I" %}
9115   ins_encode %{
9116     __ pslld($dst$$XMMRegister, (int)$shift$$constant);
9117   %}
9118   ins_pipe( pipe_slow );
9119 %}
9120 
9121 instruct vsll4I_reg(vecX dst, vecX src, vecS shift) %{
9122   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
9123   match(Set dst (LShiftVI src shift));
9124   format %{ "vpslld  $dst,$src,$shift\t! left shift packed4I" %}
9125   ins_encode %{
9126     int vector_len = 0;
9127     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9128   %}
9129   ins_pipe( pipe_slow );
9130 %}
9131 
9132 instruct vsll4I_reg_imm(vecX dst, vecX src, immI8 shift) %{
9133   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
9134   match(Set dst (LShiftVI src shift));
9135   format %{ "vpslld  $dst,$src,$shift\t! left shift packed4I" %}
9136   ins_encode %{
9137     int vector_len = 0;
9138     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9139   %}
9140   ins_pipe( pipe_slow );
9141 %}
9142 
9143 instruct vsll8I_reg(vecY dst, vecY src, vecS shift) %{
9144   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
9145   match(Set dst (LShiftVI src shift));
9146   format %{ "vpslld  $dst,$src,$shift\t! left shift packed8I" %}
9147   ins_encode %{
9148     int vector_len = 1;
9149     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9150   %}
9151   ins_pipe( pipe_slow );
9152 %}
9153 
9154 instruct vsll8I_reg_imm(vecY dst, vecY src, immI8 shift) %{
9155   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
9156   match(Set dst (LShiftVI src shift));
9157   format %{ "vpslld  $dst,$src,$shift\t! left shift packed8I" %}
9158   ins_encode %{
9159     int vector_len = 1;
9160     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9161   %}
9162   ins_pipe( pipe_slow );
9163 %}
9164 
9165 instruct vsll16I_reg(vecZ dst, vecZ src, vecS shift) %{
9166   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
9167   match(Set dst (LShiftVI src shift));
9168   format %{ "vpslld  $dst,$src,$shift\t! left shift packed16I" %}
9169   ins_encode %{
9170     int vector_len = 2;
9171     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9172   %}
9173   ins_pipe( pipe_slow );
9174 %}
9175 
9176 instruct vsll16I_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
9177   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
9178   match(Set dst (LShiftVI src shift));
9179   format %{ "vpslld  $dst,$src,$shift\t! left shift packed16I" %}
9180   ins_encode %{
9181     int vector_len = 2;
9182     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9183   %}
9184   ins_pipe( pipe_slow );
9185 %}
9186 
9187 // Longs vector left shift
9188 instruct vsll2L(vecX dst, vecS shift) %{
9189   predicate(n->as_Vector()->length() == 2);
9190   match(Set dst (LShiftVL dst shift));
9191   format %{ "psllq   $dst,$shift\t! left shift packed2L" %}
9192   ins_encode %{
9193     __ psllq($dst$$XMMRegister, $shift$$XMMRegister);
9194   %}
9195   ins_pipe( pipe_slow );
9196 %}
9197 
9198 instruct vsll2L_imm(vecX dst, immI8 shift) %{
9199   predicate(n->as_Vector()->length() == 2);
9200   match(Set dst (LShiftVL dst shift));
9201   format %{ "psllq   $dst,$shift\t! left shift packed2L" %}
9202   ins_encode %{
9203     __ psllq($dst$$XMMRegister, (int)$shift$$constant);
9204   %}
9205   ins_pipe( pipe_slow );
9206 %}
9207 
9208 instruct vsll2L_reg(vecX dst, vecX src, vecS shift) %{
9209   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
9210   match(Set dst (LShiftVL src shift));
9211   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed2L" %}
9212   ins_encode %{
9213     int vector_len = 0;
9214     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9215   %}
9216   ins_pipe( pipe_slow );
9217 %}
9218 
9219 instruct vsll2L_reg_imm(vecX dst, vecX src, immI8 shift) %{
9220   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
9221   match(Set dst (LShiftVL src shift));
9222   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed2L" %}
9223   ins_encode %{
9224     int vector_len = 0;
9225     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9226   %}
9227   ins_pipe( pipe_slow );
9228 %}
9229 
9230 instruct vsll4L_reg(vecY dst, vecY src, vecS shift) %{
9231   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
9232   match(Set dst (LShiftVL src shift));
9233   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed4L" %}
9234   ins_encode %{
9235     int vector_len = 1;
9236     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9237   %}
9238   ins_pipe( pipe_slow );
9239 %}
9240 
9241 instruct vsll4L_reg_imm(vecY dst, vecY src, immI8 shift) %{
9242   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
9243   match(Set dst (LShiftVL src shift));
9244   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed4L" %}
9245   ins_encode %{
9246     int vector_len = 1;
9247     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9248   %}
9249   ins_pipe( pipe_slow );
9250 %}
9251 
9252 instruct vsll8L_reg(vecZ dst, vecZ src, vecS shift) %{
9253   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
9254   match(Set dst (LShiftVL src shift));
9255   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed8L" %}
9256   ins_encode %{
9257     int vector_len = 2;
9258     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9259   %}
9260   ins_pipe( pipe_slow );
9261 %}
9262 
9263 instruct vsll8L_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
9264   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
9265   match(Set dst (LShiftVL src shift));
9266   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed8L" %}
9267   ins_encode %{
9268     int vector_len = 2;
9269     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9270   %}
9271   ins_pipe( pipe_slow );
9272 %}
9273 
9274 // ----------------------- LogicalRightShift -----------------------------------
9275 
9276 // Shorts vector logical right shift produces incorrect Java result
9277 // for negative data because java code convert short value into int with
9278 // sign extension before a shift. But char vectors are fine since chars are
9279 // unsigned values.
9280 
9281 instruct vsrl2S(vecS dst, vecS shift) %{
9282   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
9283   match(Set dst (URShiftVS dst shift));
9284   format %{ "psrlw   $dst,$shift\t! logical right shift packed2S" %}
9285   ins_encode %{
9286     __ psrlw($dst$$XMMRegister, $shift$$XMMRegister);
9287   %}
9288   ins_pipe( pipe_slow );
9289 %}
9290 
9291 instruct vsrl2S_imm(vecS dst, immI8 shift) %{
9292   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
9293   match(Set dst (URShiftVS dst shift));
9294   format %{ "psrlw   $dst,$shift\t! logical right shift packed2S" %}
9295   ins_encode %{
9296     __ psrlw($dst$$XMMRegister, (int)$shift$$constant);
9297   %}
9298   ins_pipe( pipe_slow );
9299 %}
9300 
9301 instruct vsrl2S_reg_avx(vecS dst, vecS src, vecS shift) %{
9302   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
9303   match(Set dst (URShiftVS src shift));
9304   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed2S" %}
9305   ins_encode %{
9306     int vector_len = 0;
9307     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9308   %}
9309   ins_pipe( pipe_slow );
9310 %}
9311 
9312 instruct vsrl2S_reg_evex(vecS dst, vecS src, vecS shift) %{
9313   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
9314   match(Set dst (URShiftVS src shift));
9315   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed2S" %}
9316   ins_encode %{
9317     int vector_len = 0;
9318     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9319   %}
9320   ins_pipe( pipe_slow );
9321 %}
9322 
9323 instruct vsrl2S_reg_evex_special(vecS dst, vecS src, vecS shift) %{
9324   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
9325   match(Set dst (URShiftVS dst shift));
9326   effect(TEMP src);
9327   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed2S" %}
9328   ins_encode %{
9329     int vector_len = 0;
9330     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9331   %}
9332   ins_pipe( pipe_slow );
9333 %}
9334 
9335 instruct vsrl2S_reg_imm_avx(vecS dst, vecS src, immI8 shift) %{
9336   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
9337   match(Set dst (URShiftVS src shift));
9338   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed2S" %}
9339   ins_encode %{
9340     int vector_len = 0;
9341     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9342   %}
9343   ins_pipe( pipe_slow );
9344 %}
9345 
9346 instruct vsrl2S_reg_imm_evex(vecS dst, vecS src, immI8 shift) %{
9347   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
9348   match(Set dst (URShiftVS src shift));
9349   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed2S" %}
9350   ins_encode %{
9351     int vector_len = 0;
9352     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9353   %}
9354   ins_pipe( pipe_slow );
9355 %}
9356 
9357 instruct vsrl2S_reg_imm_evex_special(vecS dst, vecS src, immI8 shift) %{
9358   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
9359   match(Set dst (URShiftVS dst shift));
9360   effect(TEMP src);
9361   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed2S" %}
9362   ins_encode %{
9363     int vector_len = 0;
9364     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9365   %}
9366   ins_pipe( pipe_slow );
9367 %}
9368 
9369 instruct vsrl4S(vecD dst, vecS shift) %{
9370   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
9371   match(Set dst (URShiftVS dst shift));
9372   format %{ "psrlw   $dst,$shift\t! logical right shift packed4S" %}
9373   ins_encode %{
9374     __ psrlw($dst$$XMMRegister, $shift$$XMMRegister);
9375   %}
9376   ins_pipe( pipe_slow );
9377 %}
9378 
9379 instruct vsrl4S_imm(vecD dst, immI8 shift) %{
9380   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
9381   match(Set dst (URShiftVS dst shift));
9382   format %{ "psrlw   $dst,$shift\t! logical right shift packed4S" %}
9383   ins_encode %{
9384     __ psrlw($dst$$XMMRegister, (int)$shift$$constant);
9385   %}
9386   ins_pipe( pipe_slow );
9387 %}
9388 
9389 instruct vsrl4S_reg_avx(vecD dst, vecD src, vecS shift) %{
9390   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
9391   match(Set dst (URShiftVS src shift));
9392   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed4S" %}
9393   ins_encode %{
9394     int vector_len = 0;
9395     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9396   %}
9397   ins_pipe( pipe_slow );
9398 %}
9399 
9400 instruct vsrl4S_reg_evex(vecD dst, vecD src, vecS shift) %{
9401   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
9402   match(Set dst (URShiftVS src shift));
9403   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed4S" %}
9404   ins_encode %{
9405     int vector_len = 0;
9406     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9407   %}
9408   ins_pipe( pipe_slow );
9409 %}
9410 
9411 instruct vsrl4S_reg_evex_special(vecD dst, vecD src, vecS shift) %{
9412   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
9413   match(Set dst (URShiftVS dst shift));
9414   effect(TEMP src);
9415   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed4S" %}
9416   ins_encode %{
9417     int vector_len = 0;
9418     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9419   %}
9420   ins_pipe( pipe_slow );
9421 %}
9422 
9423 instruct vsrl4S_reg_imm_avx(vecD dst, vecD src, immI8 shift) %{
9424   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
9425   match(Set dst (URShiftVS src shift));
9426   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed4S" %}
9427   ins_encode %{
9428     int vector_len = 0;
9429     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9430   %}
9431   ins_pipe( pipe_slow );
9432 %}
9433 
9434 instruct vsrl4S_reg_imm_evex(vecD dst, vecD src, immI8 shift) %{
9435   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
9436   match(Set dst (URShiftVS src shift));
9437   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed4S" %}
9438   ins_encode %{
9439     int vector_len = 0;
9440     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9441   %}
9442   ins_pipe( pipe_slow );
9443 %}
9444 
9445 instruct vsrl4S_reg_imm_evex_special(vecD dst, vecD src, immI8 shift) %{
9446   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
9447   match(Set dst (URShiftVS dst shift));
9448   effect(TEMP src);
9449   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed4S" %}
9450   ins_encode %{
9451     int vector_len = 0;
9452     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9453   %}
9454   ins_pipe( pipe_slow );
9455 %}
9456 
9457 instruct vsrl8S(vecX dst, vecS shift) %{
9458   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
9459   match(Set dst (URShiftVS dst shift));
9460   format %{ "psrlw   $dst,$shift\t! logical right shift packed8S" %}
9461   ins_encode %{
9462     __ psrlw($dst$$XMMRegister, $shift$$XMMRegister);
9463   %}
9464   ins_pipe( pipe_slow );
9465 %}
9466 
9467 instruct vsrl8S_imm(vecX dst, immI8 shift) %{
9468   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
9469   match(Set dst (URShiftVS dst shift));
9470   format %{ "psrlw   $dst,$shift\t! logical right shift packed8S" %}
9471   ins_encode %{
9472     __ psrlw($dst$$XMMRegister, (int)$shift$$constant);
9473   %}
9474   ins_pipe( pipe_slow );
9475 %}
9476 
9477 instruct vsrl8S_reg_avx(vecX dst, vecX src, vecS shift) %{
9478   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
9479   match(Set dst (URShiftVS src shift));
9480   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed8S" %}
9481   ins_encode %{
9482     int vector_len = 0;
9483     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9484   %}
9485   ins_pipe( pipe_slow );
9486 %}
9487 
9488 instruct vsrl8S_reg_evex(vecX dst, vecX src, vecS shift) %{
9489   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
9490   match(Set dst (URShiftVS src shift));
9491   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed8S" %}
9492   ins_encode %{
9493     int vector_len = 0;
9494     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9495   %}
9496   ins_pipe( pipe_slow );
9497 %}
9498 
9499 instruct vsrl8S_reg_evex_special(vecX dst, vecX src, vecS shift) %{
9500   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
9501   match(Set dst (URShiftVS dst shift));
9502   effect(TEMP src);
9503   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed8S" %}
9504   ins_encode %{
9505     int vector_len = 0;
9506     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9507   %}
9508   ins_pipe( pipe_slow );
9509 %}
9510 
9511 instruct vsrl8S_reg_imm_avx(vecX dst, vecX src, immI8 shift) %{
9512   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
9513   match(Set dst (URShiftVS src shift));
9514   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed8S" %}
9515   ins_encode %{
9516     int vector_len = 0;
9517     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9518   %}
9519   ins_pipe( pipe_slow );
9520 %}
9521 
9522 instruct vsrl8S_reg_imm_evex(vecX dst, vecX src, immI8 shift) %{
9523   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
9524   match(Set dst (URShiftVS src shift));
9525   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed8S" %}
9526   ins_encode %{
9527     int vector_len = 0;
9528     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9529   %}
9530   ins_pipe( pipe_slow );
9531 %}
9532 
9533 instruct vsrl8S_reg_imm_evex_special(vecX dst, vecX src, immI8 shift) %{
9534   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
9535   match(Set dst (URShiftVS dst shift));
9536   effect(TEMP src);
9537   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed8S" %}
9538   ins_encode %{
9539     int vector_len = 0;
9540     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9541   %}
9542   ins_pipe( pipe_slow );
9543 %}
9544 
9545 instruct vsrl16S_reg_avx(vecY dst, vecY src, vecS shift) %{
9546   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
9547   match(Set dst (URShiftVS src shift));
9548   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed16S" %}
9549   ins_encode %{
9550     int vector_len = 1;
9551     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9552   %}
9553   ins_pipe( pipe_slow );
9554 %}
9555 
9556 instruct vsrl16S_reg_evex(vecY dst, vecY src, vecS shift) %{
9557   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
9558   match(Set dst (URShiftVS src shift));
9559   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed16S" %}
9560   ins_encode %{
9561     int vector_len = 1;
9562     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9563   %}
9564   ins_pipe( pipe_slow );
9565 %}
9566 
9567 instruct vsrl16S_reg_evex_special(vecY dst, vecY src, vecS shift) %{
9568   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
9569   match(Set dst (URShiftVS dst shift));
9570   effect(TEMP src);
9571   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed16S" %}
9572   ins_encode %{
9573     int vector_len = 1;
9574     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9575   %}
9576   ins_pipe( pipe_slow );
9577 %}
9578 
9579 instruct vsrl16S_reg_imm_avx(vecY dst, vecY src, immI8 shift) %{
9580   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
9581   match(Set dst (URShiftVS src shift));
9582   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed16S" %}
9583   ins_encode %{
9584     int vector_len = 1;
9585     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9586   %}
9587   ins_pipe( pipe_slow );
9588 %}
9589 
9590 instruct vsrl16S_reg_imm_evex(vecY dst, vecY src, immI8 shift) %{
9591   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
9592   match(Set dst (URShiftVS src shift));
9593   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed16S" %}
9594   ins_encode %{
9595     int vector_len = 1;
9596     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9597   %}
9598   ins_pipe( pipe_slow );
9599 %}
9600 
9601 instruct vsrl16S_reg_imm_evex_special(vecY dst, vecY src, immI8 shift) %{
9602   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
9603   match(Set dst (URShiftVS dst shift));
9604   effect(TEMP src);
9605   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed16S" %}
9606   ins_encode %{
9607     int vector_len = 1;
9608     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9609   %}
9610   ins_pipe( pipe_slow );
9611 %}
9612 
9613 instruct vsrl32S_reg(vecZ dst, vecZ src, vecS shift) %{
9614   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
9615   match(Set dst (URShiftVS src shift));
9616   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed32S" %}
9617   ins_encode %{
9618     int vector_len = 2;
9619     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9620   %}
9621   ins_pipe( pipe_slow );
9622 %}
9623 
9624 instruct vsrl32S_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
9625   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
9626   match(Set dst (URShiftVS src shift));
9627   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed32S" %}
9628   ins_encode %{
9629     int vector_len = 2;
9630     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9631   %}
9632   ins_pipe( pipe_slow );
9633 %}
9634 
9635 // Integers vector logical right shift
9636 instruct vsrl2I(vecD dst, vecS shift) %{
9637   predicate(n->as_Vector()->length() == 2);
9638   match(Set dst (URShiftVI dst shift));
9639   format %{ "psrld   $dst,$shift\t! logical right shift packed2I" %}
9640   ins_encode %{
9641     __ psrld($dst$$XMMRegister, $shift$$XMMRegister);
9642   %}
9643   ins_pipe( pipe_slow );
9644 %}
9645 
9646 instruct vsrl2I_imm(vecD dst, immI8 shift) %{
9647   predicate(n->as_Vector()->length() == 2);
9648   match(Set dst (URShiftVI dst shift));
9649   format %{ "psrld   $dst,$shift\t! logical right shift packed2I" %}
9650   ins_encode %{
9651     __ psrld($dst$$XMMRegister, (int)$shift$$constant);
9652   %}
9653   ins_pipe( pipe_slow );
9654 %}
9655 
9656 instruct vsrl2I_reg(vecD dst, vecD src, vecS shift) %{
9657   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
9658   match(Set dst (URShiftVI src shift));
9659   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed2I" %}
9660   ins_encode %{
9661     int vector_len = 0;
9662     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9663   %}
9664   ins_pipe( pipe_slow );
9665 %}
9666 
9667 instruct vsrl2I_reg_imm(vecD dst, vecD src, immI8 shift) %{
9668   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
9669   match(Set dst (URShiftVI src shift));
9670   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed2I" %}
9671   ins_encode %{
9672     int vector_len = 0;
9673     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9674   %}
9675   ins_pipe( pipe_slow );
9676 %}
9677 
9678 instruct vsrl4I(vecX dst, vecS shift) %{
9679   predicate(n->as_Vector()->length() == 4);
9680   match(Set dst (URShiftVI dst shift));
9681   format %{ "psrld   $dst,$shift\t! logical right shift packed4I" %}
9682   ins_encode %{
9683     __ psrld($dst$$XMMRegister, $shift$$XMMRegister);
9684   %}
9685   ins_pipe( pipe_slow );
9686 %}
9687 
9688 instruct vsrl4I_imm(vecX dst, immI8 shift) %{
9689   predicate(n->as_Vector()->length() == 4);
9690   match(Set dst (URShiftVI dst shift));
9691   format %{ "psrld   $dst,$shift\t! logical right shift packed4I" %}
9692   ins_encode %{
9693     __ psrld($dst$$XMMRegister, (int)$shift$$constant);
9694   %}
9695   ins_pipe( pipe_slow );
9696 %}
9697 
9698 instruct vsrl4I_reg(vecX dst, vecX src, vecS shift) %{
9699   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
9700   match(Set dst (URShiftVI src shift));
9701   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed4I" %}
9702   ins_encode %{
9703     int vector_len = 0;
9704     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9705   %}
9706   ins_pipe( pipe_slow );
9707 %}
9708 
9709 instruct vsrl4I_reg_imm(vecX dst, vecX src, immI8 shift) %{
9710   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
9711   match(Set dst (URShiftVI src shift));
9712   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed4I" %}
9713   ins_encode %{
9714     int vector_len = 0;
9715     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9716   %}
9717   ins_pipe( pipe_slow );
9718 %}
9719 
9720 instruct vsrl8I_reg(vecY dst, vecY src, vecS shift) %{
9721   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
9722   match(Set dst (URShiftVI src shift));
9723   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed8I" %}
9724   ins_encode %{
9725     int vector_len = 1;
9726     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9727   %}
9728   ins_pipe( pipe_slow );
9729 %}
9730 
9731 instruct vsrl8I_reg_imm(vecY dst, vecY src, immI8 shift) %{
9732   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
9733   match(Set dst (URShiftVI src shift));
9734   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed8I" %}
9735   ins_encode %{
9736     int vector_len = 1;
9737     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9738   %}
9739   ins_pipe( pipe_slow );
9740 %}
9741 
9742 instruct vsrl16I_reg(vecZ dst, vecZ src, vecS shift) %{
9743   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
9744   match(Set dst (URShiftVI src shift));
9745   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed16I" %}
9746   ins_encode %{
9747     int vector_len = 2;
9748     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9749   %}
9750   ins_pipe( pipe_slow );
9751 %}
9752 
9753 instruct vsrl16I_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
9754   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
9755   match(Set dst (URShiftVI src shift));
9756   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed16I" %}
9757   ins_encode %{
9758     int vector_len = 2;
9759     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9760   %}
9761   ins_pipe( pipe_slow );
9762 %}
9763 
9764 // Longs vector logical right shift
9765 instruct vsrl2L(vecX dst, vecS shift) %{
9766   predicate(n->as_Vector()->length() == 2);
9767   match(Set dst (URShiftVL dst shift));
9768   format %{ "psrlq   $dst,$shift\t! logical right shift packed2L" %}
9769   ins_encode %{
9770     __ psrlq($dst$$XMMRegister, $shift$$XMMRegister);
9771   %}
9772   ins_pipe( pipe_slow );
9773 %}
9774 
9775 instruct vsrl2L_imm(vecX dst, immI8 shift) %{
9776   predicate(n->as_Vector()->length() == 2);
9777   match(Set dst (URShiftVL dst shift));
9778   format %{ "psrlq   $dst,$shift\t! logical right shift packed2L" %}
9779   ins_encode %{
9780     __ psrlq($dst$$XMMRegister, (int)$shift$$constant);
9781   %}
9782   ins_pipe( pipe_slow );
9783 %}
9784 
9785 instruct vsrl2L_reg(vecX dst, vecX src, vecS shift) %{
9786   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
9787   match(Set dst (URShiftVL src shift));
9788   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed2L" %}
9789   ins_encode %{
9790     int vector_len = 0;
9791     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9792   %}
9793   ins_pipe( pipe_slow );
9794 %}
9795 
9796 instruct vsrl2L_reg_imm(vecX dst, vecX src, immI8 shift) %{
9797   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
9798   match(Set dst (URShiftVL src shift));
9799   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed2L" %}
9800   ins_encode %{
9801     int vector_len = 0;
9802     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9803   %}
9804   ins_pipe( pipe_slow );
9805 %}
9806 
9807 instruct vsrl4L_reg(vecY dst, vecY src, vecS shift) %{
9808   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
9809   match(Set dst (URShiftVL src shift));
9810   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed4L" %}
9811   ins_encode %{
9812     int vector_len = 1;
9813     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9814   %}
9815   ins_pipe( pipe_slow );
9816 %}
9817 
9818 instruct vsrl4L_reg_imm(vecY dst, vecY src, immI8 shift) %{
9819   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
9820   match(Set dst (URShiftVL src shift));
9821   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed4L" %}
9822   ins_encode %{
9823     int vector_len = 1;
9824     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9825   %}
9826   ins_pipe( pipe_slow );
9827 %}
9828 
9829 instruct vsrl8L_reg(vecZ dst, vecZ src, vecS shift) %{
9830   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
9831   match(Set dst (URShiftVL src shift));
9832   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed8L" %}
9833   ins_encode %{
9834     int vector_len = 2;
9835     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9836   %}
9837   ins_pipe( pipe_slow );
9838 %}
9839 
9840 instruct vsrl8L_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
9841   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
9842   match(Set dst (URShiftVL src shift));
9843   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed8L" %}
9844   ins_encode %{
9845     int vector_len = 2;
9846     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9847   %}
9848   ins_pipe( pipe_slow );
9849 %}
9850 
9851 // ------------------- ArithmeticRightShift -----------------------------------
9852 
9853 // Shorts/Chars vector arithmetic right shift
9854 instruct vsra2S(vecS dst, vecS shift) %{
9855   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
9856   match(Set dst (RShiftVS dst shift));
9857   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed2S" %}
9858   ins_encode %{
9859     __ psraw($dst$$XMMRegister, $shift$$XMMRegister);
9860   %}
9861   ins_pipe( pipe_slow );
9862 %}
9863 
9864 instruct vsra2S_imm(vecS dst, immI8 shift) %{
9865   predicate(n->as_Vector()->length() == 2);
9866   match(Set dst (RShiftVS dst shift));
9867   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed2S" %}
9868   ins_encode %{
9869     __ psraw($dst$$XMMRegister, (int)$shift$$constant);
9870   %}
9871   ins_pipe( pipe_slow );
9872 %}
9873 
9874 instruct vsra2S_reg_avx(vecS dst, vecS src, vecS shift) %{
9875   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
9876   match(Set dst (RShiftVS src shift));
9877   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
9878   ins_encode %{
9879     int vector_len = 0;
9880     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9881   %}
9882   ins_pipe( pipe_slow );
9883 %}
9884 
9885 instruct vsra2S_reg_evex(vecS dst, vecS src, vecS shift) %{
9886   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
9887   match(Set dst (RShiftVS src shift));
9888   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
9889   ins_encode %{
9890     int vector_len = 0;
9891     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9892   %}
9893   ins_pipe( pipe_slow );
9894 %}
9895 
9896 instruct vsra2S_reg_evex_special(vecS dst, vecS src, vecS shift) %{
9897   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
9898   match(Set dst (RShiftVS dst shift));
9899   effect(TEMP src);
9900   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
9901   ins_encode %{
9902     int vector_len = 0;
9903     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9904   %}
9905   ins_pipe( pipe_slow );
9906 %}
9907 
9908 instruct vsra2S_reg_imm_avx(vecS dst, vecS src, immI8 shift) %{
9909   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
9910   match(Set dst (RShiftVS src shift));
9911   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
9912   ins_encode %{
9913     int vector_len = 0;
9914     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9915   %}
9916   ins_pipe( pipe_slow );
9917 %}
9918 
9919 instruct vsra2S_reg_imm_evex(vecS dst, vecS src, immI8 shift) %{
9920   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
9921   match(Set dst (RShiftVS src shift));
9922   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
9923   ins_encode %{
9924     int vector_len = 0;
9925     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9926   %}
9927   ins_pipe( pipe_slow );
9928 %}
9929 
9930 instruct vsra2S_reg_imm_evex_special(vecS dst, vecS src, immI8 shift) %{
9931   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
9932   match(Set dst (RShiftVS dst shift));
9933   effect(TEMP src);
9934   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
9935   ins_encode %{
9936     int vector_len = 0;
9937     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9938   %}
9939   ins_pipe( pipe_slow );
9940 %}
9941 
9942 instruct vsra4S(vecD dst, vecS shift) %{
9943   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
9944   match(Set dst (RShiftVS dst shift));
9945   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed4S" %}
9946   ins_encode %{
9947     __ psraw($dst$$XMMRegister, $shift$$XMMRegister);
9948   %}
9949   ins_pipe( pipe_slow );
9950 %}
9951 
9952 instruct vsra4S_imm(vecD dst, immI8 shift) %{
9953   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
9954   match(Set dst (RShiftVS dst shift));
9955   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed4S" %}
9956   ins_encode %{
9957     __ psraw($dst$$XMMRegister, (int)$shift$$constant);
9958   %}
9959   ins_pipe( pipe_slow );
9960 %}
9961 
9962 instruct vsra4S_reg_avx(vecD dst, vecD src, vecS shift) %{
9963   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
9964   match(Set dst (RShiftVS src shift));
9965   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
9966   ins_encode %{
9967     int vector_len = 0;
9968     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9969   %}
9970   ins_pipe( pipe_slow );
9971 %}
9972 
9973 instruct vsra4S_reg_evex(vecD dst, vecD src, vecS shift) %{
9974   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
9975   match(Set dst (RShiftVS src shift));
9976   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
9977   ins_encode %{
9978     int vector_len = 0;
9979     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9980   %}
9981   ins_pipe( pipe_slow );
9982 %}
9983 
9984 instruct vsra4S_reg_evex_special(vecD dst, vecD src, vecS shift) %{
9985   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
9986   match(Set dst (RShiftVS dst shift));
9987   effect(TEMP src);
9988   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
9989   ins_encode %{
9990     int vector_len = 0;
9991     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9992   %}
9993   ins_pipe( pipe_slow );
9994 %}
9995 
9996 instruct vsra4S_reg_imm_avx(vecD dst, vecD src, immI8 shift) %{
9997   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
9998   match(Set dst (RShiftVS src shift));
9999   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
10000   ins_encode %{
10001     int vector_len = 0;
10002     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10003   %}
10004   ins_pipe( pipe_slow );
10005 %}
10006 
10007 instruct vsra4S_reg_imm_evex(vecD dst, vecD src, immI8 shift) %{
10008   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
10009   match(Set dst (RShiftVS src shift));
10010   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
10011   ins_encode %{
10012     int vector_len = 0;
10013     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10014   %}
10015   ins_pipe( pipe_slow );
10016 %}
10017 
10018 instruct vsra4S_reg_imm_evex_special(vecD dst, vecD src, immI8 shift) %{
10019   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
10020   match(Set dst (RShiftVS dst shift));
10021   effect(TEMP src);
10022   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
10023   ins_encode %{
10024     int vector_len = 0;
10025     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10026   %}
10027   ins_pipe( pipe_slow );
10028 %}
10029 
10030 instruct vsra8S(vecX dst, vecS shift) %{
10031   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
10032   match(Set dst (RShiftVS dst shift));
10033   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed8S" %}
10034   ins_encode %{
10035     __ psraw($dst$$XMMRegister, $shift$$XMMRegister);
10036   %}
10037   ins_pipe( pipe_slow );
10038 %}
10039 
10040 instruct vsra8S_imm(vecX dst, immI8 shift) %{
10041   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
10042   match(Set dst (RShiftVS dst shift));
10043   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed8S" %}
10044   ins_encode %{
10045     __ psraw($dst$$XMMRegister, (int)$shift$$constant);
10046   %}
10047   ins_pipe( pipe_slow );
10048 %}
10049 
10050 instruct vsra8S_reg_avx(vecX dst, vecX src, vecS shift) %{
10051   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
10052   match(Set dst (RShiftVS src shift));
10053   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
10054   ins_encode %{
10055     int vector_len = 0;
10056     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
10057   %}
10058   ins_pipe( pipe_slow );
10059 %}
10060 
10061 instruct vsra8S_reg_evex(vecX dst, vecX src, vecS shift) %{
10062   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
10063   match(Set dst (RShiftVS src shift));
10064   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
10065   ins_encode %{
10066     int vector_len = 0;
10067     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
10068   %}
10069   ins_pipe( pipe_slow );
10070 %}
10071 
10072 instruct vsra8S_reg_evex_special(vecX dst, vecX src, vecS shift) %{
10073   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
10074   match(Set dst (RShiftVS dst shift));
10075   effect(TEMP src);
10076   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
10077   ins_encode %{
10078     int vector_len = 0;
10079     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
10080   %}
10081   ins_pipe( pipe_slow );
10082 %}
10083 
10084 instruct vsra8S_reg_imm_avx(vecX dst, vecX src, immI8 shift) %{
10085   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
10086   match(Set dst (RShiftVS src shift));
10087   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
10088   ins_encode %{
10089     int vector_len = 0;
10090     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10091   %}
10092   ins_pipe( pipe_slow );
10093 %}
10094 
10095 instruct vsra8S_reg_imm_evex(vecX dst, vecX src, immI8 shift) %{
10096   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
10097   match(Set dst (RShiftVS src shift));
10098   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
10099   ins_encode %{
10100     int vector_len = 0;
10101     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10102   %}
10103   ins_pipe( pipe_slow );
10104 %}
10105 
10106 instruct vsra8S_reg_imm_evex_special(vecX dst, vecX src, immI8 shift) %{
10107   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
10108   match(Set dst (RShiftVS dst shift));
10109   effect(TEMP src);
10110   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
10111   ins_encode %{
10112     int vector_len = 0;
10113     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10114   %}
10115   ins_pipe( pipe_slow );
10116 %}
10117 
10118 instruct vsra16S_reg_avx(vecY dst, vecY src, vecS shift) %{
10119   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
10120   match(Set dst (RShiftVS src shift));
10121   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
10122   ins_encode %{
10123     int vector_len = 1;
10124     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
10125   %}
10126   ins_pipe( pipe_slow );
10127 %}
10128 
10129 instruct vsra16S_reg_evex(vecY dst, vecY src, vecS shift) %{
10130   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
10131   match(Set dst (RShiftVS src shift));
10132   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
10133   ins_encode %{
10134     int vector_len = 1;
10135     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
10136   %}
10137   ins_pipe( pipe_slow );
10138 %}
10139 
10140 instruct vsra16S_reg_evex_special(vecY dst, vecY src, vecS shift) %{
10141   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
10142   match(Set dst (RShiftVS dst shift));
10143   effect(TEMP src);
10144   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
10145   ins_encode %{
10146     int vector_len = 1;
10147     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
10148   %}
10149   ins_pipe( pipe_slow );
10150 %}
10151 
10152 instruct vsra16S_reg_imm_avx(vecY dst, vecY src, immI8 shift) %{
10153   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 16);
10154   match(Set dst (RShiftVS src shift));
10155   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
10156   ins_encode %{
10157     int vector_len = 1;
10158     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10159   %}
10160   ins_pipe( pipe_slow );
10161 %}
10162 
10163 instruct vsra16S_reg_imm_evex(vecY dst, vecY src, immI8 shift) %{
10164   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
10165   match(Set dst (RShiftVS src shift));
10166   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
10167   ins_encode %{
10168     int vector_len = 1;
10169     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10170   %}
10171   ins_pipe( pipe_slow );
10172 %}
10173 
10174 instruct vsra16S_reg_imm_evex_special(vecY dst, vecY src, immI8 shift) %{
10175   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
10176   match(Set dst (RShiftVS dst shift));
10177   effect(TEMP src);
10178   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
10179   ins_encode %{
10180     int vector_len = 1;
10181     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10182   %}
10183   ins_pipe( pipe_slow );
10184 %}
10185 
10186 instruct vsra32S_reg(vecZ dst, vecZ src, vecS shift) %{
10187   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
10188   match(Set dst (RShiftVS src shift));
10189   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed32S" %}
10190   ins_encode %{
10191     int vector_len = 2;
10192     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
10193   %}
10194   ins_pipe( pipe_slow );
10195 %}
10196 
10197 instruct vsra32S_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
10198   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
10199   match(Set dst (RShiftVS src shift));
10200   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed32S" %}
10201   ins_encode %{
10202     int vector_len = 2;
10203     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10204   %}
10205   ins_pipe( pipe_slow );
10206 %}
10207 
10208 // Integers vector arithmetic right shift
10209 instruct vsra2I(vecD dst, vecS shift) %{
10210   predicate(n->as_Vector()->length() == 2);
10211   match(Set dst (RShiftVI dst shift));
10212   format %{ "psrad   $dst,$shift\t! arithmetic right shift packed2I" %}
10213   ins_encode %{
10214     __ psrad($dst$$XMMRegister, $shift$$XMMRegister);
10215   %}
10216   ins_pipe( pipe_slow );
10217 %}
10218 
10219 instruct vsra2I_imm(vecD dst, immI8 shift) %{
10220   predicate(n->as_Vector()->length() == 2);
10221   match(Set dst (RShiftVI dst shift));
10222   format %{ "psrad   $dst,$shift\t! arithmetic right shift packed2I" %}
10223   ins_encode %{
10224     __ psrad($dst$$XMMRegister, (int)$shift$$constant);
10225   %}
10226   ins_pipe( pipe_slow );
10227 %}
10228 
10229 instruct vsra2I_reg(vecD dst, vecD src, vecS shift) %{
10230   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
10231   match(Set dst (RShiftVI src shift));
10232   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed2I" %}
10233   ins_encode %{
10234     int vector_len = 0;
10235     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
10236   %}
10237   ins_pipe( pipe_slow );
10238 %}
10239 
10240 instruct vsra2I_reg_imm(vecD dst, vecD src, immI8 shift) %{
10241   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
10242   match(Set dst (RShiftVI src shift));
10243   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed2I" %}
10244   ins_encode %{
10245     int vector_len = 0;
10246     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10247   %}
10248   ins_pipe( pipe_slow );
10249 %}
10250 
10251 instruct vsra4I(vecX dst, vecS shift) %{
10252   predicate(n->as_Vector()->length() == 4);
10253   match(Set dst (RShiftVI dst shift));
10254   format %{ "psrad   $dst,$shift\t! arithmetic right shift packed4I" %}
10255   ins_encode %{
10256     __ psrad($dst$$XMMRegister, $shift$$XMMRegister);
10257   %}
10258   ins_pipe( pipe_slow );
10259 %}
10260 
10261 instruct vsra4I_imm(vecX dst, immI8 shift) %{
10262   predicate(n->as_Vector()->length() == 4);
10263   match(Set dst (RShiftVI dst shift));
10264   format %{ "psrad   $dst,$shift\t! arithmetic right shift packed4I" %}
10265   ins_encode %{
10266     __ psrad($dst$$XMMRegister, (int)$shift$$constant);
10267   %}
10268   ins_pipe( pipe_slow );
10269 %}
10270 
10271 instruct vsra4I_reg(vecX dst, vecX src, vecS shift) %{
10272   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
10273   match(Set dst (RShiftVI src shift));
10274   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed4I" %}
10275   ins_encode %{
10276     int vector_len = 0;
10277     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
10278   %}
10279   ins_pipe( pipe_slow );
10280 %}
10281 
10282 instruct vsra4I_reg_imm(vecX dst, vecX src, immI8 shift) %{
10283   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
10284   match(Set dst (RShiftVI src shift));
10285   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed4I" %}
10286   ins_encode %{
10287     int vector_len = 0;
10288     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10289   %}
10290   ins_pipe( pipe_slow );
10291 %}
10292 
10293 instruct vsra8I_reg(vecY dst, vecY src, vecS shift) %{
10294   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
10295   match(Set dst (RShiftVI src shift));
10296   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed8I" %}
10297   ins_encode %{
10298     int vector_len = 1;
10299     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
10300   %}
10301   ins_pipe( pipe_slow );
10302 %}
10303 
10304 instruct vsra8I_reg_imm(vecY dst, vecY src, immI8 shift) %{
10305   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
10306   match(Set dst (RShiftVI src shift));
10307   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed8I" %}
10308   ins_encode %{
10309     int vector_len = 1;
10310     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10311   %}
10312   ins_pipe( pipe_slow );
10313 %}
10314 
10315 instruct vsra16I_reg(vecZ dst, vecZ src, vecS shift) %{
10316   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
10317   match(Set dst (RShiftVI src shift));
10318   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed16I" %}
10319   ins_encode %{
10320     int vector_len = 2;
10321     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
10322   %}
10323   ins_pipe( pipe_slow );
10324 %}
10325 
10326 instruct vsra16I_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
10327   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
10328   match(Set dst (RShiftVI src shift));
10329   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed16I" %}
10330   ins_encode %{
10331     int vector_len = 2;
10332     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10333   %}
10334   ins_pipe( pipe_slow );
10335 %}
10336 
10337 // There are no longs vector arithmetic right shift instructions.
10338 
10339 
10340 // --------------------------------- AND --------------------------------------
10341 
10342 instruct vand4B(vecS dst, vecS src) %{
10343   predicate(n->as_Vector()->length_in_bytes() == 4);
10344   match(Set dst (AndV dst src));
10345   format %{ "pand    $dst,$src\t! and vectors (4 bytes)" %}
10346   ins_encode %{
10347     __ pand($dst$$XMMRegister, $src$$XMMRegister);
10348   %}
10349   ins_pipe( pipe_slow );
10350 %}
10351 
10352 instruct vand4B_reg(vecS dst, vecS src1, vecS src2) %{
10353   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
10354   match(Set dst (AndV src1 src2));
10355   format %{ "vpand   $dst,$src1,$src2\t! and vectors (4 bytes)" %}
10356   ins_encode %{
10357     int vector_len = 0;
10358     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
10359   %}
10360   ins_pipe( pipe_slow );
10361 %}
10362 
10363 instruct vand4B_mem(vecS dst, vecS src, memory mem) %{
10364   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
10365   match(Set dst (AndV src (LoadVector mem)));
10366   format %{ "vpand   $dst,$src,$mem\t! and vectors (4 bytes)" %}
10367   ins_encode %{
10368     int vector_len = 0;
10369     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
10370   %}
10371   ins_pipe( pipe_slow );
10372 %}
10373 
10374 instruct vand8B(vecD dst, vecD src) %{
10375   predicate(n->as_Vector()->length_in_bytes() == 8);
10376   match(Set dst (AndV dst src));
10377   format %{ "pand    $dst,$src\t! and vectors (8 bytes)" %}
10378   ins_encode %{
10379     __ pand($dst$$XMMRegister, $src$$XMMRegister);
10380   %}
10381   ins_pipe( pipe_slow );
10382 %}
10383 
10384 instruct vand8B_reg(vecD dst, vecD src1, vecD src2) %{
10385   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
10386   match(Set dst (AndV src1 src2));
10387   format %{ "vpand   $dst,$src1,$src2\t! and vectors (8 bytes)" %}
10388   ins_encode %{
10389     int vector_len = 0;
10390     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
10391   %}
10392   ins_pipe( pipe_slow );
10393 %}
10394 
10395 instruct vand8B_mem(vecD dst, vecD src, memory mem) %{
10396   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
10397   match(Set dst (AndV src (LoadVector mem)));
10398   format %{ "vpand   $dst,$src,$mem\t! and vectors (8 bytes)" %}
10399   ins_encode %{
10400     int vector_len = 0;
10401     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
10402   %}
10403   ins_pipe( pipe_slow );
10404 %}
10405 
10406 instruct vand16B(vecX dst, vecX src) %{
10407   predicate(n->as_Vector()->length_in_bytes() == 16);
10408   match(Set dst (AndV dst src));
10409   format %{ "pand    $dst,$src\t! and vectors (16 bytes)" %}
10410   ins_encode %{
10411     __ pand($dst$$XMMRegister, $src$$XMMRegister);
10412   %}
10413   ins_pipe( pipe_slow );
10414 %}
10415 
10416 instruct vand16B_reg(vecX dst, vecX src1, vecX src2) %{
10417   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
10418   match(Set dst (AndV src1 src2));
10419   format %{ "vpand   $dst,$src1,$src2\t! and vectors (16 bytes)" %}
10420   ins_encode %{
10421     int vector_len = 0;
10422     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
10423   %}
10424   ins_pipe( pipe_slow );
10425 %}
10426 
10427 instruct vand16B_mem(vecX dst, vecX src, memory mem) %{
10428   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
10429   match(Set dst (AndV src (LoadVector mem)));
10430   format %{ "vpand   $dst,$src,$mem\t! and vectors (16 bytes)" %}
10431   ins_encode %{
10432     int vector_len = 0;
10433     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
10434   %}
10435   ins_pipe( pipe_slow );
10436 %}
10437 
10438 instruct vand32B_reg(vecY dst, vecY src1, vecY src2) %{
10439   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
10440   match(Set dst (AndV src1 src2));
10441   format %{ "vpand   $dst,$src1,$src2\t! and vectors (32 bytes)" %}
10442   ins_encode %{
10443     int vector_len = 1;
10444     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
10445   %}
10446   ins_pipe( pipe_slow );
10447 %}
10448 
10449 instruct vand32B_mem(vecY dst, vecY src, memory mem) %{
10450   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
10451   match(Set dst (AndV src (LoadVector mem)));
10452   format %{ "vpand   $dst,$src,$mem\t! and vectors (32 bytes)" %}
10453   ins_encode %{
10454     int vector_len = 1;
10455     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
10456   %}
10457   ins_pipe( pipe_slow );
10458 %}
10459 
10460 instruct vand64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
10461   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
10462   match(Set dst (AndV src1 src2));
10463   format %{ "vpand   $dst,$src1,$src2\t! and vectors (64 bytes)" %}
10464   ins_encode %{
10465     int vector_len = 2;
10466     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
10467   %}
10468   ins_pipe( pipe_slow );
10469 %}
10470 
10471 instruct vand64B_mem(vecZ dst, vecZ src, memory mem) %{
10472   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
10473   match(Set dst (AndV src (LoadVector mem)));
10474   format %{ "vpand   $dst,$src,$mem\t! and vectors (64 bytes)" %}
10475   ins_encode %{
10476     int vector_len = 2;
10477     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
10478   %}
10479   ins_pipe( pipe_slow );
10480 %}
10481 
10482 // --------------------------------- OR ---------------------------------------
10483 
10484 instruct vor4B(vecS dst, vecS src) %{
10485   predicate(n->as_Vector()->length_in_bytes() == 4);
10486   match(Set dst (OrV dst src));
10487   format %{ "por     $dst,$src\t! or vectors (4 bytes)" %}
10488   ins_encode %{
10489     __ por($dst$$XMMRegister, $src$$XMMRegister);
10490   %}
10491   ins_pipe( pipe_slow );
10492 %}
10493 
10494 instruct vor4B_reg(vecS dst, vecS src1, vecS src2) %{
10495   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
10496   match(Set dst (OrV src1 src2));
10497   format %{ "vpor    $dst,$src1,$src2\t! or vectors (4 bytes)" %}
10498   ins_encode %{
10499     int vector_len = 0;
10500     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
10501   %}
10502   ins_pipe( pipe_slow );
10503 %}
10504 
10505 instruct vor4B_mem(vecS dst, vecS src, memory mem) %{
10506   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
10507   match(Set dst (OrV src (LoadVector mem)));
10508   format %{ "vpor    $dst,$src,$mem\t! or vectors (4 bytes)" %}
10509   ins_encode %{
10510     int vector_len = 0;
10511     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
10512   %}
10513   ins_pipe( pipe_slow );
10514 %}
10515 
10516 instruct vor8B(vecD dst, vecD src) %{
10517   predicate(n->as_Vector()->length_in_bytes() == 8);
10518   match(Set dst (OrV dst src));
10519   format %{ "por     $dst,$src\t! or vectors (8 bytes)" %}
10520   ins_encode %{
10521     __ por($dst$$XMMRegister, $src$$XMMRegister);
10522   %}
10523   ins_pipe( pipe_slow );
10524 %}
10525 
10526 instruct vor8B_reg(vecD dst, vecD src1, vecD src2) %{
10527   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
10528   match(Set dst (OrV src1 src2));
10529   format %{ "vpor    $dst,$src1,$src2\t! or vectors (8 bytes)" %}
10530   ins_encode %{
10531     int vector_len = 0;
10532     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
10533   %}
10534   ins_pipe( pipe_slow );
10535 %}
10536 
10537 instruct vor8B_mem(vecD dst, vecD src, memory mem) %{
10538   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
10539   match(Set dst (OrV src (LoadVector mem)));
10540   format %{ "vpor    $dst,$src,$mem\t! or vectors (8 bytes)" %}
10541   ins_encode %{
10542     int vector_len = 0;
10543     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
10544   %}
10545   ins_pipe( pipe_slow );
10546 %}
10547 
10548 instruct vor16B(vecX dst, vecX src) %{
10549   predicate(n->as_Vector()->length_in_bytes() == 16);
10550   match(Set dst (OrV dst src));
10551   format %{ "por     $dst,$src\t! or vectors (16 bytes)" %}
10552   ins_encode %{
10553     __ por($dst$$XMMRegister, $src$$XMMRegister);
10554   %}
10555   ins_pipe( pipe_slow );
10556 %}
10557 
10558 instruct vor16B_reg(vecX dst, vecX src1, vecX src2) %{
10559   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
10560   match(Set dst (OrV src1 src2));
10561   format %{ "vpor    $dst,$src1,$src2\t! or vectors (16 bytes)" %}
10562   ins_encode %{
10563     int vector_len = 0;
10564     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
10565   %}
10566   ins_pipe( pipe_slow );
10567 %}
10568 
10569 instruct vor16B_mem(vecX dst, vecX src, memory mem) %{
10570   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
10571   match(Set dst (OrV src (LoadVector mem)));
10572   format %{ "vpor    $dst,$src,$mem\t! or vectors (16 bytes)" %}
10573   ins_encode %{
10574     int vector_len = 0;
10575     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
10576   %}
10577   ins_pipe( pipe_slow );
10578 %}
10579 
10580 instruct vor32B_reg(vecY dst, vecY src1, vecY src2) %{
10581   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
10582   match(Set dst (OrV src1 src2));
10583   format %{ "vpor    $dst,$src1,$src2\t! or vectors (32 bytes)" %}
10584   ins_encode %{
10585     int vector_len = 1;
10586     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
10587   %}
10588   ins_pipe( pipe_slow );
10589 %}
10590 
10591 instruct vor32B_mem(vecY dst, vecY src, memory mem) %{
10592   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
10593   match(Set dst (OrV src (LoadVector mem)));
10594   format %{ "vpor    $dst,$src,$mem\t! or vectors (32 bytes)" %}
10595   ins_encode %{
10596     int vector_len = 1;
10597     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
10598   %}
10599   ins_pipe( pipe_slow );
10600 %}
10601 
10602 instruct vor64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
10603   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
10604   match(Set dst (OrV src1 src2));
10605   format %{ "vpor    $dst,$src1,$src2\t! or vectors (64 bytes)" %}
10606   ins_encode %{
10607     int vector_len = 2;
10608     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
10609   %}
10610   ins_pipe( pipe_slow );
10611 %}
10612 
10613 instruct vor64B_mem(vecZ dst, vecZ src, memory mem) %{
10614   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
10615   match(Set dst (OrV src (LoadVector mem)));
10616   format %{ "vpor    $dst,$src,$mem\t! or vectors (64 bytes)" %}
10617   ins_encode %{
10618     int vector_len = 2;
10619     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
10620   %}
10621   ins_pipe( pipe_slow );
10622 %}
10623 
10624 // --------------------------------- XOR --------------------------------------
10625 
10626 instruct vxor4B(vecS dst, vecS src) %{
10627   predicate(n->as_Vector()->length_in_bytes() == 4);
10628   match(Set dst (XorV dst src));
10629   format %{ "pxor    $dst,$src\t! xor vectors (4 bytes)" %}
10630   ins_encode %{
10631     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
10632   %}
10633   ins_pipe( pipe_slow );
10634 %}
10635 
10636 instruct vxor4B_reg(vecS dst, vecS src1, vecS src2) %{
10637   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
10638   match(Set dst (XorV src1 src2));
10639   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (4 bytes)" %}
10640   ins_encode %{
10641     int vector_len = 0;
10642     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
10643   %}
10644   ins_pipe( pipe_slow );
10645 %}
10646 
10647 instruct vxor4B_mem(vecS dst, vecS src, memory mem) %{
10648   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
10649   match(Set dst (XorV src (LoadVector mem)));
10650   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (4 bytes)" %}
10651   ins_encode %{
10652     int vector_len = 0;
10653     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
10654   %}
10655   ins_pipe( pipe_slow );
10656 %}
10657 
10658 instruct vxor8B(vecD dst, vecD src) %{
10659   predicate(n->as_Vector()->length_in_bytes() == 8);
10660   match(Set dst (XorV dst src));
10661   format %{ "pxor    $dst,$src\t! xor vectors (8 bytes)" %}
10662   ins_encode %{
10663     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
10664   %}
10665   ins_pipe( pipe_slow );
10666 %}
10667 
10668 instruct vxor8B_reg(vecD dst, vecD src1, vecD src2) %{
10669   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
10670   match(Set dst (XorV src1 src2));
10671   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (8 bytes)" %}
10672   ins_encode %{
10673     int vector_len = 0;
10674     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
10675   %}
10676   ins_pipe( pipe_slow );
10677 %}
10678 
10679 instruct vxor8B_mem(vecD dst, vecD src, memory mem) %{
10680   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
10681   match(Set dst (XorV src (LoadVector mem)));
10682   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (8 bytes)" %}
10683   ins_encode %{
10684     int vector_len = 0;
10685     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
10686   %}
10687   ins_pipe( pipe_slow );
10688 %}
10689 
10690 instruct vxor16B(vecX dst, vecX src) %{
10691   predicate(n->as_Vector()->length_in_bytes() == 16);
10692   match(Set dst (XorV dst src));
10693   format %{ "pxor    $dst,$src\t! xor vectors (16 bytes)" %}
10694   ins_encode %{
10695     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
10696   %}
10697   ins_pipe( pipe_slow );
10698 %}
10699 
10700 instruct vxor16B_reg(vecX dst, vecX src1, vecX src2) %{
10701   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
10702   match(Set dst (XorV src1 src2));
10703   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (16 bytes)" %}
10704   ins_encode %{
10705     int vector_len = 0;
10706     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
10707   %}
10708   ins_pipe( pipe_slow );
10709 %}
10710 
10711 instruct vxor16B_mem(vecX dst, vecX src, memory mem) %{
10712   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
10713   match(Set dst (XorV src (LoadVector mem)));
10714   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (16 bytes)" %}
10715   ins_encode %{
10716     int vector_len = 0;
10717     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
10718   %}
10719   ins_pipe( pipe_slow );
10720 %}
10721 
10722 instruct vxor32B_reg(vecY dst, vecY src1, vecY src2) %{
10723   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
10724   match(Set dst (XorV src1 src2));
10725   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (32 bytes)" %}
10726   ins_encode %{
10727     int vector_len = 1;
10728     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
10729   %}
10730   ins_pipe( pipe_slow );
10731 %}
10732 
10733 instruct vxor32B_mem(vecY dst, vecY src, memory mem) %{
10734   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
10735   match(Set dst (XorV src (LoadVector mem)));
10736   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (32 bytes)" %}
10737   ins_encode %{
10738     int vector_len = 1;
10739     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
10740   %}
10741   ins_pipe( pipe_slow );
10742 %}
10743 
10744 instruct vxor64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
10745   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
10746   match(Set dst (XorV src1 src2));
10747   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (64 bytes)" %}
10748   ins_encode %{
10749     int vector_len = 2;
10750     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
10751   %}
10752   ins_pipe( pipe_slow );
10753 %}
10754 
10755 instruct vxor64B_mem(vecZ dst, vecZ src, memory mem) %{
10756   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
10757   match(Set dst (XorV src (LoadVector mem)));
10758   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (64 bytes)" %}
10759   ins_encode %{
10760     int vector_len = 2;
10761     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
10762   %}
10763   ins_pipe( pipe_slow );
10764 %}
10765 
10766 // --------------------------------- FMA --------------------------------------
10767 
10768 // a * b + c
10769 instruct vfma2D_reg(vecX a, vecX b, vecX c) %{
10770   predicate(UseFMA && n->as_Vector()->length() == 2);
10771   match(Set c (FmaVD  c (Binary a b)));
10772   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed2D" %}
10773   ins_cost(150);
10774   ins_encode %{
10775     int vector_len = 0;
10776     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
10777   %}
10778   ins_pipe( pipe_slow );
10779 %}
10780 
10781 // a * b + c
10782 instruct vfma2D_mem(vecX a, memory b, vecX c) %{
10783   predicate(UseFMA && n->as_Vector()->length() == 2);
10784   match(Set c (FmaVD  c (Binary a (LoadVector b))));
10785   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed2D" %}
10786   ins_cost(150);
10787   ins_encode %{
10788     int vector_len = 0;
10789     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
10790   %}
10791   ins_pipe( pipe_slow );
10792 %}
10793 
10794 
10795 // a * b + c
10796 instruct vfma4D_reg(vecY a, vecY b, vecY c) %{
10797   predicate(UseFMA && n->as_Vector()->length() == 4);
10798   match(Set c (FmaVD  c (Binary a b)));
10799   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed4D" %}
10800   ins_cost(150);
10801   ins_encode %{
10802     int vector_len = 1;
10803     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
10804   %}
10805   ins_pipe( pipe_slow );
10806 %}
10807 
10808 // a * b + c
10809 instruct vfma4D_mem(vecY a, memory b, vecY c) %{
10810   predicate(UseFMA && n->as_Vector()->length() == 4);
10811   match(Set c (FmaVD  c (Binary a (LoadVector b))));
10812   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed4D" %}
10813   ins_cost(150);
10814   ins_encode %{
10815     int vector_len = 1;
10816     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
10817   %}
10818   ins_pipe( pipe_slow );
10819 %}
10820 
10821 // a * b + c
10822 instruct vfma8D_reg(vecZ a, vecZ b, vecZ c) %{
10823   predicate(UseFMA && n->as_Vector()->length() == 8);
10824   match(Set c (FmaVD  c (Binary a b)));
10825   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed8D" %}
10826   ins_cost(150);
10827   ins_encode %{
10828     int vector_len = 2;
10829     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
10830   %}
10831   ins_pipe( pipe_slow );
10832 %}
10833 
10834 // a * b + c
10835 instruct vfma8D_mem(vecZ a, memory b, vecZ c) %{
10836   predicate(UseFMA && n->as_Vector()->length() == 8);
10837   match(Set c (FmaVD  c (Binary a (LoadVector b))));
10838   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed8D" %}
10839   ins_cost(150);
10840   ins_encode %{
10841     int vector_len = 2;
10842     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
10843   %}
10844   ins_pipe( pipe_slow );
10845 %}
10846 
10847 // a * b + c
10848 instruct vfma4F_reg(vecX a, vecX b, vecX c) %{
10849   predicate(UseFMA && n->as_Vector()->length() == 4);
10850   match(Set c (FmaVF  c (Binary a b)));
10851   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed4F" %}
10852   ins_cost(150);
10853   ins_encode %{
10854     int vector_len = 0;
10855     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
10856   %}
10857   ins_pipe( pipe_slow );
10858 %}
10859 
10860 // a * b + c
10861 instruct vfma4F_mem(vecX a, memory b, vecX c) %{
10862   predicate(UseFMA && n->as_Vector()->length() == 4);
10863   match(Set c (FmaVF  c (Binary a (LoadVector b))));
10864   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed4F" %}
10865   ins_cost(150);
10866   ins_encode %{
10867     int vector_len = 0;
10868     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
10869   %}
10870   ins_pipe( pipe_slow );
10871 %}
10872 
10873 // a * b + c
10874 instruct vfma8F_reg(vecY a, vecY b, vecY c) %{
10875   predicate(UseFMA && n->as_Vector()->length() == 8);
10876   match(Set c (FmaVF  c (Binary a b)));
10877   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed8F" %}
10878   ins_cost(150);
10879   ins_encode %{
10880     int vector_len = 1;
10881     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
10882   %}
10883   ins_pipe( pipe_slow );
10884 %}
10885 
10886 // a * b + c
10887 instruct vfma8F_mem(vecY a, memory b, vecY c) %{
10888   predicate(UseFMA && n->as_Vector()->length() == 8);
10889   match(Set c (FmaVF  c (Binary a (LoadVector b))));
10890   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed8F" %}
10891   ins_cost(150);
10892   ins_encode %{
10893     int vector_len = 1;
10894     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
10895   %}
10896   ins_pipe( pipe_slow );
10897 %}
10898 
10899 // a * b + c
10900 instruct vfma16F_reg(vecZ a, vecZ b, vecZ c) %{
10901   predicate(UseFMA && n->as_Vector()->length() == 16);
10902   match(Set c (FmaVF  c (Binary a b)));
10903   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed16F" %}
10904   ins_cost(150);
10905   ins_encode %{
10906     int vector_len = 2;
10907     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
10908   %}
10909   ins_pipe( pipe_slow );
10910 %}
10911 
10912 // a * b + c
10913 instruct vfma16F_mem(vecZ a, memory b, vecZ c) %{
10914   predicate(UseFMA && n->as_Vector()->length() == 16);
10915   match(Set c (FmaVF  c (Binary a (LoadVector b))));
10916   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed16F" %}
10917   ins_cost(150);
10918   ins_encode %{
10919     int vector_len = 2;
10920     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
10921   %}
10922   ins_pipe( pipe_slow );
10923 %}
10924 
10925 // --------------------------------- PopCount --------------------------------------
10926 
10927 instruct vpopcount2I(vecD dst, vecD src) %{
10928   predicate(VM_Version::supports_vpopcntdq() && UsePopCountInstruction && n->as_Vector()->length() == 2);
10929   match(Set dst (PopCountVI src));
10930   format %{ "vpopcntd  $dst,$src\t! vector popcount packed2I" %}
10931   ins_encode %{
10932     int vector_len = 0;
10933     __ vpopcntd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
10934   %}
10935   ins_pipe( pipe_slow );
10936 %}
10937 
10938 instruct vpopcount4I(vecX dst, vecX src) %{
10939   predicate(VM_Version::supports_vpopcntdq() && UsePopCountInstruction && n->as_Vector()->length() == 4);
10940   match(Set dst (PopCountVI src));
10941   format %{ "vpopcntd  $dst,$src\t! vector popcount packed4I" %}
10942   ins_encode %{
10943     int vector_len = 0;
10944     __ vpopcntd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
10945   %}
10946   ins_pipe( pipe_slow );
10947 %}
10948 
10949 instruct vpopcount8I(vecY dst, vecY src) %{
10950   predicate(VM_Version::supports_vpopcntdq() && UsePopCountInstruction && n->as_Vector()->length() == 8);
10951   match(Set dst (PopCountVI src));
10952   format %{ "vpopcntd  $dst,$src\t! vector popcount packed8I" %}
10953   ins_encode %{
10954     int vector_len = 1;
10955     __ vpopcntd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
10956   %}
10957   ins_pipe( pipe_slow );
10958 %}
10959 
10960 instruct vpopcount16I(vecZ dst, vecZ src) %{
10961   predicate(VM_Version::supports_vpopcntdq() && UsePopCountInstruction && n->as_Vector()->length() == 16);
10962   match(Set dst (PopCountVI src));
10963   format %{ "vpopcntd  $dst,$src\t! vector popcount packed16I" %}
10964   ins_encode %{
10965     int vector_len = 2;
10966     __ vpopcntd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
10967   %}
10968   ins_pipe( pipe_slow );
10969 %}