1 //
   2 // Copyright (c) 2011, 2018, Oracle and/or its affiliates. All rights reserved.
   3 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4 //
   5 // This code is free software; you can redistribute it and/or modify it
   6 // under the terms of the GNU General Public License version 2 only, as
   7 // published by the Free Software Foundation.
   8 //
   9 // This code is distributed in the hope that it will be useful, but WITHOUT
  10 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11 // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12 // version 2 for more details (a copy is included in the LICENSE file that
  13 // accompanied this code).
  14 //
  15 // You should have received a copy of the GNU General Public License version
  16 // 2 along with this work; if not, write to the Free Software Foundation,
  17 // Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18 //
  19 // Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20 // or visit www.oracle.com if you need additional information or have any
  21 // questions.
  22 //
  23 //
  24 
  25 // X86 Common Architecture Description File
  26 
  27 //----------REGISTER DEFINITION BLOCK------------------------------------------
  28 // This information is used by the matcher and the register allocator to
  29 // describe individual registers and classes of registers within the target
  30 // archtecture.
  31 
  32 register %{
  33 //----------Architecture Description Register Definitions----------------------
  34 // General Registers
  35 // "reg_def"  name ( register save type, C convention save type,
  36 //                   ideal register type, encoding );
  37 // Register Save Types:
  38 //
  39 // NS  = No-Save:       The register allocator assumes that these registers
  40 //                      can be used without saving upon entry to the method, &
  41 //                      that they do not need to be saved at call sites.
  42 //
  43 // SOC = Save-On-Call:  The register allocator assumes that these registers
  44 //                      can be used without saving upon entry to the method,
  45 //                      but that they must be saved at call sites.
  46 //
  47 // SOE = Save-On-Entry: The register allocator assumes that these registers
  48 //                      must be saved before using them upon entry to the
  49 //                      method, but they do not need to be saved at call
  50 //                      sites.
  51 //
  52 // AS  = Always-Save:   The register allocator assumes that these registers
  53 //                      must be saved before using them upon entry to the
  54 //                      method, & that they must be saved at call sites.
  55 //
  56 // Ideal Register Type is used to determine how to save & restore a
  57 // register.  Op_RegI will get spilled with LoadI/StoreI, Op_RegP will get
  58 // spilled with LoadP/StoreP.  If the register supports both, use Op_RegI.
  59 //
  60 // The encoding number is the actual bit-pattern placed into the opcodes.
  61 
  62 // XMM registers.  512-bit registers or 8 words each, labeled (a)-p.
  63 // Word a in each register holds a Float, words ab hold a Double.
  64 // The whole registers are used in SSE4.2 version intrinsics,
  65 // array copy stubs and superword operations (see UseSSE42Intrinsics,
  66 // UseXMMForArrayCopy and UseSuperword flags).
  67 // For pre EVEX enabled architectures:
  68 //      XMM8-XMM15 must be encoded with REX (VEX for UseAVX)
  69 // For EVEX enabled architectures:
  70 //      XMM8-XMM31 must be encoded with REX (EVEX for UseAVX).
  71 //
  72 // Linux ABI:   No register preserved across function calls
  73 //              XMM0-XMM7 might hold parameters
  74 // Windows ABI: XMM6-XMM31 preserved across function calls
  75 //              XMM0-XMM3 might hold parameters
  76 
  77 reg_def XMM0 ( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg());
  78 reg_def XMM0b( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(1));
  79 reg_def XMM0c( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(2));
  80 reg_def XMM0d( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(3));
  81 reg_def XMM0e( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(4));
  82 reg_def XMM0f( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(5));
  83 reg_def XMM0g( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(6));
  84 reg_def XMM0h( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(7));
  85 reg_def XMM0i( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(8));
  86 reg_def XMM0j( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(9));
  87 reg_def XMM0k( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(10));
  88 reg_def XMM0l( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(11));
  89 reg_def XMM0m( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(12));
  90 reg_def XMM0n( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(13));
  91 reg_def XMM0o( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(14));
  92 reg_def XMM0p( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(15));
  93 
  94 reg_def XMM1 ( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg());
  95 reg_def XMM1b( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(1));
  96 reg_def XMM1c( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(2));
  97 reg_def XMM1d( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(3));
  98 reg_def XMM1e( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(4));
  99 reg_def XMM1f( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(5));
 100 reg_def XMM1g( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(6));
 101 reg_def XMM1h( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(7));
 102 reg_def XMM1i( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(8));
 103 reg_def XMM1j( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(9));
 104 reg_def XMM1k( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(10));
 105 reg_def XMM1l( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(11));
 106 reg_def XMM1m( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(12));
 107 reg_def XMM1n( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(13));
 108 reg_def XMM1o( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(14));
 109 reg_def XMM1p( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(15));
 110 
 111 reg_def XMM2 ( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg());
 112 reg_def XMM2b( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(1));
 113 reg_def XMM2c( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(2));
 114 reg_def XMM2d( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(3));
 115 reg_def XMM2e( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(4));
 116 reg_def XMM2f( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(5));
 117 reg_def XMM2g( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(6));
 118 reg_def XMM2h( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(7));
 119 reg_def XMM2i( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(8));
 120 reg_def XMM2j( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(9));
 121 reg_def XMM2k( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(10));
 122 reg_def XMM2l( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(11));
 123 reg_def XMM2m( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(12));
 124 reg_def XMM2n( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(13));
 125 reg_def XMM2o( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(14));
 126 reg_def XMM2p( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(15));
 127 
 128 reg_def XMM3 ( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg());
 129 reg_def XMM3b( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(1));
 130 reg_def XMM3c( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(2));
 131 reg_def XMM3d( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(3));
 132 reg_def XMM3e( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(4));
 133 reg_def XMM3f( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(5));
 134 reg_def XMM3g( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(6));
 135 reg_def XMM3h( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(7));
 136 reg_def XMM3i( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(8));
 137 reg_def XMM3j( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(9));
 138 reg_def XMM3k( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(10));
 139 reg_def XMM3l( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(11));
 140 reg_def XMM3m( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(12));
 141 reg_def XMM3n( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(13));
 142 reg_def XMM3o( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(14));
 143 reg_def XMM3p( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(15));
 144 
 145 reg_def XMM4 ( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg());
 146 reg_def XMM4b( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(1));
 147 reg_def XMM4c( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(2));
 148 reg_def XMM4d( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(3));
 149 reg_def XMM4e( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(4));
 150 reg_def XMM4f( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(5));
 151 reg_def XMM4g( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(6));
 152 reg_def XMM4h( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(7));
 153 reg_def XMM4i( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(8));
 154 reg_def XMM4j( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(9));
 155 reg_def XMM4k( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(10));
 156 reg_def XMM4l( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(11));
 157 reg_def XMM4m( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(12));
 158 reg_def XMM4n( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(13));
 159 reg_def XMM4o( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(14));
 160 reg_def XMM4p( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(15));
 161 
 162 reg_def XMM5 ( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg());
 163 reg_def XMM5b( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(1));
 164 reg_def XMM5c( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(2));
 165 reg_def XMM5d( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(3));
 166 reg_def XMM5e( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(4));
 167 reg_def XMM5f( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(5));
 168 reg_def XMM5g( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(6));
 169 reg_def XMM5h( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(7));
 170 reg_def XMM5i( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(8));
 171 reg_def XMM5j( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(9));
 172 reg_def XMM5k( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(10));
 173 reg_def XMM5l( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(11));
 174 reg_def XMM5m( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(12));
 175 reg_def XMM5n( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(13));
 176 reg_def XMM5o( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(14));
 177 reg_def XMM5p( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(15));
 178 
 179 reg_def XMM6 ( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg());
 180 reg_def XMM6b( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(1));
 181 reg_def XMM6c( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(2));
 182 reg_def XMM6d( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(3));
 183 reg_def XMM6e( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(4));
 184 reg_def XMM6f( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(5));
 185 reg_def XMM6g( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(6));
 186 reg_def XMM6h( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(7));
 187 reg_def XMM6i( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(8));
 188 reg_def XMM6j( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(9));
 189 reg_def XMM6k( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(10));
 190 reg_def XMM6l( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(11));
 191 reg_def XMM6m( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(12));
 192 reg_def XMM6n( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(13));
 193 reg_def XMM6o( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(14));
 194 reg_def XMM6p( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(15));
 195 
 196 reg_def XMM7 ( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg());
 197 reg_def XMM7b( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(1));
 198 reg_def XMM7c( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(2));
 199 reg_def XMM7d( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(3));
 200 reg_def XMM7e( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(4));
 201 reg_def XMM7f( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(5));
 202 reg_def XMM7g( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(6));
 203 reg_def XMM7h( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(7));
 204 reg_def XMM7i( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(8));
 205 reg_def XMM7j( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(9));
 206 reg_def XMM7k( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(10));
 207 reg_def XMM7l( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(11));
 208 reg_def XMM7m( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(12));
 209 reg_def XMM7n( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(13));
 210 reg_def XMM7o( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(14));
 211 reg_def XMM7p( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(15));
 212 
 213 #ifdef _LP64
 214 
 215 reg_def XMM8 ( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg());
 216 reg_def XMM8b( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(1));
 217 reg_def XMM8c( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(2));
 218 reg_def XMM8d( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(3));
 219 reg_def XMM8e( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(4));
 220 reg_def XMM8f( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(5));
 221 reg_def XMM8g( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(6));
 222 reg_def XMM8h( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(7));
 223 reg_def XMM8i( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(8));
 224 reg_def XMM8j( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(9));
 225 reg_def XMM8k( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(10));
 226 reg_def XMM8l( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(11));
 227 reg_def XMM8m( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(12));
 228 reg_def XMM8n( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(13));
 229 reg_def XMM8o( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(14));
 230 reg_def XMM8p( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(15));
 231 
 232 reg_def XMM9 ( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg());
 233 reg_def XMM9b( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(1));
 234 reg_def XMM9c( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(2));
 235 reg_def XMM9d( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(3));
 236 reg_def XMM9e( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(4));
 237 reg_def XMM9f( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(5));
 238 reg_def XMM9g( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(6));
 239 reg_def XMM9h( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(7));
 240 reg_def XMM9i( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(8));
 241 reg_def XMM9j( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(9));
 242 reg_def XMM9k( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(10));
 243 reg_def XMM9l( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(11));
 244 reg_def XMM9m( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(12));
 245 reg_def XMM9n( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(13));
 246 reg_def XMM9o( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(14));
 247 reg_def XMM9p( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(15));
 248 
 249 reg_def XMM10 ( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg());
 250 reg_def XMM10b( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(1));
 251 reg_def XMM10c( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(2));
 252 reg_def XMM10d( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(3));
 253 reg_def XMM10e( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(4));
 254 reg_def XMM10f( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(5));
 255 reg_def XMM10g( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(6));
 256 reg_def XMM10h( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(7));
 257 reg_def XMM10i( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(8));
 258 reg_def XMM10j( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(9));
 259 reg_def XMM10k( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(10));
 260 reg_def XMM10l( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(11));
 261 reg_def XMM10m( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(12));
 262 reg_def XMM10n( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(13));
 263 reg_def XMM10o( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(14));
 264 reg_def XMM10p( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(15));
 265 
 266 reg_def XMM11 ( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg());
 267 reg_def XMM11b( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(1));
 268 reg_def XMM11c( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(2));
 269 reg_def XMM11d( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(3));
 270 reg_def XMM11e( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(4));
 271 reg_def XMM11f( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(5));
 272 reg_def XMM11g( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(6));
 273 reg_def XMM11h( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(7));
 274 reg_def XMM11i( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(8));
 275 reg_def XMM11j( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(9));
 276 reg_def XMM11k( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(10));
 277 reg_def XMM11l( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(11));
 278 reg_def XMM11m( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(12));
 279 reg_def XMM11n( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(13));
 280 reg_def XMM11o( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(14));
 281 reg_def XMM11p( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(15));
 282 
 283 reg_def XMM12 ( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg());
 284 reg_def XMM12b( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(1));
 285 reg_def XMM12c( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(2));
 286 reg_def XMM12d( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(3));
 287 reg_def XMM12e( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(4));
 288 reg_def XMM12f( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(5));
 289 reg_def XMM12g( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(6));
 290 reg_def XMM12h( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(7));
 291 reg_def XMM12i( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(8));
 292 reg_def XMM12j( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(9));
 293 reg_def XMM12k( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(10));
 294 reg_def XMM12l( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(11));
 295 reg_def XMM12m( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(12));
 296 reg_def XMM12n( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(13));
 297 reg_def XMM12o( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(14));
 298 reg_def XMM12p( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(15));
 299 
 300 reg_def XMM13 ( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg());
 301 reg_def XMM13b( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(1));
 302 reg_def XMM13c( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(2));
 303 reg_def XMM13d( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(3));
 304 reg_def XMM13e( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(4));
 305 reg_def XMM13f( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(5));
 306 reg_def XMM13g( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(6));
 307 reg_def XMM13h( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(7));
 308 reg_def XMM13i( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(8));
 309 reg_def XMM13j( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(9));
 310 reg_def XMM13k( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(10));
 311 reg_def XMM13l( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(11));
 312 reg_def XMM13m( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(12));
 313 reg_def XMM13n( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(13));
 314 reg_def XMM13o( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(14));
 315 reg_def XMM13p( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(15));
 316 
 317 reg_def XMM14 ( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg());
 318 reg_def XMM14b( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(1));
 319 reg_def XMM14c( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(2));
 320 reg_def XMM14d( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(3));
 321 reg_def XMM14e( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(4));
 322 reg_def XMM14f( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(5));
 323 reg_def XMM14g( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(6));
 324 reg_def XMM14h( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(7));
 325 reg_def XMM14i( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(8));
 326 reg_def XMM14j( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(9));
 327 reg_def XMM14k( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(10));
 328 reg_def XMM14l( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(11));
 329 reg_def XMM14m( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(12));
 330 reg_def XMM14n( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(13));
 331 reg_def XMM14o( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(14));
 332 reg_def XMM14p( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(15));
 333 
 334 reg_def XMM15 ( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg());
 335 reg_def XMM15b( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(1));
 336 reg_def XMM15c( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(2));
 337 reg_def XMM15d( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(3));
 338 reg_def XMM15e( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(4));
 339 reg_def XMM15f( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(5));
 340 reg_def XMM15g( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(6));
 341 reg_def XMM15h( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(7));
 342 reg_def XMM15i( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(8));
 343 reg_def XMM15j( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(9));
 344 reg_def XMM15k( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(10));
 345 reg_def XMM15l( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(11));
 346 reg_def XMM15m( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(12));
 347 reg_def XMM15n( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(13));
 348 reg_def XMM15o( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(14));
 349 reg_def XMM15p( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(15));
 350 
 351 reg_def XMM16 ( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg());
 352 reg_def XMM16b( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(1));
 353 reg_def XMM16c( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(2));
 354 reg_def XMM16d( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(3));
 355 reg_def XMM16e( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(4));
 356 reg_def XMM16f( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(5));
 357 reg_def XMM16g( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(6));
 358 reg_def XMM16h( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(7));
 359 reg_def XMM16i( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(8));
 360 reg_def XMM16j( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(9));
 361 reg_def XMM16k( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(10));
 362 reg_def XMM16l( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(11));
 363 reg_def XMM16m( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(12));
 364 reg_def XMM16n( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(13));
 365 reg_def XMM16o( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(14));
 366 reg_def XMM16p( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(15));
 367 
 368 reg_def XMM17 ( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg());
 369 reg_def XMM17b( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(1));
 370 reg_def XMM17c( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(2));
 371 reg_def XMM17d( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(3));
 372 reg_def XMM17e( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(4));
 373 reg_def XMM17f( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(5));
 374 reg_def XMM17g( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(6));
 375 reg_def XMM17h( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(7));
 376 reg_def XMM17i( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(8));
 377 reg_def XMM17j( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(9));
 378 reg_def XMM17k( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(10));
 379 reg_def XMM17l( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(11));
 380 reg_def XMM17m( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(12));
 381 reg_def XMM17n( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(13));
 382 reg_def XMM17o( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(14));
 383 reg_def XMM17p( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(15));
 384 
 385 reg_def XMM18 ( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg());
 386 reg_def XMM18b( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(1));
 387 reg_def XMM18c( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(2));
 388 reg_def XMM18d( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(3));
 389 reg_def XMM18e( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(4));
 390 reg_def XMM18f( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(5));
 391 reg_def XMM18g( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(6));
 392 reg_def XMM18h( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(7));
 393 reg_def XMM18i( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(8));
 394 reg_def XMM18j( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(9));
 395 reg_def XMM18k( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(10));
 396 reg_def XMM18l( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(11));
 397 reg_def XMM18m( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(12));
 398 reg_def XMM18n( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(13));
 399 reg_def XMM18o( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(14));
 400 reg_def XMM18p( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(15));
 401 
 402 reg_def XMM19 ( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg());
 403 reg_def XMM19b( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(1));
 404 reg_def XMM19c( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(2));
 405 reg_def XMM19d( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(3));
 406 reg_def XMM19e( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(4));
 407 reg_def XMM19f( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(5));
 408 reg_def XMM19g( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(6));
 409 reg_def XMM19h( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(7));
 410 reg_def XMM19i( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(8));
 411 reg_def XMM19j( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(9));
 412 reg_def XMM19k( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(10));
 413 reg_def XMM19l( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(11));
 414 reg_def XMM19m( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(12));
 415 reg_def XMM19n( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(13));
 416 reg_def XMM19o( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(14));
 417 reg_def XMM19p( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(15));
 418 
 419 reg_def XMM20 ( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg());
 420 reg_def XMM20b( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(1));
 421 reg_def XMM20c( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(2));
 422 reg_def XMM20d( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(3));
 423 reg_def XMM20e( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(4));
 424 reg_def XMM20f( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(5));
 425 reg_def XMM20g( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(6));
 426 reg_def XMM20h( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(7));
 427 reg_def XMM20i( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(8));
 428 reg_def XMM20j( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(9));
 429 reg_def XMM20k( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(10));
 430 reg_def XMM20l( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(11));
 431 reg_def XMM20m( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(12));
 432 reg_def XMM20n( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(13));
 433 reg_def XMM20o( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(14));
 434 reg_def XMM20p( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(15));
 435 
 436 reg_def XMM21 ( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg());
 437 reg_def XMM21b( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(1));
 438 reg_def XMM21c( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(2));
 439 reg_def XMM21d( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(3));
 440 reg_def XMM21e( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(4));
 441 reg_def XMM21f( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(5));
 442 reg_def XMM21g( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(6));
 443 reg_def XMM21h( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(7));
 444 reg_def XMM21i( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(8));
 445 reg_def XMM21j( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(9));
 446 reg_def XMM21k( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(10));
 447 reg_def XMM21l( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(11));
 448 reg_def XMM21m( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(12));
 449 reg_def XMM21n( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(13));
 450 reg_def XMM21o( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(14));
 451 reg_def XMM21p( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(15));
 452 
 453 reg_def XMM22 ( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg());
 454 reg_def XMM22b( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(1));
 455 reg_def XMM22c( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(2));
 456 reg_def XMM22d( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(3));
 457 reg_def XMM22e( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(4));
 458 reg_def XMM22f( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(5));
 459 reg_def XMM22g( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(6));
 460 reg_def XMM22h( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(7));
 461 reg_def XMM22i( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(8));
 462 reg_def XMM22j( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(9));
 463 reg_def XMM22k( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(10));
 464 reg_def XMM22l( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(11));
 465 reg_def XMM22m( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(12));
 466 reg_def XMM22n( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(13));
 467 reg_def XMM22o( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(14));
 468 reg_def XMM22p( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(15));
 469 
 470 reg_def XMM23 ( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg());
 471 reg_def XMM23b( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(1));
 472 reg_def XMM23c( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(2));
 473 reg_def XMM23d( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(3));
 474 reg_def XMM23e( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(4));
 475 reg_def XMM23f( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(5));
 476 reg_def XMM23g( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(6));
 477 reg_def XMM23h( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(7));
 478 reg_def XMM23i( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(8));
 479 reg_def XMM23j( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(9));
 480 reg_def XMM23k( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(10));
 481 reg_def XMM23l( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(11));
 482 reg_def XMM23m( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(12));
 483 reg_def XMM23n( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(13));
 484 reg_def XMM23o( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(14));
 485 reg_def XMM23p( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(15));
 486 
 487 reg_def XMM24 ( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg());
 488 reg_def XMM24b( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(1));
 489 reg_def XMM24c( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(2));
 490 reg_def XMM24d( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(3));
 491 reg_def XMM24e( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(4));
 492 reg_def XMM24f( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(5));
 493 reg_def XMM24g( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(6));
 494 reg_def XMM24h( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(7));
 495 reg_def XMM24i( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(8));
 496 reg_def XMM24j( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(9));
 497 reg_def XMM24k( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(10));
 498 reg_def XMM24l( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(11));
 499 reg_def XMM24m( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(12));
 500 reg_def XMM24n( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(13));
 501 reg_def XMM24o( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(14));
 502 reg_def XMM24p( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(15));
 503 
 504 reg_def XMM25 ( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg());
 505 reg_def XMM25b( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(1));
 506 reg_def XMM25c( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(2));
 507 reg_def XMM25d( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(3));
 508 reg_def XMM25e( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(4));
 509 reg_def XMM25f( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(5));
 510 reg_def XMM25g( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(6));
 511 reg_def XMM25h( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(7));
 512 reg_def XMM25i( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(8));
 513 reg_def XMM25j( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(9));
 514 reg_def XMM25k( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(10));
 515 reg_def XMM25l( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(11));
 516 reg_def XMM25m( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(12));
 517 reg_def XMM25n( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(13));
 518 reg_def XMM25o( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(14));
 519 reg_def XMM25p( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(15));
 520 
 521 reg_def XMM26 ( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg());
 522 reg_def XMM26b( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(1));
 523 reg_def XMM26c( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(2));
 524 reg_def XMM26d( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(3));
 525 reg_def XMM26e( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(4));
 526 reg_def XMM26f( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(5));
 527 reg_def XMM26g( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(6));
 528 reg_def XMM26h( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(7));
 529 reg_def XMM26i( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(8));
 530 reg_def XMM26j( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(9));
 531 reg_def XMM26k( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(10));
 532 reg_def XMM26l( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(11));
 533 reg_def XMM26m( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(12));
 534 reg_def XMM26n( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(13));
 535 reg_def XMM26o( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(14));
 536 reg_def XMM26p( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(15));
 537 
 538 reg_def XMM27 ( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg());
 539 reg_def XMM27b( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(1));
 540 reg_def XMM27c( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(2));
 541 reg_def XMM27d( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(3));
 542 reg_def XMM27e( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(4));
 543 reg_def XMM27f( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(5));
 544 reg_def XMM27g( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(6));
 545 reg_def XMM27h( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(7));
 546 reg_def XMM27i( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(8));
 547 reg_def XMM27j( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(9));
 548 reg_def XMM27k( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(10));
 549 reg_def XMM27l( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(11));
 550 reg_def XMM27m( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(12));
 551 reg_def XMM27n( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(13));
 552 reg_def XMM27o( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(14));
 553 reg_def XMM27p( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(15));
 554 
 555 reg_def XMM28 ( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg());
 556 reg_def XMM28b( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(1));
 557 reg_def XMM28c( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(2));
 558 reg_def XMM28d( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(3));
 559 reg_def XMM28e( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(4));
 560 reg_def XMM28f( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(5));
 561 reg_def XMM28g( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(6));
 562 reg_def XMM28h( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(7));
 563 reg_def XMM28i( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(8));
 564 reg_def XMM28j( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(9));
 565 reg_def XMM28k( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(10));
 566 reg_def XMM28l( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(11));
 567 reg_def XMM28m( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(12));
 568 reg_def XMM28n( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(13));
 569 reg_def XMM28o( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(14));
 570 reg_def XMM28p( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(15));
 571 
 572 reg_def XMM29 ( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg());
 573 reg_def XMM29b( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(1));
 574 reg_def XMM29c( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(2));
 575 reg_def XMM29d( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(3));
 576 reg_def XMM29e( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(4));
 577 reg_def XMM29f( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(5));
 578 reg_def XMM29g( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(6));
 579 reg_def XMM29h( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(7));
 580 reg_def XMM29i( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(8));
 581 reg_def XMM29j( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(9));
 582 reg_def XMM29k( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(10));
 583 reg_def XMM29l( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(11));
 584 reg_def XMM29m( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(12));
 585 reg_def XMM29n( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(13));
 586 reg_def XMM29o( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(14));
 587 reg_def XMM29p( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(15));
 588 
 589 reg_def XMM30 ( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg());
 590 reg_def XMM30b( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(1));
 591 reg_def XMM30c( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(2));
 592 reg_def XMM30d( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(3));
 593 reg_def XMM30e( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(4));
 594 reg_def XMM30f( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(5));
 595 reg_def XMM30g( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(6));
 596 reg_def XMM30h( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(7));
 597 reg_def XMM30i( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(8));
 598 reg_def XMM30j( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(9));
 599 reg_def XMM30k( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(10));
 600 reg_def XMM30l( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(11));
 601 reg_def XMM30m( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(12));
 602 reg_def XMM30n( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(13));
 603 reg_def XMM30o( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(14));
 604 reg_def XMM30p( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(15));
 605 
 606 reg_def XMM31 ( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg());
 607 reg_def XMM31b( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(1));
 608 reg_def XMM31c( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(2));
 609 reg_def XMM31d( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(3));
 610 reg_def XMM31e( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(4));
 611 reg_def XMM31f( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(5));
 612 reg_def XMM31g( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(6));
 613 reg_def XMM31h( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(7));
 614 reg_def XMM31i( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(8));
 615 reg_def XMM31j( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(9));
 616 reg_def XMM31k( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(10));
 617 reg_def XMM31l( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(11));
 618 reg_def XMM31m( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(12));
 619 reg_def XMM31n( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(13));
 620 reg_def XMM31o( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(14));
 621 reg_def XMM31p( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(15));
 622 
 623 #endif // _LP64
 624 
 625 #ifdef _LP64
 626 reg_def RFLAGS(SOC, SOC, 0, 16, VMRegImpl::Bad());
 627 #else
 628 reg_def RFLAGS(SOC, SOC, 0, 8, VMRegImpl::Bad());
 629 #endif // _LP64
 630 
 631 alloc_class chunk1(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
 632                    XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
 633                    XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
 634                    XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
 635                    XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
 636                    XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
 637                    XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
 638                    XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
 639 #ifdef _LP64
 640                   ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
 641                    XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
 642                    XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
 643                    XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
 644                    XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
 645                    XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
 646                    XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
 647                    XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
 648                   ,XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
 649                    XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
 650                    XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
 651                    XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
 652                    XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
 653                    XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
 654                    XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
 655                    XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
 656                    XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
 657                    XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
 658                    XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
 659                    XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
 660                    XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
 661                    XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
 662                    XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
 663                    XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p
 664 #endif
 665                       );
 666 
 667 // flags allocation class should be last.
 668 alloc_class chunk2(RFLAGS);
 669 
 670 // Singleton class for condition codes
 671 reg_class int_flags(RFLAGS);
 672 
 673 // Class for pre evex float registers
 674 reg_class float_reg_legacy(XMM0,
 675                     XMM1,
 676                     XMM2,
 677                     XMM3,
 678                     XMM4,
 679                     XMM5,
 680                     XMM6,
 681                     XMM7
 682 #ifdef _LP64
 683                    ,XMM8,
 684                     XMM9,
 685                     XMM10,
 686                     XMM11,
 687                     XMM12,
 688                     XMM13,
 689                     XMM14,
 690                     XMM15
 691 #endif
 692                     );
 693 
 694 // Class for evex float registers
 695 reg_class float_reg_evex(XMM0,
 696                     XMM1,
 697                     XMM2,
 698                     XMM3,
 699                     XMM4,
 700                     XMM5,
 701                     XMM6,
 702                     XMM7
 703 #ifdef _LP64
 704                    ,XMM8,
 705                     XMM9,
 706                     XMM10,
 707                     XMM11,
 708                     XMM12,
 709                     XMM13,
 710                     XMM14,
 711                     XMM15,
 712                     XMM16,
 713                     XMM17,
 714                     XMM18,
 715                     XMM19,
 716                     XMM20,
 717                     XMM21,
 718                     XMM22,
 719                     XMM23,
 720                     XMM24,
 721                     XMM25,
 722                     XMM26,
 723                     XMM27,
 724                     XMM28,
 725                     XMM29,
 726                     XMM30,
 727                     XMM31
 728 #endif
 729                     );
 730 
 731 reg_class_dynamic float_reg(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() %} );
 732 reg_class_dynamic float_reg_vl(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
 733 
 734 // Class for pre evex double registers
 735 reg_class double_reg_legacy(XMM0,  XMM0b,
 736                      XMM1,  XMM1b,
 737                      XMM2,  XMM2b,
 738                      XMM3,  XMM3b,
 739                      XMM4,  XMM4b,
 740                      XMM5,  XMM5b,
 741                      XMM6,  XMM6b,
 742                      XMM7,  XMM7b
 743 #ifdef _LP64
 744                     ,XMM8,  XMM8b,
 745                      XMM9,  XMM9b,
 746                      XMM10, XMM10b,
 747                      XMM11, XMM11b,
 748                      XMM12, XMM12b,
 749                      XMM13, XMM13b,
 750                      XMM14, XMM14b,
 751                      XMM15, XMM15b
 752 #endif
 753                      );
 754 
 755 // Class for evex double registers
 756 reg_class double_reg_evex(XMM0,  XMM0b,
 757                      XMM1,  XMM1b,
 758                      XMM2,  XMM2b,
 759                      XMM3,  XMM3b,
 760                      XMM4,  XMM4b,
 761                      XMM5,  XMM5b,
 762                      XMM6,  XMM6b,
 763                      XMM7,  XMM7b
 764 #ifdef _LP64
 765                     ,XMM8,  XMM8b,
 766                      XMM9,  XMM9b,
 767                      XMM10, XMM10b,
 768                      XMM11, XMM11b,
 769                      XMM12, XMM12b,
 770                      XMM13, XMM13b,
 771                      XMM14, XMM14b,
 772                      XMM15, XMM15b,
 773                      XMM16, XMM16b,
 774                      XMM17, XMM17b,
 775                      XMM18, XMM18b,
 776                      XMM19, XMM19b,
 777                      XMM20, XMM20b,
 778                      XMM21, XMM21b,
 779                      XMM22, XMM22b,
 780                      XMM23, XMM23b,
 781                      XMM24, XMM24b,
 782                      XMM25, XMM25b,
 783                      XMM26, XMM26b,
 784                      XMM27, XMM27b,
 785                      XMM28, XMM28b,
 786                      XMM29, XMM29b,
 787                      XMM30, XMM30b,
 788                      XMM31, XMM31b
 789 #endif
 790                      );
 791 
 792 reg_class_dynamic double_reg(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() %} );
 793 reg_class_dynamic double_reg_vl(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
 794 
 795 // Class for pre evex 32bit vector registers
 796 reg_class vectors_reg_legacy(XMM0,
 797                       XMM1,
 798                       XMM2,
 799                       XMM3,
 800                       XMM4,
 801                       XMM5,
 802                       XMM6,
 803                       XMM7
 804 #ifdef _LP64
 805                      ,XMM8,
 806                       XMM9,
 807                       XMM10,
 808                       XMM11,
 809                       XMM12,
 810                       XMM13,
 811                       XMM14,
 812                       XMM15
 813 #endif
 814                       );
 815 
 816 // Class for evex 32bit vector registers
 817 reg_class vectors_reg_evex(XMM0,
 818                       XMM1,
 819                       XMM2,
 820                       XMM3,
 821                       XMM4,
 822                       XMM5,
 823                       XMM6,
 824                       XMM7
 825 #ifdef _LP64
 826                      ,XMM8,
 827                       XMM9,
 828                       XMM10,
 829                       XMM11,
 830                       XMM12,
 831                       XMM13,
 832                       XMM14,
 833                       XMM15,
 834                       XMM16,
 835                       XMM17,
 836                       XMM18,
 837                       XMM19,
 838                       XMM20,
 839                       XMM21,
 840                       XMM22,
 841                       XMM23,
 842                       XMM24,
 843                       XMM25,
 844                       XMM26,
 845                       XMM27,
 846                       XMM28,
 847                       XMM29,
 848                       XMM30,
 849                       XMM31
 850 #endif
 851                       );
 852 
 853 reg_class_dynamic vectors_reg(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_evex() %} );
 854 reg_class_dynamic vectors_reg_vlbwdq(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
 855 
 856 // Class for all 64bit vector registers
 857 reg_class vectord_reg_legacy(XMM0,  XMM0b,
 858                       XMM1,  XMM1b,
 859                       XMM2,  XMM2b,
 860                       XMM3,  XMM3b,
 861                       XMM4,  XMM4b,
 862                       XMM5,  XMM5b,
 863                       XMM6,  XMM6b,
 864                       XMM7,  XMM7b
 865 #ifdef _LP64
 866                      ,XMM8,  XMM8b,
 867                       XMM9,  XMM9b,
 868                       XMM10, XMM10b,
 869                       XMM11, XMM11b,
 870                       XMM12, XMM12b,
 871                       XMM13, XMM13b,
 872                       XMM14, XMM14b,
 873                       XMM15, XMM15b
 874 #endif
 875                       );
 876 
 877 // Class for all 64bit vector registers
 878 reg_class vectord_reg_evex(XMM0,  XMM0b,
 879                       XMM1,  XMM1b,
 880                       XMM2,  XMM2b,
 881                       XMM3,  XMM3b,
 882                       XMM4,  XMM4b,
 883                       XMM5,  XMM5b,
 884                       XMM6,  XMM6b,
 885                       XMM7,  XMM7b
 886 #ifdef _LP64
 887                      ,XMM8,  XMM8b,
 888                       XMM9,  XMM9b,
 889                       XMM10, XMM10b,
 890                       XMM11, XMM11b,
 891                       XMM12, XMM12b,
 892                       XMM13, XMM13b,
 893                       XMM14, XMM14b,
 894                       XMM15, XMM15b,
 895                       XMM16, XMM16b,
 896                       XMM17, XMM17b,
 897                       XMM18, XMM18b,
 898                       XMM19, XMM19b,
 899                       XMM20, XMM20b,
 900                       XMM21, XMM21b,
 901                       XMM22, XMM22b,
 902                       XMM23, XMM23b,
 903                       XMM24, XMM24b,
 904                       XMM25, XMM25b,
 905                       XMM26, XMM26b,
 906                       XMM27, XMM27b,
 907                       XMM28, XMM28b,
 908                       XMM29, XMM29b,
 909                       XMM30, XMM30b,
 910                       XMM31, XMM31b
 911 #endif
 912                       );
 913 
 914 reg_class_dynamic vectord_reg(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_evex() %} );
 915 reg_class_dynamic vectord_reg_vlbwdq(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
 916 
 917 // Class for all 128bit vector registers
 918 reg_class vectorx_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,
 919                       XMM1,  XMM1b,  XMM1c,  XMM1d,
 920                       XMM2,  XMM2b,  XMM2c,  XMM2d,
 921                       XMM3,  XMM3b,  XMM3c,  XMM3d,
 922                       XMM4,  XMM4b,  XMM4c,  XMM4d,
 923                       XMM5,  XMM5b,  XMM5c,  XMM5d,
 924                       XMM6,  XMM6b,  XMM6c,  XMM6d,
 925                       XMM7,  XMM7b,  XMM7c,  XMM7d
 926 #ifdef _LP64
 927                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,
 928                       XMM9,  XMM9b,  XMM9c,  XMM9d,
 929                       XMM10, XMM10b, XMM10c, XMM10d,
 930                       XMM11, XMM11b, XMM11c, XMM11d,
 931                       XMM12, XMM12b, XMM12c, XMM12d,
 932                       XMM13, XMM13b, XMM13c, XMM13d,
 933                       XMM14, XMM14b, XMM14c, XMM14d,
 934                       XMM15, XMM15b, XMM15c, XMM15d
 935 #endif
 936                       );
 937 
 938 // Class for all 128bit vector registers
 939 reg_class vectorx_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,
 940                       XMM1,  XMM1b,  XMM1c,  XMM1d,
 941                       XMM2,  XMM2b,  XMM2c,  XMM2d,
 942                       XMM3,  XMM3b,  XMM3c,  XMM3d,
 943                       XMM4,  XMM4b,  XMM4c,  XMM4d,
 944                       XMM5,  XMM5b,  XMM5c,  XMM5d,
 945                       XMM6,  XMM6b,  XMM6c,  XMM6d,
 946                       XMM7,  XMM7b,  XMM7c,  XMM7d
 947 #ifdef _LP64
 948                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,
 949                       XMM9,  XMM9b,  XMM9c,  XMM9d,
 950                       XMM10, XMM10b, XMM10c, XMM10d,
 951                       XMM11, XMM11b, XMM11c, XMM11d,
 952                       XMM12, XMM12b, XMM12c, XMM12d,
 953                       XMM13, XMM13b, XMM13c, XMM13d,
 954                       XMM14, XMM14b, XMM14c, XMM14d,
 955                       XMM15, XMM15b, XMM15c, XMM15d,
 956                       XMM16, XMM16b, XMM16c, XMM16d,
 957                       XMM17, XMM17b, XMM17c, XMM17d,
 958                       XMM18, XMM18b, XMM18c, XMM18d,
 959                       XMM19, XMM19b, XMM19c, XMM19d,
 960                       XMM20, XMM20b, XMM20c, XMM20d,
 961                       XMM21, XMM21b, XMM21c, XMM21d,
 962                       XMM22, XMM22b, XMM22c, XMM22d,
 963                       XMM23, XMM23b, XMM23c, XMM23d,
 964                       XMM24, XMM24b, XMM24c, XMM24d,
 965                       XMM25, XMM25b, XMM25c, XMM25d,
 966                       XMM26, XMM26b, XMM26c, XMM26d,
 967                       XMM27, XMM27b, XMM27c, XMM27d,
 968                       XMM28, XMM28b, XMM28c, XMM28d,
 969                       XMM29, XMM29b, XMM29c, XMM29d,
 970                       XMM30, XMM30b, XMM30c, XMM30d,
 971                       XMM31, XMM31b, XMM31c, XMM31d
 972 #endif
 973                       );
 974 
 975 reg_class_dynamic vectorx_reg(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_evex() %} );
 976 reg_class_dynamic vectorx_reg_vlbwdq(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
 977 
 978 // Class for all 256bit vector registers
 979 reg_class vectory_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
 980                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
 981                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
 982                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
 983                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
 984                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
 985                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
 986                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
 987 #ifdef _LP64
 988                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
 989                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
 990                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
 991                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
 992                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
 993                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
 994                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
 995                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h
 996 #endif
 997                       );
 998 
 999 // Class for all 256bit vector registers
1000 reg_class vectory_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
1001                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
1002                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
1003                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
1004                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
1005                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
1006                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
1007                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
1008 #ifdef _LP64
1009                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
1010                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
1011                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
1012                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
1013                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
1014                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
1015                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
1016                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h,
1017                       XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h,
1018                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h,
1019                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h,
1020                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h,
1021                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h,
1022                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h,
1023                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h,
1024                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h,
1025                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h,
1026                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h,
1027                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h,
1028                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h,
1029                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h,
1030                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h,
1031                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h,
1032                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h
1033 #endif
1034                       );
1035 
1036 reg_class_dynamic vectory_reg(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_evex() %} );
1037 reg_class_dynamic vectory_reg_vlbwdq(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
1038 
1039 // Class for all 512bit vector registers
1040 reg_class vectorz_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
1041                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
1042                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
1043                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
1044                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
1045                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
1046                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
1047                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
1048 #ifdef _LP64
1049                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
1050                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
1051                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
1052                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
1053                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
1054                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
1055                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
1056                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
1057                      ,XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
1058                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
1059                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
1060                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
1061                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
1062                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
1063                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
1064                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
1065                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
1066                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
1067                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
1068                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
1069                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
1070                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
1071                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
1072                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p
1073 #endif
1074                       );
1075 
1076 // Class for restricted 512bit vector registers
1077 reg_class vectorz_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
1078                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
1079                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
1080                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
1081                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
1082                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
1083                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
1084                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
1085 #ifdef _LP64
1086                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
1087                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
1088                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
1089                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
1090                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
1091                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
1092                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
1093                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
1094 #endif
1095                       );
1096 
1097 reg_class_dynamic vectorz_reg(vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() %} );
1098 reg_class_dynamic vectorz_reg_vl(vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
1099 
1100 reg_class xmm0_reg(XMM0, XMM0b, XMM0c, XMM0d);
1101 reg_class ymm0_reg(XMM0, XMM0b, XMM0c, XMM0d, XMM0e, XMM0f, XMM0g, XMM0h);
1102 reg_class zmm0_reg(XMM0, XMM0b, XMM0c, XMM0d, XMM0e, XMM0f, XMM0g, XMM0h, XMM0i, XMM0j, XMM0k, XMM0l, XMM0m, XMM0n, XMM0o, XMM0p);
1103 
1104 reg_class xmm1_reg(XMM1, XMM1b, XMM1c, XMM1d);
1105 reg_class ymm1_reg(XMM1, XMM1b, XMM1c, XMM1d, XMM1e, XMM1f, XMM1g, XMM1h);
1106 reg_class zmm1_reg(XMM1, XMM1b, XMM1c, XMM1d, XMM1e, XMM1f, XMM1g, XMM1h, XMM1i, XMM1j, XMM1k, XMM1l, XMM1m, XMM1n, XMM1o, XMM1p);
1107 
1108 reg_class xmm2_reg(XMM2, XMM2b, XMM2c, XMM2d);
1109 reg_class ymm2_reg(XMM2, XMM2b, XMM2c, XMM2d, XMM2e, XMM2f, XMM2g, XMM2h);
1110 reg_class zmm2_reg(XMM2, XMM2b, XMM2c, XMM2d, XMM2e, XMM2f, XMM2g, XMM2h, XMM2i, XMM2j, XMM2k, XMM2l, XMM2m, XMM2n, XMM2o, XMM2p);
1111 
1112 reg_class xmm3_reg(XMM3, XMM3b, XMM3c, XMM3d);
1113 reg_class ymm3_reg(XMM3, XMM3b, XMM3c, XMM3d, XMM3e, XMM3f, XMM3g, XMM3h);
1114 reg_class zmm3_reg(XMM3, XMM3b, XMM3c, XMM3d, XMM3e, XMM3f, XMM3g, XMM3h, XMM3i, XMM3j, XMM3k, XMM3l, XMM3m, XMM3n, XMM3o, XMM3p);
1115 
1116 reg_class xmm4_reg(XMM4, XMM4b, XMM4c, XMM4d);
1117 reg_class ymm4_reg(XMM4, XMM4b, XMM4c, XMM4d, XMM4e, XMM4f, XMM4g, XMM4h);
1118 reg_class zmm4_reg(XMM4, XMM4b, XMM4c, XMM4d, XMM4e, XMM4f, XMM4g, XMM4h, XMM4i, XMM4j, XMM4k, XMM4l, XMM4m, XMM4n, XMM4o, XMM4p);
1119 
1120 reg_class xmm5_reg(XMM5, XMM5b, XMM5c, XMM5d);
1121 reg_class ymm5_reg(XMM5, XMM5b, XMM5c, XMM5d, XMM5e, XMM5f, XMM5g, XMM5h);
1122 reg_class zmm5_reg(XMM5, XMM5b, XMM5c, XMM5d, XMM5e, XMM5f, XMM5g, XMM5h, XMM5i, XMM5j, XMM5k, XMM5l, XMM5m, XMM5n, XMM5o, XMM5p);
1123 
1124 reg_class xmm6_reg(XMM6, XMM6b, XMM6c, XMM6d);
1125 reg_class ymm6_reg(XMM6, XMM6b, XMM6c, XMM6d, XMM6e, XMM6f, XMM6g, XMM6h);
1126 reg_class zmm6_reg(XMM6, XMM6b, XMM6c, XMM6d, XMM6e, XMM6f, XMM6g, XMM6h, XMM6i, XMM6j, XMM6k, XMM6l, XMM6m, XMM6n, XMM6o, XMM6p);
1127 
1128 reg_class xmm7_reg(XMM7, XMM7b, XMM7c, XMM7d);
1129 reg_class ymm7_reg(XMM7, XMM7b, XMM7c, XMM7d, XMM7e, XMM7f, XMM7g, XMM7h);
1130 reg_class zmm7_reg(XMM7, XMM7b, XMM7c, XMM7d, XMM7e, XMM7f, XMM7g, XMM7h, XMM7i, XMM7j, XMM7k, XMM7l, XMM7m, XMM7n, XMM7o, XMM7p);
1131 
1132 #ifdef _LP64
1133 
1134 reg_class xmm8_reg(XMM8, XMM8b, XMM8c, XMM8d);
1135 reg_class ymm8_reg(XMM8, XMM8b, XMM8c, XMM8d, XMM8e, XMM8f, XMM8g, XMM8h);
1136 reg_class zmm8_reg(XMM8, XMM8b, XMM8c, XMM8d, XMM8e, XMM8f, XMM8g, XMM8h, XMM8i, XMM8j, XMM8k, XMM8l, XMM8m, XMM8n, XMM8o, XMM8p);
1137 
1138 reg_class xmm9_reg(XMM9, XMM9b, XMM9c, XMM9d);
1139 reg_class ymm9_reg(XMM9, XMM9b, XMM9c, XMM9d, XMM9e, XMM9f, XMM9g, XMM9h);
1140 reg_class zmm9_reg(XMM9, XMM9b, XMM9c, XMM9d, XMM9e, XMM9f, XMM9g, XMM9h, XMM9i, XMM9j, XMM9k, XMM9l, XMM9m, XMM9n, XMM9o, XMM9p);
1141 
1142 reg_class xmm10_reg(XMM10, XMM10b, XMM10c, XMM10d);
1143 reg_class ymm10_reg(XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h);
1144 reg_class zmm10_reg(XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p);
1145 
1146 reg_class xmm11_reg(XMM11, XMM11b, XMM11c, XMM11d);
1147 reg_class ymm11_reg(XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h);
1148 reg_class zmm11_reg(XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p);
1149 
1150 reg_class xmm12_reg(XMM12, XMM12b, XMM12c, XMM12d);
1151 reg_class ymm12_reg(XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h);
1152 reg_class zmm12_reg(XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p);
1153 
1154 reg_class xmm13_reg(XMM13, XMM13b, XMM13c, XMM13d);
1155 reg_class ymm13_reg(XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h);
1156 reg_class zmm13_reg(XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p);
1157 
1158 reg_class xmm14_reg(XMM14, XMM14b, XMM14c, XMM14d);
1159 reg_class ymm14_reg(XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h);
1160 reg_class zmm14_reg(XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p);
1161 
1162 reg_class xmm15_reg(XMM15, XMM15b, XMM15c, XMM15d);
1163 reg_class ymm15_reg(XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h);
1164 reg_class zmm15_reg(XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p);
1165 
1166 reg_class xmm16_reg(XMM16, XMM16b, XMM16c, XMM16d);
1167 reg_class ymm16_reg(XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h);
1168 reg_class zmm16_reg(XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p);
1169 
1170 reg_class xmm17_reg(XMM17, XMM17b, XMM17c, XMM17d);
1171 reg_class ymm17_reg(XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h);
1172 reg_class zmm17_reg(XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p);
1173 
1174 reg_class xmm18_reg(XMM18, XMM18b, XMM18c, XMM18d);
1175 reg_class ymm18_reg(XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h);
1176 reg_class zmm18_reg(XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p);
1177 
1178 reg_class xmm19_reg(XMM19, XMM19b, XMM19c, XMM19d);
1179 reg_class ymm19_reg(XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h);
1180 reg_class zmm19_reg(XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p);
1181 
1182 reg_class xmm20_reg(XMM20, XMM20b, XMM20c, XMM20d);
1183 reg_class ymm20_reg(XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h);
1184 reg_class zmm20_reg(XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p);
1185 
1186 reg_class xmm21_reg(XMM21, XMM21b, XMM21c, XMM21d);
1187 reg_class ymm21_reg(XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h);
1188 reg_class zmm21_reg(XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p);
1189 
1190 reg_class xmm22_reg(XMM22, XMM22b, XMM22c, XMM22d);
1191 reg_class ymm22_reg(XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h);
1192 reg_class zmm22_reg(XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p);
1193 
1194 reg_class xmm23_reg(XMM23, XMM23b, XMM23c, XMM23d);
1195 reg_class ymm23_reg(XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h);
1196 reg_class zmm23_reg(XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p);
1197 
1198 reg_class xmm24_reg(XMM24, XMM24b, XMM24c, XMM24d);
1199 reg_class ymm24_reg(XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h);
1200 reg_class zmm24_reg(XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p);
1201 
1202 reg_class xmm25_reg(XMM25, XMM25b, XMM25c, XMM25d);
1203 reg_class ymm25_reg(XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h);
1204 reg_class zmm25_reg(XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p);
1205 
1206 reg_class xmm26_reg(XMM26, XMM26b, XMM26c, XMM26d);
1207 reg_class ymm26_reg(XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h);
1208 reg_class zmm26_reg(XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p);
1209 
1210 reg_class xmm27_reg(XMM27, XMM27b, XMM27c, XMM27d);
1211 reg_class ymm27_reg(XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h);
1212 reg_class zmm27_reg(XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p);
1213 
1214 reg_class xmm28_reg(XMM28, XMM28b, XMM28c, XMM28d);
1215 reg_class ymm28_reg(XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h);
1216 reg_class zmm28_reg(XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p);
1217 
1218 reg_class xmm29_reg(XMM29, XMM29b, XMM29c, XMM29d);
1219 reg_class ymm29_reg(XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h);
1220 reg_class zmm29_reg(XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p);
1221 
1222 reg_class xmm30_reg(XMM30, XMM30b, XMM30c, XMM30d);
1223 reg_class ymm30_reg(XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h);
1224 reg_class zmm30_reg(XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p);
1225 
1226 reg_class xmm31_reg(XMM31, XMM31b, XMM31c, XMM31d);
1227 reg_class ymm31_reg(XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h);
1228 reg_class zmm31_reg(XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p);
1229 
1230 #endif
1231 
1232 %}
1233 
1234 
1235 //----------SOURCE BLOCK-------------------------------------------------------
1236 // This is a block of C++ code which provides values, functions, and
1237 // definitions necessary in the rest of the architecture description
1238 
1239 source_hpp %{
1240 // Header information of the source block.
1241 // Method declarations/definitions which are used outside
1242 // the ad-scope can conveniently be defined here.
1243 //
1244 // To keep related declarations/definitions/uses close together,
1245 // we switch between source %{ }% and source_hpp %{ }% freely as needed.
1246 
1247 class NativeJump;
1248 
1249 class CallStubImpl {
1250 
1251   //--------------------------------------------------------------
1252   //---<  Used for optimization in Compile::shorten_branches  >---
1253   //--------------------------------------------------------------
1254 
1255  public:
1256   // Size of call trampoline stub.
1257   static uint size_call_trampoline() {
1258     return 0; // no call trampolines on this platform
1259   }
1260 
1261   // number of relocations needed by a call trampoline stub
1262   static uint reloc_call_trampoline() {
1263     return 0; // no call trampolines on this platform
1264   }
1265 };
1266 
1267 class HandlerImpl {
1268 
1269  public:
1270 
1271   static int emit_exception_handler(CodeBuffer &cbuf);
1272   static int emit_deopt_handler(CodeBuffer& cbuf);
1273 
1274   static uint size_exception_handler() {
1275     // NativeCall instruction size is the same as NativeJump.
1276     // exception handler starts out as jump and can be patched to
1277     // a call be deoptimization.  (4932387)
1278     // Note that this value is also credited (in output.cpp) to
1279     // the size of the code section.
1280     return NativeJump::instruction_size;
1281   }
1282 
1283 #ifdef _LP64
1284   static uint size_deopt_handler() {
1285     // three 5 byte instructions plus one move for unreachable address.
1286     return 15+3;
1287   }
1288 #else
1289   static uint size_deopt_handler() {
1290     // NativeCall instruction size is the same as NativeJump.
1291     // exception handler starts out as jump and can be patched to
1292     // a call be deoptimization.  (4932387)
1293     // Note that this value is also credited (in output.cpp) to
1294     // the size of the code section.
1295     return 5 + NativeJump::instruction_size; // pushl(); jmp;
1296   }
1297 #endif
1298 };
1299 
1300 %} // end source_hpp
1301 
1302 source %{
1303 
1304 #include "opto/addnode.hpp"
1305 
1306 // Emit exception handler code.
1307 // Stuff framesize into a register and call a VM stub routine.
1308 int HandlerImpl::emit_exception_handler(CodeBuffer& cbuf) {
1309 
1310   // Note that the code buffer's insts_mark is always relative to insts.
1311   // That's why we must use the macroassembler to generate a handler.
1312   MacroAssembler _masm(&cbuf);
1313   address base = __ start_a_stub(size_exception_handler());
1314   if (base == NULL) {
1315     ciEnv::current()->record_failure("CodeCache is full");
1316     return 0;  // CodeBuffer::expand failed
1317   }
1318   int offset = __ offset();
1319   __ jump(RuntimeAddress(OptoRuntime::exception_blob()->entry_point()));
1320   assert(__ offset() - offset <= (int) size_exception_handler(), "overflow");
1321   __ end_a_stub();
1322   return offset;
1323 }
1324 
1325 // Emit deopt handler code.
1326 int HandlerImpl::emit_deopt_handler(CodeBuffer& cbuf) {
1327 
1328   // Note that the code buffer's insts_mark is always relative to insts.
1329   // That's why we must use the macroassembler to generate a handler.
1330   MacroAssembler _masm(&cbuf);
1331   address base = __ start_a_stub(size_deopt_handler());
1332   if (base == NULL) {
1333     ciEnv::current()->record_failure("CodeCache is full");
1334     return 0;  // CodeBuffer::expand failed
1335   }
1336   int offset = __ offset();
1337 
1338 #ifdef _LP64
1339   address the_pc = (address) __ pc();
1340   Label next;
1341   // push a "the_pc" on the stack without destroying any registers
1342   // as they all may be live.
1343 
1344   // push address of "next"
1345   __ call(next, relocInfo::none); // reloc none is fine since it is a disp32
1346   __ bind(next);
1347   // adjust it so it matches "the_pc"
1348   __ subptr(Address(rsp, 0), __ offset() - offset);
1349 #else
1350   InternalAddress here(__ pc());
1351   __ pushptr(here.addr());
1352 #endif
1353 
1354   __ jump(RuntimeAddress(SharedRuntime::deopt_blob()->unpack()));
1355   assert(__ offset() - offset <= (int) size_deopt_handler(), "overflow %d", (__ offset() - offset));
1356   __ end_a_stub();
1357   return offset;
1358 }
1359 
1360 
1361 //=============================================================================
1362 
1363   // Float masks come from different places depending on platform.
1364 #ifdef _LP64
1365   static address float_signmask()  { return StubRoutines::x86::float_sign_mask(); }
1366   static address float_signflip()  { return StubRoutines::x86::float_sign_flip(); }
1367   static address double_signmask() { return StubRoutines::x86::double_sign_mask(); }
1368   static address double_signflip() { return StubRoutines::x86::double_sign_flip(); }
1369 #else
1370   static address float_signmask()  { return (address)float_signmask_pool; }
1371   static address float_signflip()  { return (address)float_signflip_pool; }
1372   static address double_signmask() { return (address)double_signmask_pool; }
1373   static address double_signflip() { return (address)double_signflip_pool; }
1374 #endif
1375 
1376 
1377 const bool Matcher::match_rule_supported(int opcode) {
1378   if (!has_match_rule(opcode))
1379     return false;
1380 
1381   bool ret_value = true;
1382   switch (opcode) {
1383     case Op_PopCountI:
1384     case Op_PopCountL:
1385       if (!UsePopCountInstruction)
1386         ret_value = false;
1387       break;
1388     case Op_PopCountVI:
1389       if (!UsePopCountInstruction || !VM_Version::supports_vpopcntdq())
1390         ret_value = false;
1391       break;
1392     case Op_MulVI:
1393       if ((UseSSE < 4) && (UseAVX < 1)) // only with SSE4_1 or AVX
1394         ret_value = false;
1395       break;
1396     case Op_MulVL:
1397     case Op_MulReductionVL:
1398       if (VM_Version::supports_avx512dq() == false)
1399         ret_value = false;
1400       break;
1401     case Op_AddReductionVL:
1402       if (UseAVX < 3) // only EVEX : vector connectivity becomes an issue here
1403         ret_value = false;
1404       break;
1405     case Op_AddReductionVI:
1406       if (UseSSE < 3) // requires at least SSE3
1407         ret_value = false;
1408       break;
1409     case Op_MulReductionVI:
1410       if (UseSSE < 4) // requires at least SSE4
1411         ret_value = false;
1412       break;
1413     case Op_AddReductionVF:
1414     case Op_AddReductionVD:
1415     case Op_MulReductionVF:
1416     case Op_MulReductionVD:
1417       if (UseSSE < 1) // requires at least SSE
1418         ret_value = false;
1419       break;
1420     case Op_SqrtVD:
1421     case Op_SqrtVF:
1422       if (UseAVX < 1) // enabled for AVX only
1423         ret_value = false;
1424       break;
1425     case Op_CompareAndSwapL:
1426 #ifdef _LP64
1427     case Op_CompareAndSwapP:
1428 #endif
1429       if (!VM_Version::supports_cx8())
1430         ret_value = false;
1431       break;
1432     case Op_CMoveVF:
1433     case Op_CMoveVD:
1434       if (UseAVX < 1 || UseAVX > 2)
1435         ret_value = false;
1436       break;
1437     case Op_StrIndexOf:
1438       if (!UseSSE42Intrinsics)
1439         ret_value = false;
1440       break;
1441     case Op_StrIndexOfChar:
1442       if (!UseSSE42Intrinsics)
1443         ret_value = false;
1444       break;
1445     case Op_OnSpinWait:
1446       if (VM_Version::supports_on_spin_wait() == false)
1447         ret_value = false;
1448       break;
1449     case Op_MulAddVS2VI:
1450       if (UseSSE < 2)
1451         ret_value = false;
1452       break;
1453     case Op_MaxD:
1454     case Op_MaxF:
1455     case Op_MinD:
1456     case Op_MinF:
1457       if (UseAVX < 1) // enabled for AVX only
1458         ret_value = false;
1459       break;
1460   }
1461 
1462   return ret_value;  // Per default match rules are supported.
1463 }
1464 
1465 const bool Matcher::match_rule_supported_vector(int opcode, int vlen) {
1466   // identify extra cases that we might want to provide match rules for
1467   // e.g. Op_ vector nodes and other intrinsics while guarding with vlen
1468   bool ret_value = match_rule_supported(opcode);
1469   if (ret_value) {
1470     switch (opcode) {
1471       case Op_AddVB:
1472       case Op_SubVB:
1473         if ((vlen == 64) && (VM_Version::supports_avx512bw() == false))
1474           ret_value = false;
1475         break;
1476       case Op_URShiftVS:
1477       case Op_RShiftVS:
1478       case Op_LShiftVS:
1479       case Op_MulVS:
1480       case Op_AddVS:
1481       case Op_SubVS:
1482         if ((vlen == 32) && (VM_Version::supports_avx512bw() == false))
1483           ret_value = false;
1484         break;
1485       case Op_CMoveVF:
1486         if (vlen != 8)
1487           ret_value  = false;
1488         break;
1489       case Op_CMoveVD:
1490         if (vlen != 4)
1491           ret_value  = false;
1492         break;
1493     }
1494   }
1495 
1496   return ret_value;  // Per default match rules are supported.
1497 }
1498 
1499 const bool Matcher::has_predicated_vectors(void) {
1500   bool ret_value = false;
1501   if (UseAVX > 2) {
1502     ret_value = VM_Version::supports_avx512vl();
1503   }
1504 
1505   return ret_value;
1506 }
1507 
1508 const int Matcher::float_pressure(int default_pressure_threshold) {
1509   int float_pressure_threshold = default_pressure_threshold;
1510 #ifdef _LP64
1511   if (UseAVX > 2) {
1512     // Increase pressure threshold on machines with AVX3 which have
1513     // 2x more XMM registers.
1514     float_pressure_threshold = default_pressure_threshold * 2;
1515   }
1516 #endif
1517   return float_pressure_threshold;
1518 }
1519 
1520 // Max vector size in bytes. 0 if not supported.
1521 const int Matcher::vector_width_in_bytes(BasicType bt) {
1522   assert(is_java_primitive(bt), "only primitive type vectors");
1523   if (UseSSE < 2) return 0;
1524   // SSE2 supports 128bit vectors for all types.
1525   // AVX2 supports 256bit vectors for all types.
1526   // AVX2/EVEX supports 512bit vectors for all types.
1527   int size = (UseAVX > 1) ? (1 << UseAVX) * 8 : 16;
1528   // AVX1 supports 256bit vectors only for FLOAT and DOUBLE.
1529   if (UseAVX > 0 && (bt == T_FLOAT || bt == T_DOUBLE))
1530     size = (UseAVX > 2) ? 64 : 32;
1531   if (UseAVX > 2 && (bt == T_BYTE || bt == T_SHORT || bt == T_CHAR))
1532     size = (VM_Version::supports_avx512bw()) ? 64 : 32;
1533   // Use flag to limit vector size.
1534   size = MIN2(size,(int)MaxVectorSize);
1535   // Minimum 2 values in vector (or 4 for bytes).
1536   switch (bt) {
1537   case T_DOUBLE:
1538   case T_LONG:
1539     if (size < 16) return 0;
1540     break;
1541   case T_FLOAT:
1542   case T_INT:
1543     if (size < 8) return 0;
1544     break;
1545   case T_BOOLEAN:
1546     if (size < 4) return 0;
1547     break;
1548   case T_CHAR:
1549     if (size < 4) return 0;
1550     break;
1551   case T_BYTE:
1552     if (size < 4) return 0;
1553     break;
1554   case T_SHORT:
1555     if (size < 4) return 0;
1556     break;
1557   default:
1558     ShouldNotReachHere();
1559   }
1560   return size;
1561 }
1562 
1563 // Limits on vector size (number of elements) loaded into vector.
1564 const int Matcher::max_vector_size(const BasicType bt) {
1565   return vector_width_in_bytes(bt)/type2aelembytes(bt);
1566 }
1567 const int Matcher::min_vector_size(const BasicType bt) {
1568   int max_size = max_vector_size(bt);
1569   // Min size which can be loaded into vector is 4 bytes.
1570   int size = (type2aelembytes(bt) == 1) ? 4 : 2;
1571   return MIN2(size,max_size);
1572 }
1573 
1574 // Vector ideal reg corresponding to specified size in bytes
1575 const uint Matcher::vector_ideal_reg(int size) {
1576   assert(MaxVectorSize >= size, "");
1577   switch(size) {
1578     case  4: return Op_VecS;
1579     case  8: return Op_VecD;
1580     case 16: return Op_VecX;
1581     case 32: return Op_VecY;
1582     case 64: return Op_VecZ;
1583   }
1584   ShouldNotReachHere();
1585   return 0;
1586 }
1587 
1588 // Only lowest bits of xmm reg are used for vector shift count.
1589 const uint Matcher::vector_shift_count_ideal_reg(int size) {
1590   return Op_VecS;
1591 }
1592 
1593 // x86 supports misaligned vectors store/load.
1594 const bool Matcher::misaligned_vectors_ok() {
1595   return !AlignVector; // can be changed by flag
1596 }
1597 
1598 // x86 AES instructions are compatible with SunJCE expanded
1599 // keys, hence we do not need to pass the original key to stubs
1600 const bool Matcher::pass_original_key_for_aes() {
1601   return false;
1602 }
1603 
1604 
1605 const bool Matcher::convi2l_type_required = true;
1606 
1607 // Check for shift by small constant as well
1608 static bool clone_shift(Node* shift, Matcher* matcher, Matcher::MStack& mstack, VectorSet& address_visited) {
1609   if (shift->Opcode() == Op_LShiftX && shift->in(2)->is_Con() &&
1610       shift->in(2)->get_int() <= 3 &&
1611       // Are there other uses besides address expressions?
1612       !matcher->is_visited(shift)) {
1613     address_visited.set(shift->_idx); // Flag as address_visited
1614     mstack.push(shift->in(2), Matcher::Visit);
1615     Node *conv = shift->in(1);
1616 #ifdef _LP64
1617     // Allow Matcher to match the rule which bypass
1618     // ConvI2L operation for an array index on LP64
1619     // if the index value is positive.
1620     if (conv->Opcode() == Op_ConvI2L &&
1621         conv->as_Type()->type()->is_long()->_lo >= 0 &&
1622         // Are there other uses besides address expressions?
1623         !matcher->is_visited(conv)) {
1624       address_visited.set(conv->_idx); // Flag as address_visited
1625       mstack.push(conv->in(1), Matcher::Pre_Visit);
1626     } else
1627 #endif
1628       mstack.push(conv, Matcher::Pre_Visit);
1629     return true;
1630   }
1631   return false;
1632 }
1633 
1634 // Should the Matcher clone shifts on addressing modes, expecting them
1635 // to be subsumed into complex addressing expressions or compute them
1636 // into registers?
1637 bool Matcher::clone_address_expressions(AddPNode* m, Matcher::MStack& mstack, VectorSet& address_visited) {
1638   Node *off = m->in(AddPNode::Offset);
1639   if (off->is_Con()) {
1640     address_visited.test_set(m->_idx); // Flag as address_visited
1641     Node *adr = m->in(AddPNode::Address);
1642 
1643     // Intel can handle 2 adds in addressing mode
1644     // AtomicAdd is not an addressing expression.
1645     // Cheap to find it by looking for screwy base.
1646     if (adr->is_AddP() &&
1647         !adr->in(AddPNode::Base)->is_top() &&
1648         LP64_ONLY( off->get_long() == (int) (off->get_long()) && ) // immL32
1649         // Are there other uses besides address expressions?
1650         !is_visited(adr)) {
1651       address_visited.set(adr->_idx); // Flag as address_visited
1652       Node *shift = adr->in(AddPNode::Offset);
1653       if (!clone_shift(shift, this, mstack, address_visited)) {
1654         mstack.push(shift, Pre_Visit);
1655       }
1656       mstack.push(adr->in(AddPNode::Address), Pre_Visit);
1657       mstack.push(adr->in(AddPNode::Base), Pre_Visit);
1658     } else {
1659       mstack.push(adr, Pre_Visit);
1660     }
1661 
1662     // Clone X+offset as it also folds into most addressing expressions
1663     mstack.push(off, Visit);
1664     mstack.push(m->in(AddPNode::Base), Pre_Visit);
1665     return true;
1666   } else if (clone_shift(off, this, mstack, address_visited)) {
1667     address_visited.test_set(m->_idx); // Flag as address_visited
1668     mstack.push(m->in(AddPNode::Address), Pre_Visit);
1669     mstack.push(m->in(AddPNode::Base), Pre_Visit);
1670     return true;
1671   }
1672   return false;
1673 }
1674 
1675 void Compile::reshape_address(AddPNode* addp) {
1676 }
1677 
1678 // Helper methods for MachSpillCopyNode::implementation().
1679 static int vec_mov_helper(CodeBuffer *cbuf, bool do_size, int src_lo, int dst_lo,
1680                           int src_hi, int dst_hi, uint ireg, outputStream* st) {
1681   // In 64-bit VM size calculation is very complex. Emitting instructions
1682   // into scratch buffer is used to get size in 64-bit VM.
1683   LP64_ONLY( assert(!do_size, "this method calculates size only for 32-bit VM"); )
1684   assert(ireg == Op_VecS || // 32bit vector
1685          (src_lo & 1) == 0 && (src_lo + 1) == src_hi &&
1686          (dst_lo & 1) == 0 && (dst_lo + 1) == dst_hi,
1687          "no non-adjacent vector moves" );
1688   if (cbuf) {
1689     MacroAssembler _masm(cbuf);
1690     int offset = __ offset();
1691     switch (ireg) {
1692     case Op_VecS: // copy whole register
1693     case Op_VecD:
1694     case Op_VecX:
1695 #ifndef _LP64
1696       __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
1697 #else
1698       if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
1699         __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
1700       } else {
1701         __ vextractf32x4(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 0x0);
1702      }
1703 #endif
1704       break;
1705     case Op_VecY:
1706 #ifndef _LP64
1707       __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
1708 #else
1709       if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
1710         __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
1711       } else {
1712         __ vextractf64x4(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 0x0);
1713      }
1714 #endif
1715       break;
1716     case Op_VecZ:
1717       __ evmovdquq(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 2);
1718       break;
1719     default:
1720       ShouldNotReachHere();
1721     }
1722     int size = __ offset() - offset;
1723 #ifdef ASSERT
1724     // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
1725     assert(!do_size || size == 4, "incorrect size calculattion");
1726 #endif
1727     return size;
1728 #ifndef PRODUCT
1729   } else if (!do_size) {
1730     switch (ireg) {
1731     case Op_VecS:
1732     case Op_VecD:
1733     case Op_VecX:
1734       st->print("movdqu  %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
1735       break;
1736     case Op_VecY:
1737     case Op_VecZ:
1738       st->print("vmovdqu %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
1739       break;
1740     default:
1741       ShouldNotReachHere();
1742     }
1743 #endif
1744   }
1745   // VEX_2bytes prefix is used if UseAVX > 0, and it takes the same 2 bytes as SIMD prefix.
1746   return (UseAVX > 2) ? 6 : 4;
1747 }
1748 
1749 static int vec_spill_helper(CodeBuffer *cbuf, bool do_size, bool is_load,
1750                             int stack_offset, int reg, uint ireg, outputStream* st) {
1751   // In 64-bit VM size calculation is very complex. Emitting instructions
1752   // into scratch buffer is used to get size in 64-bit VM.
1753   LP64_ONLY( assert(!do_size, "this method calculates size only for 32-bit VM"); )
1754   if (cbuf) {
1755     MacroAssembler _masm(cbuf);
1756     int offset = __ offset();
1757     if (is_load) {
1758       switch (ireg) {
1759       case Op_VecS:
1760         __ movdl(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
1761         break;
1762       case Op_VecD:
1763         __ movq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
1764         break;
1765       case Op_VecX:
1766 #ifndef _LP64
1767         __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
1768 #else
1769         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
1770           __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
1771         } else {
1772           __ vpxor(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 2);
1773           __ vinsertf32x4(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset),0x0);
1774         }
1775 #endif
1776         break;
1777       case Op_VecY:
1778 #ifndef _LP64
1779         __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
1780 #else
1781         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
1782           __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
1783         } else {
1784           __ vpxor(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 2);
1785           __ vinsertf64x4(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset),0x0);
1786         }
1787 #endif
1788         break;
1789       case Op_VecZ:
1790         __ evmovdquq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset), 2);
1791         break;
1792       default:
1793         ShouldNotReachHere();
1794       }
1795     } else { // store
1796       switch (ireg) {
1797       case Op_VecS:
1798         __ movdl(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
1799         break;
1800       case Op_VecD:
1801         __ movq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
1802         break;
1803       case Op_VecX:
1804 #ifndef _LP64
1805         __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
1806 #else
1807         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
1808           __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
1809         }
1810         else {
1811           __ vextractf32x4(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 0x0);
1812         }
1813 #endif
1814         break;
1815       case Op_VecY:
1816 #ifndef _LP64
1817         __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
1818 #else
1819         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
1820           __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
1821         }
1822         else {
1823           __ vextractf64x4(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 0x0);
1824         }
1825 #endif
1826         break;
1827       case Op_VecZ:
1828         __ evmovdquq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 2);
1829         break;
1830       default:
1831         ShouldNotReachHere();
1832       }
1833     }
1834     int size = __ offset() - offset;
1835 #ifdef ASSERT
1836     int offset_size = (stack_offset == 0) ? 0 : ((stack_offset < 0x80) ? 1 : (UseAVX > 2) ? 6 : 4);
1837     // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
1838     assert(!do_size || size == (5+offset_size), "incorrect size calculattion");
1839 #endif
1840     return size;
1841 #ifndef PRODUCT
1842   } else if (!do_size) {
1843     if (is_load) {
1844       switch (ireg) {
1845       case Op_VecS:
1846         st->print("movd    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
1847         break;
1848       case Op_VecD:
1849         st->print("movq    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
1850         break;
1851        case Op_VecX:
1852         st->print("movdqu  %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
1853         break;
1854       case Op_VecY:
1855       case Op_VecZ:
1856         st->print("vmovdqu %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
1857         break;
1858       default:
1859         ShouldNotReachHere();
1860       }
1861     } else { // store
1862       switch (ireg) {
1863       case Op_VecS:
1864         st->print("movd    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
1865         break;
1866       case Op_VecD:
1867         st->print("movq    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
1868         break;
1869        case Op_VecX:
1870         st->print("movdqu  [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
1871         break;
1872       case Op_VecY:
1873       case Op_VecZ:
1874         st->print("vmovdqu [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
1875         break;
1876       default:
1877         ShouldNotReachHere();
1878       }
1879     }
1880 #endif
1881   }
1882   bool is_single_byte = false;
1883   int vec_len = 0;
1884   if ((UseAVX > 2) && (stack_offset != 0)) {
1885     int tuple_type = Assembler::EVEX_FVM;
1886     int input_size = Assembler::EVEX_32bit;
1887     switch (ireg) {
1888     case Op_VecS:
1889       tuple_type = Assembler::EVEX_T1S;
1890       break;
1891     case Op_VecD:
1892       tuple_type = Assembler::EVEX_T1S;
1893       input_size = Assembler::EVEX_64bit;
1894       break;
1895     case Op_VecX:
1896       break;
1897     case Op_VecY:
1898       vec_len = 1;
1899       break;
1900     case Op_VecZ:
1901       vec_len = 2;
1902       break;
1903     }
1904     is_single_byte = Assembler::query_compressed_disp_byte(stack_offset, true, vec_len, tuple_type, input_size, 0);
1905   }
1906   int offset_size = 0;
1907   int size = 5;
1908   if (UseAVX > 2 ) {
1909     if (VM_Version::supports_avx512novl() && (vec_len == 2)) {
1910       offset_size = (stack_offset == 0) ? 0 : ((is_single_byte) ? 1 : 4);
1911       size += 2; // Need an additional two bytes for EVEX encoding
1912     } else if (VM_Version::supports_avx512novl() && (vec_len < 2)) {
1913       offset_size = (stack_offset == 0) ? 0 : ((stack_offset <= 127) ? 1 : 4);
1914     } else {
1915       offset_size = (stack_offset == 0) ? 0 : ((is_single_byte) ? 1 : 4);
1916       size += 2; // Need an additional two bytes for EVEX encodding
1917     }
1918   } else {
1919     offset_size = (stack_offset == 0) ? 0 : ((stack_offset <= 127) ? 1 : 4);
1920   }
1921   // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
1922   return size+offset_size;
1923 }
1924 
1925 static inline jint replicate4_imm(int con, int width) {
1926   // Load a constant of "width" (in bytes) and replicate it to fill 32bit.
1927   assert(width == 1 || width == 2, "only byte or short types here");
1928   int bit_width = width * 8;
1929   jint val = con;
1930   val &= (1 << bit_width) - 1;  // mask off sign bits
1931   while(bit_width < 32) {
1932     val |= (val << bit_width);
1933     bit_width <<= 1;
1934   }
1935   return val;
1936 }
1937 
1938 static inline jlong replicate8_imm(int con, int width) {
1939   // Load a constant of "width" (in bytes) and replicate it to fill 64bit.
1940   assert(width == 1 || width == 2 || width == 4, "only byte, short or int types here");
1941   int bit_width = width * 8;
1942   jlong val = con;
1943   val &= (((jlong) 1) << bit_width) - 1;  // mask off sign bits
1944   while(bit_width < 64) {
1945     val |= (val << bit_width);
1946     bit_width <<= 1;
1947   }
1948   return val;
1949 }
1950 
1951 #ifndef PRODUCT
1952   void MachNopNode::format(PhaseRegAlloc*, outputStream* st) const {
1953     st->print("nop \t# %d bytes pad for loops and calls", _count);
1954   }
1955 #endif
1956 
1957   void MachNopNode::emit(CodeBuffer &cbuf, PhaseRegAlloc*) const {
1958     MacroAssembler _masm(&cbuf);
1959     __ nop(_count);
1960   }
1961 
1962   uint MachNopNode::size(PhaseRegAlloc*) const {
1963     return _count;
1964   }
1965 
1966 #ifndef PRODUCT
1967   void MachBreakpointNode::format(PhaseRegAlloc*, outputStream* st) const {
1968     st->print("# breakpoint");
1969   }
1970 #endif
1971 
1972   void MachBreakpointNode::emit(CodeBuffer &cbuf, PhaseRegAlloc* ra_) const {
1973     MacroAssembler _masm(&cbuf);
1974     __ int3();
1975   }
1976 
1977   uint MachBreakpointNode::size(PhaseRegAlloc* ra_) const {
1978     return MachNode::size(ra_);
1979   }
1980 
1981 %}
1982 
1983 encode %{
1984 
1985   enc_class call_epilog %{
1986     if (VerifyStackAtCalls) {
1987       // Check that stack depth is unchanged: find majik cookie on stack
1988       int framesize = ra_->reg2offset_unchecked(OptoReg::add(ra_->_matcher._old_SP, -3*VMRegImpl::slots_per_word));
1989       MacroAssembler _masm(&cbuf);
1990       Label L;
1991       __ cmpptr(Address(rsp, framesize), (int32_t)0xbadb100d);
1992       __ jccb(Assembler::equal, L);
1993       // Die if stack mismatch
1994       __ int3();
1995       __ bind(L);
1996     }
1997   %}
1998 
1999 %}
2000 
2001 
2002 //----------OPERANDS-----------------------------------------------------------
2003 // Operand definitions must precede instruction definitions for correct parsing
2004 // in the ADLC because operands constitute user defined types which are used in
2005 // instruction definitions.
2006 
2007 operand vecZ() %{
2008   constraint(ALLOC_IN_RC(vectorz_reg));
2009   match(VecZ);
2010 
2011   format %{ %}
2012   interface(REG_INTER);
2013 %}
2014 
2015 operand legVecZ() %{
2016   constraint(ALLOC_IN_RC(vectorz_reg_vl));
2017   match(VecZ);
2018 
2019   format %{ %}
2020   interface(REG_INTER);
2021 %}
2022 
2023 // Comparison Code for FP conditional move
2024 operand cmpOp_vcmppd() %{
2025   match(Bool);
2026 
2027   predicate(n->as_Bool()->_test._test != BoolTest::overflow &&
2028             n->as_Bool()->_test._test != BoolTest::no_overflow);
2029   format %{ "" %}
2030   interface(COND_INTER) %{
2031     equal        (0x0, "eq");
2032     less         (0x1, "lt");
2033     less_equal   (0x2, "le");
2034     not_equal    (0xC, "ne");
2035     greater_equal(0xD, "ge");
2036     greater      (0xE, "gt");
2037     //TODO cannot compile (adlc breaks) without two next lines with error:
2038     // x86_64.ad(13987) Syntax Error: :In operand cmpOp_vcmppd: Do not support this encode constant: ' %{
2039     // equal' for overflow.
2040     overflow     (0x20, "o");  // not really supported by the instruction
2041     no_overflow  (0x21, "no"); // not really supported by the instruction
2042   %}
2043 %}
2044 
2045 
2046 // INSTRUCTIONS -- Platform independent definitions (same for 32- and 64-bit)
2047 
2048 // ============================================================================
2049 
2050 instruct ShouldNotReachHere() %{
2051   match(Halt);
2052   format %{ "ud2\t# ShouldNotReachHere" %}
2053   ins_encode %{
2054     __ ud2();
2055   %}
2056   ins_pipe(pipe_slow);
2057 %}
2058 
2059 // =================================EVEX special===============================
2060 
2061 instruct setMask(rRegI dst, rRegI src) %{
2062   predicate(Matcher::has_predicated_vectors());
2063   match(Set dst (SetVectMaskI  src));
2064   effect(TEMP dst);
2065   format %{ "setvectmask   $dst, $src" %}
2066   ins_encode %{
2067     __ setvectmask($dst$$Register, $src$$Register);
2068   %}
2069   ins_pipe(pipe_slow);
2070 %}
2071 
2072 // ============================================================================
2073 
2074 instruct addF_reg(regF dst, regF src) %{
2075   predicate((UseSSE>=1) && (UseAVX == 0));
2076   match(Set dst (AddF dst src));
2077 
2078   format %{ "addss   $dst, $src" %}
2079   ins_cost(150);
2080   ins_encode %{
2081     __ addss($dst$$XMMRegister, $src$$XMMRegister);
2082   %}
2083   ins_pipe(pipe_slow);
2084 %}
2085 
2086 instruct addF_mem(regF dst, memory src) %{
2087   predicate((UseSSE>=1) && (UseAVX == 0));
2088   match(Set dst (AddF dst (LoadF src)));
2089 
2090   format %{ "addss   $dst, $src" %}
2091   ins_cost(150);
2092   ins_encode %{
2093     __ addss($dst$$XMMRegister, $src$$Address);
2094   %}
2095   ins_pipe(pipe_slow);
2096 %}
2097 
2098 instruct addF_imm(regF dst, immF con) %{
2099   predicate((UseSSE>=1) && (UseAVX == 0));
2100   match(Set dst (AddF dst con));
2101   format %{ "addss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2102   ins_cost(150);
2103   ins_encode %{
2104     __ addss($dst$$XMMRegister, $constantaddress($con));
2105   %}
2106   ins_pipe(pipe_slow);
2107 %}
2108 
2109 instruct addF_reg_reg(regF dst, regF src1, regF src2) %{
2110   predicate(UseAVX > 0);
2111   match(Set dst (AddF src1 src2));
2112 
2113   format %{ "vaddss  $dst, $src1, $src2" %}
2114   ins_cost(150);
2115   ins_encode %{
2116     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2117   %}
2118   ins_pipe(pipe_slow);
2119 %}
2120 
2121 instruct addF_reg_mem(regF dst, regF src1, memory src2) %{
2122   predicate(UseAVX > 0);
2123   match(Set dst (AddF src1 (LoadF src2)));
2124 
2125   format %{ "vaddss  $dst, $src1, $src2" %}
2126   ins_cost(150);
2127   ins_encode %{
2128     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2129   %}
2130   ins_pipe(pipe_slow);
2131 %}
2132 
2133 instruct addF_reg_imm(regF dst, regF src, immF con) %{
2134   predicate(UseAVX > 0);
2135   match(Set dst (AddF src con));
2136 
2137   format %{ "vaddss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
2138   ins_cost(150);
2139   ins_encode %{
2140     __ vaddss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2141   %}
2142   ins_pipe(pipe_slow);
2143 %}
2144 
2145 instruct addD_reg(regD dst, regD src) %{
2146   predicate((UseSSE>=2) && (UseAVX == 0));
2147   match(Set dst (AddD dst src));
2148 
2149   format %{ "addsd   $dst, $src" %}
2150   ins_cost(150);
2151   ins_encode %{
2152     __ addsd($dst$$XMMRegister, $src$$XMMRegister);
2153   %}
2154   ins_pipe(pipe_slow);
2155 %}
2156 
2157 instruct addD_mem(regD dst, memory src) %{
2158   predicate((UseSSE>=2) && (UseAVX == 0));
2159   match(Set dst (AddD dst (LoadD src)));
2160 
2161   format %{ "addsd   $dst, $src" %}
2162   ins_cost(150);
2163   ins_encode %{
2164     __ addsd($dst$$XMMRegister, $src$$Address);
2165   %}
2166   ins_pipe(pipe_slow);
2167 %}
2168 
2169 instruct addD_imm(regD dst, immD con) %{
2170   predicate((UseSSE>=2) && (UseAVX == 0));
2171   match(Set dst (AddD dst con));
2172   format %{ "addsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2173   ins_cost(150);
2174   ins_encode %{
2175     __ addsd($dst$$XMMRegister, $constantaddress($con));
2176   %}
2177   ins_pipe(pipe_slow);
2178 %}
2179 
2180 instruct addD_reg_reg(regD dst, regD src1, regD src2) %{
2181   predicate(UseAVX > 0);
2182   match(Set dst (AddD src1 src2));
2183 
2184   format %{ "vaddsd  $dst, $src1, $src2" %}
2185   ins_cost(150);
2186   ins_encode %{
2187     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2188   %}
2189   ins_pipe(pipe_slow);
2190 %}
2191 
2192 instruct addD_reg_mem(regD dst, regD src1, memory src2) %{
2193   predicate(UseAVX > 0);
2194   match(Set dst (AddD src1 (LoadD src2)));
2195 
2196   format %{ "vaddsd  $dst, $src1, $src2" %}
2197   ins_cost(150);
2198   ins_encode %{
2199     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2200   %}
2201   ins_pipe(pipe_slow);
2202 %}
2203 
2204 instruct addD_reg_imm(regD dst, regD src, immD con) %{
2205   predicate(UseAVX > 0);
2206   match(Set dst (AddD src con));
2207 
2208   format %{ "vaddsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
2209   ins_cost(150);
2210   ins_encode %{
2211     __ vaddsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2212   %}
2213   ins_pipe(pipe_slow);
2214 %}
2215 
2216 instruct subF_reg(regF dst, regF src) %{
2217   predicate((UseSSE>=1) && (UseAVX == 0));
2218   match(Set dst (SubF dst src));
2219 
2220   format %{ "subss   $dst, $src" %}
2221   ins_cost(150);
2222   ins_encode %{
2223     __ subss($dst$$XMMRegister, $src$$XMMRegister);
2224   %}
2225   ins_pipe(pipe_slow);
2226 %}
2227 
2228 instruct subF_mem(regF dst, memory src) %{
2229   predicate((UseSSE>=1) && (UseAVX == 0));
2230   match(Set dst (SubF dst (LoadF src)));
2231 
2232   format %{ "subss   $dst, $src" %}
2233   ins_cost(150);
2234   ins_encode %{
2235     __ subss($dst$$XMMRegister, $src$$Address);
2236   %}
2237   ins_pipe(pipe_slow);
2238 %}
2239 
2240 instruct subF_imm(regF dst, immF con) %{
2241   predicate((UseSSE>=1) && (UseAVX == 0));
2242   match(Set dst (SubF dst con));
2243   format %{ "subss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2244   ins_cost(150);
2245   ins_encode %{
2246     __ subss($dst$$XMMRegister, $constantaddress($con));
2247   %}
2248   ins_pipe(pipe_slow);
2249 %}
2250 
2251 instruct subF_reg_reg(regF dst, regF src1, regF src2) %{
2252   predicate(UseAVX > 0);
2253   match(Set dst (SubF src1 src2));
2254 
2255   format %{ "vsubss  $dst, $src1, $src2" %}
2256   ins_cost(150);
2257   ins_encode %{
2258     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2259   %}
2260   ins_pipe(pipe_slow);
2261 %}
2262 
2263 instruct subF_reg_mem(regF dst, regF src1, memory src2) %{
2264   predicate(UseAVX > 0);
2265   match(Set dst (SubF src1 (LoadF src2)));
2266 
2267   format %{ "vsubss  $dst, $src1, $src2" %}
2268   ins_cost(150);
2269   ins_encode %{
2270     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2271   %}
2272   ins_pipe(pipe_slow);
2273 %}
2274 
2275 instruct subF_reg_imm(regF dst, regF src, immF con) %{
2276   predicate(UseAVX > 0);
2277   match(Set dst (SubF src con));
2278 
2279   format %{ "vsubss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
2280   ins_cost(150);
2281   ins_encode %{
2282     __ vsubss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2283   %}
2284   ins_pipe(pipe_slow);
2285 %}
2286 
2287 instruct subD_reg(regD dst, regD src) %{
2288   predicate((UseSSE>=2) && (UseAVX == 0));
2289   match(Set dst (SubD dst src));
2290 
2291   format %{ "subsd   $dst, $src" %}
2292   ins_cost(150);
2293   ins_encode %{
2294     __ subsd($dst$$XMMRegister, $src$$XMMRegister);
2295   %}
2296   ins_pipe(pipe_slow);
2297 %}
2298 
2299 instruct subD_mem(regD dst, memory src) %{
2300   predicate((UseSSE>=2) && (UseAVX == 0));
2301   match(Set dst (SubD dst (LoadD src)));
2302 
2303   format %{ "subsd   $dst, $src" %}
2304   ins_cost(150);
2305   ins_encode %{
2306     __ subsd($dst$$XMMRegister, $src$$Address);
2307   %}
2308   ins_pipe(pipe_slow);
2309 %}
2310 
2311 instruct subD_imm(regD dst, immD con) %{
2312   predicate((UseSSE>=2) && (UseAVX == 0));
2313   match(Set dst (SubD dst con));
2314   format %{ "subsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2315   ins_cost(150);
2316   ins_encode %{
2317     __ subsd($dst$$XMMRegister, $constantaddress($con));
2318   %}
2319   ins_pipe(pipe_slow);
2320 %}
2321 
2322 instruct subD_reg_reg(regD dst, regD src1, regD src2) %{
2323   predicate(UseAVX > 0);
2324   match(Set dst (SubD src1 src2));
2325 
2326   format %{ "vsubsd  $dst, $src1, $src2" %}
2327   ins_cost(150);
2328   ins_encode %{
2329     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2330   %}
2331   ins_pipe(pipe_slow);
2332 %}
2333 
2334 instruct subD_reg_mem(regD dst, regD src1, memory src2) %{
2335   predicate(UseAVX > 0);
2336   match(Set dst (SubD src1 (LoadD src2)));
2337 
2338   format %{ "vsubsd  $dst, $src1, $src2" %}
2339   ins_cost(150);
2340   ins_encode %{
2341     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2342   %}
2343   ins_pipe(pipe_slow);
2344 %}
2345 
2346 instruct subD_reg_imm(regD dst, regD src, immD con) %{
2347   predicate(UseAVX > 0);
2348   match(Set dst (SubD src con));
2349 
2350   format %{ "vsubsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
2351   ins_cost(150);
2352   ins_encode %{
2353     __ vsubsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2354   %}
2355   ins_pipe(pipe_slow);
2356 %}
2357 
2358 instruct mulF_reg(regF dst, regF src) %{
2359   predicate((UseSSE>=1) && (UseAVX == 0));
2360   match(Set dst (MulF dst src));
2361 
2362   format %{ "mulss   $dst, $src" %}
2363   ins_cost(150);
2364   ins_encode %{
2365     __ mulss($dst$$XMMRegister, $src$$XMMRegister);
2366   %}
2367   ins_pipe(pipe_slow);
2368 %}
2369 
2370 instruct mulF_mem(regF dst, memory src) %{
2371   predicate((UseSSE>=1) && (UseAVX == 0));
2372   match(Set dst (MulF dst (LoadF src)));
2373 
2374   format %{ "mulss   $dst, $src" %}
2375   ins_cost(150);
2376   ins_encode %{
2377     __ mulss($dst$$XMMRegister, $src$$Address);
2378   %}
2379   ins_pipe(pipe_slow);
2380 %}
2381 
2382 instruct mulF_imm(regF dst, immF con) %{
2383   predicate((UseSSE>=1) && (UseAVX == 0));
2384   match(Set dst (MulF dst con));
2385   format %{ "mulss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2386   ins_cost(150);
2387   ins_encode %{
2388     __ mulss($dst$$XMMRegister, $constantaddress($con));
2389   %}
2390   ins_pipe(pipe_slow);
2391 %}
2392 
2393 instruct mulF_reg_reg(regF dst, regF src1, regF src2) %{
2394   predicate(UseAVX > 0);
2395   match(Set dst (MulF src1 src2));
2396 
2397   format %{ "vmulss  $dst, $src1, $src2" %}
2398   ins_cost(150);
2399   ins_encode %{
2400     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2401   %}
2402   ins_pipe(pipe_slow);
2403 %}
2404 
2405 instruct mulF_reg_mem(regF dst, regF src1, memory src2) %{
2406   predicate(UseAVX > 0);
2407   match(Set dst (MulF src1 (LoadF src2)));
2408 
2409   format %{ "vmulss  $dst, $src1, $src2" %}
2410   ins_cost(150);
2411   ins_encode %{
2412     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2413   %}
2414   ins_pipe(pipe_slow);
2415 %}
2416 
2417 instruct mulF_reg_imm(regF dst, regF src, immF con) %{
2418   predicate(UseAVX > 0);
2419   match(Set dst (MulF src con));
2420 
2421   format %{ "vmulss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
2422   ins_cost(150);
2423   ins_encode %{
2424     __ vmulss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2425   %}
2426   ins_pipe(pipe_slow);
2427 %}
2428 
2429 instruct mulD_reg(regD dst, regD src) %{
2430   predicate((UseSSE>=2) && (UseAVX == 0));
2431   match(Set dst (MulD dst src));
2432 
2433   format %{ "mulsd   $dst, $src" %}
2434   ins_cost(150);
2435   ins_encode %{
2436     __ mulsd($dst$$XMMRegister, $src$$XMMRegister);
2437   %}
2438   ins_pipe(pipe_slow);
2439 %}
2440 
2441 instruct mulD_mem(regD dst, memory src) %{
2442   predicate((UseSSE>=2) && (UseAVX == 0));
2443   match(Set dst (MulD dst (LoadD src)));
2444 
2445   format %{ "mulsd   $dst, $src" %}
2446   ins_cost(150);
2447   ins_encode %{
2448     __ mulsd($dst$$XMMRegister, $src$$Address);
2449   %}
2450   ins_pipe(pipe_slow);
2451 %}
2452 
2453 instruct mulD_imm(regD dst, immD con) %{
2454   predicate((UseSSE>=2) && (UseAVX == 0));
2455   match(Set dst (MulD dst con));
2456   format %{ "mulsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2457   ins_cost(150);
2458   ins_encode %{
2459     __ mulsd($dst$$XMMRegister, $constantaddress($con));
2460   %}
2461   ins_pipe(pipe_slow);
2462 %}
2463 
2464 instruct mulD_reg_reg(regD dst, regD src1, regD src2) %{
2465   predicate(UseAVX > 0);
2466   match(Set dst (MulD src1 src2));
2467 
2468   format %{ "vmulsd  $dst, $src1, $src2" %}
2469   ins_cost(150);
2470   ins_encode %{
2471     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2472   %}
2473   ins_pipe(pipe_slow);
2474 %}
2475 
2476 instruct mulD_reg_mem(regD dst, regD src1, memory src2) %{
2477   predicate(UseAVX > 0);
2478   match(Set dst (MulD src1 (LoadD src2)));
2479 
2480   format %{ "vmulsd  $dst, $src1, $src2" %}
2481   ins_cost(150);
2482   ins_encode %{
2483     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2484   %}
2485   ins_pipe(pipe_slow);
2486 %}
2487 
2488 instruct mulD_reg_imm(regD dst, regD src, immD con) %{
2489   predicate(UseAVX > 0);
2490   match(Set dst (MulD src con));
2491 
2492   format %{ "vmulsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
2493   ins_cost(150);
2494   ins_encode %{
2495     __ vmulsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2496   %}
2497   ins_pipe(pipe_slow);
2498 %}
2499 
2500 instruct divF_reg(regF dst, regF src) %{
2501   predicate((UseSSE>=1) && (UseAVX == 0));
2502   match(Set dst (DivF dst src));
2503 
2504   format %{ "divss   $dst, $src" %}
2505   ins_cost(150);
2506   ins_encode %{
2507     __ divss($dst$$XMMRegister, $src$$XMMRegister);
2508   %}
2509   ins_pipe(pipe_slow);
2510 %}
2511 
2512 instruct divF_mem(regF dst, memory src) %{
2513   predicate((UseSSE>=1) && (UseAVX == 0));
2514   match(Set dst (DivF dst (LoadF src)));
2515 
2516   format %{ "divss   $dst, $src" %}
2517   ins_cost(150);
2518   ins_encode %{
2519     __ divss($dst$$XMMRegister, $src$$Address);
2520   %}
2521   ins_pipe(pipe_slow);
2522 %}
2523 
2524 instruct divF_imm(regF dst, immF con) %{
2525   predicate((UseSSE>=1) && (UseAVX == 0));
2526   match(Set dst (DivF dst con));
2527   format %{ "divss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2528   ins_cost(150);
2529   ins_encode %{
2530     __ divss($dst$$XMMRegister, $constantaddress($con));
2531   %}
2532   ins_pipe(pipe_slow);
2533 %}
2534 
2535 instruct divF_reg_reg(regF dst, regF src1, regF src2) %{
2536   predicate(UseAVX > 0);
2537   match(Set dst (DivF src1 src2));
2538 
2539   format %{ "vdivss  $dst, $src1, $src2" %}
2540   ins_cost(150);
2541   ins_encode %{
2542     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2543   %}
2544   ins_pipe(pipe_slow);
2545 %}
2546 
2547 instruct divF_reg_mem(regF dst, regF src1, memory src2) %{
2548   predicate(UseAVX > 0);
2549   match(Set dst (DivF src1 (LoadF src2)));
2550 
2551   format %{ "vdivss  $dst, $src1, $src2" %}
2552   ins_cost(150);
2553   ins_encode %{
2554     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2555   %}
2556   ins_pipe(pipe_slow);
2557 %}
2558 
2559 instruct divF_reg_imm(regF dst, regF src, immF con) %{
2560   predicate(UseAVX > 0);
2561   match(Set dst (DivF src con));
2562 
2563   format %{ "vdivss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
2564   ins_cost(150);
2565   ins_encode %{
2566     __ vdivss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2567   %}
2568   ins_pipe(pipe_slow);
2569 %}
2570 
2571 instruct divD_reg(regD dst, regD src) %{
2572   predicate((UseSSE>=2) && (UseAVX == 0));
2573   match(Set dst (DivD dst src));
2574 
2575   format %{ "divsd   $dst, $src" %}
2576   ins_cost(150);
2577   ins_encode %{
2578     __ divsd($dst$$XMMRegister, $src$$XMMRegister);
2579   %}
2580   ins_pipe(pipe_slow);
2581 %}
2582 
2583 instruct divD_mem(regD dst, memory src) %{
2584   predicate((UseSSE>=2) && (UseAVX == 0));
2585   match(Set dst (DivD dst (LoadD src)));
2586 
2587   format %{ "divsd   $dst, $src" %}
2588   ins_cost(150);
2589   ins_encode %{
2590     __ divsd($dst$$XMMRegister, $src$$Address);
2591   %}
2592   ins_pipe(pipe_slow);
2593 %}
2594 
2595 instruct divD_imm(regD dst, immD con) %{
2596   predicate((UseSSE>=2) && (UseAVX == 0));
2597   match(Set dst (DivD dst con));
2598   format %{ "divsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2599   ins_cost(150);
2600   ins_encode %{
2601     __ divsd($dst$$XMMRegister, $constantaddress($con));
2602   %}
2603   ins_pipe(pipe_slow);
2604 %}
2605 
2606 instruct divD_reg_reg(regD dst, regD src1, regD src2) %{
2607   predicate(UseAVX > 0);
2608   match(Set dst (DivD src1 src2));
2609 
2610   format %{ "vdivsd  $dst, $src1, $src2" %}
2611   ins_cost(150);
2612   ins_encode %{
2613     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2614   %}
2615   ins_pipe(pipe_slow);
2616 %}
2617 
2618 instruct divD_reg_mem(regD dst, regD src1, memory src2) %{
2619   predicate(UseAVX > 0);
2620   match(Set dst (DivD src1 (LoadD src2)));
2621 
2622   format %{ "vdivsd  $dst, $src1, $src2" %}
2623   ins_cost(150);
2624   ins_encode %{
2625     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2626   %}
2627   ins_pipe(pipe_slow);
2628 %}
2629 
2630 instruct divD_reg_imm(regD dst, regD src, immD con) %{
2631   predicate(UseAVX > 0);
2632   match(Set dst (DivD src con));
2633 
2634   format %{ "vdivsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
2635   ins_cost(150);
2636   ins_encode %{
2637     __ vdivsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2638   %}
2639   ins_pipe(pipe_slow);
2640 %}
2641 
2642 instruct absF_reg(regF dst) %{
2643   predicate((UseSSE>=1) && (UseAVX == 0));
2644   match(Set dst (AbsF dst));
2645   ins_cost(150);
2646   format %{ "andps   $dst, [0x7fffffff]\t# abs float by sign masking" %}
2647   ins_encode %{
2648     __ andps($dst$$XMMRegister, ExternalAddress(float_signmask()));
2649   %}
2650   ins_pipe(pipe_slow);
2651 %}
2652 
2653 instruct absF_reg_reg(vlRegF dst, vlRegF src) %{
2654   predicate(UseAVX > 0);
2655   match(Set dst (AbsF src));
2656   ins_cost(150);
2657   format %{ "vandps  $dst, $src, [0x7fffffff]\t# abs float by sign masking" %}
2658   ins_encode %{
2659     int vector_len = 0;
2660     __ vandps($dst$$XMMRegister, $src$$XMMRegister,
2661               ExternalAddress(float_signmask()), vector_len);
2662   %}
2663   ins_pipe(pipe_slow);
2664 %}
2665 
2666 instruct absD_reg(regD dst) %{
2667   predicate((UseSSE>=2) && (UseAVX == 0));
2668   match(Set dst (AbsD dst));
2669   ins_cost(150);
2670   format %{ "andpd   $dst, [0x7fffffffffffffff]\t"
2671             "# abs double by sign masking" %}
2672   ins_encode %{
2673     __ andpd($dst$$XMMRegister, ExternalAddress(double_signmask()));
2674   %}
2675   ins_pipe(pipe_slow);
2676 %}
2677 
2678 instruct absD_reg_reg(vlRegD dst, vlRegD src) %{
2679   predicate(UseAVX > 0);
2680   match(Set dst (AbsD src));
2681   ins_cost(150);
2682   format %{ "vandpd  $dst, $src, [0x7fffffffffffffff]\t"
2683             "# abs double by sign masking" %}
2684   ins_encode %{
2685     int vector_len = 0;
2686     __ vandpd($dst$$XMMRegister, $src$$XMMRegister,
2687               ExternalAddress(double_signmask()), vector_len);
2688   %}
2689   ins_pipe(pipe_slow);
2690 %}
2691 
2692 instruct negF_reg(regF dst) %{
2693   predicate((UseSSE>=1) && (UseAVX == 0));
2694   match(Set dst (NegF dst));
2695   ins_cost(150);
2696   format %{ "xorps   $dst, [0x80000000]\t# neg float by sign flipping" %}
2697   ins_encode %{
2698     __ xorps($dst$$XMMRegister, ExternalAddress(float_signflip()));
2699   %}
2700   ins_pipe(pipe_slow);
2701 %}
2702 
2703 instruct negF_reg_reg(vlRegF dst, vlRegF src) %{
2704   predicate(UseAVX > 0);
2705   match(Set dst (NegF src));
2706   ins_cost(150);
2707   format %{ "vnegatess  $dst, $src, [0x80000000]\t# neg float by sign flipping" %}
2708   ins_encode %{
2709     __ vnegatess($dst$$XMMRegister, $src$$XMMRegister,
2710                  ExternalAddress(float_signflip()));
2711   %}
2712   ins_pipe(pipe_slow);
2713 %}
2714 
2715 instruct negD_reg(regD dst) %{
2716   predicate((UseSSE>=2) && (UseAVX == 0));
2717   match(Set dst (NegD dst));
2718   ins_cost(150);
2719   format %{ "xorpd   $dst, [0x8000000000000000]\t"
2720             "# neg double by sign flipping" %}
2721   ins_encode %{
2722     __ xorpd($dst$$XMMRegister, ExternalAddress(double_signflip()));
2723   %}
2724   ins_pipe(pipe_slow);
2725 %}
2726 
2727 instruct negD_reg_reg(vlRegD dst, vlRegD src) %{
2728   predicate(UseAVX > 0);
2729   match(Set dst (NegD src));
2730   ins_cost(150);
2731   format %{ "vnegatesd  $dst, $src, [0x8000000000000000]\t"
2732             "# neg double by sign flipping" %}
2733   ins_encode %{
2734     __ vnegatesd($dst$$XMMRegister, $src$$XMMRegister,
2735                  ExternalAddress(double_signflip()));
2736   %}
2737   ins_pipe(pipe_slow);
2738 %}
2739 
2740 instruct sqrtF_reg(regF dst, regF src) %{
2741   predicate(UseSSE>=1);
2742   match(Set dst (SqrtF src));
2743 
2744   format %{ "sqrtss  $dst, $src" %}
2745   ins_cost(150);
2746   ins_encode %{
2747     __ sqrtss($dst$$XMMRegister, $src$$XMMRegister);
2748   %}
2749   ins_pipe(pipe_slow);
2750 %}
2751 
2752 instruct sqrtF_mem(regF dst, memory src) %{
2753   predicate(UseSSE>=1);
2754   match(Set dst (SqrtF (LoadF src)));
2755 
2756   format %{ "sqrtss  $dst, $src" %}
2757   ins_cost(150);
2758   ins_encode %{
2759     __ sqrtss($dst$$XMMRegister, $src$$Address);
2760   %}
2761   ins_pipe(pipe_slow);
2762 %}
2763 
2764 instruct sqrtF_imm(regF dst, immF con) %{
2765   predicate(UseSSE>=1);
2766   match(Set dst (SqrtF con));
2767 
2768   format %{ "sqrtss  $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2769   ins_cost(150);
2770   ins_encode %{
2771     __ sqrtss($dst$$XMMRegister, $constantaddress($con));
2772   %}
2773   ins_pipe(pipe_slow);
2774 %}
2775 
2776 instruct sqrtD_reg(regD dst, regD src) %{
2777   predicate(UseSSE>=2);
2778   match(Set dst (SqrtD src));
2779 
2780   format %{ "sqrtsd  $dst, $src" %}
2781   ins_cost(150);
2782   ins_encode %{
2783     __ sqrtsd($dst$$XMMRegister, $src$$XMMRegister);
2784   %}
2785   ins_pipe(pipe_slow);
2786 %}
2787 
2788 instruct sqrtD_mem(regD dst, memory src) %{
2789   predicate(UseSSE>=2);
2790   match(Set dst (SqrtD (LoadD src)));
2791 
2792   format %{ "sqrtsd  $dst, $src" %}
2793   ins_cost(150);
2794   ins_encode %{
2795     __ sqrtsd($dst$$XMMRegister, $src$$Address);
2796   %}
2797   ins_pipe(pipe_slow);
2798 %}
2799 
2800 instruct sqrtD_imm(regD dst, immD con) %{
2801   predicate(UseSSE>=2);
2802   match(Set dst (SqrtD con));
2803   format %{ "sqrtsd  $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2804   ins_cost(150);
2805   ins_encode %{
2806     __ sqrtsd($dst$$XMMRegister, $constantaddress($con));
2807   %}
2808   ins_pipe(pipe_slow);
2809 %}
2810 
2811 instruct onspinwait() %{
2812   match(OnSpinWait);
2813   ins_cost(200);
2814 
2815   format %{
2816     $$template
2817     $$emit$$"pause\t! membar_onspinwait"
2818   %}
2819   ins_encode %{
2820     __ pause();
2821   %}
2822   ins_pipe(pipe_slow);
2823 %}
2824 
2825 // a * b + c
2826 instruct fmaD_reg(regD a, regD b, regD c) %{
2827   predicate(UseFMA);
2828   match(Set c (FmaD  c (Binary a b)));
2829   format %{ "fmasd $a,$b,$c\t# $c = $a * $b + $c" %}
2830   ins_cost(150);
2831   ins_encode %{
2832     __ fmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
2833   %}
2834   ins_pipe( pipe_slow );
2835 %}
2836 
2837 // a * b + c
2838 instruct fmaF_reg(regF a, regF b, regF c) %{
2839   predicate(UseFMA);
2840   match(Set c (FmaF  c (Binary a b)));
2841   format %{ "fmass $a,$b,$c\t# $c = $a * $b + $c" %}
2842   ins_cost(150);
2843   ins_encode %{
2844     __ fmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
2845   %}
2846   ins_pipe( pipe_slow );
2847 %}
2848 
2849 // Following pseudo code describes the algorithm for max[FD]: 
2850 // Min algorithm is on similar lines
2851 //  btmp = (b < 0) ? a : b
2852 //  atmp = (b < 0) ? b : a
2853 //  Tmp  = Max_Float( atmp , btmp)
2854 //  Res  = (atmp == NaN) ? atmp : Tmp 
2855 instruct maxF_reg(legRegF dst, legRegF a, legRegF b, legRegF tmp, legRegF atmp, legRegF btmp) %{
2856   predicate(UseAVX > 0);
2857   match(Set dst (MaxF a b));
2858   effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp);
2859   format %{ 
2860      "blendvps         $btmp,$b,$a,$b           \n\t"
2861      "blendvps         $atmp,$a,$b,$b           \n\t"
2862      "vmaxps           $tmp,$atmp,$btmp         \n\t"
2863      "cmpps.unordered  $btmp, $atmp, $atmp      \n\t"
2864      "blendvps         $dst,$tmp,$atmp,$btmp    \n\t"
2865   %}
2866   ins_encode %{
2867     int vector_len = 0;
2868     __ blendvps($btmp$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, vector_len);
2869     __ blendvps($atmp$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $b$$XMMRegister, vector_len);
2870     __ vmaxps($tmp$$XMMRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vector_len);
2871     __ cmpps($btmp$$XMMRegister, $atmp$$XMMRegister, $atmp$$XMMRegister, 0x3, vector_len);
2872     __ blendvps($dst$$XMMRegister, $tmp$$XMMRegister, $atmp$$XMMRegister, $btmp$$XMMRegister, vector_len);
2873  %}
2874   ins_pipe( pipe_slow );
2875 %}
2876 
2877 
2878 // max = java.lang.Max(double a , double b)
2879 instruct maxD_reg(legRegD dst, legRegD a, legRegD b, legRegD tmp, legRegD atmp, legRegD btmp) %{
2880   predicate(UseAVX > 0);
2881   match(Set dst (MaxD a b));
2882   effect(USE a, USE b, TEMP atmp, TEMP btmp, TEMP tmp);
2883   format %{ 
2884      "blendvpd         $btmp,$b,$a,$b            \n\t"
2885      "blendvpd         $atmp,$a,$b,$b            \n\t"
2886      "vmaxpd           $tmp,$atmp,$btmp          \n\t"
2887      "cmppd.unordered  $btmp, $atmp, $atmp       \n\t" 
2888      "blendvpd         $dst,$tmp,$atmp,$btmp     \n\t"
2889   %}
2890   ins_encode %{
2891     int vector_len = 0;
2892     __ blendvpd($btmp$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, vector_len);
2893     __ blendvpd($atmp$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $b$$XMMRegister, vector_len);
2894     __ vmaxpd($tmp$$XMMRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vector_len);
2895     __ cmppd($btmp$$XMMRegister, $atmp$$XMMRegister, $atmp$$XMMRegister, 0x3, vector_len);
2896     __ blendvpd($dst$$XMMRegister, $tmp$$XMMRegister, $atmp$$XMMRegister, $btmp$$XMMRegister, vector_len);
2897   %}
2898   ins_pipe( pipe_slow );
2899 %}
2900 
2901 
2902 // min = java.lang.Min(float a , float b)
2903 instruct minF_reg(legRegF dst, legRegF a, legRegF b, legRegF tmp, legRegF atmp, legRegF btmp) %{
2904   predicate(UseAVX > 0);
2905   match(Set dst (MinF a b));
2906   effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp);
2907   format %{ 
2908      "blendvps         $atmp,$a,$b,$a             \n\t"
2909      "blendvps         $btmp,$b,$a,$a             \n\t"
2910      "vminps           $tmp,$atmp,$btmp           \n\t"
2911      "cmpps.unordered  $btmp, $atmp, $atmp        \n\t" 
2912      "blendvps         $dst,$tmp,$atmp,$btmp      \n\t"
2913   %}
2914   ins_encode %{
2915     int vector_len = 0;
2916     __ blendvps($atmp$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, vector_len);
2917     __ blendvps($btmp$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, $a$$XMMRegister, vector_len);
2918     __ vminps($tmp$$XMMRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vector_len);
2919     __ cmpps($btmp$$XMMRegister, $atmp$$XMMRegister, $atmp$$XMMRegister, 0x3, vector_len);
2920     __ blendvps($dst$$XMMRegister, $tmp$$XMMRegister, $atmp$$XMMRegister, $btmp$$XMMRegister, vector_len);
2921   %}
2922   ins_pipe( pipe_slow );
2923 %}
2924 
2925 // min = java.lang.Min(double a , double b)
2926 instruct minD_reg(legRegD dst, legRegD a, legRegD b, legRegD tmp, legRegD atmp, legRegD btmp) %{
2927   predicate(UseAVX > 0);
2928   match(Set dst (MinD a b));
2929   effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp);
2930   format %{ 
2931      "blendvpd         $atmp,$a,$b,$a           \n\t"
2932      "blendvpd         $btmp,$b,$a,$a           \n\t"
2933      "vminpd           $tmp,$atmp,$btmp         \n\t"
2934      "cmppd.unordered  $btmp, $atmp, $atmp      \n\t" 
2935      "blendvpd         $dst,$tmp,$atmp,$btmp    \n\t"
2936   %}
2937   ins_encode %{
2938     int vector_len = 0;
2939     __ blendvpd($atmp$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, vector_len);
2940     __ blendvpd($btmp$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, $a$$XMMRegister, vector_len);
2941     __ vminpd($tmp$$XMMRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vector_len);
2942     __ cmppd($btmp$$XMMRegister, $atmp$$XMMRegister, $atmp$$XMMRegister, 0x3, vector_len);
2943     __ blendvpd($dst$$XMMRegister, $tmp$$XMMRegister, $atmp$$XMMRegister, $btmp$$XMMRegister, vector_len);
2944   %}
2945   ins_pipe( pipe_slow );
2946 %}
2947 
2948 // ====================VECTOR INSTRUCTIONS=====================================
2949 
2950 
2951 // Load vectors (4 bytes long)
2952 instruct loadV4(vecS dst, memory mem) %{
2953   predicate(n->as_LoadVector()->memory_size() == 4);
2954   match(Set dst (LoadVector mem));
2955   ins_cost(125);
2956   format %{ "movd    $dst,$mem\t! load vector (4 bytes)" %}
2957   ins_encode %{
2958     __ movdl($dst$$XMMRegister, $mem$$Address);
2959   %}
2960   ins_pipe( pipe_slow );
2961 %}
2962 
2963 // Load vectors (4 bytes long)
2964 instruct MoveVecS2Leg(legVecS dst, vecS src) %{
2965   match(Set dst src);
2966   format %{ "movss $dst,$src\t! load vector (4 bytes)" %}
2967   ins_encode %{
2968     __ movflt($dst$$XMMRegister, $src$$XMMRegister);
2969   %}
2970   ins_pipe( fpu_reg_reg );
2971 %}
2972 
2973 // Load vectors (4 bytes long)
2974 instruct MoveLeg2VecS(vecS dst, legVecS src) %{
2975   match(Set dst src);
2976   format %{ "movss $dst,$src\t! load vector (4 bytes)" %}
2977   ins_encode %{
2978     __ movflt($dst$$XMMRegister, $src$$XMMRegister);
2979   %}
2980   ins_pipe( fpu_reg_reg );
2981 %}
2982 
2983 // Load vectors (8 bytes long)
2984 instruct loadV8(vecD dst, memory mem) %{
2985   predicate(n->as_LoadVector()->memory_size() == 8);
2986   match(Set dst (LoadVector mem));
2987   ins_cost(125);
2988   format %{ "movq    $dst,$mem\t! load vector (8 bytes)" %}
2989   ins_encode %{
2990     __ movq($dst$$XMMRegister, $mem$$Address);
2991   %}
2992   ins_pipe( pipe_slow );
2993 %}
2994 
2995 // Load vectors (8 bytes long)
2996 instruct MoveVecD2Leg(legVecD dst, vecD src) %{
2997   match(Set dst src);
2998   format %{ "movsd $dst,$src\t! load vector (8 bytes)" %}
2999   ins_encode %{
3000     __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
3001   %}
3002   ins_pipe( fpu_reg_reg );
3003 %}
3004 
3005 // Load vectors (8 bytes long)
3006 instruct MoveLeg2VecD(vecD dst, legVecD src) %{
3007   match(Set dst src);
3008   format %{ "movsd $dst,$src\t! load vector (8 bytes)" %}
3009   ins_encode %{
3010     __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
3011   %}
3012   ins_pipe( fpu_reg_reg );
3013 %}
3014 
3015 // Load vectors (16 bytes long)
3016 instruct loadV16(vecX dst, memory mem) %{
3017   predicate(n->as_LoadVector()->memory_size() == 16);
3018   match(Set dst (LoadVector mem));
3019   ins_cost(125);
3020   format %{ "movdqu  $dst,$mem\t! load vector (16 bytes)" %}
3021   ins_encode %{
3022     __ movdqu($dst$$XMMRegister, $mem$$Address);
3023   %}
3024   ins_pipe( pipe_slow );
3025 %}
3026 
3027 // Load vectors (16 bytes long)
3028 instruct MoveVecX2Leg(legVecX dst, vecX src) %{
3029   match(Set dst src);
3030   format %{ "movdqu $dst,$src\t! load vector (16 bytes)" %}
3031   ins_encode %{
3032     if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
3033       int vector_len = 2;
3034       __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
3035     } else {
3036       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
3037     }
3038   %}
3039   ins_pipe( fpu_reg_reg );
3040 %}
3041 
3042 // Load vectors (16 bytes long)
3043 instruct MoveLeg2VecX(vecX dst, legVecX src) %{
3044   match(Set dst src);
3045   format %{ "movdqu $dst,$src\t! load vector (16 bytes)" %}
3046   ins_encode %{
3047     if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
3048       int vector_len = 2;
3049       __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
3050     } else {
3051       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
3052     }
3053   %}
3054   ins_pipe( fpu_reg_reg );
3055 %}
3056 
3057 // Load vectors (32 bytes long)
3058 instruct loadV32(vecY dst, memory mem) %{
3059   predicate(n->as_LoadVector()->memory_size() == 32);
3060   match(Set dst (LoadVector mem));
3061   ins_cost(125);
3062   format %{ "vmovdqu $dst,$mem\t! load vector (32 bytes)" %}
3063   ins_encode %{
3064     __ vmovdqu($dst$$XMMRegister, $mem$$Address);
3065   %}
3066   ins_pipe( pipe_slow );
3067 %}
3068 
3069 // Load vectors (32 bytes long)
3070 instruct MoveVecY2Leg(legVecY dst, vecY src) %{
3071   match(Set dst src);
3072   format %{ "vmovdqu $dst,$src\t! load vector (32 bytes)" %}
3073   ins_encode %{
3074     if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
3075       int vector_len = 2;
3076       __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
3077     } else {
3078       __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister);
3079     }
3080   %}
3081   ins_pipe( fpu_reg_reg );
3082 %}
3083 
3084 // Load vectors (32 bytes long)
3085 instruct MoveLeg2VecY(vecY dst, legVecY src) %{
3086   match(Set dst src);
3087   format %{ "vmovdqu $dst,$src\t! load vector (32 bytes)" %}
3088   ins_encode %{
3089     if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
3090       int vector_len = 2;
3091       __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
3092     } else {
3093       __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister);
3094     }
3095   %}
3096   ins_pipe( fpu_reg_reg );
3097 %}
3098 
3099 // Load vectors (64 bytes long)
3100 instruct loadV64_dword(vecZ dst, memory mem) %{
3101   predicate(n->as_LoadVector()->memory_size() == 64 && n->as_LoadVector()->element_size() <= 4);
3102   match(Set dst (LoadVector mem));
3103   ins_cost(125);
3104   format %{ "vmovdqul $dst k0,$mem\t! load vector (64 bytes)" %}
3105   ins_encode %{
3106     int vector_len = 2;
3107     __ evmovdqul($dst$$XMMRegister, $mem$$Address, vector_len);
3108   %}
3109   ins_pipe( pipe_slow );
3110 %}
3111 
3112 // Load vectors (64 bytes long)
3113 instruct loadV64_qword(vecZ dst, memory mem) %{
3114   predicate(n->as_LoadVector()->memory_size() == 64 && n->as_LoadVector()->element_size() > 4);
3115   match(Set dst (LoadVector mem));
3116   ins_cost(125);
3117   format %{ "vmovdquq $dst k0,$mem\t! load vector (64 bytes)" %}
3118   ins_encode %{
3119     int vector_len = 2;
3120     __ evmovdquq($dst$$XMMRegister, $mem$$Address, vector_len);
3121   %}
3122   ins_pipe( pipe_slow );
3123 %}
3124 
3125 instruct MoveVecZ2Leg(legVecZ dst, vecZ  src) %{
3126   match(Set dst src);
3127   format %{ "vmovdquq $dst k0,$src\t! Move vector (64 bytes)" %}
3128   ins_encode %{
3129     int vector_len = 2;
3130     __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
3131   %}
3132   ins_pipe( fpu_reg_reg );
3133 %}
3134 
3135 instruct MoveLeg2VecZ(vecZ dst, legVecZ  src) %{
3136   match(Set dst src);
3137   format %{ "vmovdquq $dst k0,$src\t! Move vector (64 bytes)" %}
3138   ins_encode %{
3139     int vector_len = 2;
3140     __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
3141   %}
3142   ins_pipe( fpu_reg_reg );
3143 %}
3144 
3145 // Store vectors
3146 instruct storeV4(memory mem, vecS src) %{
3147   predicate(n->as_StoreVector()->memory_size() == 4);
3148   match(Set mem (StoreVector mem src));
3149   ins_cost(145);
3150   format %{ "movd    $mem,$src\t! store vector (4 bytes)" %}
3151   ins_encode %{
3152     __ movdl($mem$$Address, $src$$XMMRegister);
3153   %}
3154   ins_pipe( pipe_slow );
3155 %}
3156 
3157 instruct storeV8(memory mem, vecD src) %{
3158   predicate(n->as_StoreVector()->memory_size() == 8);
3159   match(Set mem (StoreVector mem src));
3160   ins_cost(145);
3161   format %{ "movq    $mem,$src\t! store vector (8 bytes)" %}
3162   ins_encode %{
3163     __ movq($mem$$Address, $src$$XMMRegister);
3164   %}
3165   ins_pipe( pipe_slow );
3166 %}
3167 
3168 instruct storeV16(memory mem, vecX src) %{
3169   predicate(n->as_StoreVector()->memory_size() == 16);
3170   match(Set mem (StoreVector mem src));
3171   ins_cost(145);
3172   format %{ "movdqu  $mem,$src\t! store vector (16 bytes)" %}
3173   ins_encode %{
3174     __ movdqu($mem$$Address, $src$$XMMRegister);
3175   %}
3176   ins_pipe( pipe_slow );
3177 %}
3178 
3179 instruct storeV32(memory mem, vecY src) %{
3180   predicate(n->as_StoreVector()->memory_size() == 32);
3181   match(Set mem (StoreVector mem src));
3182   ins_cost(145);
3183   format %{ "vmovdqu $mem,$src\t! store vector (32 bytes)" %}
3184   ins_encode %{
3185     __ vmovdqu($mem$$Address, $src$$XMMRegister);
3186   %}
3187   ins_pipe( pipe_slow );
3188 %}
3189 
3190 instruct storeV64_dword(memory mem, vecZ src) %{
3191   predicate(n->as_StoreVector()->memory_size() == 64 && n->as_StoreVector()->element_size() <= 4);
3192   match(Set mem (StoreVector mem src));
3193   ins_cost(145);
3194   format %{ "vmovdqul $mem k0,$src\t! store vector (64 bytes)" %}
3195   ins_encode %{
3196     int vector_len = 2;
3197     __ evmovdqul($mem$$Address, $src$$XMMRegister, vector_len);
3198   %}
3199   ins_pipe( pipe_slow );
3200 %}
3201 
3202 instruct storeV64_qword(memory mem, vecZ src) %{
3203   predicate(n->as_StoreVector()->memory_size() == 64 && n->as_StoreVector()->element_size() > 4);
3204   match(Set mem (StoreVector mem src));
3205   ins_cost(145);
3206   format %{ "vmovdquq $mem k0,$src\t! store vector (64 bytes)" %}
3207   ins_encode %{
3208     int vector_len = 2;
3209     __ evmovdquq($mem$$Address, $src$$XMMRegister, vector_len);
3210   %}
3211   ins_pipe( pipe_slow );
3212 %}
3213 
3214 // ====================LEGACY REPLICATE=======================================
3215 
3216 instruct Repl4B_mem(vecS dst, memory mem) %{
3217   predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
3218   match(Set dst (ReplicateB (LoadB mem)));
3219   format %{ "punpcklbw $dst,$mem\n\t"
3220             "pshuflw $dst,$dst,0x00\t! replicate4B" %}
3221   ins_encode %{
3222     __ punpcklbw($dst$$XMMRegister, $mem$$Address);
3223     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3224   %}
3225   ins_pipe( pipe_slow );
3226 %}
3227 
3228 instruct Repl8B_mem(vecD dst, memory mem) %{
3229   predicate(n->as_Vector()->length() == 8 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
3230   match(Set dst (ReplicateB (LoadB mem)));
3231   format %{ "punpcklbw $dst,$mem\n\t"
3232             "pshuflw $dst,$dst,0x00\t! replicate8B" %}
3233   ins_encode %{
3234     __ punpcklbw($dst$$XMMRegister, $mem$$Address);
3235     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3236   %}
3237   ins_pipe( pipe_slow );
3238 %}
3239 
3240 instruct Repl16B(vecX dst, rRegI src) %{
3241   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
3242   match(Set dst (ReplicateB src));
3243   format %{ "movd    $dst,$src\n\t"
3244             "punpcklbw $dst,$dst\n\t"
3245             "pshuflw $dst,$dst,0x00\n\t"
3246             "punpcklqdq $dst,$dst\t! replicate16B" %}
3247   ins_encode %{
3248     __ movdl($dst$$XMMRegister, $src$$Register);
3249     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
3250     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3251     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3252   %}
3253   ins_pipe( pipe_slow );
3254 %}
3255 
3256 instruct Repl16B_mem(vecX dst, memory mem) %{
3257   predicate(n->as_Vector()->length() == 16 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
3258   match(Set dst (ReplicateB (LoadB mem)));
3259   format %{ "punpcklbw $dst,$mem\n\t"
3260             "pshuflw $dst,$dst,0x00\n\t"
3261             "punpcklqdq $dst,$dst\t! replicate16B" %}
3262   ins_encode %{
3263     __ punpcklbw($dst$$XMMRegister, $mem$$Address);
3264     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3265     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3266   %}
3267   ins_pipe( pipe_slow );
3268 %}
3269 
3270 instruct Repl32B(vecY dst, rRegI src) %{
3271   predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
3272   match(Set dst (ReplicateB src));
3273   format %{ "movd    $dst,$src\n\t"
3274             "punpcklbw $dst,$dst\n\t"
3275             "pshuflw $dst,$dst,0x00\n\t"
3276             "punpcklqdq $dst,$dst\n\t"
3277             "vinserti128_high $dst,$dst\t! replicate32B" %}
3278   ins_encode %{
3279     __ movdl($dst$$XMMRegister, $src$$Register);
3280     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
3281     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3282     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3283     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3284   %}
3285   ins_pipe( pipe_slow );
3286 %}
3287 
3288 instruct Repl32B_mem(vecY dst, memory mem) %{
3289   predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
3290   match(Set dst (ReplicateB (LoadB mem)));
3291   format %{ "punpcklbw $dst,$mem\n\t"
3292             "pshuflw $dst,$dst,0x00\n\t"
3293             "punpcklqdq $dst,$dst\n\t"
3294             "vinserti128_high $dst,$dst\t! replicate32B" %}
3295   ins_encode %{
3296     __ punpcklbw($dst$$XMMRegister, $mem$$Address);
3297     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3298     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3299     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3300   %}
3301   ins_pipe( pipe_slow );
3302 %}
3303 
3304 instruct Repl64B(legVecZ dst, rRegI src) %{
3305   predicate(n->as_Vector()->length() == 64 && !VM_Version::supports_avx512vlbw());
3306   match(Set dst (ReplicateB src));
3307   format %{ "movd    $dst,$src\n\t"
3308             "punpcklbw $dst,$dst\n\t"
3309             "pshuflw $dst,$dst,0x00\n\t"
3310             "punpcklqdq $dst,$dst\n\t"
3311             "vinserti128_high $dst,$dst\t"
3312             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate64B" %}
3313   ins_encode %{
3314     __ movdl($dst$$XMMRegister, $src$$Register);
3315     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
3316     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3317     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3318     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3319     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3320   %}
3321   ins_pipe( pipe_slow );
3322 %}
3323 
3324 instruct Repl64B_mem(legVecZ dst, memory mem) %{
3325   predicate(n->as_Vector()->length() == 64 && !VM_Version::supports_avx512vlbw());
3326   match(Set dst (ReplicateB (LoadB mem)));
3327   format %{ "punpcklbw $dst,$mem\n\t"
3328             "pshuflw $dst,$dst,0x00\n\t"
3329             "punpcklqdq $dst,$dst\n\t"
3330             "vinserti128_high $dst,$dst\t"
3331             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate64B" %}
3332   ins_encode %{
3333     __ punpcklbw($dst$$XMMRegister, $mem$$Address);
3334     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3335     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3336     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3337     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3338   %}
3339   ins_pipe( pipe_slow );
3340 %}
3341 
3342 instruct Repl16B_imm(vecX dst, immI con) %{
3343   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
3344   match(Set dst (ReplicateB con));
3345   format %{ "movq    $dst,[$constantaddress]\n\t"
3346             "punpcklqdq $dst,$dst\t! replicate16B($con)" %}
3347   ins_encode %{
3348     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
3349     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3350   %}
3351   ins_pipe( pipe_slow );
3352 %}
3353 
3354 instruct Repl32B_imm(vecY dst, immI con) %{
3355   predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
3356   match(Set dst (ReplicateB con));
3357   format %{ "movq    $dst,[$constantaddress]\n\t"
3358             "punpcklqdq $dst,$dst\n\t"
3359             "vinserti128_high $dst,$dst\t! lreplicate32B($con)" %}
3360   ins_encode %{
3361     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
3362     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3363     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3364   %}
3365   ins_pipe( pipe_slow );
3366 %}
3367 
3368 instruct Repl64B_imm(legVecZ dst, immI con) %{
3369   predicate(n->as_Vector()->length() == 64 && !VM_Version::supports_avx512vlbw());
3370   match(Set dst (ReplicateB con));
3371   format %{ "movq    $dst,[$constantaddress]\n\t"
3372             "punpcklqdq $dst,$dst\n\t"
3373             "vinserti128_high $dst,$dst\t"
3374             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate64B($con)" %}
3375   ins_encode %{
3376     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
3377     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3378     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3379     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3380   %}
3381   ins_pipe( pipe_slow );
3382 %}
3383 
3384 instruct Repl4S(vecD dst, rRegI src) %{
3385   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vlbw());
3386   match(Set dst (ReplicateS src));
3387   format %{ "movd    $dst,$src\n\t"
3388             "pshuflw $dst,$dst,0x00\t! replicate4S" %}
3389   ins_encode %{
3390     __ movdl($dst$$XMMRegister, $src$$Register);
3391     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3392   %}
3393   ins_pipe( pipe_slow );
3394 %}
3395 
3396 instruct Repl4S_mem(vecD dst, memory mem) %{
3397   predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
3398   match(Set dst (ReplicateS (LoadS mem)));
3399   format %{ "pshuflw $dst,$mem,0x00\t! replicate4S" %}
3400   ins_encode %{
3401     __ pshuflw($dst$$XMMRegister, $mem$$Address, 0x00);
3402   %}
3403   ins_pipe( pipe_slow );
3404 %}
3405 
3406 instruct Repl8S(vecX dst, rRegI src) %{
3407   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vlbw());
3408   match(Set dst (ReplicateS src));
3409   format %{ "movd    $dst,$src\n\t"
3410             "pshuflw $dst,$dst,0x00\n\t"
3411             "punpcklqdq $dst,$dst\t! replicate8S" %}
3412   ins_encode %{
3413     __ movdl($dst$$XMMRegister, $src$$Register);
3414     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3415     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3416   %}
3417   ins_pipe( pipe_slow );
3418 %}
3419 
3420 instruct Repl8S_mem(vecX dst, memory mem) %{
3421   predicate(n->as_Vector()->length() == 8 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
3422   match(Set dst (ReplicateS (LoadS mem)));
3423   format %{ "pshuflw $dst,$mem,0x00\n\t"
3424             "punpcklqdq $dst,$dst\t! replicate8S" %}
3425   ins_encode %{
3426     __ pshuflw($dst$$XMMRegister, $mem$$Address, 0x00);
3427     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3428   %}
3429   ins_pipe( pipe_slow );
3430 %}
3431 
3432 instruct Repl8S_imm(vecX dst, immI con) %{
3433   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vlbw());
3434   match(Set dst (ReplicateS con));
3435   format %{ "movq    $dst,[$constantaddress]\n\t"
3436             "punpcklqdq $dst,$dst\t! replicate8S($con)" %}
3437   ins_encode %{
3438     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
3439     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3440   %}
3441   ins_pipe( pipe_slow );
3442 %}
3443 
3444 instruct Repl16S(vecY dst, rRegI src) %{
3445   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
3446   match(Set dst (ReplicateS src));
3447   format %{ "movd    $dst,$src\n\t"
3448             "pshuflw $dst,$dst,0x00\n\t"
3449             "punpcklqdq $dst,$dst\n\t"
3450             "vinserti128_high $dst,$dst\t! replicate16S" %}
3451   ins_encode %{
3452     __ movdl($dst$$XMMRegister, $src$$Register);
3453     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3454     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3455     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3456   %}
3457   ins_pipe( pipe_slow );
3458 %}
3459 
3460 instruct Repl16S_mem(vecY dst, memory mem) %{
3461   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
3462   match(Set dst (ReplicateS (LoadS mem)));
3463   format %{ "pshuflw $dst,$mem,0x00\n\t"
3464             "punpcklqdq $dst,$dst\n\t"
3465             "vinserti128_high $dst,$dst\t! replicate16S" %}
3466   ins_encode %{
3467     __ pshuflw($dst$$XMMRegister, $mem$$Address, 0x00);
3468     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3469     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3470   %}
3471   ins_pipe( pipe_slow );
3472 %}
3473 
3474 instruct Repl16S_imm(vecY dst, immI con) %{
3475   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
3476   match(Set dst (ReplicateS con));
3477   format %{ "movq    $dst,[$constantaddress]\n\t"
3478             "punpcklqdq $dst,$dst\n\t"
3479             "vinserti128_high $dst,$dst\t! replicate16S($con)" %}
3480   ins_encode %{
3481     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
3482     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3483     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3484   %}
3485   ins_pipe( pipe_slow );
3486 %}
3487 
3488 instruct Repl32S(legVecZ dst, rRegI src) %{
3489   predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
3490   match(Set dst (ReplicateS src));
3491   format %{ "movd    $dst,$src\n\t"
3492             "pshuflw $dst,$dst,0x00\n\t"
3493             "punpcklqdq $dst,$dst\n\t"
3494             "vinserti128_high $dst,$dst\t"
3495             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate32S" %}
3496   ins_encode %{
3497     __ movdl($dst$$XMMRegister, $src$$Register);
3498     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3499     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3500     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3501     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3502   %}
3503   ins_pipe( pipe_slow );
3504 %}
3505 
3506 instruct Repl32S_mem(legVecZ dst, memory mem) %{
3507   predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
3508   match(Set dst (ReplicateS (LoadS mem)));
3509   format %{ "pshuflw $dst,$mem,0x00\n\t"
3510             "punpcklqdq $dst,$dst\n\t"
3511             "vinserti128_high $dst,$dst\t"
3512             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate32S" %}
3513   ins_encode %{
3514     __ pshuflw($dst$$XMMRegister, $mem$$Address, 0x00);
3515     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3516     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3517     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3518   %}
3519   ins_pipe( pipe_slow );
3520 %}
3521 
3522 instruct Repl32S_imm(legVecZ dst, immI con) %{
3523   predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
3524   match(Set dst (ReplicateS con));
3525   format %{ "movq    $dst,[$constantaddress]\n\t"
3526             "punpcklqdq $dst,$dst\n\t"
3527             "vinserti128_high $dst,$dst\t"
3528             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate32S($con)" %}
3529   ins_encode %{
3530     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
3531     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3532     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3533     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3534   %}
3535   ins_pipe( pipe_slow );
3536 %}
3537 
3538 instruct Repl4I(vecX dst, rRegI src) %{
3539   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3540   match(Set dst (ReplicateI src));
3541   format %{ "movd    $dst,$src\n\t"
3542             "pshufd  $dst,$dst,0x00\t! replicate4I" %}
3543   ins_encode %{
3544     __ movdl($dst$$XMMRegister, $src$$Register);
3545     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3546   %}
3547   ins_pipe( pipe_slow );
3548 %}
3549 
3550 instruct Repl4I_mem(vecX dst, memory mem) %{
3551   predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3552   match(Set dst (ReplicateI (LoadI mem)));
3553   format %{ "pshufd  $dst,$mem,0x00\t! replicate4I" %}
3554   ins_encode %{
3555     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3556   %}
3557   ins_pipe( pipe_slow );
3558 %}
3559 
3560 instruct Repl8I(vecY dst, rRegI src) %{
3561   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3562   match(Set dst (ReplicateI src));
3563   format %{ "movd    $dst,$src\n\t"
3564             "pshufd  $dst,$dst,0x00\n\t"
3565             "vinserti128_high $dst,$dst\t! replicate8I" %}
3566   ins_encode %{
3567     __ movdl($dst$$XMMRegister, $src$$Register);
3568     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3569     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3570   %}
3571   ins_pipe( pipe_slow );
3572 %}
3573 
3574 instruct Repl8I_mem(vecY dst, memory mem) %{
3575   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3576   match(Set dst (ReplicateI (LoadI mem)));
3577   format %{ "pshufd  $dst,$mem,0x00\n\t"
3578             "vinserti128_high $dst,$dst\t! replicate8I" %}
3579   ins_encode %{
3580     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3581     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3582   %}
3583   ins_pipe( pipe_slow );
3584 %}
3585 
3586 instruct Repl16I(legVecZ dst, rRegI src) %{
3587   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vl());
3588   match(Set dst (ReplicateI src));
3589   format %{ "movd    $dst,$src\n\t"
3590             "pshufd  $dst,$dst,0x00\n\t"
3591             "vinserti128_high $dst,$dst\t"
3592             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate16I" %}
3593   ins_encode %{
3594     __ movdl($dst$$XMMRegister, $src$$Register);
3595     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3596     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3597     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3598   %}
3599   ins_pipe( pipe_slow );
3600 %}
3601 
3602 instruct Repl16I_mem(legVecZ dst, memory mem) %{
3603   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vl());
3604   match(Set dst (ReplicateI (LoadI mem)));
3605   format %{ "pshufd  $dst,$mem,0x00\n\t"
3606             "vinserti128_high $dst,$dst\t"
3607             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate16I" %}
3608   ins_encode %{
3609     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3610     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3611     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3612   %}
3613   ins_pipe( pipe_slow );
3614 %}
3615 
3616 instruct Repl4I_imm(vecX dst, immI con) %{
3617   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3618   match(Set dst (ReplicateI con));
3619   format %{ "movq    $dst,[$constantaddress]\t! replicate4I($con)\n\t"
3620             "punpcklqdq $dst,$dst" %}
3621   ins_encode %{
3622     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
3623     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3624   %}
3625   ins_pipe( pipe_slow );
3626 %}
3627 
3628 instruct Repl8I_imm(vecY dst, immI con) %{
3629   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3630   match(Set dst (ReplicateI con));
3631   format %{ "movq    $dst,[$constantaddress]\t! replicate8I($con)\n\t"
3632             "punpcklqdq $dst,$dst\n\t"
3633             "vinserti128_high $dst,$dst" %}
3634   ins_encode %{
3635     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
3636     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3637     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3638   %}
3639   ins_pipe( pipe_slow );
3640 %}
3641 
3642 instruct Repl16I_imm(legVecZ dst, immI con) %{
3643   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vl());
3644   match(Set dst (ReplicateI con));
3645   format %{ "movq    $dst,[$constantaddress]\t"
3646             "punpcklqdq $dst,$dst\n\t"
3647             "vinserti128_high $dst,$dst"
3648             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate16I($con)" %}
3649   ins_encode %{
3650     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
3651     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3652     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3653     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3654   %}
3655   ins_pipe( pipe_slow );
3656 %}
3657 
3658 // Long could be loaded into xmm register directly from memory.
3659 instruct Repl2L_mem(vecX dst, memory mem) %{
3660   predicate(n->as_Vector()->length() == 2 && !VM_Version::supports_avx512vlbw());
3661   match(Set dst (ReplicateL (LoadL mem)));
3662   format %{ "movq    $dst,$mem\n\t"
3663             "punpcklqdq $dst,$dst\t! replicate2L" %}
3664   ins_encode %{
3665     __ movq($dst$$XMMRegister, $mem$$Address);
3666     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3667   %}
3668   ins_pipe( pipe_slow );
3669 %}
3670 
3671 // Replicate long (8 byte) scalar to be vector
3672 #ifdef _LP64
3673 instruct Repl4L(vecY dst, rRegL src) %{
3674   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3675   match(Set dst (ReplicateL src));
3676   format %{ "movdq   $dst,$src\n\t"
3677             "punpcklqdq $dst,$dst\n\t"
3678             "vinserti128_high $dst,$dst\t! replicate4L" %}
3679   ins_encode %{
3680     __ movdq($dst$$XMMRegister, $src$$Register);
3681     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3682     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3683   %}
3684   ins_pipe( pipe_slow );
3685 %}
3686 
3687 instruct Repl8L(legVecZ dst, rRegL src) %{
3688   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3689   match(Set dst (ReplicateL src));
3690   format %{ "movdq   $dst,$src\n\t"
3691             "punpcklqdq $dst,$dst\n\t"
3692             "vinserti128_high $dst,$dst\t"
3693             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate8L" %}
3694   ins_encode %{
3695     __ movdq($dst$$XMMRegister, $src$$Register);
3696     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3697     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3698     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3699   %}
3700   ins_pipe( pipe_slow );
3701 %}
3702 #else // _LP64
3703 instruct Repl4L(vecY dst, eRegL src, vecY tmp) %{
3704   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3705   match(Set dst (ReplicateL src));
3706   effect(TEMP dst, USE src, TEMP tmp);
3707   format %{ "movdl   $dst,$src.lo\n\t"
3708             "movdl   $tmp,$src.hi\n\t"
3709             "punpckldq $dst,$tmp\n\t"
3710             "punpcklqdq $dst,$dst\n\t"
3711             "vinserti128_high $dst,$dst\t! replicate4L" %}
3712   ins_encode %{
3713     __ movdl($dst$$XMMRegister, $src$$Register);
3714     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
3715     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
3716     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3717     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3718   %}
3719   ins_pipe( pipe_slow );
3720 %}
3721 
3722 instruct Repl8L(legVecZ dst, eRegL src, legVecZ tmp) %{
3723   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3724   match(Set dst (ReplicateL src));
3725   effect(TEMP dst, USE src, TEMP tmp);
3726   format %{ "movdl   $dst,$src.lo\n\t"
3727             "movdl   $tmp,$src.hi\n\t"
3728             "punpckldq $dst,$tmp\n\t"
3729             "punpcklqdq $dst,$dst\n\t"
3730             "vinserti128_high $dst,$dst\t"
3731             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate8L" %}
3732   ins_encode %{
3733     __ movdl($dst$$XMMRegister, $src$$Register);
3734     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
3735     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
3736     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3737     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3738     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3739   %}
3740   ins_pipe( pipe_slow );
3741 %}
3742 #endif // _LP64
3743 
3744 instruct Repl4L_imm(vecY dst, immL con) %{
3745   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3746   match(Set dst (ReplicateL con));
3747   format %{ "movq    $dst,[$constantaddress]\n\t"
3748             "punpcklqdq $dst,$dst\n\t"
3749             "vinserti128_high $dst,$dst\t! replicate4L($con)" %}
3750   ins_encode %{
3751     __ movq($dst$$XMMRegister, $constantaddress($con));
3752     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3753     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3754   %}
3755   ins_pipe( pipe_slow );
3756 %}
3757 
3758 instruct Repl8L_imm(legVecZ dst, immL con) %{
3759   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3760   match(Set dst (ReplicateL con));
3761   format %{ "movq    $dst,[$constantaddress]\n\t"
3762             "punpcklqdq $dst,$dst\n\t"
3763             "vinserti128_high $dst,$dst\t"
3764             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate8L($con)" %}
3765   ins_encode %{
3766     __ movq($dst$$XMMRegister, $constantaddress($con));
3767     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3768     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3769     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3770   %}
3771   ins_pipe( pipe_slow );
3772 %}
3773 
3774 instruct Repl4L_mem(vecY dst, memory mem) %{
3775   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3776   match(Set dst (ReplicateL (LoadL mem)));
3777   format %{ "movq    $dst,$mem\n\t"
3778             "punpcklqdq $dst,$dst\n\t"
3779             "vinserti128_high $dst,$dst\t! replicate4L" %}
3780   ins_encode %{
3781     __ movq($dst$$XMMRegister, $mem$$Address);
3782     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3783     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3784   %}
3785   ins_pipe( pipe_slow );
3786 %}
3787 
3788 instruct Repl8L_mem(legVecZ dst, memory mem) %{
3789   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3790   match(Set dst (ReplicateL (LoadL mem)));
3791   format %{ "movq    $dst,$mem\n\t"
3792             "punpcklqdq $dst,$dst\n\t"
3793             "vinserti128_high $dst,$dst\t"
3794             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate8L" %}
3795   ins_encode %{
3796     __ movq($dst$$XMMRegister, $mem$$Address);
3797     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3798     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3799     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3800   %}
3801   ins_pipe( pipe_slow );
3802 %}
3803 
3804 instruct Repl2F_mem(vecD dst, memory mem) %{
3805   predicate(n->as_Vector()->length() == 2 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3806   match(Set dst (ReplicateF (LoadF mem)));
3807   format %{ "pshufd  $dst,$mem,0x00\t! replicate2F" %}
3808   ins_encode %{
3809     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3810   %}
3811   ins_pipe( pipe_slow );
3812 %}
3813 
3814 instruct Repl4F_mem(vecX dst, memory mem) %{
3815   predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3816   match(Set dst (ReplicateF (LoadF mem)));
3817   format %{ "pshufd  $dst,$mem,0x00\t! replicate4F" %}
3818   ins_encode %{
3819     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3820   %}
3821   ins_pipe( pipe_slow );
3822 %}
3823 
3824 instruct Repl8F(vecY dst, vlRegF src) %{
3825   predicate(n->as_Vector()->length() == 8 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3826   match(Set dst (ReplicateF src));
3827   format %{ "pshufd  $dst,$src,0x00\n\t"
3828             "vinsertf128_high $dst,$dst\t! replicate8F" %}
3829   ins_encode %{
3830     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
3831     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3832   %}
3833   ins_pipe( pipe_slow );
3834 %}
3835 
3836 instruct Repl8F_mem(vecY dst, memory mem) %{
3837   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3838   match(Set dst (ReplicateF (LoadF mem)));
3839   format %{ "pshufd  $dst,$mem,0x00\n\t"
3840             "vinsertf128_high $dst,$dst\t! replicate8F" %}
3841   ins_encode %{
3842     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3843     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3844   %}
3845   ins_pipe( pipe_slow );
3846 %}
3847 
3848 instruct Repl16F(legVecZ dst, vlRegF src) %{
3849   predicate(n->as_Vector()->length() == 16 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3850   match(Set dst (ReplicateF src));
3851   format %{ "pshufd  $dst,$src,0x00\n\t"
3852             "vinsertf128_high $dst,$dst\t"
3853             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate16F" %}
3854   ins_encode %{
3855     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
3856     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3857     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3858   %}
3859   ins_pipe( pipe_slow );
3860 %}
3861 
3862 instruct Repl16F_mem(legVecZ dst, memory mem) %{
3863   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vl());
3864   match(Set dst (ReplicateF (LoadF mem)));
3865   format %{ "pshufd  $dst,$mem,0x00\n\t"
3866             "vinsertf128_high $dst,$dst\t"
3867             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate16F" %}
3868   ins_encode %{
3869     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3870     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3871     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3872   %}
3873   ins_pipe( pipe_slow );
3874 %}
3875 
3876 instruct Repl2F_zero(vecD dst, immF0 zero) %{
3877   predicate(n->as_Vector()->length() == 2 && UseAVX < 3);
3878   match(Set dst (ReplicateF zero));
3879   format %{ "xorps   $dst,$dst\t! replicate2F zero" %}
3880   ins_encode %{
3881     __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
3882   %}
3883   ins_pipe( fpu_reg_reg );
3884 %}
3885 
3886 instruct Repl4F_zero(vecX dst, immF0 zero) %{
3887   predicate(n->as_Vector()->length() == 4 && UseAVX < 3);
3888   match(Set dst (ReplicateF zero));
3889   format %{ "xorps   $dst,$dst\t! replicate4F zero" %}
3890   ins_encode %{
3891     __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
3892   %}
3893   ins_pipe( fpu_reg_reg );
3894 %}
3895 
3896 instruct Repl8F_zero(vecY dst, immF0 zero) %{
3897   predicate(n->as_Vector()->length() == 8 && UseAVX < 3);
3898   match(Set dst (ReplicateF zero));
3899   format %{ "vxorps  $dst,$dst,$dst\t! replicate8F zero" %}
3900   ins_encode %{
3901     int vector_len = 1;
3902     __ vxorps($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3903   %}
3904   ins_pipe( fpu_reg_reg );
3905 %}
3906 
3907 instruct Repl2D_mem(vecX dst, memory mem) %{
3908   predicate(n->as_Vector()->length() == 2 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3909   match(Set dst (ReplicateD (LoadD mem)));
3910   format %{ "pshufd  $dst,$mem,0x44\t! replicate2D" %}
3911   ins_encode %{
3912     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x44);
3913   %}
3914   ins_pipe( pipe_slow );
3915 %}
3916 
3917 instruct Repl4D(vecY dst, vlRegD src) %{
3918   predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3919   match(Set dst (ReplicateD src));
3920   format %{ "pshufd  $dst,$src,0x44\n\t"
3921             "vinsertf128_high $dst,$dst\t! replicate4D" %}
3922   ins_encode %{
3923     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
3924     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3925   %}
3926   ins_pipe( pipe_slow );
3927 %}
3928 
3929 instruct Repl4D_mem(vecY dst, memory mem) %{
3930   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3931   match(Set dst (ReplicateD (LoadD mem)));
3932   format %{ "pshufd  $dst,$mem,0x44\n\t"
3933             "vinsertf128_high $dst,$dst\t! replicate4D" %}
3934   ins_encode %{
3935     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x44);
3936     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3937   %}
3938   ins_pipe( pipe_slow );
3939 %}
3940 
3941 instruct Repl8D(legVecZ dst, vlRegD src) %{
3942   predicate(n->as_Vector()->length() == 8 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3943   match(Set dst (ReplicateD src));
3944   format %{ "pshufd  $dst,$src,0x44\n\t"
3945             "vinsertf128_high $dst,$dst\t"
3946             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate8D" %}
3947   ins_encode %{
3948     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
3949     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3950     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3951   %}
3952   ins_pipe( pipe_slow );
3953 %}
3954 
3955 instruct Repl8D_mem(legVecZ dst, memory mem) %{
3956   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3957   match(Set dst (ReplicateD (LoadD mem)));
3958   format %{ "pshufd  $dst,$mem,0x44\n\t"
3959             "vinsertf128_high $dst,$dst\t"
3960             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate8D" %}
3961   ins_encode %{
3962     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x44);
3963     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3964     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3965   %}
3966   ins_pipe( pipe_slow );
3967 %}
3968 
3969 // Replicate double (8 byte) scalar zero to be vector
3970 instruct Repl2D_zero(vecX dst, immD0 zero) %{
3971   predicate(n->as_Vector()->length() == 2 && UseAVX < 3);
3972   match(Set dst (ReplicateD zero));
3973   format %{ "xorpd   $dst,$dst\t! replicate2D zero" %}
3974   ins_encode %{
3975     __ xorpd($dst$$XMMRegister, $dst$$XMMRegister);
3976   %}
3977   ins_pipe( fpu_reg_reg );
3978 %}
3979 
3980 instruct Repl4D_zero(vecY dst, immD0 zero) %{
3981   predicate(n->as_Vector()->length() == 4 && UseAVX < 3);
3982   match(Set dst (ReplicateD zero));
3983   format %{ "vxorpd  $dst,$dst,$dst,vect256\t! replicate4D zero" %}
3984   ins_encode %{
3985     int vector_len = 1;
3986     __ vxorpd($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3987   %}
3988   ins_pipe( fpu_reg_reg );
3989 %}
3990 
3991 // ====================GENERIC REPLICATE==========================================
3992 
3993 // Replicate byte scalar to be vector
3994 instruct Repl4B(vecS dst, rRegI src) %{
3995   predicate(n->as_Vector()->length() == 4);
3996   match(Set dst (ReplicateB src));
3997   format %{ "movd    $dst,$src\n\t"
3998             "punpcklbw $dst,$dst\n\t"
3999             "pshuflw $dst,$dst,0x00\t! replicate4B" %}
4000   ins_encode %{
4001     __ movdl($dst$$XMMRegister, $src$$Register);
4002     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
4003     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
4004   %}
4005   ins_pipe( pipe_slow );
4006 %}
4007 
4008 instruct Repl8B(vecD dst, rRegI src) %{
4009   predicate(n->as_Vector()->length() == 8);
4010   match(Set dst (ReplicateB src));
4011   format %{ "movd    $dst,$src\n\t"
4012             "punpcklbw $dst,$dst\n\t"
4013             "pshuflw $dst,$dst,0x00\t! replicate8B" %}
4014   ins_encode %{
4015     __ movdl($dst$$XMMRegister, $src$$Register);
4016     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
4017     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
4018   %}
4019   ins_pipe( pipe_slow );
4020 %}
4021 
4022 // Replicate byte scalar immediate to be vector by loading from const table.
4023 instruct Repl4B_imm(vecS dst, immI con) %{
4024   predicate(n->as_Vector()->length() == 4);
4025   match(Set dst (ReplicateB con));
4026   format %{ "movdl   $dst,[$constantaddress]\t! replicate4B($con)" %}
4027   ins_encode %{
4028     __ movdl($dst$$XMMRegister, $constantaddress(replicate4_imm($con$$constant, 1)));
4029   %}
4030   ins_pipe( pipe_slow );
4031 %}
4032 
4033 instruct Repl8B_imm(vecD dst, immI con) %{
4034   predicate(n->as_Vector()->length() == 8);
4035   match(Set dst (ReplicateB con));
4036   format %{ "movq    $dst,[$constantaddress]\t! replicate8B($con)" %}
4037   ins_encode %{
4038     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
4039   %}
4040   ins_pipe( pipe_slow );
4041 %}
4042 
4043 // Replicate byte scalar zero to be vector
4044 instruct Repl4B_zero(vecS dst, immI0 zero) %{
4045   predicate(n->as_Vector()->length() == 4);
4046   match(Set dst (ReplicateB zero));
4047   format %{ "pxor    $dst,$dst\t! replicate4B zero" %}
4048   ins_encode %{
4049     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4050   %}
4051   ins_pipe( fpu_reg_reg );
4052 %}
4053 
4054 instruct Repl8B_zero(vecD dst, immI0 zero) %{
4055   predicate(n->as_Vector()->length() == 8);
4056   match(Set dst (ReplicateB zero));
4057   format %{ "pxor    $dst,$dst\t! replicate8B zero" %}
4058   ins_encode %{
4059     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4060   %}
4061   ins_pipe( fpu_reg_reg );
4062 %}
4063 
4064 instruct Repl16B_zero(vecX dst, immI0 zero) %{
4065   predicate(n->as_Vector()->length() == 16);
4066   match(Set dst (ReplicateB zero));
4067   format %{ "pxor    $dst,$dst\t! replicate16B zero" %}
4068   ins_encode %{
4069     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4070   %}
4071   ins_pipe( fpu_reg_reg );
4072 %}
4073 
4074 instruct Repl32B_zero(vecY dst, immI0 zero) %{
4075   predicate(n->as_Vector()->length() == 32);
4076   match(Set dst (ReplicateB zero));
4077   format %{ "vpxor   $dst,$dst,$dst\t! replicate32B zero" %}
4078   ins_encode %{
4079     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
4080     int vector_len = 1;
4081     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4082   %}
4083   ins_pipe( fpu_reg_reg );
4084 %}
4085 
4086 // Replicate char/short (2 byte) scalar to be vector
4087 instruct Repl2S(vecS dst, rRegI src) %{
4088   predicate(n->as_Vector()->length() == 2);
4089   match(Set dst (ReplicateS src));
4090   format %{ "movd    $dst,$src\n\t"
4091             "pshuflw $dst,$dst,0x00\t! replicate2S" %}
4092   ins_encode %{
4093     __ movdl($dst$$XMMRegister, $src$$Register);
4094     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
4095   %}
4096   ins_pipe( fpu_reg_reg );
4097 %}
4098 
4099 // Replicate char/short (2 byte) scalar immediate to be vector by loading from const table.
4100 instruct Repl2S_imm(vecS dst, immI con) %{
4101   predicate(n->as_Vector()->length() == 2);
4102   match(Set dst (ReplicateS con));
4103   format %{ "movdl   $dst,[$constantaddress]\t! replicate2S($con)" %}
4104   ins_encode %{
4105     __ movdl($dst$$XMMRegister, $constantaddress(replicate4_imm($con$$constant, 2)));
4106   %}
4107   ins_pipe( fpu_reg_reg );
4108 %}
4109 
4110 instruct Repl4S_imm(vecD dst, immI con) %{
4111   predicate(n->as_Vector()->length() == 4);
4112   match(Set dst (ReplicateS con));
4113   format %{ "movq    $dst,[$constantaddress]\t! replicate4S($con)" %}
4114   ins_encode %{
4115     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
4116   %}
4117   ins_pipe( fpu_reg_reg );
4118 %}
4119 
4120 // Replicate char/short (2 byte) scalar zero to be vector
4121 instruct Repl2S_zero(vecS dst, immI0 zero) %{
4122   predicate(n->as_Vector()->length() == 2);
4123   match(Set dst (ReplicateS zero));
4124   format %{ "pxor    $dst,$dst\t! replicate2S zero" %}
4125   ins_encode %{
4126     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4127   %}
4128   ins_pipe( fpu_reg_reg );
4129 %}
4130 
4131 instruct Repl4S_zero(vecD dst, immI0 zero) %{
4132   predicate(n->as_Vector()->length() == 4);
4133   match(Set dst (ReplicateS zero));
4134   format %{ "pxor    $dst,$dst\t! replicate4S zero" %}
4135   ins_encode %{
4136     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4137   %}
4138   ins_pipe( fpu_reg_reg );
4139 %}
4140 
4141 instruct Repl8S_zero(vecX dst, immI0 zero) %{
4142   predicate(n->as_Vector()->length() == 8);
4143   match(Set dst (ReplicateS zero));
4144   format %{ "pxor    $dst,$dst\t! replicate8S zero" %}
4145   ins_encode %{
4146     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4147   %}
4148   ins_pipe( fpu_reg_reg );
4149 %}
4150 
4151 instruct Repl16S_zero(vecY dst, immI0 zero) %{
4152   predicate(n->as_Vector()->length() == 16);
4153   match(Set dst (ReplicateS zero));
4154   format %{ "vpxor   $dst,$dst,$dst\t! replicate16S zero" %}
4155   ins_encode %{
4156     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
4157     int vector_len = 1;
4158     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4159   %}
4160   ins_pipe( fpu_reg_reg );
4161 %}
4162 
4163 // Replicate integer (4 byte) scalar to be vector
4164 instruct Repl2I(vecD dst, rRegI src) %{
4165   predicate(n->as_Vector()->length() == 2);
4166   match(Set dst (ReplicateI src));
4167   format %{ "movd    $dst,$src\n\t"
4168             "pshufd  $dst,$dst,0x00\t! replicate2I" %}
4169   ins_encode %{
4170     __ movdl($dst$$XMMRegister, $src$$Register);
4171     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
4172   %}
4173   ins_pipe( fpu_reg_reg );
4174 %}
4175 
4176 // Integer could be loaded into xmm register directly from memory.
4177 instruct Repl2I_mem(vecD dst, memory mem) %{
4178   predicate(n->as_Vector()->length() == 2);
4179   match(Set dst (ReplicateI (LoadI mem)));
4180   format %{ "movd    $dst,$mem\n\t"
4181             "pshufd  $dst,$dst,0x00\t! replicate2I" %}
4182   ins_encode %{
4183     __ movdl($dst$$XMMRegister, $mem$$Address);
4184     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
4185   %}
4186   ins_pipe( fpu_reg_reg );
4187 %}
4188 
4189 // Replicate integer (4 byte) scalar immediate to be vector by loading from const table.
4190 instruct Repl2I_imm(vecD dst, immI con) %{
4191   predicate(n->as_Vector()->length() == 2);
4192   match(Set dst (ReplicateI con));
4193   format %{ "movq    $dst,[$constantaddress]\t! replicate2I($con)" %}
4194   ins_encode %{
4195     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
4196   %}
4197   ins_pipe( fpu_reg_reg );
4198 %}
4199 
4200 // Replicate integer (4 byte) scalar zero to be vector
4201 instruct Repl2I_zero(vecD dst, immI0 zero) %{
4202   predicate(n->as_Vector()->length() == 2);
4203   match(Set dst (ReplicateI zero));
4204   format %{ "pxor    $dst,$dst\t! replicate2I" %}
4205   ins_encode %{
4206     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4207   %}
4208   ins_pipe( fpu_reg_reg );
4209 %}
4210 
4211 instruct Repl4I_zero(vecX dst, immI0 zero) %{
4212   predicate(n->as_Vector()->length() == 4);
4213   match(Set dst (ReplicateI zero));
4214   format %{ "pxor    $dst,$dst\t! replicate4I zero)" %}
4215   ins_encode %{
4216     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4217   %}
4218   ins_pipe( fpu_reg_reg );
4219 %}
4220 
4221 instruct Repl8I_zero(vecY dst, immI0 zero) %{
4222   predicate(n->as_Vector()->length() == 8);
4223   match(Set dst (ReplicateI zero));
4224   format %{ "vpxor   $dst,$dst,$dst\t! replicate8I zero" %}
4225   ins_encode %{
4226     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
4227     int vector_len = 1;
4228     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4229   %}
4230   ins_pipe( fpu_reg_reg );
4231 %}
4232 
4233 // Replicate long (8 byte) scalar to be vector
4234 #ifdef _LP64
4235 instruct Repl2L(vecX dst, rRegL src) %{
4236   predicate(n->as_Vector()->length() == 2);
4237   match(Set dst (ReplicateL src));
4238   format %{ "movdq   $dst,$src\n\t"
4239             "punpcklqdq $dst,$dst\t! replicate2L" %}
4240   ins_encode %{
4241     __ movdq($dst$$XMMRegister, $src$$Register);
4242     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4243   %}
4244   ins_pipe( pipe_slow );
4245 %}
4246 #else // _LP64
4247 instruct Repl2L(vecX dst, eRegL src, vecX tmp) %{
4248   predicate(n->as_Vector()->length() == 2);
4249   match(Set dst (ReplicateL src));
4250   effect(TEMP dst, USE src, TEMP tmp);
4251   format %{ "movdl   $dst,$src.lo\n\t"
4252             "movdl   $tmp,$src.hi\n\t"
4253             "punpckldq $dst,$tmp\n\t"
4254             "punpcklqdq $dst,$dst\t! replicate2L"%}
4255   ins_encode %{
4256     __ movdl($dst$$XMMRegister, $src$$Register);
4257     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
4258     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
4259     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4260   %}
4261   ins_pipe( pipe_slow );
4262 %}
4263 #endif // _LP64
4264 
4265 // Replicate long (8 byte) scalar immediate to be vector by loading from const table.
4266 instruct Repl2L_imm(vecX dst, immL con) %{
4267   predicate(n->as_Vector()->length() == 2);
4268   match(Set dst (ReplicateL con));
4269   format %{ "movq    $dst,[$constantaddress]\n\t"
4270             "punpcklqdq $dst,$dst\t! replicate2L($con)" %}
4271   ins_encode %{
4272     __ movq($dst$$XMMRegister, $constantaddress($con));
4273     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4274   %}
4275   ins_pipe( pipe_slow );
4276 %}
4277 
4278 // Replicate long (8 byte) scalar zero to be vector
4279 instruct Repl2L_zero(vecX dst, immL0 zero) %{
4280   predicate(n->as_Vector()->length() == 2);
4281   match(Set dst (ReplicateL zero));
4282   format %{ "pxor    $dst,$dst\t! replicate2L zero" %}
4283   ins_encode %{
4284     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4285   %}
4286   ins_pipe( fpu_reg_reg );
4287 %}
4288 
4289 instruct Repl4L_zero(vecY dst, immL0 zero) %{
4290   predicate(n->as_Vector()->length() == 4);
4291   match(Set dst (ReplicateL zero));
4292   format %{ "vpxor   $dst,$dst,$dst\t! replicate4L zero" %}
4293   ins_encode %{
4294     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
4295     int vector_len = 1;
4296     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4297   %}
4298   ins_pipe( fpu_reg_reg );
4299 %}
4300 
4301 // Replicate float (4 byte) scalar to be vector
4302 instruct Repl2F(vecD dst, vlRegF src) %{
4303   predicate(n->as_Vector()->length() == 2);
4304   match(Set dst (ReplicateF src));
4305   format %{ "pshufd  $dst,$dst,0x00\t! replicate2F" %}
4306   ins_encode %{
4307     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
4308   %}
4309   ins_pipe( fpu_reg_reg );
4310 %}
4311 
4312 instruct Repl4F(vecX dst, vlRegF src) %{
4313   predicate(n->as_Vector()->length() == 4);
4314   match(Set dst (ReplicateF src));
4315   format %{ "pshufd  $dst,$dst,0x00\t! replicate4F" %}
4316   ins_encode %{
4317     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
4318   %}
4319   ins_pipe( pipe_slow );
4320 %}
4321 
4322 // Replicate double (8 bytes) scalar to be vector
4323 instruct Repl2D(vecX dst, vlRegD src) %{
4324   predicate(n->as_Vector()->length() == 2);
4325   match(Set dst (ReplicateD src));
4326   format %{ "pshufd  $dst,$src,0x44\t! replicate2D" %}
4327   ins_encode %{
4328     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
4329   %}
4330   ins_pipe( pipe_slow );
4331 %}
4332 
4333 // ====================EVEX REPLICATE=============================================
4334 
4335 instruct Repl4B_mem_evex(vecS dst, memory mem) %{
4336   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4337   match(Set dst (ReplicateB (LoadB mem)));
4338   format %{ "vpbroadcastb  $dst,$mem\t! replicate4B" %}
4339   ins_encode %{
4340     int vector_len = 0;
4341     __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
4342   %}
4343   ins_pipe( pipe_slow );
4344 %}
4345 
4346 instruct Repl8B_mem_evex(vecD dst, memory mem) %{
4347   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4348   match(Set dst (ReplicateB (LoadB mem)));
4349   format %{ "vpbroadcastb  $dst,$mem\t! replicate8B" %}
4350   ins_encode %{
4351     int vector_len = 0;
4352     __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
4353   %}
4354   ins_pipe( pipe_slow );
4355 %}
4356 
4357 instruct Repl16B_evex(vecX dst, rRegI src) %{
4358   predicate(n->as_Vector()->length() == 16 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4359   match(Set dst (ReplicateB src));
4360   format %{ "evpbroadcastb $dst,$src\t! replicate16B" %}
4361   ins_encode %{
4362    int vector_len = 0;
4363     __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vector_len);
4364   %}
4365   ins_pipe( pipe_slow );
4366 %}
4367 
4368 instruct Repl16B_mem_evex(vecX dst, memory mem) %{
4369   predicate(n->as_Vector()->length() == 16 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4370   match(Set dst (ReplicateB (LoadB mem)));
4371   format %{ "vpbroadcastb  $dst,$mem\t! replicate16B" %}
4372   ins_encode %{
4373     int vector_len = 0;
4374     __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
4375   %}
4376   ins_pipe( pipe_slow );
4377 %}
4378 
4379 instruct Repl32B_evex(vecY dst, rRegI src) %{
4380   predicate(n->as_Vector()->length() == 32 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4381   match(Set dst (ReplicateB src));
4382   format %{ "evpbroadcastb $dst,$src\t! replicate32B" %}
4383   ins_encode %{
4384    int vector_len = 1;
4385     __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vector_len);
4386   %}
4387   ins_pipe( pipe_slow );
4388 %}
4389 
4390 instruct Repl32B_mem_evex(vecY dst, memory mem) %{
4391   predicate(n->as_Vector()->length() == 32 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4392   match(Set dst (ReplicateB (LoadB mem)));
4393   format %{ "vpbroadcastb  $dst,$mem\t! replicate32B" %}
4394   ins_encode %{
4395     int vector_len = 1;
4396     __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
4397   %}
4398   ins_pipe( pipe_slow );
4399 %}
4400 
4401 instruct Repl64B_evex(vecZ dst, rRegI src) %{
4402   predicate(n->as_Vector()->length() == 64 && UseAVX > 2 && VM_Version::supports_avx512bw());
4403   match(Set dst (ReplicateB src));
4404   format %{ "evpbroadcastb $dst,$src\t! upper replicate64B" %}
4405   ins_encode %{
4406    int vector_len = 2;
4407     __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vector_len);
4408   %}
4409   ins_pipe( pipe_slow );
4410 %}
4411 
4412 instruct Repl64B_mem_evex(vecZ dst, memory mem) %{
4413   predicate(n->as_Vector()->length() == 64 && UseAVX > 2 && VM_Version::supports_avx512bw());
4414   match(Set dst (ReplicateB (LoadB mem)));
4415   format %{ "vpbroadcastb  $dst,$mem\t! replicate64B" %}
4416   ins_encode %{
4417     int vector_len = 2;
4418     __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
4419   %}
4420   ins_pipe( pipe_slow );
4421 %}
4422 
4423 instruct Repl16B_imm_evex(vecX dst, immI con) %{
4424   predicate(n->as_Vector()->length() == 16 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4425   match(Set dst (ReplicateB con));
4426   format %{ "movq    $dst,[$constantaddress]\n\t"
4427             "vpbroadcastb $dst,$dst\t! replicate16B" %}
4428   ins_encode %{
4429    int vector_len = 0;
4430     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
4431     __ vpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4432   %}
4433   ins_pipe( pipe_slow );
4434 %}
4435 
4436 instruct Repl32B_imm_evex(vecY dst, immI con) %{
4437   predicate(n->as_Vector()->length() == 32 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4438   match(Set dst (ReplicateB con));
4439   format %{ "movq    $dst,[$constantaddress]\n\t"
4440             "vpbroadcastb $dst,$dst\t! replicate32B" %}
4441   ins_encode %{
4442    int vector_len = 1;
4443     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
4444     __ vpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4445   %}
4446   ins_pipe( pipe_slow );
4447 %}
4448 
4449 instruct Repl64B_imm_evex(vecZ dst, immI con) %{
4450   predicate(n->as_Vector()->length() == 64 && UseAVX > 2 && VM_Version::supports_avx512bw());
4451   match(Set dst (ReplicateB con));
4452   format %{ "movq    $dst,[$constantaddress]\n\t"
4453             "vpbroadcastb $dst,$dst\t! upper replicate64B" %}
4454   ins_encode %{
4455    int vector_len = 2;
4456     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
4457     __ vpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4458   %}
4459   ins_pipe( pipe_slow );
4460 %}
4461 
4462 instruct Repl64B_zero_evex(vecZ dst, immI0 zero) %{
4463   predicate(n->as_Vector()->length() == 64 && UseAVX > 2);
4464   match(Set dst (ReplicateB zero));
4465   format %{ "vpxor   $dst k0,$dst,$dst\t! replicate64B zero" %}
4466   ins_encode %{
4467     // Use vxorpd since AVX does not have vpxor for 512-bit (EVEX will have it).
4468     int vector_len = 2;
4469     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4470   %}
4471   ins_pipe( fpu_reg_reg );
4472 %}
4473 
4474 instruct Repl4S_evex(vecD dst, rRegI src) %{
4475   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4476   match(Set dst (ReplicateS src));
4477   format %{ "evpbroadcastw $dst,$src\t! replicate4S" %}
4478   ins_encode %{
4479    int vector_len = 0;
4480     __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
4481   %}
4482   ins_pipe( pipe_slow );
4483 %}
4484 
4485 instruct Repl4S_mem_evex(vecD dst, memory mem) %{
4486   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4487   match(Set dst (ReplicateS (LoadS mem)));
4488   format %{ "vpbroadcastw  $dst,$mem\t! replicate4S" %}
4489   ins_encode %{
4490     int vector_len = 0;
4491     __ vpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
4492   %}
4493   ins_pipe( pipe_slow );
4494 %}
4495 
4496 instruct Repl8S_evex(vecX dst, rRegI src) %{
4497   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4498   match(Set dst (ReplicateS src));
4499   format %{ "evpbroadcastw $dst,$src\t! replicate8S" %}
4500   ins_encode %{
4501    int vector_len = 0;
4502     __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
4503   %}
4504   ins_pipe( pipe_slow );
4505 %}
4506 
4507 instruct Repl8S_mem_evex(vecX dst, memory mem) %{
4508   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4509   match(Set dst (ReplicateS (LoadS mem)));
4510   format %{ "vpbroadcastw  $dst,$mem\t! replicate8S" %}
4511   ins_encode %{
4512     int vector_len = 0;
4513     __ vpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
4514   %}
4515   ins_pipe( pipe_slow );
4516 %}
4517 
4518 instruct Repl16S_evex(vecY dst, rRegI src) %{
4519   predicate(n->as_Vector()->length() == 16 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4520   match(Set dst (ReplicateS src));
4521   format %{ "evpbroadcastw $dst,$src\t! replicate16S" %}
4522   ins_encode %{
4523    int vector_len = 1;
4524     __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
4525   %}
4526   ins_pipe( pipe_slow );
4527 %}
4528 
4529 instruct Repl16S_mem_evex(vecY dst, memory mem) %{
4530   predicate(n->as_Vector()->length() == 16 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4531   match(Set dst (ReplicateS (LoadS mem)));
4532   format %{ "vpbroadcastw  $dst,$mem\t! replicate16S" %}
4533   ins_encode %{
4534     int vector_len = 1;
4535     __ vpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
4536   %}
4537   ins_pipe( pipe_slow );
4538 %}
4539 
4540 instruct Repl32S_evex(vecZ dst, rRegI src) %{
4541   predicate(n->as_Vector()->length() == 32 && UseAVX > 2 && VM_Version::supports_avx512bw());
4542   match(Set dst (ReplicateS src));
4543   format %{ "evpbroadcastw $dst,$src\t! replicate32S" %}
4544   ins_encode %{
4545    int vector_len = 2;
4546     __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
4547   %}
4548   ins_pipe( pipe_slow );
4549 %}
4550 
4551 instruct Repl32S_mem_evex(vecZ dst, memory mem) %{
4552   predicate(n->as_Vector()->length() == 32 && UseAVX > 2 && VM_Version::supports_avx512bw());
4553   match(Set dst (ReplicateS (LoadS mem)));
4554   format %{ "vpbroadcastw  $dst,$mem\t! replicate32S" %}
4555   ins_encode %{
4556     int vector_len = 2;
4557     __ vpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
4558   %}
4559   ins_pipe( pipe_slow );
4560 %}
4561 
4562 instruct Repl8S_imm_evex(vecX dst, immI con) %{
4563   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4564   match(Set dst (ReplicateS con));
4565   format %{ "movq    $dst,[$constantaddress]\n\t"
4566             "vpbroadcastw $dst,$dst\t! replicate8S" %}
4567   ins_encode %{
4568    int vector_len = 0;
4569     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
4570     __ vpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4571   %}
4572   ins_pipe( pipe_slow );
4573 %}
4574 
4575 instruct Repl16S_imm_evex(vecY dst, immI con) %{
4576   predicate(n->as_Vector()->length() == 16 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4577   match(Set dst (ReplicateS con));
4578   format %{ "movq    $dst,[$constantaddress]\n\t"
4579             "vpbroadcastw $dst,$dst\t! replicate16S" %}
4580   ins_encode %{
4581    int vector_len = 1;
4582     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
4583     __ vpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4584   %}
4585   ins_pipe( pipe_slow );
4586 %}
4587 
4588 instruct Repl32S_imm_evex(vecZ dst, immI con) %{
4589   predicate(n->as_Vector()->length() == 32 && UseAVX > 2 && VM_Version::supports_avx512bw());
4590   match(Set dst (ReplicateS con));
4591   format %{ "movq    $dst,[$constantaddress]\n\t"
4592             "vpbroadcastw $dst,$dst\t! replicate32S" %}
4593   ins_encode %{
4594    int vector_len = 2;
4595     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
4596     __ vpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4597   %}
4598   ins_pipe( pipe_slow );
4599 %}
4600 
4601 instruct Repl32S_zero_evex(vecZ dst, immI0 zero) %{
4602   predicate(n->as_Vector()->length() == 32 && UseAVX > 2);
4603   match(Set dst (ReplicateS zero));
4604   format %{ "vpxor   $dst k0,$dst,$dst\t! replicate32S zero" %}
4605   ins_encode %{
4606     // Use vxorpd since AVX does not have vpxor for 512-bit (EVEX will have it).
4607     int vector_len = 2;
4608     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4609   %}
4610   ins_pipe( fpu_reg_reg );
4611 %}
4612 
4613 instruct Repl4I_evex(vecX dst, rRegI src) %{
4614   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
4615   match(Set dst (ReplicateI src));
4616   format %{ "evpbroadcastd  $dst,$src\t! replicate4I" %}
4617   ins_encode %{
4618     int vector_len = 0;
4619     __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vector_len);
4620   %}
4621   ins_pipe( pipe_slow );
4622 %}
4623 
4624 instruct Repl4I_mem_evex(vecX dst, memory mem) %{
4625   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
4626   match(Set dst (ReplicateI (LoadI mem)));
4627   format %{ "vpbroadcastd  $dst,$mem\t! replicate4I" %}
4628   ins_encode %{
4629     int vector_len = 0;
4630     __ vpbroadcastd($dst$$XMMRegister, $mem$$Address, vector_len);
4631   %}
4632   ins_pipe( pipe_slow );
4633 %}
4634 
4635 instruct Repl8I_evex(vecY dst, rRegI src) %{
4636   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vl());
4637   match(Set dst (ReplicateI src));
4638   format %{ "evpbroadcastd  $dst,$src\t! replicate8I" %}
4639   ins_encode %{
4640     int vector_len = 1;
4641     __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vector_len);
4642   %}
4643   ins_pipe( pipe_slow );
4644 %}
4645 
4646 instruct Repl8I_mem_evex(vecY dst, memory mem) %{
4647   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vl());
4648   match(Set dst (ReplicateI (LoadI mem)));
4649   format %{ "vpbroadcastd  $dst,$mem\t! replicate8I" %}
4650   ins_encode %{
4651     int vector_len = 1;
4652     __ vpbroadcastd($dst$$XMMRegister, $mem$$Address, vector_len);
4653   %}
4654   ins_pipe( pipe_slow );
4655 %}
4656 
4657 instruct Repl16I_evex(vecZ dst, rRegI src) %{
4658   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4659   match(Set dst (ReplicateI src));
4660   format %{ "evpbroadcastd  $dst,$src\t! replicate16I" %}
4661   ins_encode %{
4662     int vector_len = 2;
4663     __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vector_len);
4664   %}
4665   ins_pipe( pipe_slow );
4666 %}
4667 
4668 instruct Repl16I_mem_evex(vecZ dst, memory mem) %{
4669   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4670   match(Set dst (ReplicateI (LoadI mem)));
4671   format %{ "vpbroadcastd  $dst,$mem\t! replicate16I" %}
4672   ins_encode %{
4673     int vector_len = 2;
4674     __ vpbroadcastd($dst$$XMMRegister, $mem$$Address, vector_len);
4675   %}
4676   ins_pipe( pipe_slow );
4677 %}
4678 
4679 instruct Repl4I_imm_evex(vecX dst, immI con) %{
4680   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
4681   match(Set dst (ReplicateI con));
4682   format %{ "movq    $dst,[$constantaddress]\t! replicate8I($con)\n\t"
4683             "vpbroadcastd  $dst,$dst\t! replicate4I" %}
4684   ins_encode %{
4685     int vector_len = 0;
4686     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
4687     __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4688   %}
4689   ins_pipe( pipe_slow );
4690 %}
4691 
4692 instruct Repl8I_imm_evex(vecY dst, immI con) %{
4693   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vl());
4694   match(Set dst (ReplicateI con));
4695   format %{ "movq    $dst,[$constantaddress]\t! replicate8I($con)\n\t"
4696             "vpbroadcastd  $dst,$dst\t! replicate8I" %}
4697   ins_encode %{
4698     int vector_len = 1;
4699     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
4700     __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4701   %}
4702   ins_pipe( pipe_slow );
4703 %}
4704 
4705 instruct Repl16I_imm_evex(vecZ dst, immI con) %{
4706   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4707   match(Set dst (ReplicateI con));
4708   format %{ "movq    $dst,[$constantaddress]\t! replicate16I($con)\n\t"
4709             "vpbroadcastd  $dst,$dst\t! replicate16I" %}
4710   ins_encode %{
4711     int vector_len = 2;
4712     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
4713     __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4714   %}
4715   ins_pipe( pipe_slow );
4716 %}
4717 
4718 instruct Repl16I_zero_evex(vecZ dst, immI0 zero) %{
4719   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4720   match(Set dst (ReplicateI zero));
4721   format %{ "vpxor   $dst k0,$dst,$dst\t! replicate16I zero" %}
4722   ins_encode %{
4723     // Use vxorpd since AVX does not have vpxor for 512-bit (AVX2 will have it).
4724     int vector_len = 2;
4725     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4726   %}
4727   ins_pipe( fpu_reg_reg );
4728 %}
4729 
4730 // Replicate long (8 byte) scalar to be vector
4731 #ifdef _LP64
4732 instruct Repl4L_evex(vecY dst, rRegL src) %{
4733   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
4734   match(Set dst (ReplicateL src));
4735   format %{ "evpbroadcastq  $dst,$src\t! replicate4L" %}
4736   ins_encode %{
4737     int vector_len = 1;
4738     __ evpbroadcastq($dst$$XMMRegister, $src$$Register, vector_len);
4739   %}
4740   ins_pipe( pipe_slow );
4741 %}
4742 
4743 instruct Repl8L_evex(vecZ dst, rRegL src) %{
4744   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4745   match(Set dst (ReplicateL src));
4746   format %{ "evpbroadcastq  $dst,$src\t! replicate8L" %}
4747   ins_encode %{
4748     int vector_len = 2;
4749     __ evpbroadcastq($dst$$XMMRegister, $src$$Register, vector_len);
4750   %}
4751   ins_pipe( pipe_slow );
4752 %}
4753 #else // _LP64
4754 instruct Repl4L_evex(vecY dst, eRegL src, regD tmp) %{
4755   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
4756   match(Set dst (ReplicateL src));
4757   effect(TEMP dst, USE src, TEMP tmp);
4758   format %{ "movdl   $dst,$src.lo\n\t"
4759             "movdl   $tmp,$src.hi\n\t"
4760             "punpckldq $dst,$tmp\n\t"
4761             "vpbroadcastq  $dst,$dst\t! replicate4L" %}
4762   ins_encode %{
4763     int vector_len = 1;
4764     __ movdl($dst$$XMMRegister, $src$$Register);
4765     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
4766     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
4767     __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4768   %}
4769   ins_pipe( pipe_slow );
4770 %}
4771 
4772 instruct Repl8L_evex(legVecZ dst, eRegL src, legVecZ tmp) %{
4773   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4774   match(Set dst (ReplicateL src));
4775   effect(TEMP dst, USE src, TEMP tmp);
4776   format %{ "movdl   $dst,$src.lo\n\t"
4777             "movdl   $tmp,$src.hi\n\t"
4778             "punpckldq $dst,$tmp\n\t"
4779             "vpbroadcastq  $dst,$dst\t! replicate8L" %}
4780   ins_encode %{
4781     int vector_len = 2;
4782     __ movdl($dst$$XMMRegister, $src$$Register);
4783     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
4784     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
4785     __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4786   %}
4787   ins_pipe( pipe_slow );
4788 %}
4789 #endif // _LP64
4790 
4791 instruct Repl4L_imm_evex(vecY dst, immL con) %{
4792   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
4793   match(Set dst (ReplicateL con));
4794   format %{ "movq    $dst,[$constantaddress]\n\t"
4795             "vpbroadcastq  $dst,$dst\t! replicate4L" %}
4796   ins_encode %{
4797     int vector_len = 1;
4798     __ movq($dst$$XMMRegister, $constantaddress($con));
4799     __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4800   %}
4801   ins_pipe( pipe_slow );
4802 %}
4803 
4804 instruct Repl8L_imm_evex(vecZ dst, immL con) %{
4805   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4806   match(Set dst (ReplicateL con));
4807   format %{ "movq    $dst,[$constantaddress]\n\t"
4808             "vpbroadcastq  $dst,$dst\t! replicate8L" %}
4809   ins_encode %{
4810     int vector_len = 2;
4811     __ movq($dst$$XMMRegister, $constantaddress($con));
4812     __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4813   %}
4814   ins_pipe( pipe_slow );
4815 %}
4816 
4817 instruct Repl2L_mem_evex(vecX dst, memory mem) %{
4818   predicate(n->as_Vector()->length() == 2 && UseAVX > 2 && VM_Version::supports_avx512vl());
4819   match(Set dst (ReplicateL (LoadL mem)));
4820   format %{ "vpbroadcastd  $dst,$mem\t! replicate2L" %}
4821   ins_encode %{
4822     int vector_len = 0;
4823     __ vpbroadcastq($dst$$XMMRegister, $mem$$Address, vector_len);
4824   %}
4825   ins_pipe( pipe_slow );
4826 %}
4827 
4828 instruct Repl4L_mem_evex(vecY dst, memory mem) %{
4829   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
4830   match(Set dst (ReplicateL (LoadL mem)));
4831   format %{ "vpbroadcastd  $dst,$mem\t! replicate4L" %}
4832   ins_encode %{
4833     int vector_len = 1;
4834     __ vpbroadcastq($dst$$XMMRegister, $mem$$Address, vector_len);
4835   %}
4836   ins_pipe( pipe_slow );
4837 %}
4838 
4839 instruct Repl8L_mem_evex(vecZ dst, memory mem) %{
4840   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4841   match(Set dst (ReplicateL (LoadL mem)));
4842   format %{ "vpbroadcastd  $dst,$mem\t! replicate8L" %}
4843   ins_encode %{
4844     int vector_len = 2;
4845     __ vpbroadcastq($dst$$XMMRegister, $mem$$Address, vector_len);
4846   %}
4847   ins_pipe( pipe_slow );
4848 %}
4849 
4850 instruct Repl8L_zero_evex(vecZ dst, immL0 zero) %{
4851   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4852   match(Set dst (ReplicateL zero));
4853   format %{ "vpxor   $dst k0,$dst,$dst\t! replicate8L zero" %}
4854   ins_encode %{
4855     // Use vxorpd since AVX does not have vpxor for 512-bit (EVEX will have it).
4856     int vector_len = 2;
4857     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4858   %}
4859   ins_pipe( fpu_reg_reg );
4860 %}
4861 
4862 instruct Repl8F_evex(vecY dst, regF src) %{
4863   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vl());
4864   match(Set dst (ReplicateF src));
4865   format %{ "vpbroadcastss $dst,$src\t! replicate8F" %}
4866   ins_encode %{
4867     int vector_len = 1;
4868     __ vpbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vector_len);
4869   %}
4870   ins_pipe( pipe_slow );
4871 %}
4872 
4873 instruct Repl8F_mem_evex(vecY dst, memory mem) %{
4874   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vl());
4875   match(Set dst (ReplicateF (LoadF mem)));
4876   format %{ "vbroadcastss  $dst,$mem\t! replicate8F" %}
4877   ins_encode %{
4878     int vector_len = 1;
4879     __ vpbroadcastss($dst$$XMMRegister, $mem$$Address, vector_len);
4880   %}
4881   ins_pipe( pipe_slow );
4882 %}
4883 
4884 instruct Repl16F_evex(vecZ dst, regF src) %{
4885   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4886   match(Set dst (ReplicateF src));
4887   format %{ "vpbroadcastss $dst,$src\t! replicate16F" %}
4888   ins_encode %{
4889     int vector_len = 2;
4890     __ vpbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vector_len);
4891   %}
4892   ins_pipe( pipe_slow );
4893 %}
4894 
4895 instruct Repl16F_mem_evex(vecZ dst, memory mem) %{
4896   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4897   match(Set dst (ReplicateF (LoadF mem)));
4898   format %{ "vbroadcastss  $dst,$mem\t! replicate16F" %}
4899   ins_encode %{
4900     int vector_len = 2;
4901     __ vpbroadcastss($dst$$XMMRegister, $mem$$Address, vector_len);
4902   %}
4903   ins_pipe( pipe_slow );
4904 %}
4905 
4906 instruct Repl2F_zero_evex(vecD dst, immF0 zero) %{
4907   predicate(n->as_Vector()->length() == 2 && UseAVX > 2);
4908   match(Set dst (ReplicateF zero));
4909   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate2F zero" %}
4910   ins_encode %{
4911     // Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
4912     int vector_len = 2;
4913     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4914   %}
4915   ins_pipe( fpu_reg_reg );
4916 %}
4917 
4918 instruct Repl4F_zero_evex(vecX dst, immF0 zero) %{
4919   predicate(n->as_Vector()->length() == 4 && UseAVX > 2);
4920   match(Set dst (ReplicateF zero));
4921   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate4F zero" %}
4922   ins_encode %{
4923     // Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
4924     int vector_len = 2;
4925     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4926   %}
4927   ins_pipe( fpu_reg_reg );
4928 %}
4929 
4930 instruct Repl8F_zero_evex(vecY dst, immF0 zero) %{
4931   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4932   match(Set dst (ReplicateF zero));
4933   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate8F zero" %}
4934   ins_encode %{
4935     // Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
4936     int vector_len = 2;
4937     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4938   %}
4939   ins_pipe( fpu_reg_reg );
4940 %}
4941 
4942 instruct Repl16F_zero_evex(vecZ dst, immF0 zero) %{
4943   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4944   match(Set dst (ReplicateF zero));
4945   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate16F zero" %}
4946   ins_encode %{
4947     // Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
4948     int vector_len = 2;
4949     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4950   %}
4951   ins_pipe( fpu_reg_reg );
4952 %}
4953 
4954 instruct Repl4D_evex(vecY dst, regD src) %{
4955   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
4956   match(Set dst (ReplicateD src));
4957   format %{ "vpbroadcastsd $dst,$src\t! replicate4D" %}
4958   ins_encode %{
4959     int vector_len = 1;
4960     __ vpbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
4961   %}
4962   ins_pipe( pipe_slow );
4963 %}
4964 
4965 instruct Repl4D_mem_evex(vecY dst, memory mem) %{
4966   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
4967   match(Set dst (ReplicateD (LoadD mem)));
4968   format %{ "vbroadcastsd  $dst,$mem\t! replicate4D" %}
4969   ins_encode %{
4970     int vector_len = 1;
4971     __ vpbroadcastsd($dst$$XMMRegister, $mem$$Address, vector_len);
4972   %}
4973   ins_pipe( pipe_slow );
4974 %}
4975 
4976 instruct Repl8D_evex(vecZ dst, regD src) %{
4977   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4978   match(Set dst (ReplicateD src));
4979   format %{ "vpbroadcastsd $dst,$src\t! replicate8D" %}
4980   ins_encode %{
4981     int vector_len = 2;
4982     __ vpbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
4983   %}
4984   ins_pipe( pipe_slow );
4985 %}
4986 
4987 instruct Repl8D_mem_evex(vecZ dst, memory mem) %{
4988   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4989   match(Set dst (ReplicateD (LoadD mem)));
4990   format %{ "vbroadcastsd  $dst,$mem\t! replicate8D" %}
4991   ins_encode %{
4992     int vector_len = 2;
4993     __ vpbroadcastsd($dst$$XMMRegister, $mem$$Address, vector_len);
4994   %}
4995   ins_pipe( pipe_slow );
4996 %}
4997 
4998 instruct Repl2D_zero_evex(vecX dst, immD0 zero) %{
4999   predicate(n->as_Vector()->length() == 2 && UseAVX > 2);
5000   match(Set dst (ReplicateD zero));
5001   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate2D zero" %}
5002   ins_encode %{
5003     // Use vpxor in place of vxorpd since EVEX has a constriant on dq for vxorpd: this is a 512-bit operation
5004     int vector_len = 2;
5005     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
5006   %}
5007   ins_pipe( fpu_reg_reg );
5008 %}
5009 
5010 instruct Repl4D_zero_evex(vecY dst, immD0 zero) %{
5011   predicate(n->as_Vector()->length() == 4 && UseAVX > 2);
5012   match(Set dst (ReplicateD zero));
5013   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate4D zero" %}
5014   ins_encode %{
5015     // Use vpxor in place of vxorpd since EVEX has a constriant on dq for vxorpd: this is a 512-bit operation
5016     int vector_len = 2;
5017     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
5018   %}
5019   ins_pipe( fpu_reg_reg );
5020 %}
5021 
5022 instruct Repl8D_zero_evex(vecZ dst, immD0 zero) %{
5023   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
5024   match(Set dst (ReplicateD zero));
5025   format %{ "vpxor  $dst k0,$dst,$dst,vect512\t! replicate8D zero" %}
5026   ins_encode %{
5027     // Use vpxor in place of vxorpd since EVEX has a constriant on dq for vxorpd: this is a 512-bit operation
5028     int vector_len = 2;
5029     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
5030   %}
5031   ins_pipe( fpu_reg_reg );
5032 %}
5033 
5034 // ====================REDUCTION ARITHMETIC=======================================
5035 
5036 instruct rsadd2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, vecD tmp, vecD tmp2) %{
5037   predicate(UseSSE > 2 && UseAVX == 0);
5038   match(Set dst (AddReductionVI src1 src2));
5039   effect(TEMP tmp2, TEMP tmp);
5040   format %{ "movdqu  $tmp2,$src2\n\t"
5041             "phaddd  $tmp2,$tmp2\n\t"
5042             "movd    $tmp,$src1\n\t"
5043             "paddd   $tmp,$tmp2\n\t"
5044             "movd    $dst,$tmp\t! add reduction2I" %}
5045   ins_encode %{
5046     __ movdqu($tmp2$$XMMRegister, $src2$$XMMRegister);
5047     __ phaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister);
5048     __ movdl($tmp$$XMMRegister, $src1$$Register);
5049     __ paddd($tmp$$XMMRegister, $tmp2$$XMMRegister);
5050     __ movdl($dst$$Register, $tmp$$XMMRegister);
5051   %}
5052   ins_pipe( pipe_slow );
5053 %}
5054 
5055 instruct rvadd2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, vecD tmp, vecD tmp2) %{
5056   predicate(VM_Version::supports_avxonly());
5057   match(Set dst (AddReductionVI src1 src2));
5058   effect(TEMP tmp, TEMP tmp2);
5059   format %{ "vphaddd  $tmp,$src2,$src2\n\t"
5060             "movd     $tmp2,$src1\n\t"
5061             "vpaddd   $tmp2,$tmp2,$tmp\n\t"
5062             "movd     $dst,$tmp2\t! add reduction2I" %}
5063   ins_encode %{
5064     int vector_len = 0;
5065     __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len);
5066     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5067     __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
5068     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5069   %}
5070   ins_pipe( pipe_slow );
5071 %}
5072 
5073 instruct rvadd2I_reduction_reg_evex(rRegI dst, rRegI src1, vecD src2, vecD tmp, vecD tmp2) %{
5074   predicate(UseAVX > 2);
5075   match(Set dst (AddReductionVI src1 src2));
5076   effect(TEMP tmp, TEMP tmp2);
5077   format %{ "pshufd  $tmp2,$src2,0x1\n\t"
5078             "vpaddd  $tmp,$src2,$tmp2\n\t"
5079             "movd    $tmp2,$src1\n\t"
5080             "vpaddd  $tmp2,$tmp,$tmp2\n\t"
5081             "movd    $dst,$tmp2\t! add reduction2I" %}
5082   ins_encode %{
5083     int vector_len = 0;
5084     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5085     __ vpaddd($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5086     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5087     __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5088     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5089   %}
5090   ins_pipe( pipe_slow );
5091 %}
5092 
5093 instruct rsadd4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, vecX tmp, vecX tmp2) %{
5094   predicate(UseSSE > 2 && UseAVX == 0);
5095   match(Set dst (AddReductionVI src1 src2));
5096   effect(TEMP tmp, TEMP tmp2);
5097   format %{ "movdqu  $tmp,$src2\n\t"
5098             "phaddd  $tmp,$tmp\n\t"
5099             "phaddd  $tmp,$tmp\n\t"
5100             "movd    $tmp2,$src1\n\t"
5101             "paddd   $tmp2,$tmp\n\t"
5102             "movd    $dst,$tmp2\t! add reduction4I" %}
5103   ins_encode %{
5104     __ movdqu($tmp$$XMMRegister, $src2$$XMMRegister);
5105     __ phaddd($tmp$$XMMRegister, $tmp$$XMMRegister);
5106     __ phaddd($tmp$$XMMRegister, $tmp$$XMMRegister);
5107     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5108     __ paddd($tmp2$$XMMRegister, $tmp$$XMMRegister);
5109     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5110   %}
5111   ins_pipe( pipe_slow );
5112 %}
5113 
5114 instruct rvadd4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, vecX tmp, vecX tmp2) %{
5115   predicate(VM_Version::supports_avxonly());
5116   match(Set dst (AddReductionVI src1 src2));
5117   effect(TEMP tmp, TEMP tmp2);
5118   format %{ "vphaddd  $tmp,$src2,$src2\n\t"
5119             "vphaddd  $tmp,$tmp,$tmp\n\t"
5120             "movd     $tmp2,$src1\n\t"
5121             "vpaddd   $tmp2,$tmp2,$tmp\n\t"
5122             "movd     $dst,$tmp2\t! add reduction4I" %}
5123   ins_encode %{
5124     int vector_len = 0;
5125     __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len);
5126     __ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp$$XMMRegister, vector_len);
5127     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5128     __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
5129     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5130   %}
5131   ins_pipe( pipe_slow );
5132 %}
5133 
5134 instruct rvadd4I_reduction_reg_evex(rRegI dst, rRegI src1, vecX src2, vecX tmp, vecX tmp2) %{
5135   predicate(UseAVX > 2);
5136   match(Set dst (AddReductionVI src1 src2));
5137   effect(TEMP tmp, TEMP tmp2);
5138   format %{ "pshufd  $tmp2,$src2,0xE\n\t"
5139             "vpaddd  $tmp,$src2,$tmp2\n\t"
5140             "pshufd  $tmp2,$tmp,0x1\n\t"
5141             "vpaddd  $tmp,$tmp,$tmp2\n\t"
5142             "movd    $tmp2,$src1\n\t"
5143             "vpaddd  $tmp2,$tmp,$tmp2\n\t"
5144             "movd    $dst,$tmp2\t! add reduction4I" %}
5145   ins_encode %{
5146     int vector_len = 0;
5147     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
5148     __ vpaddd($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5149     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
5150     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5151     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5152     __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5153     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5154   %}
5155   ins_pipe( pipe_slow );
5156 %}
5157 
5158 instruct rvadd8I_reduction_reg(rRegI dst, rRegI src1, vecY src2, vecY tmp, vecY tmp2) %{
5159   predicate(VM_Version::supports_avxonly());
5160   match(Set dst (AddReductionVI src1 src2));
5161   effect(TEMP tmp, TEMP tmp2);
5162   format %{ "vphaddd  $tmp,$src2,$src2\n\t"
5163             "vphaddd  $tmp,$tmp,$tmp2\n\t"
5164             "vextracti128_high  $tmp2,$tmp\n\t"
5165             "vpaddd   $tmp,$tmp,$tmp2\n\t"
5166             "movd     $tmp2,$src1\n\t"
5167             "vpaddd   $tmp2,$tmp2,$tmp\n\t"
5168             "movd     $dst,$tmp2\t! add reduction8I" %}
5169   ins_encode %{
5170     int vector_len = 1;
5171     __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len);
5172     __ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5173     __ vextracti128_high($tmp2$$XMMRegister, $tmp$$XMMRegister);
5174     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5175     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5176     __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5177     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5178   %}
5179   ins_pipe( pipe_slow );
5180 %}
5181 
5182 instruct rvadd8I_reduction_reg_evex(rRegI dst, rRegI src1, vecY src2, vecY tmp, vecY tmp2) %{
5183   predicate(UseAVX > 2);
5184   match(Set dst (AddReductionVI src1 src2));
5185   effect(TEMP tmp, TEMP tmp2);
5186   format %{ "vextracti128_high  $tmp,$src2\n\t"
5187             "vpaddd  $tmp,$tmp,$src2\n\t"
5188             "pshufd  $tmp2,$tmp,0xE\n\t"
5189             "vpaddd  $tmp,$tmp,$tmp2\n\t"
5190             "pshufd  $tmp2,$tmp,0x1\n\t"
5191             "vpaddd  $tmp,$tmp,$tmp2\n\t"
5192             "movd    $tmp2,$src1\n\t"
5193             "vpaddd  $tmp2,$tmp,$tmp2\n\t"
5194             "movd    $dst,$tmp2\t! add reduction8I" %}
5195   ins_encode %{
5196     int vector_len = 0;
5197     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
5198     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
5199     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
5200     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5201     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
5202     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5203     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5204     __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5205     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5206   %}
5207   ins_pipe( pipe_slow );
5208 %}
5209 
5210 instruct rvadd16I_reduction_reg_evex(rRegI dst, rRegI src1, legVecZ src2, legVecZ tmp, legVecZ tmp2, legVecZ tmp3) %{
5211   predicate(UseAVX > 2);
5212   match(Set dst (AddReductionVI src1 src2));
5213   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
5214   format %{ "vextracti64x4_high  $tmp3,$src2\n\t"
5215             "vpaddd  $tmp3,$tmp3,$src2\n\t"
5216             "vextracti128_high  $tmp,$tmp3\n\t"
5217             "vpaddd  $tmp,$tmp,$tmp3\n\t"
5218             "pshufd  $tmp2,$tmp,0xE\n\t"
5219             "vpaddd  $tmp,$tmp,$tmp2\n\t"
5220             "pshufd  $tmp2,$tmp,0x1\n\t"
5221             "vpaddd  $tmp,$tmp,$tmp2\n\t"
5222             "movd    $tmp2,$src1\n\t"
5223             "vpaddd  $tmp2,$tmp,$tmp2\n\t"
5224             "movd    $dst,$tmp2\t! mul reduction16I" %}
5225   ins_encode %{
5226     __ vextracti64x4_high($tmp3$$XMMRegister, $src2$$XMMRegister);
5227     __ vpaddd($tmp3$$XMMRegister, $tmp3$$XMMRegister, $src2$$XMMRegister, 1);
5228     __ vextracti128_high($tmp$$XMMRegister, $tmp3$$XMMRegister);
5229     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp3$$XMMRegister, 0);
5230     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
5231     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5232     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
5233     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5234     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5235     __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5236     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5237   %}
5238   ins_pipe( pipe_slow );
5239 %}
5240 
5241 #ifdef _LP64
5242 instruct rvadd2L_reduction_reg(rRegL dst, rRegL src1, vecX src2, vecX tmp, vecX tmp2) %{
5243   predicate(UseAVX > 2);
5244   match(Set dst (AddReductionVL src1 src2));
5245   effect(TEMP tmp, TEMP tmp2);
5246   format %{ "pshufd  $tmp2,$src2,0xE\n\t"
5247             "vpaddq  $tmp,$src2,$tmp2\n\t"
5248             "movdq   $tmp2,$src1\n\t"
5249             "vpaddq  $tmp2,$tmp,$tmp2\n\t"
5250             "movdq   $dst,$tmp2\t! add reduction2L" %}
5251   ins_encode %{
5252     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
5253     __ vpaddq($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, 0);
5254     __ movdq($tmp2$$XMMRegister, $src1$$Register);
5255     __ vpaddq($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5256     __ movdq($dst$$Register, $tmp2$$XMMRegister);
5257   %}
5258   ins_pipe( pipe_slow );
5259 %}
5260 
5261 instruct rvadd4L_reduction_reg(rRegL dst, rRegL src1, vecY src2, vecY tmp, vecY tmp2) %{
5262   predicate(UseAVX > 2);
5263   match(Set dst (AddReductionVL src1 src2));
5264   effect(TEMP tmp, TEMP tmp2);
5265   format %{ "vextracti128_high  $tmp,$src2\n\t"
5266             "vpaddq  $tmp2,$tmp,$src2\n\t"
5267             "pshufd  $tmp,$tmp2,0xE\n\t"
5268             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
5269             "movdq   $tmp,$src1\n\t"
5270             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
5271             "movdq   $dst,$tmp2\t! add reduction4L" %}
5272   ins_encode %{
5273     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
5274     __ vpaddq($tmp2$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, 0);
5275     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5276     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5277     __ movdq($tmp$$XMMRegister, $src1$$Register);
5278     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5279     __ movdq($dst$$Register, $tmp2$$XMMRegister);
5280   %}
5281   ins_pipe( pipe_slow );
5282 %}
5283 
5284 instruct rvadd8L_reduction_reg(rRegL dst, rRegL src1, legVecZ src2, legVecZ tmp, legVecZ tmp2) %{
5285   predicate(UseAVX > 2);
5286   match(Set dst (AddReductionVL src1 src2));
5287   effect(TEMP tmp, TEMP tmp2);
5288   format %{ "vextracti64x4_high  $tmp2,$src2\n\t"
5289             "vpaddq  $tmp2,$tmp2,$src2\n\t"
5290             "vextracti128_high  $tmp,$tmp2\n\t"
5291             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
5292             "pshufd  $tmp,$tmp2,0xE\n\t"
5293             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
5294             "movdq   $tmp,$src1\n\t"
5295             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
5296             "movdq   $dst,$tmp2\t! add reduction8L" %}
5297   ins_encode %{
5298     __ vextracti64x4_high($tmp2$$XMMRegister, $src2$$XMMRegister);
5299     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
5300     __ vextracti128_high($tmp$$XMMRegister, $tmp2$$XMMRegister);
5301     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5302     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5303     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5304     __ movdq($tmp$$XMMRegister, $src1$$Register);
5305     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5306     __ movdq($dst$$Register, $tmp2$$XMMRegister);
5307   %}
5308   ins_pipe( pipe_slow );
5309 %}
5310 #endif
5311 
5312 instruct rsadd2F_reduction_reg(regF dst, vecD src2, vecD tmp) %{
5313   predicate(UseSSE >= 1 && UseAVX == 0);
5314   match(Set dst (AddReductionVF dst src2));
5315   effect(TEMP dst, TEMP tmp);
5316   format %{ "addss   $dst,$src2\n\t"
5317             "pshufd  $tmp,$src2,0x01\n\t"
5318             "addss   $dst,$tmp\t! add reduction2F" %}
5319   ins_encode %{
5320     __ addss($dst$$XMMRegister, $src2$$XMMRegister);
5321     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5322     __ addss($dst$$XMMRegister, $tmp$$XMMRegister);
5323   %}
5324   ins_pipe( pipe_slow );
5325 %}
5326 
5327 instruct rvadd2F_reduction_reg(regF dst, vecD src2, vecD tmp) %{
5328   predicate(UseAVX > 0);
5329   match(Set dst (AddReductionVF dst src2));
5330   effect(TEMP dst, TEMP tmp);
5331   format %{ "vaddss  $dst,$dst,$src2\n\t"
5332             "pshufd  $tmp,$src2,0x01\n\t"
5333             "vaddss  $dst,$dst,$tmp\t! add reduction2F" %}
5334   ins_encode %{
5335     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5336     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5337     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5338   %}
5339   ins_pipe( pipe_slow );
5340 %}
5341 
5342 instruct rsadd4F_reduction_reg(regF dst, vecX src2, vecX tmp) %{
5343   predicate(UseSSE >= 1 && UseAVX == 0);
5344   match(Set dst (AddReductionVF dst src2));
5345   effect(TEMP dst, TEMP tmp);
5346   format %{ "addss   $dst,$src2\n\t"
5347             "pshufd  $tmp,$src2,0x01\n\t"
5348             "addss   $dst,$tmp\n\t"
5349             "pshufd  $tmp,$src2,0x02\n\t"
5350             "addss   $dst,$tmp\n\t"
5351             "pshufd  $tmp,$src2,0x03\n\t"
5352             "addss   $dst,$tmp\t! add reduction4F" %}
5353   ins_encode %{
5354     __ addss($dst$$XMMRegister, $src2$$XMMRegister);
5355     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5356     __ addss($dst$$XMMRegister, $tmp$$XMMRegister);
5357     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5358     __ addss($dst$$XMMRegister, $tmp$$XMMRegister);
5359     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5360     __ addss($dst$$XMMRegister, $tmp$$XMMRegister);
5361   %}
5362   ins_pipe( pipe_slow );
5363 %}
5364 
5365 instruct rvadd4F_reduction_reg(regF dst, vecX src2, vecX tmp) %{
5366   predicate(UseAVX > 0);
5367   match(Set dst (AddReductionVF dst src2));
5368   effect(TEMP tmp, TEMP dst);
5369   format %{ "vaddss  $dst,dst,$src2\n\t"
5370             "pshufd  $tmp,$src2,0x01\n\t"
5371             "vaddss  $dst,$dst,$tmp\n\t"
5372             "pshufd  $tmp,$src2,0x02\n\t"
5373             "vaddss  $dst,$dst,$tmp\n\t"
5374             "pshufd  $tmp,$src2,0x03\n\t"
5375             "vaddss  $dst,$dst,$tmp\t! add reduction4F" %}
5376   ins_encode %{
5377     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5378     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5379     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5380     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5381     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5382     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5383     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5384   %}
5385   ins_pipe( pipe_slow );
5386 %}
5387 
5388 instruct radd8F_reduction_reg(regF dst, vecY src2, vecY tmp, vecY tmp2) %{
5389   predicate(UseAVX > 0);
5390   match(Set dst (AddReductionVF dst src2));
5391   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5392   format %{ "vaddss  $dst,$dst,$src2\n\t"
5393             "pshufd  $tmp,$src2,0x01\n\t"
5394             "vaddss  $dst,$dst,$tmp\n\t"
5395             "pshufd  $tmp,$src2,0x02\n\t"
5396             "vaddss  $dst,$dst,$tmp\n\t"
5397             "pshufd  $tmp,$src2,0x03\n\t"
5398             "vaddss  $dst,$dst,$tmp\n\t"
5399             "vextractf128_high  $tmp2,$src2\n\t"
5400             "vaddss  $dst,$dst,$tmp2\n\t"
5401             "pshufd  $tmp,$tmp2,0x01\n\t"
5402             "vaddss  $dst,$dst,$tmp\n\t"
5403             "pshufd  $tmp,$tmp2,0x02\n\t"
5404             "vaddss  $dst,$dst,$tmp\n\t"
5405             "pshufd  $tmp,$tmp2,0x03\n\t"
5406             "vaddss  $dst,$dst,$tmp\t! add reduction8F" %}
5407   ins_encode %{
5408     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5409     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5410     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5411     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5412     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5413     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5414     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5415     __ vextractf128_high($tmp2$$XMMRegister, $src2$$XMMRegister);
5416     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5417     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5418     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5419     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5420     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5421     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5422     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5423   %}
5424   ins_pipe( pipe_slow );
5425 %}
5426 
5427 instruct radd16F_reduction_reg(regF dst, legVecZ src2, legVecZ tmp, legVecZ tmp2) %{
5428   predicate(UseAVX > 2);
5429   match(Set dst (AddReductionVF dst src2));
5430   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5431   format %{ "vaddss  $dst,$dst,$src2\n\t"
5432             "pshufd  $tmp,$src2,0x01\n\t"
5433             "vaddss  $dst,$dst,$tmp\n\t"
5434             "pshufd  $tmp,$src2,0x02\n\t"
5435             "vaddss  $dst,$dst,$tmp\n\t"
5436             "pshufd  $tmp,$src2,0x03\n\t"
5437             "vaddss  $dst,$dst,$tmp\n\t"
5438             "vextractf32x4  $tmp2,$src2,0x1\n\t"
5439             "vaddss  $dst,$dst,$tmp2\n\t"
5440             "pshufd  $tmp,$tmp2,0x01\n\t"
5441             "vaddss  $dst,$dst,$tmp\n\t"
5442             "pshufd  $tmp,$tmp2,0x02\n\t"
5443             "vaddss  $dst,$dst,$tmp\n\t"
5444             "pshufd  $tmp,$tmp2,0x03\n\t"
5445             "vaddss  $dst,$dst,$tmp\n\t"
5446             "vextractf32x4  $tmp2,$src2,0x2\n\t"
5447             "vaddss  $dst,$dst,$tmp2\n\t"
5448             "pshufd  $tmp,$tmp2,0x01\n\t"
5449             "vaddss  $dst,$dst,$tmp\n\t"
5450             "pshufd  $tmp,$tmp2,0x02\n\t"
5451             "vaddss  $dst,$dst,$tmp\n\t"
5452             "pshufd  $tmp,$tmp2,0x03\n\t"
5453             "vaddss  $dst,$dst,$tmp\n\t"
5454             "vextractf32x4  $tmp2,$src2,0x3\n\t"
5455             "vaddss  $dst,$dst,$tmp2\n\t"
5456             "pshufd  $tmp,$tmp2,0x01\n\t"
5457             "vaddss  $dst,$dst,$tmp\n\t"
5458             "pshufd  $tmp,$tmp2,0x02\n\t"
5459             "vaddss  $dst,$dst,$tmp\n\t"
5460             "pshufd  $tmp,$tmp2,0x03\n\t"
5461             "vaddss  $dst,$dst,$tmp\t! add reduction16F" %}
5462   ins_encode %{
5463     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5464     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5465     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5466     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5467     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5468     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5469     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5470     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5471     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5472     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5473     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5474     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5475     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5476     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5477     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5478     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
5479     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5480     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5481     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5482     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5483     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5484     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5485     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5486     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
5487     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5488     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5489     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5490     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5491     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5492     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5493     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5494   %}
5495   ins_pipe( pipe_slow );
5496 %}
5497 
5498 instruct rsadd2D_reduction_reg(regD dst, vecX src2, vecX tmp) %{
5499   predicate(UseSSE >= 1 && UseAVX == 0);
5500   match(Set dst (AddReductionVD dst src2));
5501   effect(TEMP tmp, TEMP dst);
5502   format %{ "addsd   $dst,$src2\n\t"
5503             "pshufd  $tmp,$src2,0xE\n\t"
5504             "addsd   $dst,$tmp\t! add reduction2D" %}
5505   ins_encode %{
5506     __ addsd($dst$$XMMRegister, $src2$$XMMRegister);
5507     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5508     __ addsd($dst$$XMMRegister, $tmp$$XMMRegister);
5509   %}
5510   ins_pipe( pipe_slow );
5511 %}
5512 
5513 instruct rvadd2D_reduction_reg(regD dst, vecX src2, vecX tmp) %{
5514   predicate(UseAVX > 0);
5515   match(Set dst (AddReductionVD dst src2));
5516   effect(TEMP tmp, TEMP dst);
5517   format %{ "vaddsd  $dst,$dst,$src2\n\t"
5518             "pshufd  $tmp,$src2,0xE\n\t"
5519             "vaddsd  $dst,$dst,$tmp\t! add reduction2D" %}
5520   ins_encode %{
5521     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5522     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5523     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5524   %}
5525   ins_pipe( pipe_slow );
5526 %}
5527 
5528 instruct rvadd4D_reduction_reg(regD dst, vecY src2, vecX tmp, vecX tmp2) %{
5529   predicate(UseAVX > 0);
5530   match(Set dst (AddReductionVD dst src2));
5531   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5532   format %{ "vaddsd  $dst,$dst,$src2\n\t"
5533             "pshufd  $tmp,$src2,0xE\n\t"
5534             "vaddsd  $dst,$dst,$tmp\n\t"
5535             "vextractf128  $tmp2,$src2,0x1\n\t"
5536             "vaddsd  $dst,$dst,$tmp2\n\t"
5537             "pshufd  $tmp,$tmp2,0xE\n\t"
5538             "vaddsd  $dst,$dst,$tmp\t! add reduction4D" %}
5539   ins_encode %{
5540     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5541     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5542     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5543     __ vextractf128($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5544     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5545     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5546     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5547   %}
5548   ins_pipe( pipe_slow );
5549 %}
5550 
5551 instruct rvadd8D_reduction_reg(regD dst, legVecZ src2, legVecZ tmp, legVecZ tmp2) %{
5552   predicate(UseAVX > 2);
5553   match(Set dst (AddReductionVD dst src2));
5554   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5555   format %{ "vaddsd  $dst,$dst,$src2\n\t"
5556             "pshufd  $tmp,$src2,0xE\n\t"
5557             "vaddsd  $dst,$dst,$tmp\n\t"
5558             "vextractf32x4  $tmp2,$src2,0x1\n\t"
5559             "vaddsd  $dst,$dst,$tmp2\n\t"
5560             "pshufd  $tmp,$tmp2,0xE\n\t"
5561             "vaddsd  $dst,$dst,$tmp\n\t"
5562             "vextractf32x4  $tmp2,$src2,0x2\n\t"
5563             "vaddsd  $dst,$dst,$tmp2\n\t"
5564             "pshufd  $tmp,$tmp2,0xE\n\t"
5565             "vaddsd  $dst,$dst,$tmp\n\t"
5566             "vextractf32x4  $tmp2,$src2,0x3\n\t"
5567             "vaddsd  $dst,$dst,$tmp2\n\t"
5568             "pshufd  $tmp,$tmp2,0xE\n\t"
5569             "vaddsd  $dst,$dst,$tmp\t! add reduction8D" %}
5570   ins_encode %{
5571     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5572     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5573     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5574     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5575     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5576     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5577     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5578     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
5579     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5580     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5581     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5582     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
5583     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5584     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5585     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5586   %}
5587   ins_pipe( pipe_slow );
5588 %}
5589 
5590 instruct rsmul2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, vecD tmp, vecD tmp2) %{
5591   predicate(UseSSE > 3 && UseAVX == 0);
5592   match(Set dst (MulReductionVI src1 src2));
5593   effect(TEMP tmp, TEMP tmp2);
5594   format %{ "pshufd  $tmp2,$src2,0x1\n\t"
5595             "pmulld  $tmp2,$src2\n\t"
5596             "movd    $tmp,$src1\n\t"
5597             "pmulld  $tmp2,$tmp\n\t"
5598             "movd    $dst,$tmp2\t! mul reduction2I" %}
5599   ins_encode %{
5600     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5601     __ pmulld($tmp2$$XMMRegister, $src2$$XMMRegister);
5602     __ movdl($tmp$$XMMRegister, $src1$$Register);
5603     __ pmulld($tmp2$$XMMRegister, $tmp$$XMMRegister);
5604     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5605   %}
5606   ins_pipe( pipe_slow );
5607 %}
5608 
5609 instruct rvmul2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, vecD tmp, vecD tmp2) %{
5610   predicate(UseAVX > 0);
5611   match(Set dst (MulReductionVI src1 src2));
5612   effect(TEMP tmp, TEMP tmp2);
5613   format %{ "pshufd   $tmp2,$src2,0x1\n\t"
5614             "vpmulld  $tmp,$src2,$tmp2\n\t"
5615             "movd     $tmp2,$src1\n\t"
5616             "vpmulld  $tmp2,$tmp,$tmp2\n\t"
5617             "movd     $dst,$tmp2\t! mul reduction2I" %}
5618   ins_encode %{
5619     int vector_len = 0;
5620     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5621     __ vpmulld($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5622     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5623     __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5624     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5625   %}
5626   ins_pipe( pipe_slow );
5627 %}
5628 
5629 instruct rsmul4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, vecX tmp, vecX tmp2) %{
5630   predicate(UseSSE > 3 && UseAVX == 0);
5631   match(Set dst (MulReductionVI src1 src2));
5632   effect(TEMP tmp, TEMP tmp2);
5633   format %{ "pshufd  $tmp2,$src2,0xE\n\t"
5634             "pmulld  $tmp2,$src2\n\t"
5635             "pshufd  $tmp,$tmp2,0x1\n\t"
5636             "pmulld  $tmp2,$tmp\n\t"
5637             "movd    $tmp,$src1\n\t"
5638             "pmulld  $tmp2,$tmp\n\t"
5639             "movd    $dst,$tmp2\t! mul reduction4I" %}
5640   ins_encode %{
5641     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
5642     __ pmulld($tmp2$$XMMRegister, $src2$$XMMRegister);
5643     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x1);
5644     __ pmulld($tmp2$$XMMRegister, $tmp$$XMMRegister);
5645     __ movdl($tmp$$XMMRegister, $src1$$Register);
5646     __ pmulld($tmp2$$XMMRegister, $tmp$$XMMRegister);
5647     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5648   %}
5649   ins_pipe( pipe_slow );
5650 %}
5651 
5652 instruct rvmul4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, vecX tmp, vecX tmp2) %{
5653   predicate(UseAVX > 0);
5654   match(Set dst (MulReductionVI src1 src2));
5655   effect(TEMP tmp, TEMP tmp2);
5656   format %{ "pshufd   $tmp2,$src2,0xE\n\t"
5657             "vpmulld  $tmp,$src2,$tmp2\n\t"
5658             "pshufd   $tmp2,$tmp,0x1\n\t"
5659             "vpmulld  $tmp,$tmp,$tmp2\n\t"
5660             "movd     $tmp2,$src1\n\t"
5661             "vpmulld  $tmp2,$tmp,$tmp2\n\t"
5662             "movd     $dst,$tmp2\t! mul reduction4I" %}
5663   ins_encode %{
5664     int vector_len = 0;
5665     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
5666     __ vpmulld($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5667     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
5668     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5669     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5670     __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5671     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5672   %}
5673   ins_pipe( pipe_slow );
5674 %}
5675 
5676 instruct rvmul8I_reduction_reg(rRegI dst, rRegI src1, vecY src2, vecY tmp, vecY tmp2) %{
5677   predicate(UseAVX > 1);
5678   match(Set dst (MulReductionVI src1 src2));
5679   effect(TEMP tmp, TEMP tmp2);
5680   format %{ "vextracti128_high  $tmp,$src2\n\t"
5681             "vpmulld  $tmp,$tmp,$src2\n\t"
5682             "pshufd   $tmp2,$tmp,0xE\n\t"
5683             "vpmulld  $tmp,$tmp,$tmp2\n\t"
5684             "pshufd   $tmp2,$tmp,0x1\n\t"
5685             "vpmulld  $tmp,$tmp,$tmp2\n\t"
5686             "movd     $tmp2,$src1\n\t"
5687             "vpmulld  $tmp2,$tmp,$tmp2\n\t"
5688             "movd     $dst,$tmp2\t! mul reduction8I" %}
5689   ins_encode %{
5690     int vector_len = 0;
5691     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
5692     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
5693     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
5694     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5695     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
5696     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5697     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5698     __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5699     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5700   %}
5701   ins_pipe( pipe_slow );
5702 %}
5703 
5704 instruct rvmul16I_reduction_reg(rRegI dst, rRegI src1, legVecZ src2, legVecZ tmp, legVecZ tmp2, legVecZ tmp3) %{
5705   predicate(UseAVX > 2);
5706   match(Set dst (MulReductionVI src1 src2));
5707   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
5708   format %{ "vextracti64x4_high  $tmp3,$src2\n\t"
5709             "vpmulld  $tmp3,$tmp3,$src2\n\t"
5710             "vextracti128_high  $tmp,$tmp3\n\t"
5711             "vpmulld  $tmp,$tmp,$src2\n\t"
5712             "pshufd   $tmp2,$tmp,0xE\n\t"
5713             "vpmulld  $tmp,$tmp,$tmp2\n\t"
5714             "pshufd   $tmp2,$tmp,0x1\n\t"
5715             "vpmulld  $tmp,$tmp,$tmp2\n\t"
5716             "movd     $tmp2,$src1\n\t"
5717             "vpmulld  $tmp2,$tmp,$tmp2\n\t"
5718             "movd     $dst,$tmp2\t! mul reduction16I" %}
5719   ins_encode %{
5720     __ vextracti64x4_high($tmp3$$XMMRegister, $src2$$XMMRegister);
5721     __ vpmulld($tmp3$$XMMRegister, $tmp3$$XMMRegister, $src2$$XMMRegister, 1);
5722     __ vextracti128_high($tmp$$XMMRegister, $tmp3$$XMMRegister);
5723     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp3$$XMMRegister, 0);
5724     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
5725     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5726     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
5727     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5728     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5729     __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5730     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5731   %}
5732   ins_pipe( pipe_slow );
5733 %}
5734 
5735 #ifdef _LP64
5736 instruct rvmul2L_reduction_reg(rRegL dst, rRegL src1, vecX src2, vecX tmp, vecX tmp2) %{
5737   predicate(UseAVX > 2 && VM_Version::supports_avx512dq());
5738   match(Set dst (MulReductionVL src1 src2));
5739   effect(TEMP tmp, TEMP tmp2);
5740   format %{ "pshufd   $tmp2,$src2,0xE\n\t"
5741             "vpmullq  $tmp,$src2,$tmp2\n\t"
5742             "movdq    $tmp2,$src1\n\t"
5743             "vpmullq  $tmp2,$tmp,$tmp2\n\t"
5744             "movdq    $dst,$tmp2\t! mul reduction2L" %}
5745   ins_encode %{
5746     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
5747     __ vpmullq($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, 0);
5748     __ movdq($tmp2$$XMMRegister, $src1$$Register);
5749     __ vpmullq($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5750     __ movdq($dst$$Register, $tmp2$$XMMRegister);
5751   %}
5752   ins_pipe( pipe_slow );
5753 %}
5754 
5755 instruct rvmul4L_reduction_reg(rRegL dst, rRegL src1, vecY src2, vecY tmp, vecY tmp2) %{
5756   predicate(UseAVX > 2 && VM_Version::supports_avx512dq());
5757   match(Set dst (MulReductionVL src1 src2));
5758   effect(TEMP tmp, TEMP tmp2);
5759   format %{ "vextracti128_high  $tmp,$src2\n\t"
5760             "vpmullq  $tmp2,$tmp,$src2\n\t"
5761             "pshufd   $tmp,$tmp2,0xE\n\t"
5762             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5763             "movdq    $tmp,$src1\n\t"
5764             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5765             "movdq    $dst,$tmp2\t! mul reduction4L" %}
5766   ins_encode %{
5767     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
5768     __ vpmullq($tmp2$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, 0);
5769     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5770     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5771     __ movdq($tmp$$XMMRegister, $src1$$Register);
5772     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5773     __ movdq($dst$$Register, $tmp2$$XMMRegister);
5774   %}
5775   ins_pipe( pipe_slow );
5776 %}
5777 
5778 instruct rvmul8L_reduction_reg(rRegL dst, rRegL src1, legVecZ src2, legVecZ tmp, legVecZ tmp2) %{
5779   predicate(UseAVX > 2 && VM_Version::supports_avx512dq());
5780   match(Set dst (MulReductionVL src1 src2));
5781   effect(TEMP tmp, TEMP tmp2);
5782   format %{ "vextracti64x4_high  $tmp2,$src2\n\t"
5783             "vpmullq  $tmp2,$tmp2,$src2\n\t"
5784             "vextracti128_high  $tmp,$tmp2\n\t"
5785             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5786             "pshufd   $tmp,$tmp2,0xE\n\t"
5787             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5788             "movdq    $tmp,$src1\n\t"
5789             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5790             "movdq    $dst,$tmp2\t! mul reduction8L" %}
5791   ins_encode %{
5792     __ vextracti64x4_high($tmp2$$XMMRegister, $src2$$XMMRegister);
5793     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
5794     __ vextracti128_high($tmp$$XMMRegister, $tmp2$$XMMRegister);
5795     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5796     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5797     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5798     __ movdq($tmp$$XMMRegister, $src1$$Register);
5799     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5800     __ movdq($dst$$Register, $tmp2$$XMMRegister);
5801   %}
5802   ins_pipe( pipe_slow );
5803 %}
5804 #endif
5805 
5806 instruct rsmul2F_reduction(regF dst, vecD src2, vecD tmp) %{
5807   predicate(UseSSE >= 1 && UseAVX == 0);
5808   match(Set dst (MulReductionVF dst src2));
5809   effect(TEMP dst, TEMP tmp);
5810   format %{ "mulss   $dst,$src2\n\t"
5811             "pshufd  $tmp,$src2,0x01\n\t"
5812             "mulss   $dst,$tmp\t! mul reduction2F" %}
5813   ins_encode %{
5814     __ mulss($dst$$XMMRegister, $src2$$XMMRegister);
5815     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5816     __ mulss($dst$$XMMRegister, $tmp$$XMMRegister);
5817   %}
5818   ins_pipe( pipe_slow );
5819 %}
5820 
5821 instruct rvmul2F_reduction_reg(regF dst, vecD src2, vecD tmp) %{
5822   predicate(UseAVX > 0);
5823   match(Set dst (MulReductionVF dst src2));
5824   effect(TEMP tmp, TEMP dst);
5825   format %{ "vmulss  $dst,$dst,$src2\n\t"
5826             "pshufd  $tmp,$src2,0x01\n\t"
5827             "vmulss  $dst,$dst,$tmp\t! mul reduction2F" %}
5828   ins_encode %{
5829     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5830     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5831     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5832   %}
5833   ins_pipe( pipe_slow );
5834 %}
5835 
5836 instruct rsmul4F_reduction_reg(regF dst, vecX src2, vecX tmp) %{
5837   predicate(UseSSE >= 1 && UseAVX == 0);
5838   match(Set dst (MulReductionVF dst src2));
5839   effect(TEMP dst, TEMP tmp);
5840   format %{ "mulss   $dst,$src2\n\t"
5841             "pshufd  $tmp,$src2,0x01\n\t"
5842             "mulss   $dst,$tmp\n\t"
5843             "pshufd  $tmp,$src2,0x02\n\t"
5844             "mulss   $dst,$tmp\n\t"
5845             "pshufd  $tmp,$src2,0x03\n\t"
5846             "mulss   $dst,$tmp\t! mul reduction4F" %}
5847   ins_encode %{
5848     __ mulss($dst$$XMMRegister, $src2$$XMMRegister);
5849     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5850     __ mulss($dst$$XMMRegister, $tmp$$XMMRegister);
5851     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5852     __ mulss($dst$$XMMRegister, $tmp$$XMMRegister);
5853     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5854     __ mulss($dst$$XMMRegister, $tmp$$XMMRegister);
5855   %}
5856   ins_pipe( pipe_slow );
5857 %}
5858 
5859 instruct rvmul4F_reduction_reg(regF dst, vecX src2, vecX tmp) %{
5860   predicate(UseAVX > 0);
5861   match(Set dst (MulReductionVF dst src2));
5862   effect(TEMP tmp, TEMP dst);
5863   format %{ "vmulss  $dst,$dst,$src2\n\t"
5864             "pshufd  $tmp,$src2,0x01\n\t"
5865             "vmulss  $dst,$dst,$tmp\n\t"
5866             "pshufd  $tmp,$src2,0x02\n\t"
5867             "vmulss  $dst,$dst,$tmp\n\t"
5868             "pshufd  $tmp,$src2,0x03\n\t"
5869             "vmulss  $dst,$dst,$tmp\t! mul reduction4F" %}
5870   ins_encode %{
5871     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5872     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5873     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5874     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5875     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5876     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5877     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5878   %}
5879   ins_pipe( pipe_slow );
5880 %}
5881 
5882 instruct rvmul8F_reduction_reg(regF dst, vecY src2, vecY tmp, vecY tmp2) %{
5883   predicate(UseAVX > 0);
5884   match(Set dst (MulReductionVF dst src2));
5885   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5886   format %{ "vmulss  $dst,$dst,$src2\n\t"
5887             "pshufd  $tmp,$src2,0x01\n\t"
5888             "vmulss  $dst,$dst,$tmp\n\t"
5889             "pshufd  $tmp,$src2,0x02\n\t"
5890             "vmulss  $dst,$dst,$tmp\n\t"
5891             "pshufd  $tmp,$src2,0x03\n\t"
5892             "vmulss  $dst,$dst,$tmp\n\t"
5893             "vextractf128_high  $tmp2,$src2\n\t"
5894             "vmulss  $dst,$dst,$tmp2\n\t"
5895             "pshufd  $tmp,$tmp2,0x01\n\t"
5896             "vmulss  $dst,$dst,$tmp\n\t"
5897             "pshufd  $tmp,$tmp2,0x02\n\t"
5898             "vmulss  $dst,$dst,$tmp\n\t"
5899             "pshufd  $tmp,$tmp2,0x03\n\t"
5900             "vmulss  $dst,$dst,$tmp\t! mul reduction8F" %}
5901   ins_encode %{
5902     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5903     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5904     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5905     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5906     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5907     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5908     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5909     __ vextractf128_high($tmp2$$XMMRegister, $src2$$XMMRegister);
5910     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5911     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5912     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5913     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5914     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5915     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5916     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5917   %}
5918   ins_pipe( pipe_slow );
5919 %}
5920 
5921 instruct rvmul16F_reduction_reg(regF dst, legVecZ src2, legVecZ tmp, legVecZ tmp2) %{
5922   predicate(UseAVX > 2);
5923   match(Set dst (MulReductionVF dst src2));
5924   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5925   format %{ "vmulss  $dst,$dst,$src2\n\t"
5926             "pshufd  $tmp,$src2,0x01\n\t"
5927             "vmulss  $dst,$dst,$tmp\n\t"
5928             "pshufd  $tmp,$src2,0x02\n\t"
5929             "vmulss  $dst,$dst,$tmp\n\t"
5930             "pshufd  $tmp,$src2,0x03\n\t"
5931             "vmulss  $dst,$dst,$tmp\n\t"
5932             "vextractf32x4  $tmp2,$src2,0x1\n\t"
5933             "vmulss  $dst,$dst,$tmp2\n\t"
5934             "pshufd  $tmp,$tmp2,0x01\n\t"
5935             "vmulss  $dst,$dst,$tmp\n\t"
5936             "pshufd  $tmp,$tmp2,0x02\n\t"
5937             "vmulss  $dst,$dst,$tmp\n\t"
5938             "pshufd  $tmp,$tmp2,0x03\n\t"
5939             "vmulss  $dst,$dst,$tmp\n\t"
5940             "vextractf32x4  $tmp2,$src2,0x2\n\t"
5941             "vmulss  $dst,$dst,$tmp2\n\t"
5942             "pshufd  $tmp,$tmp2,0x01\n\t"
5943             "vmulss  $dst,$dst,$tmp\n\t"
5944             "pshufd  $tmp,$tmp2,0x02\n\t"
5945             "vmulss  $dst,$dst,$tmp\n\t"
5946             "pshufd  $tmp,$tmp2,0x03\n\t"
5947             "vmulss  $dst,$dst,$tmp\n\t"
5948             "vextractf32x4  $tmp2,$src2,0x3\n\t"
5949             "vmulss  $dst,$dst,$tmp2\n\t"
5950             "pshufd  $tmp,$tmp2,0x01\n\t"
5951             "vmulss  $dst,$dst,$tmp\n\t"
5952             "pshufd  $tmp,$tmp2,0x02\n\t"
5953             "vmulss  $dst,$dst,$tmp\n\t"
5954             "pshufd  $tmp,$tmp2,0x03\n\t"
5955             "vmulss  $dst,$dst,$tmp\t! mul reduction16F" %}
5956   ins_encode %{
5957     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5958     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5959     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5960     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5961     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5962     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5963     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5964     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5965     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5966     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5967     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5968     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5969     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5970     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5971     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5972     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
5973     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5974     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5975     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5976     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5977     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5978     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5979     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5980     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
5981     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5982     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5983     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5984     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5985     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5986     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5987     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5988   %}
5989   ins_pipe( pipe_slow );
5990 %}
5991 
5992 instruct rsmul2D_reduction_reg(regD dst, vecX src2, vecX tmp) %{
5993   predicate(UseSSE >= 1 && UseAVX == 0);
5994   match(Set dst (MulReductionVD dst src2));
5995   effect(TEMP dst, TEMP tmp);
5996   format %{ "mulsd   $dst,$src2\n\t"
5997             "pshufd  $tmp,$src2,0xE\n\t"
5998             "mulsd   $dst,$tmp\t! mul reduction2D" %}
5999   ins_encode %{
6000     __ mulsd($dst$$XMMRegister, $src2$$XMMRegister);
6001     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
6002     __ mulsd($dst$$XMMRegister, $tmp$$XMMRegister);
6003   %}
6004   ins_pipe( pipe_slow );
6005 %}
6006 
6007 instruct rvmul2D_reduction_reg(regD dst, vecX src2, vecX tmp) %{
6008   predicate(UseAVX > 0);
6009   match(Set dst (MulReductionVD dst src2));
6010   effect(TEMP tmp, TEMP dst);
6011   format %{ "vmulsd  $dst,$dst,$src2\n\t"
6012             "pshufd  $tmp,$src2,0xE\n\t"
6013             "vmulsd  $dst,$dst,$tmp\t! mul reduction2D" %}
6014   ins_encode %{
6015     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
6016     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
6017     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6018   %}
6019   ins_pipe( pipe_slow );
6020 %}
6021 
6022 instruct rvmul4D_reduction_reg(regD dst, vecY src2, vecY tmp, vecY tmp2) %{
6023   predicate(UseAVX > 0);
6024   match(Set dst (MulReductionVD dst src2));
6025   effect(TEMP tmp, TEMP dst, TEMP tmp2);
6026   format %{ "vmulsd  $dst,$dst,$src2\n\t"
6027             "pshufd  $tmp,$src2,0xE\n\t"
6028             "vmulsd  $dst,$dst,$tmp\n\t"
6029             "vextractf128_high  $tmp2,$src2\n\t"
6030             "vmulsd  $dst,$dst,$tmp2\n\t"
6031             "pshufd  $tmp,$tmp2,0xE\n\t"
6032             "vmulsd  $dst,$dst,$tmp\t! mul reduction4D" %}
6033   ins_encode %{
6034     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
6035     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
6036     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6037     __ vextractf128_high($tmp2$$XMMRegister, $src2$$XMMRegister);
6038     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
6039     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
6040     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6041   %}
6042   ins_pipe( pipe_slow );
6043 %}
6044 
6045 instruct rvmul8D_reduction_reg(regD dst, legVecZ src2, legVecZ tmp, legVecZ tmp2) %{
6046   predicate(UseAVX > 2);
6047   match(Set dst (MulReductionVD dst src2));
6048   effect(TEMP tmp, TEMP dst, TEMP tmp2);
6049   format %{ "vmulsd  $dst,$dst,$src2\n\t"
6050             "pshufd  $tmp,$src2,0xE\n\t"
6051             "vmulsd  $dst,$dst,$tmp\n\t"
6052             "vextractf32x4  $tmp2,$src2,0x1\n\t"
6053             "vmulsd  $dst,$dst,$tmp2\n\t"
6054             "pshufd  $tmp,$src2,0xE\n\t"
6055             "vmulsd  $dst,$dst,$tmp\n\t"
6056             "vextractf32x4  $tmp2,$src2,0x2\n\t"
6057             "vmulsd  $dst,$dst,$tmp2\n\t"
6058             "pshufd  $tmp,$tmp2,0xE\n\t"
6059             "vmulsd  $dst,$dst,$tmp\n\t"
6060             "vextractf32x4  $tmp2,$src2,0x3\n\t"
6061             "vmulsd  $dst,$dst,$tmp2\n\t"
6062             "pshufd  $tmp,$tmp2,0xE\n\t"
6063             "vmulsd  $dst,$dst,$tmp\t! mul reduction8D" %}
6064   ins_encode %{
6065     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
6066     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
6067     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6068     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
6069     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
6070     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
6071     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6072     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
6073     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
6074     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
6075     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6076     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
6077     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
6078     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
6079     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6080   %}
6081   ins_pipe( pipe_slow );
6082 %}
6083 
6084 // ====================VECTOR ARITHMETIC=======================================
6085 
6086 // --------------------------------- ADD --------------------------------------
6087 
6088 // Bytes vector add
6089 instruct vadd4B(vecS dst, vecS src) %{
6090   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
6091   match(Set dst (AddVB dst src));
6092   format %{ "paddb   $dst,$src\t! add packed4B" %}
6093   ins_encode %{
6094     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
6095   %}
6096   ins_pipe( pipe_slow );
6097 %}
6098 
6099 instruct vadd4B_reg(vecS dst, vecS src1, vecS src2) %{
6100   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6101   match(Set dst (AddVB src1 src2));
6102   format %{ "vpaddb  $dst,$src1,$src2\t! add packed4B" %}
6103   ins_encode %{
6104     int vector_len = 0;
6105     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6106   %}
6107   ins_pipe( pipe_slow );
6108 %}
6109 
6110 
6111 instruct vadd4B_mem(vecS dst, vecS src, memory mem) %{
6112   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6113   match(Set dst (AddVB src (LoadVector mem)));
6114   format %{ "vpaddb  $dst,$src,$mem\t! add packed4B" %}
6115   ins_encode %{
6116     int vector_len = 0;
6117     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6118   %}
6119   ins_pipe( pipe_slow );
6120 %}
6121 
6122 instruct vadd8B(vecD dst, vecD src) %{
6123   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
6124   match(Set dst (AddVB dst src));
6125   format %{ "paddb   $dst,$src\t! add packed8B" %}
6126   ins_encode %{
6127     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
6128   %}
6129   ins_pipe( pipe_slow );
6130 %}
6131 
6132 instruct vadd8B_reg(vecD dst, vecD src1, vecD src2) %{
6133   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6134   match(Set dst (AddVB src1 src2));
6135   format %{ "vpaddb  $dst,$src1,$src2\t! add packed8B" %}
6136   ins_encode %{
6137     int vector_len = 0;
6138     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6139   %}
6140   ins_pipe( pipe_slow );
6141 %}
6142 
6143 
6144 instruct vadd8B_mem(vecD dst, vecD src, memory mem) %{
6145   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6146   match(Set dst (AddVB src (LoadVector mem)));
6147   format %{ "vpaddb  $dst,$src,$mem\t! add packed8B" %}
6148   ins_encode %{
6149     int vector_len = 0;
6150     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6151   %}
6152   ins_pipe( pipe_slow );
6153 %}
6154 
6155 instruct vadd16B(vecX dst, vecX src) %{
6156   predicate(UseAVX == 0 && n->as_Vector()->length() == 16);
6157   match(Set dst (AddVB dst src));
6158   format %{ "paddb   $dst,$src\t! add packed16B" %}
6159   ins_encode %{
6160     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
6161   %}
6162   ins_pipe( pipe_slow );
6163 %}
6164 
6165 instruct vadd16B_reg(vecX dst, vecX src1, vecX src2) %{
6166   predicate(UseAVX > 0  && n->as_Vector()->length() == 16);
6167   match(Set dst (AddVB src1 src2));
6168   format %{ "vpaddb  $dst,$src1,$src2\t! add packed16B" %}
6169   ins_encode %{
6170     int vector_len = 0;
6171     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6172   %}
6173   ins_pipe( pipe_slow );
6174 %}
6175 
6176 instruct vadd16B_mem(vecX dst, vecX src, memory mem) %{
6177   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
6178   match(Set dst (AddVB src (LoadVector mem)));
6179   format %{ "vpaddb  $dst,$src,$mem\t! add packed16B" %}
6180   ins_encode %{
6181     int vector_len = 0;
6182     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6183   %}
6184   ins_pipe( pipe_slow );
6185 %}
6186 
6187 instruct vadd32B_reg(vecY dst, vecY src1, vecY src2) %{
6188   predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
6189   match(Set dst (AddVB src1 src2));
6190   format %{ "vpaddb  $dst,$src1,$src2\t! add packed32B" %}
6191   ins_encode %{
6192     int vector_len = 1;
6193     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6194   %}
6195   ins_pipe( pipe_slow );
6196 %}
6197 
6198 instruct vadd32B_mem(vecY dst, vecY src, memory mem) %{
6199   predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
6200   match(Set dst (AddVB src (LoadVector mem)));
6201   format %{ "vpaddb  $dst,$src,$mem\t! add packed32B" %}
6202   ins_encode %{
6203     int vector_len = 1;
6204     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6205   %}
6206   ins_pipe( pipe_slow );
6207 %}
6208 
6209 instruct vadd64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
6210   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64);
6211   match(Set dst (AddVB src1 src2));
6212   format %{ "vpaddb  $dst,$src1,$src2\t! add packed64B" %}
6213   ins_encode %{
6214     int vector_len = 2;
6215     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6216   %}
6217   ins_pipe( pipe_slow );
6218 %}
6219 
6220 instruct vadd64B_mem(vecZ dst, vecZ src, memory mem) %{
6221   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64);
6222   match(Set dst (AddVB src (LoadVector mem)));
6223   format %{ "vpaddb  $dst,$src,$mem\t! add packed64B" %}
6224   ins_encode %{
6225     int vector_len = 2;
6226     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6227   %}
6228   ins_pipe( pipe_slow );
6229 %}
6230 
6231 // Shorts/Chars vector add
6232 instruct vadd2S(vecS dst, vecS src) %{
6233   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
6234   match(Set dst (AddVS dst src));
6235   format %{ "paddw   $dst,$src\t! add packed2S" %}
6236   ins_encode %{
6237     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
6238   %}
6239   ins_pipe( pipe_slow );
6240 %}
6241 
6242 instruct vadd2S_reg(vecS dst, vecS src1, vecS src2) %{
6243   predicate(UseAVX > 0  && n->as_Vector()->length() == 2);
6244   match(Set dst (AddVS src1 src2));
6245   format %{ "vpaddw  $dst,$src1,$src2\t! add packed2S" %}
6246   ins_encode %{
6247     int vector_len = 0;
6248     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6249   %}
6250   ins_pipe( pipe_slow );
6251 %}
6252 
6253 instruct vadd2S_mem(vecS dst, vecS src, memory mem) %{
6254   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6255   match(Set dst (AddVS src (LoadVector mem)));
6256   format %{ "vpaddw  $dst,$src,$mem\t! add packed2S" %}
6257   ins_encode %{
6258     int vector_len = 0;
6259     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6260   %}
6261   ins_pipe( pipe_slow );
6262 %}
6263 
6264 instruct vadd4S(vecD dst, vecD src) %{
6265   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
6266   match(Set dst (AddVS dst src));
6267   format %{ "paddw   $dst,$src\t! add packed4S" %}
6268   ins_encode %{
6269     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
6270   %}
6271   ins_pipe( pipe_slow );
6272 %}
6273 
6274 instruct vadd4S_reg(vecD dst, vecD src1, vecD src2) %{
6275   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6276   match(Set dst (AddVS src1 src2));
6277   format %{ "vpaddw  $dst,$src1,$src2\t! add packed4S" %}
6278   ins_encode %{
6279     int vector_len = 0;
6280     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6281   %}
6282   ins_pipe( pipe_slow );
6283 %}
6284 
6285 instruct vadd4S_mem(vecD dst, vecD src, memory mem) %{
6286   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6287   match(Set dst (AddVS src (LoadVector mem)));
6288   format %{ "vpaddw  $dst,$src,$mem\t! add packed4S" %}
6289   ins_encode %{
6290     int vector_len = 0;
6291     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6292   %}
6293   ins_pipe( pipe_slow );
6294 %}
6295 
6296 instruct vadd8S(vecX dst, vecX src) %{
6297   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
6298   match(Set dst (AddVS dst src));
6299   format %{ "paddw   $dst,$src\t! add packed8S" %}
6300   ins_encode %{
6301     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
6302   %}
6303   ins_pipe( pipe_slow );
6304 %}
6305 
6306 instruct vadd8S_reg(vecX dst, vecX src1, vecX src2) %{
6307   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6308   match(Set dst (AddVS src1 src2));
6309   format %{ "vpaddw  $dst,$src1,$src2\t! add packed8S" %}
6310   ins_encode %{
6311     int vector_len = 0;
6312     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6313   %}
6314   ins_pipe( pipe_slow );
6315 %}
6316 
6317 instruct vadd8S_mem(vecX dst, vecX src, memory mem) %{
6318   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6319   match(Set dst (AddVS src (LoadVector mem)));
6320   format %{ "vpaddw  $dst,$src,$mem\t! add packed8S" %}
6321   ins_encode %{
6322     int vector_len = 0;
6323     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6324   %}
6325   ins_pipe( pipe_slow );
6326 %}
6327 
6328 instruct vadd16S_reg(vecY dst, vecY src1, vecY src2) %{
6329   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
6330   match(Set dst (AddVS src1 src2));
6331   format %{ "vpaddw  $dst,$src1,$src2\t! add packed16S" %}
6332   ins_encode %{
6333     int vector_len = 1;
6334     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6335   %}
6336   ins_pipe( pipe_slow );
6337 %}
6338 
6339 instruct vadd16S_mem(vecY dst, vecY src, memory mem) %{
6340   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
6341   match(Set dst (AddVS src (LoadVector mem)));
6342   format %{ "vpaddw  $dst,$src,$mem\t! add packed16S" %}
6343   ins_encode %{
6344     int vector_len = 1;
6345     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6346   %}
6347   ins_pipe( pipe_slow );
6348 %}
6349 
6350 instruct vadd32S_reg(vecZ dst, vecZ src1, vecZ src2) %{
6351   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
6352   match(Set dst (AddVS src1 src2));
6353   format %{ "vpaddw  $dst,$src1,$src2\t! add packed32S" %}
6354   ins_encode %{
6355     int vector_len = 2;
6356     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6357   %}
6358   ins_pipe( pipe_slow );
6359 %}
6360 
6361 instruct vadd32S_mem(vecZ dst, vecZ src, memory mem) %{
6362   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
6363   match(Set dst (AddVS src (LoadVector mem)));
6364   format %{ "vpaddw  $dst,$src,$mem\t! add packed32S" %}
6365   ins_encode %{
6366     int vector_len = 2;
6367     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6368   %}
6369   ins_pipe( pipe_slow );
6370 %}
6371 
6372 // Integers vector add
6373 instruct vadd2I(vecD dst, vecD src) %{
6374   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
6375   match(Set dst (AddVI dst src));
6376   format %{ "paddd   $dst,$src\t! add packed2I" %}
6377   ins_encode %{
6378     __ paddd($dst$$XMMRegister, $src$$XMMRegister);
6379   %}
6380   ins_pipe( pipe_slow );
6381 %}
6382 
6383 instruct vadd2I_reg(vecD dst, vecD src1, vecD src2) %{
6384   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6385   match(Set dst (AddVI src1 src2));
6386   format %{ "vpaddd  $dst,$src1,$src2\t! add packed2I" %}
6387   ins_encode %{
6388     int vector_len = 0;
6389     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6390   %}
6391   ins_pipe( pipe_slow );
6392 %}
6393 
6394 instruct vadd2I_mem(vecD dst, vecD src, memory mem) %{
6395   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6396   match(Set dst (AddVI src (LoadVector mem)));
6397   format %{ "vpaddd  $dst,$src,$mem\t! add packed2I" %}
6398   ins_encode %{
6399     int vector_len = 0;
6400     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6401   %}
6402   ins_pipe( pipe_slow );
6403 %}
6404 
6405 instruct vadd4I(vecX dst, vecX src) %{
6406   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
6407   match(Set dst (AddVI dst src));
6408   format %{ "paddd   $dst,$src\t! add packed4I" %}
6409   ins_encode %{
6410     __ paddd($dst$$XMMRegister, $src$$XMMRegister);
6411   %}
6412   ins_pipe( pipe_slow );
6413 %}
6414 
6415 instruct vadd4I_reg(vecX dst, vecX src1, vecX src2) %{
6416   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6417   match(Set dst (AddVI src1 src2));
6418   format %{ "vpaddd  $dst,$src1,$src2\t! add packed4I" %}
6419   ins_encode %{
6420     int vector_len = 0;
6421     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6422   %}
6423   ins_pipe( pipe_slow );
6424 %}
6425 
6426 instruct vadd4I_mem(vecX dst, vecX src, memory mem) %{
6427   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6428   match(Set dst (AddVI src (LoadVector mem)));
6429   format %{ "vpaddd  $dst,$src,$mem\t! add packed4I" %}
6430   ins_encode %{
6431     int vector_len = 0;
6432     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6433   %}
6434   ins_pipe( pipe_slow );
6435 %}
6436 
6437 instruct vadd8I_reg(vecY dst, vecY src1, vecY src2) %{
6438   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
6439   match(Set dst (AddVI src1 src2));
6440   format %{ "vpaddd  $dst,$src1,$src2\t! add packed8I" %}
6441   ins_encode %{
6442     int vector_len = 1;
6443     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6444   %}
6445   ins_pipe( pipe_slow );
6446 %}
6447 
6448 instruct vadd8I_mem(vecY dst, vecY src, memory mem) %{
6449   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
6450   match(Set dst (AddVI src (LoadVector mem)));
6451   format %{ "vpaddd  $dst,$src,$mem\t! add packed8I" %}
6452   ins_encode %{
6453     int vector_len = 1;
6454     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6455   %}
6456   ins_pipe( pipe_slow );
6457 %}
6458 
6459 instruct vadd16I_reg(vecZ dst, vecZ src1, vecZ src2) %{
6460   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
6461   match(Set dst (AddVI src1 src2));
6462   format %{ "vpaddd  $dst,$src1,$src2\t! add packed16I" %}
6463   ins_encode %{
6464     int vector_len = 2;
6465     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6466   %}
6467   ins_pipe( pipe_slow );
6468 %}
6469 
6470 instruct vadd16I_mem(vecZ dst, vecZ src, memory mem) %{
6471   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
6472   match(Set dst (AddVI src (LoadVector mem)));
6473   format %{ "vpaddd  $dst,$src,$mem\t! add packed16I" %}
6474   ins_encode %{
6475     int vector_len = 2;
6476     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6477   %}
6478   ins_pipe( pipe_slow );
6479 %}
6480 
6481 // Longs vector add
6482 instruct vadd2L(vecX dst, vecX src) %{
6483   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
6484   match(Set dst (AddVL dst src));
6485   format %{ "paddq   $dst,$src\t! add packed2L" %}
6486   ins_encode %{
6487     __ paddq($dst$$XMMRegister, $src$$XMMRegister);
6488   %}
6489   ins_pipe( pipe_slow );
6490 %}
6491 
6492 instruct vadd2L_reg(vecX dst, vecX src1, vecX src2) %{
6493   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6494   match(Set dst (AddVL src1 src2));
6495   format %{ "vpaddq  $dst,$src1,$src2\t! add packed2L" %}
6496   ins_encode %{
6497     int vector_len = 0;
6498     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6499   %}
6500   ins_pipe( pipe_slow );
6501 %}
6502 
6503 instruct vadd2L_mem(vecX dst, vecX src, memory mem) %{
6504   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6505   match(Set dst (AddVL src (LoadVector mem)));
6506   format %{ "vpaddq  $dst,$src,$mem\t! add packed2L" %}
6507   ins_encode %{
6508     int vector_len = 0;
6509     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6510   %}
6511   ins_pipe( pipe_slow );
6512 %}
6513 
6514 instruct vadd4L_reg(vecY dst, vecY src1, vecY src2) %{
6515   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
6516   match(Set dst (AddVL src1 src2));
6517   format %{ "vpaddq  $dst,$src1,$src2\t! add packed4L" %}
6518   ins_encode %{
6519     int vector_len = 1;
6520     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6521   %}
6522   ins_pipe( pipe_slow );
6523 %}
6524 
6525 instruct vadd4L_mem(vecY dst, vecY src, memory mem) %{
6526   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
6527   match(Set dst (AddVL src (LoadVector mem)));
6528   format %{ "vpaddq  $dst,$src,$mem\t! add packed4L" %}
6529   ins_encode %{
6530     int vector_len = 1;
6531     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6532   %}
6533   ins_pipe( pipe_slow );
6534 %}
6535 
6536 instruct vadd8L_reg(vecZ dst, vecZ src1, vecZ src2) %{
6537   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
6538   match(Set dst (AddVL src1 src2));
6539   format %{ "vpaddq  $dst,$src1,$src2\t! add packed8L" %}
6540   ins_encode %{
6541     int vector_len = 2;
6542     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6543   %}
6544   ins_pipe( pipe_slow );
6545 %}
6546 
6547 instruct vadd8L_mem(vecZ dst, vecZ src, memory mem) %{
6548   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
6549   match(Set dst (AddVL src (LoadVector mem)));
6550   format %{ "vpaddq  $dst,$src,$mem\t! add packed8L" %}
6551   ins_encode %{
6552     int vector_len = 2;
6553     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6554   %}
6555   ins_pipe( pipe_slow );
6556 %}
6557 
6558 // Floats vector add
6559 instruct vadd2F(vecD dst, vecD src) %{
6560   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
6561   match(Set dst (AddVF dst src));
6562   format %{ "addps   $dst,$src\t! add packed2F" %}
6563   ins_encode %{
6564     __ addps($dst$$XMMRegister, $src$$XMMRegister);
6565   %}
6566   ins_pipe( pipe_slow );
6567 %}
6568 
6569 instruct vadd2F_reg(vecD dst, vecD src1, vecD src2) %{
6570   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6571   match(Set dst (AddVF src1 src2));
6572   format %{ "vaddps  $dst,$src1,$src2\t! add packed2F" %}
6573   ins_encode %{
6574     int vector_len = 0;
6575     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6576   %}
6577   ins_pipe( pipe_slow );
6578 %}
6579 
6580 instruct vadd2F_mem(vecD dst, vecD src, memory mem) %{
6581   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6582   match(Set dst (AddVF src (LoadVector mem)));
6583   format %{ "vaddps  $dst,$src,$mem\t! add packed2F" %}
6584   ins_encode %{
6585     int vector_len = 0;
6586     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6587   %}
6588   ins_pipe( pipe_slow );
6589 %}
6590 
6591 instruct vadd4F(vecX dst, vecX src) %{
6592   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
6593   match(Set dst (AddVF dst src));
6594   format %{ "addps   $dst,$src\t! add packed4F" %}
6595   ins_encode %{
6596     __ addps($dst$$XMMRegister, $src$$XMMRegister);
6597   %}
6598   ins_pipe( pipe_slow );
6599 %}
6600 
6601 instruct vadd4F_reg(vecX dst, vecX src1, vecX src2) %{
6602   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6603   match(Set dst (AddVF src1 src2));
6604   format %{ "vaddps  $dst,$src1,$src2\t! add packed4F" %}
6605   ins_encode %{
6606     int vector_len = 0;
6607     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6608   %}
6609   ins_pipe( pipe_slow );
6610 %}
6611 
6612 instruct vadd4F_mem(vecX dst, vecX src, memory mem) %{
6613   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6614   match(Set dst (AddVF src (LoadVector mem)));
6615   format %{ "vaddps  $dst,$src,$mem\t! add packed4F" %}
6616   ins_encode %{
6617     int vector_len = 0;
6618     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6619   %}
6620   ins_pipe( pipe_slow );
6621 %}
6622 
6623 instruct vadd8F_reg(vecY dst, vecY src1, vecY src2) %{
6624   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6625   match(Set dst (AddVF src1 src2));
6626   format %{ "vaddps  $dst,$src1,$src2\t! add packed8F" %}
6627   ins_encode %{
6628     int vector_len = 1;
6629     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6630   %}
6631   ins_pipe( pipe_slow );
6632 %}
6633 
6634 instruct vadd8F_mem(vecY dst, vecY src, memory mem) %{
6635   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6636   match(Set dst (AddVF src (LoadVector mem)));
6637   format %{ "vaddps  $dst,$src,$mem\t! add packed8F" %}
6638   ins_encode %{
6639     int vector_len = 1;
6640     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6641   %}
6642   ins_pipe( pipe_slow );
6643 %}
6644 
6645 instruct vadd16F_reg(vecZ dst, vecZ src1, vecZ src2) %{
6646   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
6647   match(Set dst (AddVF src1 src2));
6648   format %{ "vaddps  $dst,$src1,$src2\t! add packed16F" %}
6649   ins_encode %{
6650     int vector_len = 2;
6651     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6652   %}
6653   ins_pipe( pipe_slow );
6654 %}
6655 
6656 instruct vadd16F_mem(vecZ dst, vecZ src, memory mem) %{
6657   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
6658   match(Set dst (AddVF src (LoadVector mem)));
6659   format %{ "vaddps  $dst,$src,$mem\t! add packed16F" %}
6660   ins_encode %{
6661     int vector_len = 2;
6662     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6663   %}
6664   ins_pipe( pipe_slow );
6665 %}
6666 
6667 // Doubles vector add
6668 instruct vadd2D(vecX dst, vecX src) %{
6669   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
6670   match(Set dst (AddVD dst src));
6671   format %{ "addpd   $dst,$src\t! add packed2D" %}
6672   ins_encode %{
6673     __ addpd($dst$$XMMRegister, $src$$XMMRegister);
6674   %}
6675   ins_pipe( pipe_slow );
6676 %}
6677 
6678 instruct vadd2D_reg(vecX dst, vecX src1, vecX src2) %{
6679   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6680   match(Set dst (AddVD src1 src2));
6681   format %{ "vaddpd  $dst,$src1,$src2\t! add packed2D" %}
6682   ins_encode %{
6683     int vector_len = 0;
6684     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6685   %}
6686   ins_pipe( pipe_slow );
6687 %}
6688 
6689 instruct vadd2D_mem(vecX dst, vecX src, memory mem) %{
6690   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6691   match(Set dst (AddVD src (LoadVector mem)));
6692   format %{ "vaddpd  $dst,$src,$mem\t! add packed2D" %}
6693   ins_encode %{
6694     int vector_len = 0;
6695     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6696   %}
6697   ins_pipe( pipe_slow );
6698 %}
6699 
6700 instruct vadd4D_reg(vecY dst, vecY src1, vecY src2) %{
6701   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6702   match(Set dst (AddVD src1 src2));
6703   format %{ "vaddpd  $dst,$src1,$src2\t! add packed4D" %}
6704   ins_encode %{
6705     int vector_len = 1;
6706     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6707   %}
6708   ins_pipe( pipe_slow );
6709 %}
6710 
6711 instruct vadd4D_mem(vecY dst, vecY src, memory mem) %{
6712   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6713   match(Set dst (AddVD src (LoadVector mem)));
6714   format %{ "vaddpd  $dst,$src,$mem\t! add packed4D" %}
6715   ins_encode %{
6716     int vector_len = 1;
6717     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6718   %}
6719   ins_pipe( pipe_slow );
6720 %}
6721 
6722 instruct vadd8D_reg(vecZ dst, vecZ src1, vecZ src2) %{
6723   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
6724   match(Set dst (AddVD src1 src2));
6725   format %{ "vaddpd  $dst,$src1,$src2\t! add packed8D" %}
6726   ins_encode %{
6727     int vector_len = 2;
6728     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6729   %}
6730   ins_pipe( pipe_slow );
6731 %}
6732 
6733 instruct vadd8D_mem(vecZ dst, vecZ src, memory mem) %{
6734   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
6735   match(Set dst (AddVD src (LoadVector mem)));
6736   format %{ "vaddpd  $dst,$src,$mem\t! add packed8D" %}
6737   ins_encode %{
6738     int vector_len = 2;
6739     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6740   %}
6741   ins_pipe( pipe_slow );
6742 %}
6743 
6744 // --------------------------------- SUB --------------------------------------
6745 
6746 // Bytes vector sub
6747 instruct vsub4B(vecS dst, vecS src) %{
6748   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
6749   match(Set dst (SubVB dst src));
6750   format %{ "psubb   $dst,$src\t! sub packed4B" %}
6751   ins_encode %{
6752     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
6753   %}
6754   ins_pipe( pipe_slow );
6755 %}
6756 
6757 instruct vsub4B_reg(vecS dst, vecS src1, vecS src2) %{
6758   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6759   match(Set dst (SubVB src1 src2));
6760   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed4B" %}
6761   ins_encode %{
6762     int vector_len = 0;
6763     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6764   %}
6765   ins_pipe( pipe_slow );
6766 %}
6767 
6768 instruct vsub4B_mem(vecS dst, vecS src, memory mem) %{
6769   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6770   match(Set dst (SubVB src (LoadVector mem)));
6771   format %{ "vpsubb  $dst,$src,$mem\t! sub packed4B" %}
6772   ins_encode %{
6773     int vector_len = 0;
6774     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6775   %}
6776   ins_pipe( pipe_slow );
6777 %}
6778 
6779 instruct vsub8B(vecD dst, vecD src) %{
6780   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
6781   match(Set dst (SubVB dst src));
6782   format %{ "psubb   $dst,$src\t! sub packed8B" %}
6783   ins_encode %{
6784     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
6785   %}
6786   ins_pipe( pipe_slow );
6787 %}
6788 
6789 instruct vsub8B_reg(vecD dst, vecD src1, vecD src2) %{
6790   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6791   match(Set dst (SubVB src1 src2));
6792   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed8B" %}
6793   ins_encode %{
6794     int vector_len = 0;
6795     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6796   %}
6797   ins_pipe( pipe_slow );
6798 %}
6799 
6800 instruct vsub8B_mem(vecD dst, vecD src, memory mem) %{
6801   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6802   match(Set dst (SubVB src (LoadVector mem)));
6803   format %{ "vpsubb  $dst,$src,$mem\t! sub packed8B" %}
6804   ins_encode %{
6805     int vector_len = 0;
6806     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6807   %}
6808   ins_pipe( pipe_slow );
6809 %}
6810 
6811 instruct vsub16B(vecX dst, vecX src) %{
6812   predicate(UseAVX == 0 && n->as_Vector()->length() == 16);
6813   match(Set dst (SubVB dst src));
6814   format %{ "psubb   $dst,$src\t! sub packed16B" %}
6815   ins_encode %{
6816     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
6817   %}
6818   ins_pipe( pipe_slow );
6819 %}
6820 
6821 instruct vsub16B_reg(vecX dst, vecX src1, vecX src2) %{
6822   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
6823   match(Set dst (SubVB src1 src2));
6824   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed16B" %}
6825   ins_encode %{
6826     int vector_len = 0;
6827     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6828   %}
6829   ins_pipe( pipe_slow );
6830 %}
6831 
6832 instruct vsub16B_mem(vecX dst, vecX src, memory mem) %{
6833   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
6834   match(Set dst (SubVB src (LoadVector mem)));
6835   format %{ "vpsubb  $dst,$src,$mem\t! sub packed16B" %}
6836   ins_encode %{
6837     int vector_len = 0;
6838     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6839   %}
6840   ins_pipe( pipe_slow );
6841 %}
6842 
6843 instruct vsub32B_reg(vecY dst, vecY src1, vecY src2) %{
6844   predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
6845   match(Set dst (SubVB src1 src2));
6846   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed32B" %}
6847   ins_encode %{
6848     int vector_len = 1;
6849     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6850   %}
6851   ins_pipe( pipe_slow );
6852 %}
6853 
6854 instruct vsub32B_mem(vecY dst, vecY src, memory mem) %{
6855   predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
6856   match(Set dst (SubVB src (LoadVector mem)));
6857   format %{ "vpsubb  $dst,$src,$mem\t! sub packed32B" %}
6858   ins_encode %{
6859     int vector_len = 1;
6860     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6861   %}
6862   ins_pipe( pipe_slow );
6863 %}
6864 
6865 instruct vsub64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
6866   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64);
6867   match(Set dst (SubVB src1 src2));
6868   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed64B" %}
6869   ins_encode %{
6870     int vector_len = 2;
6871     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6872   %}
6873   ins_pipe( pipe_slow );
6874 %}
6875 
6876 instruct vsub64B_mem(vecZ dst, vecZ src, memory mem) %{
6877   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64);
6878   match(Set dst (SubVB src (LoadVector mem)));
6879   format %{ "vpsubb  $dst,$src,$mem\t! sub packed64B" %}
6880   ins_encode %{
6881     int vector_len = 2;
6882     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6883   %}
6884   ins_pipe( pipe_slow );
6885 %}
6886 
6887 // Shorts/Chars vector sub
6888 instruct vsub2S(vecS dst, vecS src) %{
6889   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
6890   match(Set dst (SubVS dst src));
6891   format %{ "psubw   $dst,$src\t! sub packed2S" %}
6892   ins_encode %{
6893     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
6894   %}
6895   ins_pipe( pipe_slow );
6896 %}
6897 
6898 instruct vsub2S_reg(vecS dst, vecS src1, vecS src2) %{
6899   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6900   match(Set dst (SubVS src1 src2));
6901   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed2S" %}
6902   ins_encode %{
6903     int vector_len = 0;
6904     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6905   %}
6906   ins_pipe( pipe_slow );
6907 %}
6908 
6909 instruct vsub2S_mem(vecS dst, vecS src, memory mem) %{
6910   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6911   match(Set dst (SubVS src (LoadVector mem)));
6912   format %{ "vpsubw  $dst,$src,$mem\t! sub packed2S" %}
6913   ins_encode %{
6914     int vector_len = 0;
6915     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6916   %}
6917   ins_pipe( pipe_slow );
6918 %}
6919 
6920 instruct vsub4S(vecD dst, vecD src) %{
6921   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
6922   match(Set dst (SubVS dst src));
6923   format %{ "psubw   $dst,$src\t! sub packed4S" %}
6924   ins_encode %{
6925     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
6926   %}
6927   ins_pipe( pipe_slow );
6928 %}
6929 
6930 instruct vsub4S_reg(vecD dst, vecD src1, vecD src2) %{
6931   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6932   match(Set dst (SubVS src1 src2));
6933   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed4S" %}
6934   ins_encode %{
6935     int vector_len = 0;
6936     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6937   %}
6938   ins_pipe( pipe_slow );
6939 %}
6940 
6941 instruct vsub4S_mem(vecD dst, vecD src, memory mem) %{
6942   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6943   match(Set dst (SubVS src (LoadVector mem)));
6944   format %{ "vpsubw  $dst,$src,$mem\t! sub packed4S" %}
6945   ins_encode %{
6946     int vector_len = 0;
6947     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6948   %}
6949   ins_pipe( pipe_slow );
6950 %}
6951 
6952 instruct vsub8S(vecX dst, vecX src) %{
6953   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
6954   match(Set dst (SubVS dst src));
6955   format %{ "psubw   $dst,$src\t! sub packed8S" %}
6956   ins_encode %{
6957     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
6958   %}
6959   ins_pipe( pipe_slow );
6960 %}
6961 
6962 instruct vsub8S_reg(vecX dst, vecX src1, vecX src2) %{
6963   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6964   match(Set dst (SubVS src1 src2));
6965   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed8S" %}
6966   ins_encode %{
6967     int vector_len = 0;
6968     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6969   %}
6970   ins_pipe( pipe_slow );
6971 %}
6972 
6973 instruct vsub8S_mem(vecX dst, vecX src, memory mem) %{
6974   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6975   match(Set dst (SubVS src (LoadVector mem)));
6976   format %{ "vpsubw  $dst,$src,$mem\t! sub packed8S" %}
6977   ins_encode %{
6978     int vector_len = 0;
6979     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6980   %}
6981   ins_pipe( pipe_slow );
6982 %}
6983 
6984 instruct vsub16S_reg(vecY dst, vecY src1, vecY src2) %{
6985   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
6986   match(Set dst (SubVS src1 src2));
6987   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed16S" %}
6988   ins_encode %{
6989     int vector_len = 1;
6990     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6991   %}
6992   ins_pipe( pipe_slow );
6993 %}
6994 
6995 instruct vsub16S_mem(vecY dst, vecY src, memory mem) %{
6996   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
6997   match(Set dst (SubVS src (LoadVector mem)));
6998   format %{ "vpsubw  $dst,$src,$mem\t! sub packed16S" %}
6999   ins_encode %{
7000     int vector_len = 1;
7001     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7002   %}
7003   ins_pipe( pipe_slow );
7004 %}
7005 
7006 instruct vsub32S_reg(vecZ dst, vecZ src1, vecZ src2) %{
7007   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
7008   match(Set dst (SubVS src1 src2));
7009   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed32S" %}
7010   ins_encode %{
7011     int vector_len = 2;
7012     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7013   %}
7014   ins_pipe( pipe_slow );
7015 %}
7016 
7017 instruct vsub32S_mem(vecZ dst, vecZ src, memory mem) %{
7018   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
7019   match(Set dst (SubVS src (LoadVector mem)));
7020   format %{ "vpsubw  $dst,$src,$mem\t! sub packed32S" %}
7021   ins_encode %{
7022     int vector_len = 2;
7023     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7024   %}
7025   ins_pipe( pipe_slow );
7026 %}
7027 
7028 // Integers vector sub
7029 instruct vsub2I(vecD dst, vecD src) %{
7030   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
7031   match(Set dst (SubVI dst src));
7032   format %{ "psubd   $dst,$src\t! sub packed2I" %}
7033   ins_encode %{
7034     __ psubd($dst$$XMMRegister, $src$$XMMRegister);
7035   %}
7036   ins_pipe( pipe_slow );
7037 %}
7038 
7039 instruct vsub2I_reg(vecD dst, vecD src1, vecD src2) %{
7040   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7041   match(Set dst (SubVI src1 src2));
7042   format %{ "vpsubd  $dst,$src1,$src2\t! sub packed2I" %}
7043   ins_encode %{
7044     int vector_len = 0;
7045     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7046   %}
7047   ins_pipe( pipe_slow );
7048 %}
7049 
7050 instruct vsub2I_mem(vecD dst, vecD src, memory mem) %{
7051   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7052   match(Set dst (SubVI src (LoadVector mem)));
7053   format %{ "vpsubd  $dst,$src,$mem\t! sub packed2I" %}
7054   ins_encode %{
7055     int vector_len = 0;
7056     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7057   %}
7058   ins_pipe( pipe_slow );
7059 %}
7060 
7061 instruct vsub4I(vecX dst, vecX src) %{
7062   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
7063   match(Set dst (SubVI dst src));
7064   format %{ "psubd   $dst,$src\t! sub packed4I" %}
7065   ins_encode %{
7066     __ psubd($dst$$XMMRegister, $src$$XMMRegister);
7067   %}
7068   ins_pipe( pipe_slow );
7069 %}
7070 
7071 instruct vsub4I_reg(vecX dst, vecX src1, vecX src2) %{
7072   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7073   match(Set dst (SubVI src1 src2));
7074   format %{ "vpsubd  $dst,$src1,$src2\t! sub packed4I" %}
7075   ins_encode %{
7076     int vector_len = 0;
7077     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7078   %}
7079   ins_pipe( pipe_slow );
7080 %}
7081 
7082 instruct vsub4I_mem(vecX dst, vecX src, memory mem) %{
7083   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7084   match(Set dst (SubVI src (LoadVector mem)));
7085   format %{ "vpsubd  $dst,$src,$mem\t! sub packed4I" %}
7086   ins_encode %{
7087     int vector_len = 0;
7088     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7089   %}
7090   ins_pipe( pipe_slow );
7091 %}
7092 
7093 instruct vsub8I_reg(vecY dst, vecY src1, vecY src2) %{
7094   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
7095   match(Set dst (SubVI src1 src2));
7096   format %{ "vpsubd  $dst,$src1,$src2\t! sub packed8I" %}
7097   ins_encode %{
7098     int vector_len = 1;
7099     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7100   %}
7101   ins_pipe( pipe_slow );
7102 %}
7103 
7104 instruct vsub8I_mem(vecY dst, vecY src, memory mem) %{
7105   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
7106   match(Set dst (SubVI src (LoadVector mem)));
7107   format %{ "vpsubd  $dst,$src,$mem\t! sub packed8I" %}
7108   ins_encode %{
7109     int vector_len = 1;
7110     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7111   %}
7112   ins_pipe( pipe_slow );
7113 %}
7114 
7115 instruct vsub16I_reg(vecZ dst, vecZ src1, vecZ src2) %{
7116   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
7117   match(Set dst (SubVI src1 src2));
7118   format %{ "vpsubd  $dst,$src1,$src2\t! sub packed16I" %}
7119   ins_encode %{
7120     int vector_len = 2;
7121     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7122   %}
7123   ins_pipe( pipe_slow );
7124 %}
7125 
7126 instruct vsub16I_mem(vecZ dst, vecZ src, memory mem) %{
7127   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
7128   match(Set dst (SubVI src (LoadVector mem)));
7129   format %{ "vpsubd  $dst,$src,$mem\t! sub packed16I" %}
7130   ins_encode %{
7131     int vector_len = 2;
7132     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7133   %}
7134   ins_pipe( pipe_slow );
7135 %}
7136 
7137 // Longs vector sub
7138 instruct vsub2L(vecX dst, vecX src) %{
7139   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
7140   match(Set dst (SubVL dst src));
7141   format %{ "psubq   $dst,$src\t! sub packed2L" %}
7142   ins_encode %{
7143     __ psubq($dst$$XMMRegister, $src$$XMMRegister);
7144   %}
7145   ins_pipe( pipe_slow );
7146 %}
7147 
7148 instruct vsub2L_reg(vecX dst, vecX src1, vecX src2) %{
7149   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7150   match(Set dst (SubVL src1 src2));
7151   format %{ "vpsubq  $dst,$src1,$src2\t! sub packed2L" %}
7152   ins_encode %{
7153     int vector_len = 0;
7154     __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7155   %}
7156   ins_pipe( pipe_slow );
7157 %}
7158 
7159 instruct vsub2L_mem(vecX dst, vecX src, memory mem) %{
7160   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7161   match(Set dst (SubVL src (LoadVector mem)));
7162   format %{ "vpsubq  $dst,$src,$mem\t! sub packed2L" %}
7163   ins_encode %{
7164     int vector_len = 0;
7165     __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7166   %}
7167   ins_pipe( pipe_slow );
7168 %}
7169 
7170 instruct vsub4L_reg(vecY dst, vecY src1, vecY src2) %{
7171   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
7172   match(Set dst (SubVL src1 src2));
7173   format %{ "vpsubq  $dst,$src1,$src2\t! sub packed4L" %}
7174   ins_encode %{
7175     int vector_len = 1;
7176     __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7177   %}
7178   ins_pipe( pipe_slow );
7179 %}
7180 
7181 instruct vsub4L_mem(vecY dst, vecY src, memory mem) %{
7182   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
7183   match(Set dst (SubVL src (LoadVector mem)));
7184   format %{ "vpsubq  $dst,$src,$mem\t! sub packed4L" %}
7185   ins_encode %{
7186     int vector_len = 1;
7187     __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7188   %}
7189   ins_pipe( pipe_slow );
7190 %}
7191 
7192 instruct vsub8L_reg(vecZ dst, vecZ src1, vecZ src2) %{
7193   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
7194   match(Set dst (SubVL src1 src2));
7195   format %{ "vpsubq  $dst,$src1,$src2\t! sub packed8L" %}
7196   ins_encode %{
7197     int vector_len = 2;
7198     __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7199   %}
7200   ins_pipe( pipe_slow );
7201 %}
7202 
7203 instruct vsub8L_mem(vecZ dst, vecZ src, memory mem) %{
7204   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
7205   match(Set dst (SubVL src (LoadVector mem)));
7206   format %{ "vpsubq  $dst,$src,$mem\t! sub packed8L" %}
7207   ins_encode %{
7208     int vector_len = 2;
7209     __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7210   %}
7211   ins_pipe( pipe_slow );
7212 %}
7213 
7214 // Floats vector sub
7215 instruct vsub2F(vecD dst, vecD src) %{
7216   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
7217   match(Set dst (SubVF dst src));
7218   format %{ "subps   $dst,$src\t! sub packed2F" %}
7219   ins_encode %{
7220     __ subps($dst$$XMMRegister, $src$$XMMRegister);
7221   %}
7222   ins_pipe( pipe_slow );
7223 %}
7224 
7225 instruct vsub2F_reg(vecD dst, vecD src1, vecD src2) %{
7226   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7227   match(Set dst (SubVF src1 src2));
7228   format %{ "vsubps  $dst,$src1,$src2\t! sub packed2F" %}
7229   ins_encode %{
7230     int vector_len = 0;
7231     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7232   %}
7233   ins_pipe( pipe_slow );
7234 %}
7235 
7236 instruct vsub2F_mem(vecD dst, vecD src, memory mem) %{
7237   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7238   match(Set dst (SubVF src (LoadVector mem)));
7239   format %{ "vsubps  $dst,$src,$mem\t! sub packed2F" %}
7240   ins_encode %{
7241     int vector_len = 0;
7242     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7243   %}
7244   ins_pipe( pipe_slow );
7245 %}
7246 
7247 instruct vsub4F(vecX dst, vecX src) %{
7248   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
7249   match(Set dst (SubVF dst src));
7250   format %{ "subps   $dst,$src\t! sub packed4F" %}
7251   ins_encode %{
7252     __ subps($dst$$XMMRegister, $src$$XMMRegister);
7253   %}
7254   ins_pipe( pipe_slow );
7255 %}
7256 
7257 instruct vsub4F_reg(vecX dst, vecX src1, vecX src2) %{
7258   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7259   match(Set dst (SubVF src1 src2));
7260   format %{ "vsubps  $dst,$src1,$src2\t! sub packed4F" %}
7261   ins_encode %{
7262     int vector_len = 0;
7263     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7264   %}
7265   ins_pipe( pipe_slow );
7266 %}
7267 
7268 instruct vsub4F_mem(vecX dst, vecX src, memory mem) %{
7269   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7270   match(Set dst (SubVF src (LoadVector mem)));
7271   format %{ "vsubps  $dst,$src,$mem\t! sub packed4F" %}
7272   ins_encode %{
7273     int vector_len = 0;
7274     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7275   %}
7276   ins_pipe( pipe_slow );
7277 %}
7278 
7279 instruct vsub8F_reg(vecY dst, vecY src1, vecY src2) %{
7280   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
7281   match(Set dst (SubVF src1 src2));
7282   format %{ "vsubps  $dst,$src1,$src2\t! sub packed8F" %}
7283   ins_encode %{
7284     int vector_len = 1;
7285     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7286   %}
7287   ins_pipe( pipe_slow );
7288 %}
7289 
7290 instruct vsub8F_mem(vecY dst, vecY src, memory mem) %{
7291   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
7292   match(Set dst (SubVF src (LoadVector mem)));
7293   format %{ "vsubps  $dst,$src,$mem\t! sub packed8F" %}
7294   ins_encode %{
7295     int vector_len = 1;
7296     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7297   %}
7298   ins_pipe( pipe_slow );
7299 %}
7300 
7301 instruct vsub16F_reg(vecZ dst, vecZ src1, vecZ src2) %{
7302   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
7303   match(Set dst (SubVF src1 src2));
7304   format %{ "vsubps  $dst,$src1,$src2\t! sub packed16F" %}
7305   ins_encode %{
7306     int vector_len = 2;
7307     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7308   %}
7309   ins_pipe( pipe_slow );
7310 %}
7311 
7312 instruct vsub16F_mem(vecZ dst, vecZ src, memory mem) %{
7313   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
7314   match(Set dst (SubVF src (LoadVector mem)));
7315   format %{ "vsubps  $dst,$src,$mem\t! sub packed16F" %}
7316   ins_encode %{
7317     int vector_len = 2;
7318     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7319   %}
7320   ins_pipe( pipe_slow );
7321 %}
7322 
7323 // Doubles vector sub
7324 instruct vsub2D(vecX dst, vecX src) %{
7325   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
7326   match(Set dst (SubVD dst src));
7327   format %{ "subpd   $dst,$src\t! sub packed2D" %}
7328   ins_encode %{
7329     __ subpd($dst$$XMMRegister, $src$$XMMRegister);
7330   %}
7331   ins_pipe( pipe_slow );
7332 %}
7333 
7334 instruct vsub2D_reg(vecX dst, vecX src1, vecX src2) %{
7335   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7336   match(Set dst (SubVD src1 src2));
7337   format %{ "vsubpd  $dst,$src1,$src2\t! sub packed2D" %}
7338   ins_encode %{
7339     int vector_len = 0;
7340     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7341   %}
7342   ins_pipe( pipe_slow );
7343 %}
7344 
7345 instruct vsub2D_mem(vecX dst, vecX src, memory mem) %{
7346   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7347   match(Set dst (SubVD src (LoadVector mem)));
7348   format %{ "vsubpd  $dst,$src,$mem\t! sub packed2D" %}
7349   ins_encode %{
7350     int vector_len = 0;
7351     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7352   %}
7353   ins_pipe( pipe_slow );
7354 %}
7355 
7356 instruct vsub4D_reg(vecY dst, vecY src1, vecY src2) %{
7357   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7358   match(Set dst (SubVD src1 src2));
7359   format %{ "vsubpd  $dst,$src1,$src2\t! sub packed4D" %}
7360   ins_encode %{
7361     int vector_len = 1;
7362     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7363   %}
7364   ins_pipe( pipe_slow );
7365 %}
7366 
7367 instruct vsub4D_mem(vecY dst, vecY src, memory mem) %{
7368   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7369   match(Set dst (SubVD src (LoadVector mem)));
7370   format %{ "vsubpd  $dst,$src,$mem\t! sub packed4D" %}
7371   ins_encode %{
7372     int vector_len = 1;
7373     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7374   %}
7375   ins_pipe( pipe_slow );
7376 %}
7377 
7378 instruct vsub8D_reg(vecZ dst, vecZ src1, vecZ src2) %{
7379   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
7380   match(Set dst (SubVD src1 src2));
7381   format %{ "vsubpd  $dst,$src1,$src2\t! sub packed8D" %}
7382   ins_encode %{
7383     int vector_len = 2;
7384     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7385   %}
7386   ins_pipe( pipe_slow );
7387 %}
7388 
7389 instruct vsub8D_mem(vecZ dst, vecZ src, memory mem) %{
7390   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
7391   match(Set dst (SubVD src (LoadVector mem)));
7392   format %{ "vsubpd  $dst,$src,$mem\t! sub packed8D" %}
7393   ins_encode %{
7394     int vector_len = 2;
7395     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7396   %}
7397   ins_pipe( pipe_slow );
7398 %}
7399 
7400 // --------------------------------- MUL --------------------------------------
7401 
7402 // Shorts/Chars vector mul
7403 instruct vmul2S(vecS dst, vecS src) %{
7404   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
7405   match(Set dst (MulVS dst src));
7406   format %{ "pmullw $dst,$src\t! mul packed2S" %}
7407   ins_encode %{
7408     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
7409   %}
7410   ins_pipe( pipe_slow );
7411 %}
7412 
7413 instruct vmul2S_reg(vecS dst, vecS src1, vecS src2) %{
7414   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7415   match(Set dst (MulVS src1 src2));
7416   format %{ "vpmullw $dst,$src1,$src2\t! mul packed2S" %}
7417   ins_encode %{
7418     int vector_len = 0;
7419     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7420   %}
7421   ins_pipe( pipe_slow );
7422 %}
7423 
7424 instruct vmul2S_mem(vecS dst, vecS src, memory mem) %{
7425   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7426   match(Set dst (MulVS src (LoadVector mem)));
7427   format %{ "vpmullw $dst,$src,$mem\t! mul packed2S" %}
7428   ins_encode %{
7429     int vector_len = 0;
7430     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7431   %}
7432   ins_pipe( pipe_slow );
7433 %}
7434 
7435 instruct vmul4S(vecD dst, vecD src) %{
7436   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
7437   match(Set dst (MulVS dst src));
7438   format %{ "pmullw  $dst,$src\t! mul packed4S" %}
7439   ins_encode %{
7440     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
7441   %}
7442   ins_pipe( pipe_slow );
7443 %}
7444 
7445 instruct vmul4S_reg(vecD dst, vecD src1, vecD src2) %{
7446   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7447   match(Set dst (MulVS src1 src2));
7448   format %{ "vpmullw $dst,$src1,$src2\t! mul packed4S" %}
7449   ins_encode %{
7450     int vector_len = 0;
7451     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7452   %}
7453   ins_pipe( pipe_slow );
7454 %}
7455 
7456 instruct vmul4S_mem(vecD dst, vecD src, memory mem) %{
7457   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7458   match(Set dst (MulVS src (LoadVector mem)));
7459   format %{ "vpmullw $dst,$src,$mem\t! mul packed4S" %}
7460   ins_encode %{
7461     int vector_len = 0;
7462     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7463   %}
7464   ins_pipe( pipe_slow );
7465 %}
7466 
7467 instruct vmul8S(vecX dst, vecX src) %{
7468   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
7469   match(Set dst (MulVS dst src));
7470   format %{ "pmullw  $dst,$src\t! mul packed8S" %}
7471   ins_encode %{
7472     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
7473   %}
7474   ins_pipe( pipe_slow );
7475 %}
7476 
7477 instruct vmul8S_reg(vecX dst, vecX src1, vecX src2) %{
7478   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
7479   match(Set dst (MulVS src1 src2));
7480   format %{ "vpmullw $dst,$src1,$src2\t! mul packed8S" %}
7481   ins_encode %{
7482     int vector_len = 0;
7483     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7484   %}
7485   ins_pipe( pipe_slow );
7486 %}
7487 
7488 instruct vmul8S_mem(vecX dst, vecX src, memory mem) %{
7489   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
7490   match(Set dst (MulVS src (LoadVector mem)));
7491   format %{ "vpmullw $dst,$src,$mem\t! mul packed8S" %}
7492   ins_encode %{
7493     int vector_len = 0;
7494     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7495   %}
7496   ins_pipe( pipe_slow );
7497 %}
7498 
7499 instruct vmul16S_reg(vecY dst, vecY src1, vecY src2) %{
7500   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
7501   match(Set dst (MulVS src1 src2));
7502   format %{ "vpmullw $dst,$src1,$src2\t! mul packed16S" %}
7503   ins_encode %{
7504     int vector_len = 1;
7505     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7506   %}
7507   ins_pipe( pipe_slow );
7508 %}
7509 
7510 instruct vmul16S_mem(vecY dst, vecY src, memory mem) %{
7511   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
7512   match(Set dst (MulVS src (LoadVector mem)));
7513   format %{ "vpmullw $dst,$src,$mem\t! mul packed16S" %}
7514   ins_encode %{
7515     int vector_len = 1;
7516     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7517   %}
7518   ins_pipe( pipe_slow );
7519 %}
7520 
7521 instruct vmul32S_reg(vecZ dst, vecZ src1, vecZ src2) %{
7522   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
7523   match(Set dst (MulVS src1 src2));
7524   format %{ "vpmullw $dst,$src1,$src2\t! mul packed32S" %}
7525   ins_encode %{
7526     int vector_len = 2;
7527     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7528   %}
7529   ins_pipe( pipe_slow );
7530 %}
7531 
7532 instruct vmul32S_mem(vecZ dst, vecZ src, memory mem) %{
7533   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
7534   match(Set dst (MulVS src (LoadVector mem)));
7535   format %{ "vpmullw $dst,$src,$mem\t! mul packed32S" %}
7536   ins_encode %{
7537     int vector_len = 2;
7538     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7539   %}
7540   ins_pipe( pipe_slow );
7541 %}
7542 
7543 // Integers vector mul (sse4_1)
7544 instruct vmul2I(vecD dst, vecD src) %{
7545   predicate(UseSSE > 3 && n->as_Vector()->length() == 2);
7546   match(Set dst (MulVI dst src));
7547   format %{ "pmulld  $dst,$src\t! mul packed2I" %}
7548   ins_encode %{
7549     __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
7550   %}
7551   ins_pipe( pipe_slow );
7552 %}
7553 
7554 instruct vmul2I_reg(vecD dst, vecD src1, vecD src2) %{
7555   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7556   match(Set dst (MulVI src1 src2));
7557   format %{ "vpmulld $dst,$src1,$src2\t! mul packed2I" %}
7558   ins_encode %{
7559     int vector_len = 0;
7560     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7561   %}
7562   ins_pipe( pipe_slow );
7563 %}
7564 
7565 instruct vmul2I_mem(vecD dst, vecD src, memory mem) %{
7566   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7567   match(Set dst (MulVI src (LoadVector mem)));
7568   format %{ "vpmulld $dst,$src,$mem\t! mul packed2I" %}
7569   ins_encode %{
7570     int vector_len = 0;
7571     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7572   %}
7573   ins_pipe( pipe_slow );
7574 %}
7575 
7576 instruct vmul4I(vecX dst, vecX src) %{
7577   predicate(UseSSE > 3 && n->as_Vector()->length() == 4);
7578   match(Set dst (MulVI dst src));
7579   format %{ "pmulld  $dst,$src\t! mul packed4I" %}
7580   ins_encode %{
7581     __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
7582   %}
7583   ins_pipe( pipe_slow );
7584 %}
7585 
7586 instruct vmul4I_reg(vecX dst, vecX src1, vecX src2) %{
7587   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7588   match(Set dst (MulVI src1 src2));
7589   format %{ "vpmulld $dst,$src1,$src2\t! mul packed4I" %}
7590   ins_encode %{
7591     int vector_len = 0;
7592     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7593   %}
7594   ins_pipe( pipe_slow );
7595 %}
7596 
7597 instruct vmul4I_mem(vecX dst, vecX src, memory mem) %{
7598   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7599   match(Set dst (MulVI src (LoadVector mem)));
7600   format %{ "vpmulld $dst,$src,$mem\t! mul packed4I" %}
7601   ins_encode %{
7602     int vector_len = 0;
7603     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7604   %}
7605   ins_pipe( pipe_slow );
7606 %}
7607 
7608 instruct vmul2L_reg(vecX dst, vecX src1, vecX src2) %{
7609   predicate(UseAVX > 2 && n->as_Vector()->length() == 2 && VM_Version::supports_avx512dq());
7610   match(Set dst (MulVL src1 src2));
7611   format %{ "vpmullq $dst,$src1,$src2\t! mul packed2L" %}
7612   ins_encode %{
7613     int vector_len = 0;
7614     __ vpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7615   %}
7616   ins_pipe( pipe_slow );
7617 %}
7618 
7619 instruct vmul2L_mem(vecX dst, vecX src, memory mem) %{
7620   predicate(UseAVX > 2 && n->as_Vector()->length() == 2 && VM_Version::supports_avx512dq());
7621   match(Set dst (MulVL src (LoadVector mem)));
7622   format %{ "vpmullq $dst,$src,$mem\t! mul packed2L" %}
7623   ins_encode %{
7624     int vector_len = 0;
7625     __ vpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7626   %}
7627   ins_pipe( pipe_slow );
7628 %}
7629 
7630 instruct vmul4L_reg(vecY dst, vecY src1, vecY src2) %{
7631   predicate(UseAVX > 2 && n->as_Vector()->length() == 4 && VM_Version::supports_avx512dq());
7632   match(Set dst (MulVL src1 src2));
7633   format %{ "vpmullq $dst,$src1,$src2\t! mul packed4L" %}
7634   ins_encode %{
7635     int vector_len = 1;
7636     __ vpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7637   %}
7638   ins_pipe( pipe_slow );
7639 %}
7640 
7641 instruct vmul4L_mem(vecY dst, vecY src, memory mem) %{
7642   predicate(UseAVX > 2 && n->as_Vector()->length() == 4 && VM_Version::supports_avx512dq());
7643   match(Set dst (MulVL src (LoadVector mem)));
7644   format %{ "vpmullq $dst,$src,$mem\t! mul packed4L" %}
7645   ins_encode %{
7646     int vector_len = 1;
7647     __ vpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7648   %}
7649   ins_pipe( pipe_slow );
7650 %}
7651 
7652 instruct vmul8L_reg(vecZ dst, vecZ src1, vecZ src2) %{
7653   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && VM_Version::supports_avx512dq());
7654   match(Set dst (MulVL src1 src2));
7655   format %{ "vpmullq $dst,$src1,$src2\t! mul packed8L" %}
7656   ins_encode %{
7657     int vector_len = 2;
7658     __ vpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7659   %}
7660   ins_pipe( pipe_slow );
7661 %}
7662 
7663 instruct vmul8L_mem(vecZ dst, vecZ src, memory mem) %{
7664   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && VM_Version::supports_avx512dq());
7665   match(Set dst (MulVL src (LoadVector mem)));
7666   format %{ "vpmullq $dst,$src,$mem\t! mul packed8L" %}
7667   ins_encode %{
7668     int vector_len = 2;
7669     __ vpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7670   %}
7671   ins_pipe( pipe_slow );
7672 %}
7673 
7674 instruct vmul8I_reg(vecY dst, vecY src1, vecY src2) %{
7675   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
7676   match(Set dst (MulVI src1 src2));
7677   format %{ "vpmulld $dst,$src1,$src2\t! mul packed8I" %}
7678   ins_encode %{
7679     int vector_len = 1;
7680     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7681   %}
7682   ins_pipe( pipe_slow );
7683 %}
7684 
7685 instruct vmul8I_mem(vecY dst, vecY src, memory mem) %{
7686   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
7687   match(Set dst (MulVI src (LoadVector mem)));
7688   format %{ "vpmulld $dst,$src,$mem\t! mul packed8I" %}
7689   ins_encode %{
7690     int vector_len = 1;
7691     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7692   %}
7693   ins_pipe( pipe_slow );
7694 %}
7695 
7696 instruct vmul16I_reg(vecZ dst, vecZ src1, vecZ src2) %{
7697   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
7698   match(Set dst (MulVI src1 src2));
7699   format %{ "vpmulld $dst,$src1,$src2\t! mul packed16I" %}
7700   ins_encode %{
7701     int vector_len = 2;
7702     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7703   %}
7704   ins_pipe( pipe_slow );
7705 %}
7706 
7707 instruct vmul16I_mem(vecZ dst, vecZ src, memory mem) %{
7708   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
7709   match(Set dst (MulVI src (LoadVector mem)));
7710   format %{ "vpmulld $dst,$src,$mem\t! mul packed16I" %}
7711   ins_encode %{
7712     int vector_len = 2;
7713     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7714   %}
7715   ins_pipe( pipe_slow );
7716 %}
7717 
7718 // Floats vector mul
7719 instruct vmul2F(vecD dst, vecD src) %{
7720   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
7721   match(Set dst (MulVF dst src));
7722   format %{ "mulps   $dst,$src\t! mul packed2F" %}
7723   ins_encode %{
7724     __ mulps($dst$$XMMRegister, $src$$XMMRegister);
7725   %}
7726   ins_pipe( pipe_slow );
7727 %}
7728 
7729 instruct vmul2F_reg(vecD dst, vecD src1, vecD src2) %{
7730   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7731   match(Set dst (MulVF src1 src2));
7732   format %{ "vmulps  $dst,$src1,$src2\t! mul packed2F" %}
7733   ins_encode %{
7734     int vector_len = 0;
7735     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7736   %}
7737   ins_pipe( pipe_slow );
7738 %}
7739 
7740 instruct vmul2F_mem(vecD dst, vecD src, memory mem) %{
7741   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7742   match(Set dst (MulVF src (LoadVector mem)));
7743   format %{ "vmulps  $dst,$src,$mem\t! mul packed2F" %}
7744   ins_encode %{
7745     int vector_len = 0;
7746     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7747   %}
7748   ins_pipe( pipe_slow );
7749 %}
7750 
7751 instruct vmul4F(vecX dst, vecX src) %{
7752   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
7753   match(Set dst (MulVF dst src));
7754   format %{ "mulps   $dst,$src\t! mul packed4F" %}
7755   ins_encode %{
7756     __ mulps($dst$$XMMRegister, $src$$XMMRegister);
7757   %}
7758   ins_pipe( pipe_slow );
7759 %}
7760 
7761 instruct vmul4F_reg(vecX dst, vecX src1, vecX src2) %{
7762   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7763   match(Set dst (MulVF src1 src2));
7764   format %{ "vmulps  $dst,$src1,$src2\t! mul packed4F" %}
7765   ins_encode %{
7766     int vector_len = 0;
7767     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7768   %}
7769   ins_pipe( pipe_slow );
7770 %}
7771 
7772 instruct vmul4F_mem(vecX dst, vecX src, memory mem) %{
7773   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7774   match(Set dst (MulVF src (LoadVector mem)));
7775   format %{ "vmulps  $dst,$src,$mem\t! mul packed4F" %}
7776   ins_encode %{
7777     int vector_len = 0;
7778     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7779   %}
7780   ins_pipe( pipe_slow );
7781 %}
7782 
7783 instruct vmul8F_reg(vecY dst, vecY src1, vecY src2) %{
7784   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
7785   match(Set dst (MulVF src1 src2));
7786   format %{ "vmulps  $dst,$src1,$src2\t! mul packed8F" %}
7787   ins_encode %{
7788     int vector_len = 1;
7789     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7790   %}
7791   ins_pipe( pipe_slow );
7792 %}
7793 
7794 instruct vmul8F_mem(vecY dst, vecY src, memory mem) %{
7795   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
7796   match(Set dst (MulVF src (LoadVector mem)));
7797   format %{ "vmulps  $dst,$src,$mem\t! mul packed8F" %}
7798   ins_encode %{
7799     int vector_len = 1;
7800     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7801   %}
7802   ins_pipe( pipe_slow );
7803 %}
7804 
7805 instruct vmul16F_reg(vecZ dst, vecZ src1, vecZ src2) %{
7806   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
7807   match(Set dst (MulVF src1 src2));
7808   format %{ "vmulps  $dst,$src1,$src2\t! mul packed16F" %}
7809   ins_encode %{
7810     int vector_len = 2;
7811     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7812   %}
7813   ins_pipe( pipe_slow );
7814 %}
7815 
7816 instruct vmul16F_mem(vecZ dst, vecZ src, memory mem) %{
7817   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
7818   match(Set dst (MulVF src (LoadVector mem)));
7819   format %{ "vmulps  $dst,$src,$mem\t! mul packed16F" %}
7820   ins_encode %{
7821     int vector_len = 2;
7822     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7823   %}
7824   ins_pipe( pipe_slow );
7825 %}
7826 
7827 // Doubles vector mul
7828 instruct vmul2D(vecX dst, vecX src) %{
7829   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
7830   match(Set dst (MulVD dst src));
7831   format %{ "mulpd   $dst,$src\t! mul packed2D" %}
7832   ins_encode %{
7833     __ mulpd($dst$$XMMRegister, $src$$XMMRegister);
7834   %}
7835   ins_pipe( pipe_slow );
7836 %}
7837 
7838 instruct vmul2D_reg(vecX dst, vecX src1, vecX src2) %{
7839   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7840   match(Set dst (MulVD src1 src2));
7841   format %{ "vmulpd  $dst,$src1,$src2\t! mul packed2D" %}
7842   ins_encode %{
7843     int vector_len = 0;
7844     __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7845   %}
7846   ins_pipe( pipe_slow );
7847 %}
7848 
7849 instruct vmul2D_mem(vecX dst, vecX src, memory mem) %{
7850   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7851   match(Set dst (MulVD src (LoadVector mem)));
7852   format %{ "vmulpd  $dst,$src,$mem\t! mul packed2D" %}
7853   ins_encode %{
7854     int vector_len = 0;
7855     __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7856   %}
7857   ins_pipe( pipe_slow );
7858 %}
7859 
7860 instruct vmul4D_reg(vecY dst, vecY src1, vecY src2) %{
7861   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7862   match(Set dst (MulVD src1 src2));
7863   format %{ "vmulpd  $dst,$src1,$src2\t! mul packed4D" %}
7864   ins_encode %{
7865     int vector_len = 1;
7866     __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7867   %}
7868   ins_pipe( pipe_slow );
7869 %}
7870 
7871 instruct vmul4D_mem(vecY dst, vecY src, memory mem) %{
7872   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7873   match(Set dst (MulVD src (LoadVector mem)));
7874   format %{ "vmulpd  $dst,$src,$mem\t! mul packed4D" %}
7875   ins_encode %{
7876     int vector_len = 1;
7877     __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7878   %}
7879   ins_pipe( pipe_slow );
7880 %}
7881 
7882 instruct vmul8D_reg(vecZ dst, vecZ src1, vecZ src2) %{
7883   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
7884   match(Set dst (MulVD src1 src2));
7885   format %{ "vmulpd  $dst k0,$src1,$src2\t! mul packed8D" %}
7886   ins_encode %{
7887     int vector_len = 2;
7888     __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7889   %}
7890   ins_pipe( pipe_slow );
7891 %}
7892 
7893 instruct vmul8D_mem(vecZ dst, vecZ src, memory mem) %{
7894   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
7895   match(Set dst (MulVD src (LoadVector mem)));
7896   format %{ "vmulpd  $dst k0,$src,$mem\t! mul packed8D" %}
7897   ins_encode %{
7898     int vector_len = 2;
7899     __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7900   %}
7901   ins_pipe( pipe_slow );
7902 %}
7903 
7904 instruct vcmov8F_reg(legVecY dst, legVecY src1, legVecY src2, immI8 cop, cmpOp_vcmppd copnd) %{
7905   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
7906   match(Set dst (CMoveVF (Binary copnd cop) (Binary src1 src2)));
7907   effect(TEMP dst, USE src1, USE src2);
7908   format %{ "cmpps.$copnd  $dst, $src1, $src2  ! vcmovevf, cond=$cop\n\t"
7909             "blendvps $dst,$src1,$src2,$dst ! vcmovevf\n\t"
7910          %}
7911   ins_encode %{
7912     int vector_len = 1;
7913     int cond = (Assembler::Condition)($copnd$$cmpcode);
7914     __ cmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cond, vector_len);
7915     __ blendvps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $dst$$XMMRegister, vector_len);
7916   %}
7917   ins_pipe( pipe_slow );
7918 %}
7919 
7920 instruct vcmov4D_reg(legVecY dst, legVecY src1, legVecY src2, immI8 cop, cmpOp_vcmppd copnd) %{
7921   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7922   match(Set dst (CMoveVD (Binary copnd cop) (Binary src1 src2)));
7923   effect(TEMP dst, USE src1, USE src2);
7924   format %{ "cmppd.$copnd  $dst, $src1, $src2  ! vcmovevd, cond=$cop\n\t"
7925             "blendvpd $dst,$src1,$src2,$dst ! vcmovevd\n\t"
7926          %}
7927   ins_encode %{
7928     int vector_len = 1;
7929     int cond = (Assembler::Condition)($copnd$$cmpcode);
7930     __ cmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cond, vector_len);
7931     __ blendvpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $dst$$XMMRegister, vector_len);
7932   %}
7933   ins_pipe( pipe_slow );
7934 %}
7935 
7936 // --------------------------------- DIV --------------------------------------
7937 
7938 // Floats vector div
7939 instruct vdiv2F(vecD dst, vecD src) %{
7940   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
7941   match(Set dst (DivVF dst src));
7942   format %{ "divps   $dst,$src\t! div packed2F" %}
7943   ins_encode %{
7944     __ divps($dst$$XMMRegister, $src$$XMMRegister);
7945   %}
7946   ins_pipe( pipe_slow );
7947 %}
7948 
7949 instruct vdiv2F_reg(vecD dst, vecD src1, vecD src2) %{
7950   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7951   match(Set dst (DivVF src1 src2));
7952   format %{ "vdivps  $dst,$src1,$src2\t! div packed2F" %}
7953   ins_encode %{
7954     int vector_len = 0;
7955     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7956   %}
7957   ins_pipe( pipe_slow );
7958 %}
7959 
7960 instruct vdiv2F_mem(vecD dst, vecD src, memory mem) %{
7961   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7962   match(Set dst (DivVF src (LoadVector mem)));
7963   format %{ "vdivps  $dst,$src,$mem\t! div packed2F" %}
7964   ins_encode %{
7965     int vector_len = 0;
7966     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7967   %}
7968   ins_pipe( pipe_slow );
7969 %}
7970 
7971 instruct vdiv4F(vecX dst, vecX src) %{
7972   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
7973   match(Set dst (DivVF dst src));
7974   format %{ "divps   $dst,$src\t! div packed4F" %}
7975   ins_encode %{
7976     __ divps($dst$$XMMRegister, $src$$XMMRegister);
7977   %}
7978   ins_pipe( pipe_slow );
7979 %}
7980 
7981 instruct vdiv4F_reg(vecX dst, vecX src1, vecX src2) %{
7982   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7983   match(Set dst (DivVF src1 src2));
7984   format %{ "vdivps  $dst,$src1,$src2\t! div packed4F" %}
7985   ins_encode %{
7986     int vector_len = 0;
7987     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7988   %}
7989   ins_pipe( pipe_slow );
7990 %}
7991 
7992 instruct vdiv4F_mem(vecX dst, vecX src, memory mem) %{
7993   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7994   match(Set dst (DivVF src (LoadVector mem)));
7995   format %{ "vdivps  $dst,$src,$mem\t! div packed4F" %}
7996   ins_encode %{
7997     int vector_len = 0;
7998     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7999   %}
8000   ins_pipe( pipe_slow );
8001 %}
8002 
8003 instruct vdiv8F_reg(vecY dst, vecY src1, vecY src2) %{
8004   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
8005   match(Set dst (DivVF src1 src2));
8006   format %{ "vdivps  $dst,$src1,$src2\t! div packed8F" %}
8007   ins_encode %{
8008     int vector_len = 1;
8009     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8010   %}
8011   ins_pipe( pipe_slow );
8012 %}
8013 
8014 instruct vdiv8F_mem(vecY dst, vecY src, memory mem) %{
8015   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
8016   match(Set dst (DivVF src (LoadVector mem)));
8017   format %{ "vdivps  $dst,$src,$mem\t! div packed8F" %}
8018   ins_encode %{
8019     int vector_len = 1;
8020     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8021   %}
8022   ins_pipe( pipe_slow );
8023 %}
8024 
8025 instruct vdiv16F_reg(vecZ dst, vecZ src1, vecZ src2) %{
8026   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
8027   match(Set dst (DivVF src1 src2));
8028   format %{ "vdivps  $dst,$src1,$src2\t! div packed16F" %}
8029   ins_encode %{
8030     int vector_len = 2;
8031     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8032   %}
8033   ins_pipe( pipe_slow );
8034 %}
8035 
8036 instruct vdiv16F_mem(vecZ dst, vecZ src, memory mem) %{
8037   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
8038   match(Set dst (DivVF src (LoadVector mem)));
8039   format %{ "vdivps  $dst,$src,$mem\t! div packed16F" %}
8040   ins_encode %{
8041     int vector_len = 2;
8042     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8043   %}
8044   ins_pipe( pipe_slow );
8045 %}
8046 
8047 // Doubles vector div
8048 instruct vdiv2D(vecX dst, vecX src) %{
8049   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
8050   match(Set dst (DivVD dst src));
8051   format %{ "divpd   $dst,$src\t! div packed2D" %}
8052   ins_encode %{
8053     __ divpd($dst$$XMMRegister, $src$$XMMRegister);
8054   %}
8055   ins_pipe( pipe_slow );
8056 %}
8057 
8058 instruct vdiv2D_reg(vecX dst, vecX src1, vecX src2) %{
8059   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8060   match(Set dst (DivVD src1 src2));
8061   format %{ "vdivpd  $dst,$src1,$src2\t! div packed2D" %}
8062   ins_encode %{
8063     int vector_len = 0;
8064     __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8065   %}
8066   ins_pipe( pipe_slow );
8067 %}
8068 
8069 instruct vdiv2D_mem(vecX dst, vecX src, memory mem) %{
8070   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8071   match(Set dst (DivVD src (LoadVector mem)));
8072   format %{ "vdivpd  $dst,$src,$mem\t! div packed2D" %}
8073   ins_encode %{
8074     int vector_len = 0;
8075     __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8076   %}
8077   ins_pipe( pipe_slow );
8078 %}
8079 
8080 instruct vdiv4D_reg(vecY dst, vecY src1, vecY src2) %{
8081   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8082   match(Set dst (DivVD src1 src2));
8083   format %{ "vdivpd  $dst,$src1,$src2\t! div packed4D" %}
8084   ins_encode %{
8085     int vector_len = 1;
8086     __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8087   %}
8088   ins_pipe( pipe_slow );
8089 %}
8090 
8091 instruct vdiv4D_mem(vecY dst, vecY src, memory mem) %{
8092   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8093   match(Set dst (DivVD src (LoadVector mem)));
8094   format %{ "vdivpd  $dst,$src,$mem\t! div packed4D" %}
8095   ins_encode %{
8096     int vector_len = 1;
8097     __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8098   %}
8099   ins_pipe( pipe_slow );
8100 %}
8101 
8102 instruct vdiv8D_reg(vecZ dst, vecZ src1, vecZ src2) %{
8103   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8104   match(Set dst (DivVD src1 src2));
8105   format %{ "vdivpd  $dst,$src1,$src2\t! div packed8D" %}
8106   ins_encode %{
8107     int vector_len = 2;
8108     __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8109   %}
8110   ins_pipe( pipe_slow );
8111 %}
8112 
8113 instruct vdiv8D_mem(vecZ dst, vecZ src, memory mem) %{
8114   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8115   match(Set dst (DivVD src (LoadVector mem)));
8116   format %{ "vdivpd  $dst,$src,$mem\t! div packed8D" %}
8117   ins_encode %{
8118     int vector_len = 2;
8119     __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8120   %}
8121   ins_pipe( pipe_slow );
8122 %}
8123 
8124 // ------------------------------ Shift ---------------------------------------
8125 
8126 // Left and right shift count vectors are the same on x86
8127 // (only lowest bits of xmm reg are used for count).
8128 instruct vshiftcnt(vecS dst, rRegI cnt) %{
8129   match(Set dst (LShiftCntV cnt));
8130   match(Set dst (RShiftCntV cnt));
8131   format %{ "movd    $dst,$cnt\t! load shift count" %}
8132   ins_encode %{
8133     __ movdl($dst$$XMMRegister, $cnt$$Register);
8134   %}
8135   ins_pipe( pipe_slow );
8136 %}
8137 
8138 // --------------------------------- Sqrt --------------------------------------
8139 
8140 // Floating point vector sqrt
8141 instruct vsqrt2D_reg(vecX dst, vecX src) %{
8142   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8143   match(Set dst (SqrtVD src));
8144   format %{ "vsqrtpd  $dst,$src\t! sqrt packed2D" %}
8145   ins_encode %{
8146     int vector_len = 0;
8147     __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
8148   %}
8149   ins_pipe( pipe_slow );
8150 %}
8151 
8152 instruct vsqrt2D_mem(vecX dst, memory mem) %{
8153   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8154   match(Set dst (SqrtVD (LoadVector mem)));
8155   format %{ "vsqrtpd  $dst,$mem\t! sqrt packed2D" %}
8156   ins_encode %{
8157     int vector_len = 0;
8158     __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vector_len);
8159   %}
8160   ins_pipe( pipe_slow );
8161 %}
8162 
8163 instruct vsqrt4D_reg(vecY dst, vecY src) %{
8164   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8165   match(Set dst (SqrtVD src));
8166   format %{ "vsqrtpd  $dst,$src\t! sqrt packed4D" %}
8167   ins_encode %{
8168     int vector_len = 1;
8169     __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
8170   %}
8171   ins_pipe( pipe_slow );
8172 %}
8173 
8174 instruct vsqrt4D_mem(vecY dst, memory mem) %{
8175   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8176   match(Set dst (SqrtVD (LoadVector mem)));
8177   format %{ "vsqrtpd  $dst,$mem\t! sqrt packed4D" %}
8178   ins_encode %{
8179     int vector_len = 1;
8180     __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vector_len);
8181   %}
8182   ins_pipe( pipe_slow );
8183 %}
8184 
8185 instruct vsqrt8D_reg(vecZ dst, vecZ src) %{
8186   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8187   match(Set dst (SqrtVD src));
8188   format %{ "vsqrtpd  $dst,$src\t! sqrt packed8D" %}
8189   ins_encode %{
8190     int vector_len = 2;
8191     __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
8192   %}
8193   ins_pipe( pipe_slow );
8194 %}
8195 
8196 instruct vsqrt8D_mem(vecZ dst, memory mem) %{
8197   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8198   match(Set dst (SqrtVD (LoadVector mem)));
8199   format %{ "vsqrtpd  $dst,$mem\t! sqrt packed8D" %}
8200   ins_encode %{
8201     int vector_len = 2;
8202     __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vector_len);
8203   %}
8204   ins_pipe( pipe_slow );
8205 %}
8206 
8207 instruct vsqrt2F_reg(vecD dst, vecD src) %{
8208   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8209   match(Set dst (SqrtVF src));
8210   format %{ "vsqrtps  $dst,$src\t! sqrt packed2F" %}
8211   ins_encode %{
8212     int vector_len = 0;
8213     __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vector_len);
8214   %}
8215   ins_pipe( pipe_slow );
8216 %}
8217 
8218 instruct vsqrt2F_mem(vecD dst, memory mem) %{
8219   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8220   match(Set dst (SqrtVF (LoadVector mem)));
8221   format %{ "vsqrtps  $dst,$mem\t! sqrt packed2F" %}
8222   ins_encode %{
8223     int vector_len = 0;
8224     __ vsqrtps($dst$$XMMRegister, $mem$$Address, vector_len);
8225   %}
8226   ins_pipe( pipe_slow );
8227 %}
8228 
8229 instruct vsqrt4F_reg(vecX dst, vecX src) %{
8230   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8231   match(Set dst (SqrtVF src));
8232   format %{ "vsqrtps  $dst,$src\t! sqrt packed4F" %}
8233   ins_encode %{
8234     int vector_len = 0;
8235     __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vector_len);
8236   %}
8237   ins_pipe( pipe_slow );
8238 %}
8239 
8240 instruct vsqrt4F_mem(vecX dst, memory mem) %{
8241   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8242   match(Set dst (SqrtVF (LoadVector mem)));
8243   format %{ "vsqrtps  $dst,$mem\t! sqrt packed4F" %}
8244   ins_encode %{
8245     int vector_len = 0;
8246     __ vsqrtps($dst$$XMMRegister, $mem$$Address, vector_len);
8247   %}
8248   ins_pipe( pipe_slow );
8249 %}
8250 
8251 instruct vsqrt8F_reg(vecY dst, vecY src) %{
8252   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
8253   match(Set dst (SqrtVF src));
8254   format %{ "vsqrtps  $dst,$src\t! sqrt packed8F" %}
8255   ins_encode %{
8256     int vector_len = 1;
8257     __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vector_len);
8258   %}
8259   ins_pipe( pipe_slow );
8260 %}
8261 
8262 instruct vsqrt8F_mem(vecY dst, memory mem) %{
8263   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
8264   match(Set dst (SqrtVF (LoadVector mem)));
8265   format %{ "vsqrtps  $dst,$mem\t! sqrt packed8F" %}
8266   ins_encode %{
8267     int vector_len = 1;
8268     __ vsqrtps($dst$$XMMRegister, $mem$$Address, vector_len);
8269   %}
8270   ins_pipe( pipe_slow );
8271 %}
8272 
8273 instruct vsqrt16F_reg(vecZ dst, vecZ src) %{
8274   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
8275   match(Set dst (SqrtVF src));
8276   format %{ "vsqrtps  $dst,$src\t! sqrt packed16F" %}
8277   ins_encode %{
8278     int vector_len = 2;
8279     __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vector_len);
8280   %}
8281   ins_pipe( pipe_slow );
8282 %}
8283 
8284 instruct vsqrt16F_mem(vecZ dst, memory mem) %{
8285   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
8286   match(Set dst (SqrtVF (LoadVector mem)));
8287   format %{ "vsqrtps  $dst,$mem\t! sqrt packed16F" %}
8288   ins_encode %{
8289     int vector_len = 2;
8290     __ vsqrtps($dst$$XMMRegister, $mem$$Address, vector_len);
8291   %}
8292   ins_pipe( pipe_slow );
8293 %}
8294 
8295 // ------------------------------ LeftShift -----------------------------------
8296 
8297 // Shorts/Chars vector left shift
8298 instruct vsll2S(vecS dst, vecS shift) %{
8299   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
8300   match(Set dst (LShiftVS dst shift));
8301   format %{ "psllw   $dst,$shift\t! left shift packed2S" %}
8302   ins_encode %{
8303     __ psllw($dst$$XMMRegister, $shift$$XMMRegister);
8304   %}
8305   ins_pipe( pipe_slow );
8306 %}
8307 
8308 instruct vsll2S_imm(vecS dst, immI8 shift) %{
8309   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
8310   match(Set dst (LShiftVS dst shift));
8311   format %{ "psllw   $dst,$shift\t! left shift packed2S" %}
8312   ins_encode %{
8313     __ psllw($dst$$XMMRegister, (int)$shift$$constant);
8314   %}
8315   ins_pipe( pipe_slow );
8316 %}
8317 
8318 instruct vsll2S_reg(vecS dst, vecS src, vecS shift) %{
8319   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8320   match(Set dst (LShiftVS src shift));
8321   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
8322   ins_encode %{
8323     int vector_len = 0;
8324     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8325   %}
8326   ins_pipe( pipe_slow );
8327 %}
8328 
8329 instruct vsll2S_reg_imm(vecS dst, vecS src, immI8 shift) %{
8330   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8331   match(Set dst (LShiftVS src shift));
8332   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
8333   ins_encode %{
8334     int vector_len = 0;
8335     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8336   %}
8337   ins_pipe( pipe_slow );
8338 %}
8339 
8340 instruct vsll4S(vecD dst, vecS shift) %{
8341   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
8342   match(Set dst (LShiftVS dst shift));
8343   format %{ "psllw   $dst,$shift\t! left shift packed4S" %}
8344   ins_encode %{
8345     __ psllw($dst$$XMMRegister, $shift$$XMMRegister);
8346   %}
8347   ins_pipe( pipe_slow );
8348 %}
8349 
8350 instruct vsll4S_imm(vecD dst, immI8 shift) %{
8351   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
8352   match(Set dst (LShiftVS dst shift));
8353   format %{ "psllw   $dst,$shift\t! left shift packed4S" %}
8354   ins_encode %{
8355     __ psllw($dst$$XMMRegister, (int)$shift$$constant);
8356   %}
8357   ins_pipe( pipe_slow );
8358 %}
8359 
8360 instruct vsll4S_reg(vecD dst, vecD src, vecS shift) %{
8361   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8362   match(Set dst (LShiftVS src shift));
8363   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
8364   ins_encode %{
8365     int vector_len = 0;
8366     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8367   %}
8368   ins_pipe( pipe_slow );
8369 %}
8370 
8371 instruct vsll4S_reg_imm(vecD dst, vecD src, immI8 shift) %{
8372   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8373   match(Set dst (LShiftVS src shift));
8374   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
8375   ins_encode %{
8376     int vector_len = 0;
8377     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8378   %}
8379   ins_pipe( pipe_slow );
8380 %}
8381 
8382 instruct vsll8S(vecX dst, vecS shift) %{
8383   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
8384   match(Set dst (LShiftVS dst shift));
8385   format %{ "psllw   $dst,$shift\t! left shift packed8S" %}
8386   ins_encode %{
8387     __ psllw($dst$$XMMRegister, $shift$$XMMRegister);
8388   %}
8389   ins_pipe( pipe_slow );
8390 %}
8391 
8392 instruct vsll8S_imm(vecX dst, immI8 shift) %{
8393   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
8394   match(Set dst (LShiftVS dst shift));
8395   format %{ "psllw   $dst,$shift\t! left shift packed8S" %}
8396   ins_encode %{
8397     __ psllw($dst$$XMMRegister, (int)$shift$$constant);
8398   %}
8399   ins_pipe( pipe_slow );
8400 %}
8401 
8402 instruct vsll8S_reg(vecX dst, vecX src, vecS shift) %{
8403   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
8404   match(Set dst (LShiftVS src shift));
8405   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
8406   ins_encode %{
8407     int vector_len = 0;
8408     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8409   %}
8410   ins_pipe( pipe_slow );
8411 %}
8412 
8413 instruct vsll8S_reg_imm(vecX dst, vecX src, immI8 shift) %{
8414   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
8415   match(Set dst (LShiftVS src shift));
8416   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
8417   ins_encode %{
8418     int vector_len = 0;
8419     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8420   %}
8421   ins_pipe( pipe_slow );
8422 %}
8423 
8424 instruct vsll16S_reg(vecY dst, vecY src, vecS shift) %{
8425   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
8426   match(Set dst (LShiftVS src shift));
8427   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
8428   ins_encode %{
8429     int vector_len = 1;
8430     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8431   %}
8432   ins_pipe( pipe_slow );
8433 %}
8434 
8435 instruct vsll16S_reg_imm(vecY dst, vecY src, immI8 shift) %{
8436   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
8437   match(Set dst (LShiftVS src shift));
8438   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
8439   ins_encode %{
8440     int vector_len = 1;
8441     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8442   %}
8443   ins_pipe( pipe_slow );
8444 %}
8445 
8446 instruct vsll32S_reg(vecZ dst, vecZ src, vecS shift) %{
8447   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
8448   match(Set dst (LShiftVS src shift));
8449   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed32S" %}
8450   ins_encode %{
8451     int vector_len = 2;
8452     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8453   %}
8454   ins_pipe( pipe_slow );
8455 %}
8456 
8457 instruct vsll32S_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
8458   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
8459   match(Set dst (LShiftVS src shift));
8460   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed32S" %}
8461   ins_encode %{
8462     int vector_len = 2;
8463     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8464   %}
8465   ins_pipe( pipe_slow );
8466 %}
8467 
8468 // Integers vector left shift
8469 instruct vsll2I(vecD dst, vecS shift) %{
8470   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
8471   match(Set dst (LShiftVI dst shift));
8472   format %{ "pslld   $dst,$shift\t! left shift packed2I" %}
8473   ins_encode %{
8474     __ pslld($dst$$XMMRegister, $shift$$XMMRegister);
8475   %}
8476   ins_pipe( pipe_slow );
8477 %}
8478 
8479 instruct vsll2I_imm(vecD dst, immI8 shift) %{
8480   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
8481   match(Set dst (LShiftVI dst shift));
8482   format %{ "pslld   $dst,$shift\t! left shift packed2I" %}
8483   ins_encode %{
8484     __ pslld($dst$$XMMRegister, (int)$shift$$constant);
8485   %}
8486   ins_pipe( pipe_slow );
8487 %}
8488 
8489 instruct vsll2I_reg(vecD dst, vecD src, vecS shift) %{
8490   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8491   match(Set dst (LShiftVI src shift));
8492   format %{ "vpslld  $dst,$src,$shift\t! left shift packed2I" %}
8493   ins_encode %{
8494     int vector_len = 0;
8495     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8496   %}
8497   ins_pipe( pipe_slow );
8498 %}
8499 
8500 instruct vsll2I_reg_imm(vecD dst, vecD src, immI8 shift) %{
8501   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8502   match(Set dst (LShiftVI src shift));
8503   format %{ "vpslld  $dst,$src,$shift\t! left shift packed2I" %}
8504   ins_encode %{
8505     int vector_len = 0;
8506     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8507   %}
8508   ins_pipe( pipe_slow );
8509 %}
8510 
8511 instruct vsll4I(vecX dst, vecS shift) %{
8512   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
8513   match(Set dst (LShiftVI dst shift));
8514   format %{ "pslld   $dst,$shift\t! left shift packed4I" %}
8515   ins_encode %{
8516     __ pslld($dst$$XMMRegister, $shift$$XMMRegister);
8517   %}
8518   ins_pipe( pipe_slow );
8519 %}
8520 
8521 instruct vsll4I_imm(vecX dst, immI8 shift) %{
8522   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
8523   match(Set dst (LShiftVI dst shift));
8524   format %{ "pslld   $dst,$shift\t! left shift packed4I" %}
8525   ins_encode %{
8526     __ pslld($dst$$XMMRegister, (int)$shift$$constant);
8527   %}
8528   ins_pipe( pipe_slow );
8529 %}
8530 
8531 instruct vsll4I_reg(vecX dst, vecX src, vecS shift) %{
8532   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8533   match(Set dst (LShiftVI src shift));
8534   format %{ "vpslld  $dst,$src,$shift\t! left shift packed4I" %}
8535   ins_encode %{
8536     int vector_len = 0;
8537     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8538   %}
8539   ins_pipe( pipe_slow );
8540 %}
8541 
8542 instruct vsll4I_reg_imm(vecX dst, vecX src, immI8 shift) %{
8543   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8544   match(Set dst (LShiftVI src shift));
8545   format %{ "vpslld  $dst,$src,$shift\t! left shift packed4I" %}
8546   ins_encode %{
8547     int vector_len = 0;
8548     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8549   %}
8550   ins_pipe( pipe_slow );
8551 %}
8552 
8553 instruct vsll8I_reg(vecY dst, vecY src, vecS shift) %{
8554   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
8555   match(Set dst (LShiftVI src shift));
8556   format %{ "vpslld  $dst,$src,$shift\t! left shift packed8I" %}
8557   ins_encode %{
8558     int vector_len = 1;
8559     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8560   %}
8561   ins_pipe( pipe_slow );
8562 %}
8563 
8564 instruct vsll8I_reg_imm(vecY dst, vecY src, immI8 shift) %{
8565   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
8566   match(Set dst (LShiftVI src shift));
8567   format %{ "vpslld  $dst,$src,$shift\t! left shift packed8I" %}
8568   ins_encode %{
8569     int vector_len = 1;
8570     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8571   %}
8572   ins_pipe( pipe_slow );
8573 %}
8574 
8575 instruct vsll16I_reg(vecZ dst, vecZ src, vecS shift) %{
8576   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
8577   match(Set dst (LShiftVI src shift));
8578   format %{ "vpslld  $dst,$src,$shift\t! left shift packed16I" %}
8579   ins_encode %{
8580     int vector_len = 2;
8581     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8582   %}
8583   ins_pipe( pipe_slow );
8584 %}
8585 
8586 instruct vsll16I_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
8587   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
8588   match(Set dst (LShiftVI src shift));
8589   format %{ "vpslld  $dst,$src,$shift\t! left shift packed16I" %}
8590   ins_encode %{
8591     int vector_len = 2;
8592     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8593   %}
8594   ins_pipe( pipe_slow );
8595 %}
8596 
8597 // Longs vector left shift
8598 instruct vsll2L(vecX dst, vecS shift) %{
8599   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
8600   match(Set dst (LShiftVL dst shift));
8601   format %{ "psllq   $dst,$shift\t! left shift packed2L" %}
8602   ins_encode %{
8603     __ psllq($dst$$XMMRegister, $shift$$XMMRegister);
8604   %}
8605   ins_pipe( pipe_slow );
8606 %}
8607 
8608 instruct vsll2L_imm(vecX dst, immI8 shift) %{
8609   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
8610   match(Set dst (LShiftVL dst shift));
8611   format %{ "psllq   $dst,$shift\t! left shift packed2L" %}
8612   ins_encode %{
8613     __ psllq($dst$$XMMRegister, (int)$shift$$constant);
8614   %}
8615   ins_pipe( pipe_slow );
8616 %}
8617 
8618 instruct vsll2L_reg(vecX dst, vecX src, vecS shift) %{
8619   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8620   match(Set dst (LShiftVL src shift));
8621   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed2L" %}
8622   ins_encode %{
8623     int vector_len = 0;
8624     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8625   %}
8626   ins_pipe( pipe_slow );
8627 %}
8628 
8629 instruct vsll2L_reg_imm(vecX dst, vecX src, immI8 shift) %{
8630   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8631   match(Set dst (LShiftVL src shift));
8632   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed2L" %}
8633   ins_encode %{
8634     int vector_len = 0;
8635     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8636   %}
8637   ins_pipe( pipe_slow );
8638 %}
8639 
8640 instruct vsll4L_reg(vecY dst, vecY src, vecS shift) %{
8641   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
8642   match(Set dst (LShiftVL src shift));
8643   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed4L" %}
8644   ins_encode %{
8645     int vector_len = 1;
8646     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8647   %}
8648   ins_pipe( pipe_slow );
8649 %}
8650 
8651 instruct vsll4L_reg_imm(vecY dst, vecY src, immI8 shift) %{
8652   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
8653   match(Set dst (LShiftVL src shift));
8654   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed4L" %}
8655   ins_encode %{
8656     int vector_len = 1;
8657     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8658   %}
8659   ins_pipe( pipe_slow );
8660 %}
8661 
8662 instruct vsll8L_reg(vecZ dst, vecZ src, vecS shift) %{
8663   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8664   match(Set dst (LShiftVL src shift));
8665   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed8L" %}
8666   ins_encode %{
8667     int vector_len = 2;
8668     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8669   %}
8670   ins_pipe( pipe_slow );
8671 %}
8672 
8673 instruct vsll8L_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
8674   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8675   match(Set dst (LShiftVL src shift));
8676   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed8L" %}
8677   ins_encode %{
8678     int vector_len = 2;
8679     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8680   %}
8681   ins_pipe( pipe_slow );
8682 %}
8683 
8684 // ----------------------- LogicalRightShift -----------------------------------
8685 
8686 // Shorts vector logical right shift produces incorrect Java result
8687 // for negative data because java code convert short value into int with
8688 // sign extension before a shift. But char vectors are fine since chars are
8689 // unsigned values.
8690 
8691 instruct vsrl2S(vecS dst, vecS shift) %{
8692   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
8693   match(Set dst (URShiftVS dst shift));
8694   format %{ "psrlw   $dst,$shift\t! logical right shift packed2S" %}
8695   ins_encode %{
8696     __ psrlw($dst$$XMMRegister, $shift$$XMMRegister);
8697   %}
8698   ins_pipe( pipe_slow );
8699 %}
8700 
8701 instruct vsrl2S_imm(vecS dst, immI8 shift) %{
8702   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
8703   match(Set dst (URShiftVS dst shift));
8704   format %{ "psrlw   $dst,$shift\t! logical right shift packed2S" %}
8705   ins_encode %{
8706     __ psrlw($dst$$XMMRegister, (int)$shift$$constant);
8707   %}
8708   ins_pipe( pipe_slow );
8709 %}
8710 
8711 instruct vsrl2S_reg(vecS dst, vecS src, vecS shift) %{
8712   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8713   match(Set dst (URShiftVS src shift));
8714   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed2S" %}
8715   ins_encode %{
8716     int vector_len = 0;
8717     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8718   %}
8719   ins_pipe( pipe_slow );
8720 %}
8721 
8722 instruct vsrl2S_reg_imm(vecS dst, vecS src, immI8 shift) %{
8723   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8724   match(Set dst (URShiftVS src shift));
8725   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed2S" %}
8726   ins_encode %{
8727     int vector_len = 0;
8728     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8729   %}
8730   ins_pipe( pipe_slow );
8731 %}
8732 
8733 instruct vsrl4S(vecD dst, vecS shift) %{
8734   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
8735   match(Set dst (URShiftVS dst shift));
8736   format %{ "psrlw   $dst,$shift\t! logical right shift packed4S" %}
8737   ins_encode %{
8738     __ psrlw($dst$$XMMRegister, $shift$$XMMRegister);
8739   %}
8740   ins_pipe( pipe_slow );
8741 %}
8742 
8743 instruct vsrl4S_imm(vecD dst, immI8 shift) %{
8744   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
8745   match(Set dst (URShiftVS dst shift));
8746   format %{ "psrlw   $dst,$shift\t! logical right shift packed4S" %}
8747   ins_encode %{
8748     __ psrlw($dst$$XMMRegister, (int)$shift$$constant);
8749   %}
8750   ins_pipe( pipe_slow );
8751 %}
8752 
8753 instruct vsrl4S_reg(vecD dst, vecD src, vecS shift) %{
8754   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8755   match(Set dst (URShiftVS src shift));
8756   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed4S" %}
8757   ins_encode %{
8758     int vector_len = 0;
8759     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8760   %}
8761   ins_pipe( pipe_slow );
8762 %}
8763 
8764 instruct vsrl4S_reg_imm(vecD dst, vecD src, immI8 shift) %{
8765   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8766   match(Set dst (URShiftVS src shift));
8767   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed4S" %}
8768   ins_encode %{
8769     int vector_len = 0;
8770     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8771   %}
8772   ins_pipe( pipe_slow );
8773 %}
8774 
8775 instruct vsrl8S(vecX dst, vecS shift) %{
8776   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
8777   match(Set dst (URShiftVS dst shift));
8778   format %{ "psrlw   $dst,$shift\t! logical right shift packed8S" %}
8779   ins_encode %{
8780     __ psrlw($dst$$XMMRegister, $shift$$XMMRegister);
8781   %}
8782   ins_pipe( pipe_slow );
8783 %}
8784 
8785 instruct vsrl8S_imm(vecX dst, immI8 shift) %{
8786   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
8787   match(Set dst (URShiftVS dst shift));
8788   format %{ "psrlw   $dst,$shift\t! logical right shift packed8S" %}
8789   ins_encode %{
8790     __ psrlw($dst$$XMMRegister, (int)$shift$$constant);
8791   %}
8792   ins_pipe( pipe_slow );
8793 %}
8794 
8795 instruct vsrl8S_reg(vecX dst, vecX src, vecS shift) %{
8796   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
8797   match(Set dst (URShiftVS src shift));
8798   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed8S" %}
8799   ins_encode %{
8800     int vector_len = 0;
8801     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8802   %}
8803   ins_pipe( pipe_slow );
8804 %}
8805 
8806 instruct vsrl8S_reg_imm(vecX dst, vecX src, immI8 shift) %{
8807   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
8808   match(Set dst (URShiftVS src shift));
8809   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed8S" %}
8810   ins_encode %{
8811     int vector_len = 0;
8812     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8813   %}
8814   ins_pipe( pipe_slow );
8815 %}
8816 
8817 instruct vsrl16S_reg(vecY dst, vecY src, vecS shift) %{
8818   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
8819   match(Set dst (URShiftVS src shift));
8820   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed16S" %}
8821   ins_encode %{
8822     int vector_len = 1;
8823     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8824   %}
8825   ins_pipe( pipe_slow );
8826 %}
8827 
8828 instruct vsrl16S_reg_imm(vecY dst, vecY src, immI8 shift) %{
8829   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
8830   match(Set dst (URShiftVS src shift));
8831   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed16S" %}
8832   ins_encode %{
8833     int vector_len = 1;
8834     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8835   %}
8836   ins_pipe( pipe_slow );
8837 %}
8838 
8839 instruct vsrl32S_reg(vecZ dst, vecZ src, vecS shift) %{
8840   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
8841   match(Set dst (URShiftVS src shift));
8842   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed32S" %}
8843   ins_encode %{
8844     int vector_len = 2;
8845     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8846   %}
8847   ins_pipe( pipe_slow );
8848 %}
8849 
8850 instruct vsrl32S_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
8851   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
8852   match(Set dst (URShiftVS src shift));
8853   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed32S" %}
8854   ins_encode %{
8855     int vector_len = 2;
8856     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8857   %}
8858   ins_pipe( pipe_slow );
8859 %}
8860 
8861 // Integers vector logical right shift
8862 instruct vsrl2I(vecD dst, vecS shift) %{
8863   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
8864   match(Set dst (URShiftVI dst shift));
8865   format %{ "psrld   $dst,$shift\t! logical right shift packed2I" %}
8866   ins_encode %{
8867     __ psrld($dst$$XMMRegister, $shift$$XMMRegister);
8868   %}
8869   ins_pipe( pipe_slow );
8870 %}
8871 
8872 instruct vsrl2I_imm(vecD dst, immI8 shift) %{
8873   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
8874   match(Set dst (URShiftVI dst shift));
8875   format %{ "psrld   $dst,$shift\t! logical right shift packed2I" %}
8876   ins_encode %{
8877     __ psrld($dst$$XMMRegister, (int)$shift$$constant);
8878   %}
8879   ins_pipe( pipe_slow );
8880 %}
8881 
8882 instruct vsrl2I_reg(vecD dst, vecD src, vecS shift) %{
8883   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8884   match(Set dst (URShiftVI src shift));
8885   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed2I" %}
8886   ins_encode %{
8887     int vector_len = 0;
8888     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8889   %}
8890   ins_pipe( pipe_slow );
8891 %}
8892 
8893 instruct vsrl2I_reg_imm(vecD dst, vecD src, immI8 shift) %{
8894   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8895   match(Set dst (URShiftVI src shift));
8896   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed2I" %}
8897   ins_encode %{
8898     int vector_len = 0;
8899     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8900   %}
8901   ins_pipe( pipe_slow );
8902 %}
8903 
8904 instruct vsrl4I(vecX dst, vecS shift) %{
8905   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
8906   match(Set dst (URShiftVI dst shift));
8907   format %{ "psrld   $dst,$shift\t! logical right shift packed4I" %}
8908   ins_encode %{
8909     __ psrld($dst$$XMMRegister, $shift$$XMMRegister);
8910   %}
8911   ins_pipe( pipe_slow );
8912 %}
8913 
8914 instruct vsrl4I_imm(vecX dst, immI8 shift) %{
8915   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
8916   match(Set dst (URShiftVI dst shift));
8917   format %{ "psrld   $dst,$shift\t! logical right shift packed4I" %}
8918   ins_encode %{
8919     __ psrld($dst$$XMMRegister, (int)$shift$$constant);
8920   %}
8921   ins_pipe( pipe_slow );
8922 %}
8923 
8924 instruct vsrl4I_reg(vecX dst, vecX src, vecS shift) %{
8925   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8926   match(Set dst (URShiftVI src shift));
8927   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed4I" %}
8928   ins_encode %{
8929     int vector_len = 0;
8930     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8931   %}
8932   ins_pipe( pipe_slow );
8933 %}
8934 
8935 instruct vsrl4I_reg_imm(vecX dst, vecX src, immI8 shift) %{
8936   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8937   match(Set dst (URShiftVI src shift));
8938   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed4I" %}
8939   ins_encode %{
8940     int vector_len = 0;
8941     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8942   %}
8943   ins_pipe( pipe_slow );
8944 %}
8945 
8946 instruct vsrl8I_reg(vecY dst, vecY src, vecS shift) %{
8947   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
8948   match(Set dst (URShiftVI src shift));
8949   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed8I" %}
8950   ins_encode %{
8951     int vector_len = 1;
8952     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8953   %}
8954   ins_pipe( pipe_slow );
8955 %}
8956 
8957 instruct vsrl8I_reg_imm(vecY dst, vecY src, immI8 shift) %{
8958   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
8959   match(Set dst (URShiftVI src shift));
8960   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed8I" %}
8961   ins_encode %{
8962     int vector_len = 1;
8963     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8964   %}
8965   ins_pipe( pipe_slow );
8966 %}
8967 
8968 instruct vsrl16I_reg(vecZ dst, vecZ src, vecS shift) %{
8969   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
8970   match(Set dst (URShiftVI src shift));
8971   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed16I" %}
8972   ins_encode %{
8973     int vector_len = 2;
8974     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8975   %}
8976   ins_pipe( pipe_slow );
8977 %}
8978 
8979 instruct vsrl16I_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
8980   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
8981   match(Set dst (URShiftVI src shift));
8982   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed16I" %}
8983   ins_encode %{
8984     int vector_len = 2;
8985     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8986   %}
8987   ins_pipe( pipe_slow );
8988 %}
8989 
8990 // Longs vector logical right shift
8991 instruct vsrl2L(vecX dst, vecS shift) %{
8992   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
8993   match(Set dst (URShiftVL dst shift));
8994   format %{ "psrlq   $dst,$shift\t! logical right shift packed2L" %}
8995   ins_encode %{
8996     __ psrlq($dst$$XMMRegister, $shift$$XMMRegister);
8997   %}
8998   ins_pipe( pipe_slow );
8999 %}
9000 
9001 instruct vsrl2L_imm(vecX dst, immI8 shift) %{
9002   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
9003   match(Set dst (URShiftVL dst shift));
9004   format %{ "psrlq   $dst,$shift\t! logical right shift packed2L" %}
9005   ins_encode %{
9006     __ psrlq($dst$$XMMRegister, (int)$shift$$constant);
9007   %}
9008   ins_pipe( pipe_slow );
9009 %}
9010 
9011 instruct vsrl2L_reg(vecX dst, vecX src, vecS shift) %{
9012   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
9013   match(Set dst (URShiftVL src shift));
9014   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed2L" %}
9015   ins_encode %{
9016     int vector_len = 0;
9017     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9018   %}
9019   ins_pipe( pipe_slow );
9020 %}
9021 
9022 instruct vsrl2L_reg_imm(vecX dst, vecX src, immI8 shift) %{
9023   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
9024   match(Set dst (URShiftVL src shift));
9025   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed2L" %}
9026   ins_encode %{
9027     int vector_len = 0;
9028     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9029   %}
9030   ins_pipe( pipe_slow );
9031 %}
9032 
9033 instruct vsrl4L_reg(vecY dst, vecY src, vecS shift) %{
9034   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
9035   match(Set dst (URShiftVL src shift));
9036   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed4L" %}
9037   ins_encode %{
9038     int vector_len = 1;
9039     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9040   %}
9041   ins_pipe( pipe_slow );
9042 %}
9043 
9044 instruct vsrl4L_reg_imm(vecY dst, vecY src, immI8 shift) %{
9045   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
9046   match(Set dst (URShiftVL src shift));
9047   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed4L" %}
9048   ins_encode %{
9049     int vector_len = 1;
9050     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9051   %}
9052   ins_pipe( pipe_slow );
9053 %}
9054 
9055 instruct vsrl8L_reg(vecZ dst, vecZ src, vecS shift) %{
9056   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
9057   match(Set dst (URShiftVL src shift));
9058   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed8L" %}
9059   ins_encode %{
9060     int vector_len = 2;
9061     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9062   %}
9063   ins_pipe( pipe_slow );
9064 %}
9065 
9066 instruct vsrl8L_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
9067   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
9068   match(Set dst (URShiftVL src shift));
9069   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed8L" %}
9070   ins_encode %{
9071     int vector_len = 2;
9072     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9073   %}
9074   ins_pipe( pipe_slow );
9075 %}
9076 
9077 // ------------------- ArithmeticRightShift -----------------------------------
9078 
9079 // Shorts/Chars vector arithmetic right shift
9080 instruct vsra2S(vecS dst, vecS shift) %{
9081   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
9082   match(Set dst (RShiftVS dst shift));
9083   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed2S" %}
9084   ins_encode %{
9085     __ psraw($dst$$XMMRegister, $shift$$XMMRegister);
9086   %}
9087   ins_pipe( pipe_slow );
9088 %}
9089 
9090 instruct vsra2S_imm(vecS dst, immI8 shift) %{
9091   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
9092   match(Set dst (RShiftVS dst shift));
9093   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed2S" %}
9094   ins_encode %{
9095     __ psraw($dst$$XMMRegister, (int)$shift$$constant);
9096   %}
9097   ins_pipe( pipe_slow );
9098 %}
9099 
9100 instruct vsra2S_reg(vecS dst, vecS src, vecS shift) %{
9101   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
9102   match(Set dst (RShiftVS src shift));
9103   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
9104   ins_encode %{
9105     int vector_len = 0;
9106     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9107   %}
9108   ins_pipe( pipe_slow );
9109 %}
9110 
9111 instruct vsra2S_reg_imm(vecS dst, vecS src, immI8 shift) %{
9112   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
9113   match(Set dst (RShiftVS src shift));
9114   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
9115   ins_encode %{
9116     int vector_len = 0;
9117     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9118   %}
9119   ins_pipe( pipe_slow );
9120 %}
9121 
9122 instruct vsra4S(vecD dst, vecS shift) %{
9123   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
9124   match(Set dst (RShiftVS dst shift));
9125   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed4S" %}
9126   ins_encode %{
9127     __ psraw($dst$$XMMRegister, $shift$$XMMRegister);
9128   %}
9129   ins_pipe( pipe_slow );
9130 %}
9131 
9132 instruct vsra4S_imm(vecD dst, immI8 shift) %{
9133   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
9134   match(Set dst (RShiftVS dst shift));
9135   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed4S" %}
9136   ins_encode %{
9137     __ psraw($dst$$XMMRegister, (int)$shift$$constant);
9138   %}
9139   ins_pipe( pipe_slow );
9140 %}
9141 
9142 instruct vsra4S_reg(vecD dst, vecD src, vecS shift) %{
9143   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
9144   match(Set dst (RShiftVS src shift));
9145   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
9146   ins_encode %{
9147     int vector_len = 0;
9148     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9149   %}
9150   ins_pipe( pipe_slow );
9151 %}
9152 
9153 instruct vsra4S_reg_imm(vecD dst, vecD src, immI8 shift) %{
9154   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
9155   match(Set dst (RShiftVS src shift));
9156   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
9157   ins_encode %{
9158     int vector_len = 0;
9159     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9160   %}
9161   ins_pipe( pipe_slow );
9162 %}
9163 
9164 instruct vsra8S(vecX dst, vecS shift) %{
9165   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
9166   match(Set dst (RShiftVS dst shift));
9167   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed8S" %}
9168   ins_encode %{
9169     __ psraw($dst$$XMMRegister, $shift$$XMMRegister);
9170   %}
9171   ins_pipe( pipe_slow );
9172 %}
9173 
9174 instruct vsra8S_imm(vecX dst, immI8 shift) %{
9175   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
9176   match(Set dst (RShiftVS dst shift));
9177   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed8S" %}
9178   ins_encode %{
9179     __ psraw($dst$$XMMRegister, (int)$shift$$constant);
9180   %}
9181   ins_pipe( pipe_slow );
9182 %}
9183 
9184 instruct vsra8S_reg(vecX dst, vecX src, vecS shift) %{
9185   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
9186   match(Set dst (RShiftVS src shift));
9187   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
9188   ins_encode %{
9189     int vector_len = 0;
9190     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9191   %}
9192   ins_pipe( pipe_slow );
9193 %}
9194 
9195 instruct vsra8S_reg_imm(vecX dst, vecX src, immI8 shift) %{
9196   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
9197   match(Set dst (RShiftVS src shift));
9198   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
9199   ins_encode %{
9200     int vector_len = 0;
9201     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9202   %}
9203   ins_pipe( pipe_slow );
9204 %}
9205 
9206 instruct vsra16S_reg(vecY dst, vecY src, vecS shift) %{
9207   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
9208   match(Set dst (RShiftVS src shift));
9209   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
9210   ins_encode %{
9211     int vector_len = 1;
9212     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9213   %}
9214   ins_pipe( pipe_slow );
9215 %}
9216 
9217 instruct vsra16S_reg_imm(vecY dst, vecY src, immI8 shift) %{
9218   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
9219   match(Set dst (RShiftVS src shift));
9220   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
9221   ins_encode %{
9222     int vector_len = 1;
9223     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9224   %}
9225   ins_pipe( pipe_slow );
9226 %}
9227 
9228 instruct vsra32S_reg(vecZ dst, vecZ src, vecS shift) %{
9229   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
9230   match(Set dst (RShiftVS src shift));
9231   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed32S" %}
9232   ins_encode %{
9233     int vector_len = 2;
9234     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9235   %}
9236   ins_pipe( pipe_slow );
9237 %}
9238 
9239 instruct vsra32S_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
9240   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
9241   match(Set dst (RShiftVS src shift));
9242   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed32S" %}
9243   ins_encode %{
9244     int vector_len = 2;
9245     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9246   %}
9247   ins_pipe( pipe_slow );
9248 %}
9249 
9250 // Integers vector arithmetic right shift
9251 instruct vsra2I(vecD dst, vecS shift) %{
9252   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
9253   match(Set dst (RShiftVI dst shift));
9254   format %{ "psrad   $dst,$shift\t! arithmetic right shift packed2I" %}
9255   ins_encode %{
9256     __ psrad($dst$$XMMRegister, $shift$$XMMRegister);
9257   %}
9258   ins_pipe( pipe_slow );
9259 %}
9260 
9261 instruct vsra2I_imm(vecD dst, immI8 shift) %{
9262   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
9263   match(Set dst (RShiftVI dst shift));
9264   format %{ "psrad   $dst,$shift\t! arithmetic right shift packed2I" %}
9265   ins_encode %{
9266     __ psrad($dst$$XMMRegister, (int)$shift$$constant);
9267   %}
9268   ins_pipe( pipe_slow );
9269 %}
9270 
9271 instruct vsra2I_reg(vecD dst, vecD src, vecS shift) %{
9272   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
9273   match(Set dst (RShiftVI src shift));
9274   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed2I" %}
9275   ins_encode %{
9276     int vector_len = 0;
9277     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9278   %}
9279   ins_pipe( pipe_slow );
9280 %}
9281 
9282 instruct vsra2I_reg_imm(vecD dst, vecD src, immI8 shift) %{
9283   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
9284   match(Set dst (RShiftVI src shift));
9285   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed2I" %}
9286   ins_encode %{
9287     int vector_len = 0;
9288     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9289   %}
9290   ins_pipe( pipe_slow );
9291 %}
9292 
9293 instruct vsra4I(vecX dst, vecS shift) %{
9294   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
9295   match(Set dst (RShiftVI dst shift));
9296   format %{ "psrad   $dst,$shift\t! arithmetic right shift packed4I" %}
9297   ins_encode %{
9298     __ psrad($dst$$XMMRegister, $shift$$XMMRegister);
9299   %}
9300   ins_pipe( pipe_slow );
9301 %}
9302 
9303 instruct vsra4I_imm(vecX dst, immI8 shift) %{
9304   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
9305   match(Set dst (RShiftVI dst shift));
9306   format %{ "psrad   $dst,$shift\t! arithmetic right shift packed4I" %}
9307   ins_encode %{
9308     __ psrad($dst$$XMMRegister, (int)$shift$$constant);
9309   %}
9310   ins_pipe( pipe_slow );
9311 %}
9312 
9313 instruct vsra4I_reg(vecX dst, vecX src, vecS shift) %{
9314   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
9315   match(Set dst (RShiftVI src shift));
9316   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed4I" %}
9317   ins_encode %{
9318     int vector_len = 0;
9319     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9320   %}
9321   ins_pipe( pipe_slow );
9322 %}
9323 
9324 instruct vsra4I_reg_imm(vecX dst, vecX src, immI8 shift) %{
9325   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
9326   match(Set dst (RShiftVI src shift));
9327   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed4I" %}
9328   ins_encode %{
9329     int vector_len = 0;
9330     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9331   %}
9332   ins_pipe( pipe_slow );
9333 %}
9334 
9335 instruct vsra8I_reg(vecY dst, vecY src, vecS shift) %{
9336   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
9337   match(Set dst (RShiftVI src shift));
9338   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed8I" %}
9339   ins_encode %{
9340     int vector_len = 1;
9341     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9342   %}
9343   ins_pipe( pipe_slow );
9344 %}
9345 
9346 instruct vsra8I_reg_imm(vecY dst, vecY src, immI8 shift) %{
9347   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
9348   match(Set dst (RShiftVI src shift));
9349   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed8I" %}
9350   ins_encode %{
9351     int vector_len = 1;
9352     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9353   %}
9354   ins_pipe( pipe_slow );
9355 %}
9356 
9357 instruct vsra16I_reg(vecZ dst, vecZ src, vecS shift) %{
9358   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
9359   match(Set dst (RShiftVI src shift));
9360   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed16I" %}
9361   ins_encode %{
9362     int vector_len = 2;
9363     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9364   %}
9365   ins_pipe( pipe_slow );
9366 %}
9367 
9368 instruct vsra16I_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
9369   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
9370   match(Set dst (RShiftVI src shift));
9371   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed16I" %}
9372   ins_encode %{
9373     int vector_len = 2;
9374     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9375   %}
9376   ins_pipe( pipe_slow );
9377 %}
9378 
9379 // There are no longs vector arithmetic right shift instructions.
9380 
9381 
9382 // --------------------------------- AND --------------------------------------
9383 
9384 instruct vand4B(vecS dst, vecS src) %{
9385   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 4);
9386   match(Set dst (AndV dst src));
9387   format %{ "pand    $dst,$src\t! and vectors (4 bytes)" %}
9388   ins_encode %{
9389     __ pand($dst$$XMMRegister, $src$$XMMRegister);
9390   %}
9391   ins_pipe( pipe_slow );
9392 %}
9393 
9394 instruct vand4B_reg(vecS dst, vecS src1, vecS src2) %{
9395   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
9396   match(Set dst (AndV src1 src2));
9397   format %{ "vpand   $dst,$src1,$src2\t! and vectors (4 bytes)" %}
9398   ins_encode %{
9399     int vector_len = 0;
9400     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9401   %}
9402   ins_pipe( pipe_slow );
9403 %}
9404 
9405 instruct vand4B_mem(vecS dst, vecS src, memory mem) %{
9406   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
9407   match(Set dst (AndV src (LoadVector mem)));
9408   format %{ "vpand   $dst,$src,$mem\t! and vectors (4 bytes)" %}
9409   ins_encode %{
9410     int vector_len = 0;
9411     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9412   %}
9413   ins_pipe( pipe_slow );
9414 %}
9415 
9416 instruct vand8B(vecD dst, vecD src) %{
9417   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 8);
9418   match(Set dst (AndV dst src));
9419   format %{ "pand    $dst,$src\t! and vectors (8 bytes)" %}
9420   ins_encode %{
9421     __ pand($dst$$XMMRegister, $src$$XMMRegister);
9422   %}
9423   ins_pipe( pipe_slow );
9424 %}
9425 
9426 instruct vand8B_reg(vecD dst, vecD src1, vecD src2) %{
9427   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
9428   match(Set dst (AndV src1 src2));
9429   format %{ "vpand   $dst,$src1,$src2\t! and vectors (8 bytes)" %}
9430   ins_encode %{
9431     int vector_len = 0;
9432     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9433   %}
9434   ins_pipe( pipe_slow );
9435 %}
9436 
9437 instruct vand8B_mem(vecD dst, vecD src, memory mem) %{
9438   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
9439   match(Set dst (AndV src (LoadVector mem)));
9440   format %{ "vpand   $dst,$src,$mem\t! and vectors (8 bytes)" %}
9441   ins_encode %{
9442     int vector_len = 0;
9443     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9444   %}
9445   ins_pipe( pipe_slow );
9446 %}
9447 
9448 instruct vand16B(vecX dst, vecX src) %{
9449   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 16);
9450   match(Set dst (AndV dst src));
9451   format %{ "pand    $dst,$src\t! and vectors (16 bytes)" %}
9452   ins_encode %{
9453     __ pand($dst$$XMMRegister, $src$$XMMRegister);
9454   %}
9455   ins_pipe( pipe_slow );
9456 %}
9457 
9458 instruct vand16B_reg(vecX dst, vecX src1, vecX src2) %{
9459   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
9460   match(Set dst (AndV src1 src2));
9461   format %{ "vpand   $dst,$src1,$src2\t! and vectors (16 bytes)" %}
9462   ins_encode %{
9463     int vector_len = 0;
9464     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9465   %}
9466   ins_pipe( pipe_slow );
9467 %}
9468 
9469 instruct vand16B_mem(vecX dst, vecX src, memory mem) %{
9470   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
9471   match(Set dst (AndV src (LoadVector mem)));
9472   format %{ "vpand   $dst,$src,$mem\t! and vectors (16 bytes)" %}
9473   ins_encode %{
9474     int vector_len = 0;
9475     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9476   %}
9477   ins_pipe( pipe_slow );
9478 %}
9479 
9480 instruct vand32B_reg(vecY dst, vecY src1, vecY src2) %{
9481   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
9482   match(Set dst (AndV src1 src2));
9483   format %{ "vpand   $dst,$src1,$src2\t! and vectors (32 bytes)" %}
9484   ins_encode %{
9485     int vector_len = 1;
9486     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9487   %}
9488   ins_pipe( pipe_slow );
9489 %}
9490 
9491 instruct vand32B_mem(vecY dst, vecY src, memory mem) %{
9492   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
9493   match(Set dst (AndV src (LoadVector mem)));
9494   format %{ "vpand   $dst,$src,$mem\t! and vectors (32 bytes)" %}
9495   ins_encode %{
9496     int vector_len = 1;
9497     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9498   %}
9499   ins_pipe( pipe_slow );
9500 %}
9501 
9502 instruct vand64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
9503   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
9504   match(Set dst (AndV src1 src2));
9505   format %{ "vpand   $dst,$src1,$src2\t! and vectors (64 bytes)" %}
9506   ins_encode %{
9507     int vector_len = 2;
9508     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9509   %}
9510   ins_pipe( pipe_slow );
9511 %}
9512 
9513 instruct vand64B_mem(vecZ dst, vecZ src, memory mem) %{
9514   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
9515   match(Set dst (AndV src (LoadVector mem)));
9516   format %{ "vpand   $dst,$src,$mem\t! and vectors (64 bytes)" %}
9517   ins_encode %{
9518     int vector_len = 2;
9519     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9520   %}
9521   ins_pipe( pipe_slow );
9522 %}
9523 
9524 // --------------------------------- OR ---------------------------------------
9525 
9526 instruct vor4B(vecS dst, vecS src) %{
9527   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 4);
9528   match(Set dst (OrV dst src));
9529   format %{ "por     $dst,$src\t! or vectors (4 bytes)" %}
9530   ins_encode %{
9531     __ por($dst$$XMMRegister, $src$$XMMRegister);
9532   %}
9533   ins_pipe( pipe_slow );
9534 %}
9535 
9536 instruct vor4B_reg(vecS dst, vecS src1, vecS src2) %{
9537   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
9538   match(Set dst (OrV src1 src2));
9539   format %{ "vpor    $dst,$src1,$src2\t! or vectors (4 bytes)" %}
9540   ins_encode %{
9541     int vector_len = 0;
9542     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9543   %}
9544   ins_pipe( pipe_slow );
9545 %}
9546 
9547 instruct vor4B_mem(vecS dst, vecS src, memory mem) %{
9548   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
9549   match(Set dst (OrV src (LoadVector mem)));
9550   format %{ "vpor    $dst,$src,$mem\t! or vectors (4 bytes)" %}
9551   ins_encode %{
9552     int vector_len = 0;
9553     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9554   %}
9555   ins_pipe( pipe_slow );
9556 %}
9557 
9558 instruct vor8B(vecD dst, vecD src) %{
9559   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 8);
9560   match(Set dst (OrV dst src));
9561   format %{ "por     $dst,$src\t! or vectors (8 bytes)" %}
9562   ins_encode %{
9563     __ por($dst$$XMMRegister, $src$$XMMRegister);
9564   %}
9565   ins_pipe( pipe_slow );
9566 %}
9567 
9568 instruct vor8B_reg(vecD dst, vecD src1, vecD src2) %{
9569   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
9570   match(Set dst (OrV src1 src2));
9571   format %{ "vpor    $dst,$src1,$src2\t! or vectors (8 bytes)" %}
9572   ins_encode %{
9573     int vector_len = 0;
9574     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9575   %}
9576   ins_pipe( pipe_slow );
9577 %}
9578 
9579 instruct vor8B_mem(vecD dst, vecD src, memory mem) %{
9580   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
9581   match(Set dst (OrV src (LoadVector mem)));
9582   format %{ "vpor    $dst,$src,$mem\t! or vectors (8 bytes)" %}
9583   ins_encode %{
9584     int vector_len = 0;
9585     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9586   %}
9587   ins_pipe( pipe_slow );
9588 %}
9589 
9590 instruct vor16B(vecX dst, vecX src) %{
9591   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 16);
9592   match(Set dst (OrV dst src));
9593   format %{ "por     $dst,$src\t! or vectors (16 bytes)" %}
9594   ins_encode %{
9595     __ por($dst$$XMMRegister, $src$$XMMRegister);
9596   %}
9597   ins_pipe( pipe_slow );
9598 %}
9599 
9600 instruct vor16B_reg(vecX dst, vecX src1, vecX src2) %{
9601   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
9602   match(Set dst (OrV src1 src2));
9603   format %{ "vpor    $dst,$src1,$src2\t! or vectors (16 bytes)" %}
9604   ins_encode %{
9605     int vector_len = 0;
9606     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9607   %}
9608   ins_pipe( pipe_slow );
9609 %}
9610 
9611 instruct vor16B_mem(vecX dst, vecX src, memory mem) %{
9612   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
9613   match(Set dst (OrV src (LoadVector mem)));
9614   format %{ "vpor    $dst,$src,$mem\t! or vectors (16 bytes)" %}
9615   ins_encode %{
9616     int vector_len = 0;
9617     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9618   %}
9619   ins_pipe( pipe_slow );
9620 %}
9621 
9622 instruct vor32B_reg(vecY dst, vecY src1, vecY src2) %{
9623   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
9624   match(Set dst (OrV src1 src2));
9625   format %{ "vpor    $dst,$src1,$src2\t! or vectors (32 bytes)" %}
9626   ins_encode %{
9627     int vector_len = 1;
9628     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9629   %}
9630   ins_pipe( pipe_slow );
9631 %}
9632 
9633 instruct vor32B_mem(vecY dst, vecY src, memory mem) %{
9634   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
9635   match(Set dst (OrV src (LoadVector mem)));
9636   format %{ "vpor    $dst,$src,$mem\t! or vectors (32 bytes)" %}
9637   ins_encode %{
9638     int vector_len = 1;
9639     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9640   %}
9641   ins_pipe( pipe_slow );
9642 %}
9643 
9644 instruct vor64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
9645   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
9646   match(Set dst (OrV src1 src2));
9647   format %{ "vpor    $dst,$src1,$src2\t! or vectors (64 bytes)" %}
9648   ins_encode %{
9649     int vector_len = 2;
9650     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9651   %}
9652   ins_pipe( pipe_slow );
9653 %}
9654 
9655 instruct vor64B_mem(vecZ dst, vecZ src, memory mem) %{
9656   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
9657   match(Set dst (OrV src (LoadVector mem)));
9658   format %{ "vpor    $dst,$src,$mem\t! or vectors (64 bytes)" %}
9659   ins_encode %{
9660     int vector_len = 2;
9661     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9662   %}
9663   ins_pipe( pipe_slow );
9664 %}
9665 
9666 // --------------------------------- XOR --------------------------------------
9667 
9668 instruct vxor4B(vecS dst, vecS src) %{
9669   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 4);
9670   match(Set dst (XorV dst src));
9671   format %{ "pxor    $dst,$src\t! xor vectors (4 bytes)" %}
9672   ins_encode %{
9673     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
9674   %}
9675   ins_pipe( pipe_slow );
9676 %}
9677 
9678 instruct vxor4B_reg(vecS dst, vecS src1, vecS src2) %{
9679   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
9680   match(Set dst (XorV src1 src2));
9681   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (4 bytes)" %}
9682   ins_encode %{
9683     int vector_len = 0;
9684     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9685   %}
9686   ins_pipe( pipe_slow );
9687 %}
9688 
9689 instruct vxor4B_mem(vecS dst, vecS src, memory mem) %{
9690   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
9691   match(Set dst (XorV src (LoadVector mem)));
9692   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (4 bytes)" %}
9693   ins_encode %{
9694     int vector_len = 0;
9695     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9696   %}
9697   ins_pipe( pipe_slow );
9698 %}
9699 
9700 instruct vxor8B(vecD dst, vecD src) %{
9701   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 8);
9702   match(Set dst (XorV dst src));
9703   format %{ "pxor    $dst,$src\t! xor vectors (8 bytes)" %}
9704   ins_encode %{
9705     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
9706   %}
9707   ins_pipe( pipe_slow );
9708 %}
9709 
9710 instruct vxor8B_reg(vecD dst, vecD src1, vecD src2) %{
9711   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
9712   match(Set dst (XorV src1 src2));
9713   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (8 bytes)" %}
9714   ins_encode %{
9715     int vector_len = 0;
9716     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9717   %}
9718   ins_pipe( pipe_slow );
9719 %}
9720 
9721 instruct vxor8B_mem(vecD dst, vecD src, memory mem) %{
9722   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
9723   match(Set dst (XorV src (LoadVector mem)));
9724   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (8 bytes)" %}
9725   ins_encode %{
9726     int vector_len = 0;
9727     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9728   %}
9729   ins_pipe( pipe_slow );
9730 %}
9731 
9732 instruct vxor16B(vecX dst, vecX src) %{
9733   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 16);
9734   match(Set dst (XorV dst src));
9735   format %{ "pxor    $dst,$src\t! xor vectors (16 bytes)" %}
9736   ins_encode %{
9737     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
9738   %}
9739   ins_pipe( pipe_slow );
9740 %}
9741 
9742 instruct vxor16B_reg(vecX dst, vecX src1, vecX src2) %{
9743   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
9744   match(Set dst (XorV src1 src2));
9745   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (16 bytes)" %}
9746   ins_encode %{
9747     int vector_len = 0;
9748     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9749   %}
9750   ins_pipe( pipe_slow );
9751 %}
9752 
9753 instruct vxor16B_mem(vecX dst, vecX src, memory mem) %{
9754   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
9755   match(Set dst (XorV src (LoadVector mem)));
9756   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (16 bytes)" %}
9757   ins_encode %{
9758     int vector_len = 0;
9759     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9760   %}
9761   ins_pipe( pipe_slow );
9762 %}
9763 
9764 instruct vxor32B_reg(vecY dst, vecY src1, vecY src2) %{
9765   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
9766   match(Set dst (XorV src1 src2));
9767   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (32 bytes)" %}
9768   ins_encode %{
9769     int vector_len = 1;
9770     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9771   %}
9772   ins_pipe( pipe_slow );
9773 %}
9774 
9775 instruct vxor32B_mem(vecY dst, vecY src, memory mem) %{
9776   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
9777   match(Set dst (XorV src (LoadVector mem)));
9778   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (32 bytes)" %}
9779   ins_encode %{
9780     int vector_len = 1;
9781     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9782   %}
9783   ins_pipe( pipe_slow );
9784 %}
9785 
9786 instruct vxor64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
9787   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
9788   match(Set dst (XorV src1 src2));
9789   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (64 bytes)" %}
9790   ins_encode %{
9791     int vector_len = 2;
9792     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9793   %}
9794   ins_pipe( pipe_slow );
9795 %}
9796 
9797 instruct vxor64B_mem(vecZ dst, vecZ src, memory mem) %{
9798   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
9799   match(Set dst (XorV src (LoadVector mem)));
9800   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (64 bytes)" %}
9801   ins_encode %{
9802     int vector_len = 2;
9803     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9804   %}
9805   ins_pipe( pipe_slow );
9806 %}
9807 
9808 // --------------------------------- FMA --------------------------------------
9809 
9810 // a * b + c
9811 instruct vfma2D_reg(vecX a, vecX b, vecX c) %{
9812   predicate(UseFMA && n->as_Vector()->length() == 2);
9813   match(Set c (FmaVD  c (Binary a b)));
9814   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed2D" %}
9815   ins_cost(150);
9816   ins_encode %{
9817     int vector_len = 0;
9818     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
9819   %}
9820   ins_pipe( pipe_slow );
9821 %}
9822 
9823 // a * b + c
9824 instruct vfma2D_mem(vecX a, memory b, vecX c) %{
9825   predicate(UseFMA && n->as_Vector()->length() == 2);
9826   match(Set c (FmaVD  c (Binary a (LoadVector b))));
9827   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed2D" %}
9828   ins_cost(150);
9829   ins_encode %{
9830     int vector_len = 0;
9831     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
9832   %}
9833   ins_pipe( pipe_slow );
9834 %}
9835 
9836 
9837 // a * b + c
9838 instruct vfma4D_reg(vecY a, vecY b, vecY c) %{
9839   predicate(UseFMA && n->as_Vector()->length() == 4);
9840   match(Set c (FmaVD  c (Binary a b)));
9841   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed4D" %}
9842   ins_cost(150);
9843   ins_encode %{
9844     int vector_len = 1;
9845     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
9846   %}
9847   ins_pipe( pipe_slow );
9848 %}
9849 
9850 // a * b + c
9851 instruct vfma4D_mem(vecY a, memory b, vecY c) %{
9852   predicate(UseFMA && n->as_Vector()->length() == 4);
9853   match(Set c (FmaVD  c (Binary a (LoadVector b))));
9854   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed4D" %}
9855   ins_cost(150);
9856   ins_encode %{
9857     int vector_len = 1;
9858     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
9859   %}
9860   ins_pipe( pipe_slow );
9861 %}
9862 
9863 // a * b + c
9864 instruct vfma8D_reg(vecZ a, vecZ b, vecZ c) %{
9865   predicate(UseFMA && n->as_Vector()->length() == 8);
9866   match(Set c (FmaVD  c (Binary a b)));
9867   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed8D" %}
9868   ins_cost(150);
9869   ins_encode %{
9870     int vector_len = 2;
9871     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
9872   %}
9873   ins_pipe( pipe_slow );
9874 %}
9875 
9876 // a * b + c
9877 instruct vfma8D_mem(vecZ a, memory b, vecZ c) %{
9878   predicate(UseFMA && n->as_Vector()->length() == 8);
9879   match(Set c (FmaVD  c (Binary a (LoadVector b))));
9880   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed8D" %}
9881   ins_cost(150);
9882   ins_encode %{
9883     int vector_len = 2;
9884     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
9885   %}
9886   ins_pipe( pipe_slow );
9887 %}
9888 
9889 // a * b + c
9890 instruct vfma4F_reg(vecX a, vecX b, vecX c) %{
9891   predicate(UseFMA && n->as_Vector()->length() == 4);
9892   match(Set c (FmaVF  c (Binary a b)));
9893   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed4F" %}
9894   ins_cost(150);
9895   ins_encode %{
9896     int vector_len = 0;
9897     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
9898   %}
9899   ins_pipe( pipe_slow );
9900 %}
9901 
9902 // a * b + c
9903 instruct vfma4F_mem(vecX a, memory b, vecX c) %{
9904   predicate(UseFMA && n->as_Vector()->length() == 4);
9905   match(Set c (FmaVF  c (Binary a (LoadVector b))));
9906   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed4F" %}
9907   ins_cost(150);
9908   ins_encode %{
9909     int vector_len = 0;
9910     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
9911   %}
9912   ins_pipe( pipe_slow );
9913 %}
9914 
9915 // a * b + c
9916 instruct vfma8F_reg(vecY a, vecY b, vecY c) %{
9917   predicate(UseFMA && n->as_Vector()->length() == 8);
9918   match(Set c (FmaVF  c (Binary a b)));
9919   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed8F" %}
9920   ins_cost(150);
9921   ins_encode %{
9922     int vector_len = 1;
9923     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
9924   %}
9925   ins_pipe( pipe_slow );
9926 %}
9927 
9928 // a * b + c
9929 instruct vfma8F_mem(vecY a, memory b, vecY c) %{
9930   predicate(UseFMA && n->as_Vector()->length() == 8);
9931   match(Set c (FmaVF  c (Binary a (LoadVector b))));
9932   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed8F" %}
9933   ins_cost(150);
9934   ins_encode %{
9935     int vector_len = 1;
9936     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
9937   %}
9938   ins_pipe( pipe_slow );
9939 %}
9940 
9941 // a * b + c
9942 instruct vfma16F_reg(vecZ a, vecZ b, vecZ c) %{
9943   predicate(UseFMA && n->as_Vector()->length() == 16);
9944   match(Set c (FmaVF  c (Binary a b)));
9945   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed16F" %}
9946   ins_cost(150);
9947   ins_encode %{
9948     int vector_len = 2;
9949     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
9950   %}
9951   ins_pipe( pipe_slow );
9952 %}
9953 
9954 // a * b + c
9955 instruct vfma16F_mem(vecZ a, memory b, vecZ c) %{
9956   predicate(UseFMA && n->as_Vector()->length() == 16);
9957   match(Set c (FmaVF  c (Binary a (LoadVector b))));
9958   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed16F" %}
9959   ins_cost(150);
9960   ins_encode %{
9961     int vector_len = 2;
9962     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
9963   %}
9964   ins_pipe( pipe_slow );
9965 %}
9966 
9967 // --------------------------------- Vector Multiply Add --------------------------------------
9968 
9969 instruct smuladd4S2I_reg(vecD dst, vecD src1) %{
9970   predicate(UseSSE >= 2 && UseAVX == 0 && n->as_Vector()->length() == 2);
9971   match(Set dst (MulAddVS2VI dst src1));
9972   format %{ "pmaddwd $dst,$dst,$src1\t! muladd packed4Sto2I" %}
9973   ins_encode %{
9974     __ pmaddwd($dst$$XMMRegister, $src1$$XMMRegister);
9975   %}
9976   ins_pipe( pipe_slow );
9977 %}
9978 
9979 instruct vmuladd4S2I_reg(vecD dst, vecD src1, vecD src2) %{
9980   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
9981   match(Set dst (MulAddVS2VI src1 src2));
9982   format %{ "vpmaddwd $dst,$src1,$src2\t! muladd packed4Sto2I" %}
9983   ins_encode %{
9984     int vector_len = 0;
9985     __ vpmaddwd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9986   %}
9987   ins_pipe( pipe_slow );
9988 %}
9989 
9990 instruct smuladd8S4I_reg(vecX dst, vecX src1) %{
9991   predicate(UseSSE >= 2 && UseAVX == 0 && n->as_Vector()->length() == 4);
9992   match(Set dst (MulAddVS2VI dst src1));
9993   format %{ "pmaddwd $dst,$dst,$src1\t! muladd packed8Sto4I" %}
9994   ins_encode %{
9995     __ pmaddwd($dst$$XMMRegister, $src1$$XMMRegister);
9996   %}
9997   ins_pipe( pipe_slow );
9998 %}
9999 
10000 instruct vmuladd8S4I_reg(vecX dst, vecX src1, vecX src2) %{
10001   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
10002   match(Set dst (MulAddVS2VI src1 src2));
10003   format %{ "vpmaddwd $dst,$src1,$src2\t! muladd packed8Sto4I" %}
10004   ins_encode %{
10005     int vector_len = 0;
10006     __ vpmaddwd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
10007   %}
10008   ins_pipe( pipe_slow );
10009 %}
10010 
10011 instruct vmuladd16S8I_reg(vecY dst, vecY src1, vecY src2) %{
10012   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
10013   match(Set dst (MulAddVS2VI src1 src2));
10014   format %{ "vpmaddwd $dst,$src1,$src2\t! muladd packed16Sto8I" %}
10015   ins_encode %{
10016     int vector_len = 1;
10017     __ vpmaddwd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
10018   %}
10019   ins_pipe( pipe_slow );
10020 %}
10021 
10022 instruct vmuladd32S16I_reg(vecZ dst, vecZ src1, vecZ src2) %{
10023   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
10024   match(Set dst (MulAddVS2VI src1 src2));
10025   format %{ "vpmaddwd $dst,$src1,$src2\t! muladd packed32Sto16I" %}
10026   ins_encode %{
10027     int vector_len = 2;
10028     __ vpmaddwd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
10029   %}
10030   ins_pipe( pipe_slow );
10031 %}
10032 
10033 // --------------------------------- Vector Multiply Add Add ----------------------------------
10034 
10035 instruct vmuladdadd4S2I_reg(vecD dst, vecD src1, vecD src2) %{
10036   predicate(VM_Version::supports_vnni() && UseAVX > 2 && n->as_Vector()->length() == 2);
10037   match(Set dst (AddVI (MulAddVS2VI src1 src2) dst));
10038   format %{ "evpdpwssd $dst,$src1,$src2\t! muladdadd packed4Sto2I" %}
10039   ins_encode %{
10040     int vector_len = 0;
10041     __ evpdpwssd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
10042   %}
10043   ins_pipe( pipe_slow );
10044   ins_cost(10);
10045 %}
10046 
10047 instruct vmuladdadd8S4I_reg(vecX dst, vecX src1, vecX src2) %{
10048   predicate(VM_Version::supports_vnni() && UseAVX > 2 && n->as_Vector()->length() == 4);
10049   match(Set dst (AddVI (MulAddVS2VI src1 src2) dst));
10050   format %{ "evpdpwssd $dst,$src1,$src2\t! muladdadd packed8Sto4I" %}
10051   ins_encode %{
10052     int vector_len = 0;
10053     __ evpdpwssd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
10054   %}
10055   ins_pipe( pipe_slow );
10056   ins_cost(10);
10057 %}
10058 
10059 instruct vmuladdadd16S8I_reg(vecY dst, vecY src1, vecY src2) %{
10060   predicate(VM_Version::supports_vnni() && UseAVX > 2 && n->as_Vector()->length() == 8);
10061   match(Set dst (AddVI (MulAddVS2VI src1 src2) dst));
10062   format %{ "evpdpwssd $dst,$src1,$src2\t! muladdadd packed16Sto8I" %}
10063   ins_encode %{
10064     int vector_len = 1;
10065     __ evpdpwssd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
10066   %}
10067   ins_pipe( pipe_slow );
10068   ins_cost(10);
10069 %}
10070 
10071 instruct vmuladdadd32S16I_reg(vecZ dst, vecZ src1, vecZ src2) %{
10072   predicate(VM_Version::supports_vnni() && UseAVX > 2 && n->as_Vector()->length() == 16);
10073   match(Set dst (AddVI (MulAddVS2VI src1 src2) dst));
10074   format %{ "evpdpwssd $dst,$src1,$src2\t! muladdadd packed32Sto16I" %}
10075   ins_encode %{
10076     int vector_len = 2;
10077     __ evpdpwssd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
10078   %}
10079   ins_pipe( pipe_slow );
10080   ins_cost(10);
10081 %}
10082 
10083 // --------------------------------- PopCount --------------------------------------
10084 
10085 instruct vpopcount2I(vecD dst, vecD src) %{
10086   predicate(VM_Version::supports_vpopcntdq() && UsePopCountInstruction && n->as_Vector()->length() == 2);
10087   match(Set dst (PopCountVI src));
10088   format %{ "vpopcntd  $dst,$src\t! vector popcount packed2I" %}
10089   ins_encode %{
10090     int vector_len = 0;
10091     __ vpopcntd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
10092   %}
10093   ins_pipe( pipe_slow );
10094 %}
10095 
10096 instruct vpopcount4I(vecX dst, vecX src) %{
10097   predicate(VM_Version::supports_vpopcntdq() && UsePopCountInstruction && n->as_Vector()->length() == 4);
10098   match(Set dst (PopCountVI src));
10099   format %{ "vpopcntd  $dst,$src\t! vector popcount packed4I" %}
10100   ins_encode %{
10101     int vector_len = 0;
10102     __ vpopcntd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
10103   %}
10104   ins_pipe( pipe_slow );
10105 %}
10106 
10107 instruct vpopcount8I(vecY dst, vecY src) %{
10108   predicate(VM_Version::supports_vpopcntdq() && UsePopCountInstruction && n->as_Vector()->length() == 8);
10109   match(Set dst (PopCountVI src));
10110   format %{ "vpopcntd  $dst,$src\t! vector popcount packed8I" %}
10111   ins_encode %{
10112     int vector_len = 1;
10113     __ vpopcntd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
10114   %}
10115   ins_pipe( pipe_slow );
10116 %}
10117 
10118 instruct vpopcount16I(vecZ dst, vecZ src) %{
10119   predicate(VM_Version::supports_vpopcntdq() && UsePopCountInstruction && n->as_Vector()->length() == 16);
10120   match(Set dst (PopCountVI src));
10121   format %{ "vpopcntd  $dst,$src\t! vector popcount packed16I" %}
10122   ins_encode %{
10123     int vector_len = 2;
10124     __ vpopcntd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
10125   %}
10126   ins_pipe( pipe_slow );
10127 %}