1 //
   2 // Copyright (c) 2011, 2018, Oracle and/or its affiliates. All rights reserved.
   3 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4 //
   5 // This code is free software; you can redistribute it and/or modify it
   6 // under the terms of the GNU General Public License version 2 only, as
   7 // published by the Free Software Foundation.
   8 //
   9 // This code is distributed in the hope that it will be useful, but WITHOUT
  10 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11 // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12 // version 2 for more details (a copy is included in the LICENSE file that
  13 // accompanied this code).
  14 //
  15 // You should have received a copy of the GNU General Public License version
  16 // 2 along with this work; if not, write to the Free Software Foundation,
  17 // Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18 //
  19 // Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20 // or visit www.oracle.com if you need additional information or have any
  21 // questions.
  22 //
  23 //
  24 
  25 // X86 Common Architecture Description File
  26 
  27 //----------REGISTER DEFINITION BLOCK------------------------------------------
  28 // This information is used by the matcher and the register allocator to
  29 // describe individual registers and classes of registers within the target
  30 // archtecture.
  31 
  32 register %{
  33 //----------Architecture Description Register Definitions----------------------
  34 // General Registers
  35 // "reg_def"  name ( register save type, C convention save type,
  36 //                   ideal register type, encoding );
  37 // Register Save Types:
  38 //
  39 // NS  = No-Save:       The register allocator assumes that these registers
  40 //                      can be used without saving upon entry to the method, &
  41 //                      that they do not need to be saved at call sites.
  42 //
  43 // SOC = Save-On-Call:  The register allocator assumes that these registers
  44 //                      can be used without saving upon entry to the method,
  45 //                      but that they must be saved at call sites.
  46 //
  47 // SOE = Save-On-Entry: The register allocator assumes that these registers
  48 //                      must be saved before using them upon entry to the
  49 //                      method, but they do not need to be saved at call
  50 //                      sites.
  51 //
  52 // AS  = Always-Save:   The register allocator assumes that these registers
  53 //                      must be saved before using them upon entry to the
  54 //                      method, & that they must be saved at call sites.
  55 //
  56 // Ideal Register Type is used to determine how to save & restore a
  57 // register.  Op_RegI will get spilled with LoadI/StoreI, Op_RegP will get
  58 // spilled with LoadP/StoreP.  If the register supports both, use Op_RegI.
  59 //
  60 // The encoding number is the actual bit-pattern placed into the opcodes.
  61 
  62 // XMM registers.  512-bit registers or 8 words each, labeled (a)-p.
  63 // Word a in each register holds a Float, words ab hold a Double.
  64 // The whole registers are used in SSE4.2 version intrinsics,
  65 // array copy stubs and superword operations (see UseSSE42Intrinsics,
  66 // UseXMMForArrayCopy and UseSuperword flags).
  67 // For pre EVEX enabled architectures:
  68 //      XMM8-XMM15 must be encoded with REX (VEX for UseAVX)
  69 // For EVEX enabled architectures:
  70 //      XMM8-XMM31 must be encoded with REX (EVEX for UseAVX).
  71 //
  72 // Linux ABI:   No register preserved across function calls
  73 //              XMM0-XMM7 might hold parameters
  74 // Windows ABI: XMM6-XMM31 preserved across function calls
  75 //              XMM0-XMM3 might hold parameters
  76 
  77 reg_def XMM0 ( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg());
  78 reg_def XMM0b( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(1));
  79 reg_def XMM0c( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(2));
  80 reg_def XMM0d( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(3));
  81 reg_def XMM0e( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(4));
  82 reg_def XMM0f( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(5));
  83 reg_def XMM0g( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(6));
  84 reg_def XMM0h( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(7));
  85 reg_def XMM0i( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(8));
  86 reg_def XMM0j( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(9));
  87 reg_def XMM0k( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(10));
  88 reg_def XMM0l( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(11));
  89 reg_def XMM0m( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(12));
  90 reg_def XMM0n( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(13));
  91 reg_def XMM0o( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(14));
  92 reg_def XMM0p( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(15));
  93 
  94 reg_def XMM1 ( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg());
  95 reg_def XMM1b( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(1));
  96 reg_def XMM1c( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(2));
  97 reg_def XMM1d( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(3));
  98 reg_def XMM1e( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(4));
  99 reg_def XMM1f( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(5));
 100 reg_def XMM1g( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(6));
 101 reg_def XMM1h( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(7));
 102 reg_def XMM1i( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(8));
 103 reg_def XMM1j( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(9));
 104 reg_def XMM1k( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(10));
 105 reg_def XMM1l( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(11));
 106 reg_def XMM1m( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(12));
 107 reg_def XMM1n( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(13));
 108 reg_def XMM1o( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(14));
 109 reg_def XMM1p( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(15));
 110 
 111 reg_def XMM2 ( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg());
 112 reg_def XMM2b( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(1));
 113 reg_def XMM2c( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(2));
 114 reg_def XMM2d( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(3));
 115 reg_def XMM2e( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(4));
 116 reg_def XMM2f( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(5));
 117 reg_def XMM2g( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(6));
 118 reg_def XMM2h( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(7));
 119 reg_def XMM2i( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(8));
 120 reg_def XMM2j( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(9));
 121 reg_def XMM2k( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(10));
 122 reg_def XMM2l( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(11));
 123 reg_def XMM2m( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(12));
 124 reg_def XMM2n( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(13));
 125 reg_def XMM2o( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(14));
 126 reg_def XMM2p( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(15));
 127 
 128 reg_def XMM3 ( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg());
 129 reg_def XMM3b( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(1));
 130 reg_def XMM3c( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(2));
 131 reg_def XMM3d( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(3));
 132 reg_def XMM3e( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(4));
 133 reg_def XMM3f( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(5));
 134 reg_def XMM3g( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(6));
 135 reg_def XMM3h( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(7));
 136 reg_def XMM3i( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(8));
 137 reg_def XMM3j( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(9));
 138 reg_def XMM3k( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(10));
 139 reg_def XMM3l( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(11));
 140 reg_def XMM3m( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(12));
 141 reg_def XMM3n( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(13));
 142 reg_def XMM3o( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(14));
 143 reg_def XMM3p( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(15));
 144 
 145 reg_def XMM4 ( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg());
 146 reg_def XMM4b( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(1));
 147 reg_def XMM4c( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(2));
 148 reg_def XMM4d( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(3));
 149 reg_def XMM4e( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(4));
 150 reg_def XMM4f( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(5));
 151 reg_def XMM4g( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(6));
 152 reg_def XMM4h( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(7));
 153 reg_def XMM4i( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(8));
 154 reg_def XMM4j( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(9));
 155 reg_def XMM4k( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(10));
 156 reg_def XMM4l( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(11));
 157 reg_def XMM4m( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(12));
 158 reg_def XMM4n( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(13));
 159 reg_def XMM4o( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(14));
 160 reg_def XMM4p( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(15));
 161 
 162 reg_def XMM5 ( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg());
 163 reg_def XMM5b( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(1));
 164 reg_def XMM5c( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(2));
 165 reg_def XMM5d( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(3));
 166 reg_def XMM5e( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(4));
 167 reg_def XMM5f( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(5));
 168 reg_def XMM5g( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(6));
 169 reg_def XMM5h( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(7));
 170 reg_def XMM5i( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(8));
 171 reg_def XMM5j( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(9));
 172 reg_def XMM5k( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(10));
 173 reg_def XMM5l( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(11));
 174 reg_def XMM5m( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(12));
 175 reg_def XMM5n( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(13));
 176 reg_def XMM5o( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(14));
 177 reg_def XMM5p( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(15));
 178 
 179 reg_def XMM6 ( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg());
 180 reg_def XMM6b( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(1));
 181 reg_def XMM6c( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(2));
 182 reg_def XMM6d( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(3));
 183 reg_def XMM6e( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(4));
 184 reg_def XMM6f( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(5));
 185 reg_def XMM6g( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(6));
 186 reg_def XMM6h( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(7));
 187 reg_def XMM6i( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(8));
 188 reg_def XMM6j( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(9));
 189 reg_def XMM6k( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(10));
 190 reg_def XMM6l( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(11));
 191 reg_def XMM6m( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(12));
 192 reg_def XMM6n( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(13));
 193 reg_def XMM6o( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(14));
 194 reg_def XMM6p( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(15));
 195 
 196 reg_def XMM7 ( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg());
 197 reg_def XMM7b( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(1));
 198 reg_def XMM7c( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(2));
 199 reg_def XMM7d( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(3));
 200 reg_def XMM7e( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(4));
 201 reg_def XMM7f( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(5));
 202 reg_def XMM7g( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(6));
 203 reg_def XMM7h( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(7));
 204 reg_def XMM7i( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(8));
 205 reg_def XMM7j( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(9));
 206 reg_def XMM7k( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(10));
 207 reg_def XMM7l( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(11));
 208 reg_def XMM7m( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(12));
 209 reg_def XMM7n( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(13));
 210 reg_def XMM7o( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(14));
 211 reg_def XMM7p( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(15));
 212 
 213 #ifdef _LP64
 214 
 215 reg_def XMM8 ( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg());
 216 reg_def XMM8b( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(1));
 217 reg_def XMM8c( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(2));
 218 reg_def XMM8d( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(3));
 219 reg_def XMM8e( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(4));
 220 reg_def XMM8f( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(5));
 221 reg_def XMM8g( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(6));
 222 reg_def XMM8h( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(7));
 223 reg_def XMM8i( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(8));
 224 reg_def XMM8j( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(9));
 225 reg_def XMM8k( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(10));
 226 reg_def XMM8l( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(11));
 227 reg_def XMM8m( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(12));
 228 reg_def XMM8n( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(13));
 229 reg_def XMM8o( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(14));
 230 reg_def XMM8p( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(15));
 231 
 232 reg_def XMM9 ( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg());
 233 reg_def XMM9b( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(1));
 234 reg_def XMM9c( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(2));
 235 reg_def XMM9d( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(3));
 236 reg_def XMM9e( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(4));
 237 reg_def XMM9f( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(5));
 238 reg_def XMM9g( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(6));
 239 reg_def XMM9h( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(7));
 240 reg_def XMM9i( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(8));
 241 reg_def XMM9j( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(9));
 242 reg_def XMM9k( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(10));
 243 reg_def XMM9l( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(11));
 244 reg_def XMM9m( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(12));
 245 reg_def XMM9n( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(13));
 246 reg_def XMM9o( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(14));
 247 reg_def XMM9p( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(15));
 248 
 249 reg_def XMM10 ( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg());
 250 reg_def XMM10b( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(1));
 251 reg_def XMM10c( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(2));
 252 reg_def XMM10d( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(3));
 253 reg_def XMM10e( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(4));
 254 reg_def XMM10f( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(5));
 255 reg_def XMM10g( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(6));
 256 reg_def XMM10h( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(7));
 257 reg_def XMM10i( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(8));
 258 reg_def XMM10j( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(9));
 259 reg_def XMM10k( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(10));
 260 reg_def XMM10l( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(11));
 261 reg_def XMM10m( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(12));
 262 reg_def XMM10n( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(13));
 263 reg_def XMM10o( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(14));
 264 reg_def XMM10p( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(15));
 265 
 266 reg_def XMM11 ( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg());
 267 reg_def XMM11b( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(1));
 268 reg_def XMM11c( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(2));
 269 reg_def XMM11d( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(3));
 270 reg_def XMM11e( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(4));
 271 reg_def XMM11f( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(5));
 272 reg_def XMM11g( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(6));
 273 reg_def XMM11h( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(7));
 274 reg_def XMM11i( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(8));
 275 reg_def XMM11j( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(9));
 276 reg_def XMM11k( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(10));
 277 reg_def XMM11l( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(11));
 278 reg_def XMM11m( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(12));
 279 reg_def XMM11n( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(13));
 280 reg_def XMM11o( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(14));
 281 reg_def XMM11p( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(15));
 282 
 283 reg_def XMM12 ( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg());
 284 reg_def XMM12b( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(1));
 285 reg_def XMM12c( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(2));
 286 reg_def XMM12d( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(3));
 287 reg_def XMM12e( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(4));
 288 reg_def XMM12f( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(5));
 289 reg_def XMM12g( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(6));
 290 reg_def XMM12h( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(7));
 291 reg_def XMM12i( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(8));
 292 reg_def XMM12j( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(9));
 293 reg_def XMM12k( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(10));
 294 reg_def XMM12l( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(11));
 295 reg_def XMM12m( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(12));
 296 reg_def XMM12n( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(13));
 297 reg_def XMM12o( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(14));
 298 reg_def XMM12p( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(15));
 299 
 300 reg_def XMM13 ( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg());
 301 reg_def XMM13b( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(1));
 302 reg_def XMM13c( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(2));
 303 reg_def XMM13d( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(3));
 304 reg_def XMM13e( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(4));
 305 reg_def XMM13f( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(5));
 306 reg_def XMM13g( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(6));
 307 reg_def XMM13h( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(7));
 308 reg_def XMM13i( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(8));
 309 reg_def XMM13j( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(9));
 310 reg_def XMM13k( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(10));
 311 reg_def XMM13l( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(11));
 312 reg_def XMM13m( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(12));
 313 reg_def XMM13n( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(13));
 314 reg_def XMM13o( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(14));
 315 reg_def XMM13p( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(15));
 316 
 317 reg_def XMM14 ( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg());
 318 reg_def XMM14b( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(1));
 319 reg_def XMM14c( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(2));
 320 reg_def XMM14d( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(3));
 321 reg_def XMM14e( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(4));
 322 reg_def XMM14f( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(5));
 323 reg_def XMM14g( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(6));
 324 reg_def XMM14h( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(7));
 325 reg_def XMM14i( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(8));
 326 reg_def XMM14j( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(9));
 327 reg_def XMM14k( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(10));
 328 reg_def XMM14l( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(11));
 329 reg_def XMM14m( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(12));
 330 reg_def XMM14n( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(13));
 331 reg_def XMM14o( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(14));
 332 reg_def XMM14p( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(15));
 333 
 334 reg_def XMM15 ( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg());
 335 reg_def XMM15b( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(1));
 336 reg_def XMM15c( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(2));
 337 reg_def XMM15d( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(3));
 338 reg_def XMM15e( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(4));
 339 reg_def XMM15f( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(5));
 340 reg_def XMM15g( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(6));
 341 reg_def XMM15h( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(7));
 342 reg_def XMM15i( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(8));
 343 reg_def XMM15j( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(9));
 344 reg_def XMM15k( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(10));
 345 reg_def XMM15l( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(11));
 346 reg_def XMM15m( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(12));
 347 reg_def XMM15n( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(13));
 348 reg_def XMM15o( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(14));
 349 reg_def XMM15p( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(15));
 350 
 351 reg_def XMM16 ( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg());
 352 reg_def XMM16b( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(1));
 353 reg_def XMM16c( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(2));
 354 reg_def XMM16d( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(3));
 355 reg_def XMM16e( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(4));
 356 reg_def XMM16f( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(5));
 357 reg_def XMM16g( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(6));
 358 reg_def XMM16h( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(7));
 359 reg_def XMM16i( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(8));
 360 reg_def XMM16j( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(9));
 361 reg_def XMM16k( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(10));
 362 reg_def XMM16l( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(11));
 363 reg_def XMM16m( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(12));
 364 reg_def XMM16n( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(13));
 365 reg_def XMM16o( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(14));
 366 reg_def XMM16p( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(15));
 367 
 368 reg_def XMM17 ( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg());
 369 reg_def XMM17b( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(1));
 370 reg_def XMM17c( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(2));
 371 reg_def XMM17d( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(3));
 372 reg_def XMM17e( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(4));
 373 reg_def XMM17f( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(5));
 374 reg_def XMM17g( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(6));
 375 reg_def XMM17h( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(7));
 376 reg_def XMM17i( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(8));
 377 reg_def XMM17j( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(9));
 378 reg_def XMM17k( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(10));
 379 reg_def XMM17l( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(11));
 380 reg_def XMM17m( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(12));
 381 reg_def XMM17n( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(13));
 382 reg_def XMM17o( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(14));
 383 reg_def XMM17p( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(15));
 384 
 385 reg_def XMM18 ( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg());
 386 reg_def XMM18b( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(1));
 387 reg_def XMM18c( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(2));
 388 reg_def XMM18d( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(3));
 389 reg_def XMM18e( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(4));
 390 reg_def XMM18f( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(5));
 391 reg_def XMM18g( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(6));
 392 reg_def XMM18h( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(7));
 393 reg_def XMM18i( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(8));
 394 reg_def XMM18j( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(9));
 395 reg_def XMM18k( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(10));
 396 reg_def XMM18l( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(11));
 397 reg_def XMM18m( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(12));
 398 reg_def XMM18n( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(13));
 399 reg_def XMM18o( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(14));
 400 reg_def XMM18p( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(15));
 401 
 402 reg_def XMM19 ( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg());
 403 reg_def XMM19b( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(1));
 404 reg_def XMM19c( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(2));
 405 reg_def XMM19d( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(3));
 406 reg_def XMM19e( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(4));
 407 reg_def XMM19f( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(5));
 408 reg_def XMM19g( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(6));
 409 reg_def XMM19h( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(7));
 410 reg_def XMM19i( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(8));
 411 reg_def XMM19j( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(9));
 412 reg_def XMM19k( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(10));
 413 reg_def XMM19l( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(11));
 414 reg_def XMM19m( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(12));
 415 reg_def XMM19n( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(13));
 416 reg_def XMM19o( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(14));
 417 reg_def XMM19p( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(15));
 418 
 419 reg_def XMM20 ( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg());
 420 reg_def XMM20b( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(1));
 421 reg_def XMM20c( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(2));
 422 reg_def XMM20d( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(3));
 423 reg_def XMM20e( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(4));
 424 reg_def XMM20f( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(5));
 425 reg_def XMM20g( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(6));
 426 reg_def XMM20h( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(7));
 427 reg_def XMM20i( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(8));
 428 reg_def XMM20j( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(9));
 429 reg_def XMM20k( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(10));
 430 reg_def XMM20l( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(11));
 431 reg_def XMM20m( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(12));
 432 reg_def XMM20n( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(13));
 433 reg_def XMM20o( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(14));
 434 reg_def XMM20p( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(15));
 435 
 436 reg_def XMM21 ( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg());
 437 reg_def XMM21b( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(1));
 438 reg_def XMM21c( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(2));
 439 reg_def XMM21d( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(3));
 440 reg_def XMM21e( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(4));
 441 reg_def XMM21f( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(5));
 442 reg_def XMM21g( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(6));
 443 reg_def XMM21h( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(7));
 444 reg_def XMM21i( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(8));
 445 reg_def XMM21j( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(9));
 446 reg_def XMM21k( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(10));
 447 reg_def XMM21l( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(11));
 448 reg_def XMM21m( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(12));
 449 reg_def XMM21n( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(13));
 450 reg_def XMM21o( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(14));
 451 reg_def XMM21p( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(15));
 452 
 453 reg_def XMM22 ( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg());
 454 reg_def XMM22b( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(1));
 455 reg_def XMM22c( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(2));
 456 reg_def XMM22d( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(3));
 457 reg_def XMM22e( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(4));
 458 reg_def XMM22f( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(5));
 459 reg_def XMM22g( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(6));
 460 reg_def XMM22h( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(7));
 461 reg_def XMM22i( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(8));
 462 reg_def XMM22j( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(9));
 463 reg_def XMM22k( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(10));
 464 reg_def XMM22l( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(11));
 465 reg_def XMM22m( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(12));
 466 reg_def XMM22n( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(13));
 467 reg_def XMM22o( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(14));
 468 reg_def XMM22p( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(15));
 469 
 470 reg_def XMM23 ( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg());
 471 reg_def XMM23b( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(1));
 472 reg_def XMM23c( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(2));
 473 reg_def XMM23d( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(3));
 474 reg_def XMM23e( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(4));
 475 reg_def XMM23f( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(5));
 476 reg_def XMM23g( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(6));
 477 reg_def XMM23h( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(7));
 478 reg_def XMM23i( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(8));
 479 reg_def XMM23j( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(9));
 480 reg_def XMM23k( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(10));
 481 reg_def XMM23l( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(11));
 482 reg_def XMM23m( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(12));
 483 reg_def XMM23n( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(13));
 484 reg_def XMM23o( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(14));
 485 reg_def XMM23p( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(15));
 486 
 487 reg_def XMM24 ( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg());
 488 reg_def XMM24b( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(1));
 489 reg_def XMM24c( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(2));
 490 reg_def XMM24d( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(3));
 491 reg_def XMM24e( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(4));
 492 reg_def XMM24f( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(5));
 493 reg_def XMM24g( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(6));
 494 reg_def XMM24h( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(7));
 495 reg_def XMM24i( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(8));
 496 reg_def XMM24j( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(9));
 497 reg_def XMM24k( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(10));
 498 reg_def XMM24l( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(11));
 499 reg_def XMM24m( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(12));
 500 reg_def XMM24n( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(13));
 501 reg_def XMM24o( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(14));
 502 reg_def XMM24p( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(15));
 503 
 504 reg_def XMM25 ( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg());
 505 reg_def XMM25b( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(1));
 506 reg_def XMM25c( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(2));
 507 reg_def XMM25d( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(3));
 508 reg_def XMM25e( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(4));
 509 reg_def XMM25f( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(5));
 510 reg_def XMM25g( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(6));
 511 reg_def XMM25h( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(7));
 512 reg_def XMM25i( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(8));
 513 reg_def XMM25j( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(9));
 514 reg_def XMM25k( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(10));
 515 reg_def XMM25l( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(11));
 516 reg_def XMM25m( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(12));
 517 reg_def XMM25n( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(13));
 518 reg_def XMM25o( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(14));
 519 reg_def XMM25p( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(15));
 520 
 521 reg_def XMM26 ( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg());
 522 reg_def XMM26b( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(1));
 523 reg_def XMM26c( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(2));
 524 reg_def XMM26d( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(3));
 525 reg_def XMM26e( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(4));
 526 reg_def XMM26f( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(5));
 527 reg_def XMM26g( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(6));
 528 reg_def XMM26h( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(7));
 529 reg_def XMM26i( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(8));
 530 reg_def XMM26j( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(9));
 531 reg_def XMM26k( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(10));
 532 reg_def XMM26l( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(11));
 533 reg_def XMM26m( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(12));
 534 reg_def XMM26n( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(13));
 535 reg_def XMM26o( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(14));
 536 reg_def XMM26p( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(15));
 537 
 538 reg_def XMM27 ( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg());
 539 reg_def XMM27b( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(1));
 540 reg_def XMM27c( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(2));
 541 reg_def XMM27d( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(3));
 542 reg_def XMM27e( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(4));
 543 reg_def XMM27f( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(5));
 544 reg_def XMM27g( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(6));
 545 reg_def XMM27h( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(7));
 546 reg_def XMM27i( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(8));
 547 reg_def XMM27j( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(9));
 548 reg_def XMM27k( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(10));
 549 reg_def XMM27l( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(11));
 550 reg_def XMM27m( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(12));
 551 reg_def XMM27n( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(13));
 552 reg_def XMM27o( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(14));
 553 reg_def XMM27p( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(15));
 554 
 555 reg_def XMM28 ( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg());
 556 reg_def XMM28b( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(1));
 557 reg_def XMM28c( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(2));
 558 reg_def XMM28d( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(3));
 559 reg_def XMM28e( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(4));
 560 reg_def XMM28f( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(5));
 561 reg_def XMM28g( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(6));
 562 reg_def XMM28h( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(7));
 563 reg_def XMM28i( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(8));
 564 reg_def XMM28j( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(9));
 565 reg_def XMM28k( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(10));
 566 reg_def XMM28l( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(11));
 567 reg_def XMM28m( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(12));
 568 reg_def XMM28n( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(13));
 569 reg_def XMM28o( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(14));
 570 reg_def XMM28p( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(15));
 571 
 572 reg_def XMM29 ( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg());
 573 reg_def XMM29b( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(1));
 574 reg_def XMM29c( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(2));
 575 reg_def XMM29d( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(3));
 576 reg_def XMM29e( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(4));
 577 reg_def XMM29f( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(5));
 578 reg_def XMM29g( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(6));
 579 reg_def XMM29h( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(7));
 580 reg_def XMM29i( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(8));
 581 reg_def XMM29j( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(9));
 582 reg_def XMM29k( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(10));
 583 reg_def XMM29l( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(11));
 584 reg_def XMM29m( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(12));
 585 reg_def XMM29n( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(13));
 586 reg_def XMM29o( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(14));
 587 reg_def XMM29p( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(15));
 588 
 589 reg_def XMM30 ( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg());
 590 reg_def XMM30b( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(1));
 591 reg_def XMM30c( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(2));
 592 reg_def XMM30d( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(3));
 593 reg_def XMM30e( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(4));
 594 reg_def XMM30f( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(5));
 595 reg_def XMM30g( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(6));
 596 reg_def XMM30h( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(7));
 597 reg_def XMM30i( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(8));
 598 reg_def XMM30j( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(9));
 599 reg_def XMM30k( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(10));
 600 reg_def XMM30l( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(11));
 601 reg_def XMM30m( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(12));
 602 reg_def XMM30n( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(13));
 603 reg_def XMM30o( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(14));
 604 reg_def XMM30p( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(15));
 605 
 606 reg_def XMM31 ( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg());
 607 reg_def XMM31b( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(1));
 608 reg_def XMM31c( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(2));
 609 reg_def XMM31d( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(3));
 610 reg_def XMM31e( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(4));
 611 reg_def XMM31f( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(5));
 612 reg_def XMM31g( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(6));
 613 reg_def XMM31h( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(7));
 614 reg_def XMM31i( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(8));
 615 reg_def XMM31j( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(9));
 616 reg_def XMM31k( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(10));
 617 reg_def XMM31l( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(11));
 618 reg_def XMM31m( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(12));
 619 reg_def XMM31n( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(13));
 620 reg_def XMM31o( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(14));
 621 reg_def XMM31p( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(15));
 622 
 623 #endif // _LP64
 624 
 625 #ifdef _LP64
 626 reg_def RFLAGS(SOC, SOC, 0, 16, VMRegImpl::Bad());
 627 #else
 628 reg_def RFLAGS(SOC, SOC, 0, 8, VMRegImpl::Bad());
 629 #endif // _LP64
 630 
 631 alloc_class chunk1(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
 632                    XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
 633                    XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
 634                    XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
 635                    XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
 636                    XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
 637                    XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
 638                    XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
 639 #ifdef _LP64
 640                   ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
 641                    XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
 642                    XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
 643                    XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
 644                    XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
 645                    XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
 646                    XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
 647                    XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
 648                   ,XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
 649                    XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
 650                    XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
 651                    XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
 652                    XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
 653                    XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
 654                    XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
 655                    XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
 656                    XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
 657                    XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
 658                    XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
 659                    XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
 660                    XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
 661                    XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
 662                    XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
 663                    XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p
 664 #endif
 665                       );
 666 
 667 // flags allocation class should be last.
 668 alloc_class chunk2(RFLAGS);
 669 
 670 // Singleton class for condition codes
 671 reg_class int_flags(RFLAGS);
 672 
 673 // Class for pre evex float registers
 674 reg_class float_reg_legacy(XMM0,
 675                     XMM1,
 676                     XMM2,
 677                     XMM3,
 678                     XMM4,
 679                     XMM5,
 680                     XMM6,
 681                     XMM7
 682 #ifdef _LP64
 683                    ,XMM8,
 684                     XMM9,
 685                     XMM10,
 686                     XMM11,
 687                     XMM12,
 688                     XMM13,
 689                     XMM14,
 690                     XMM15
 691 #endif
 692                     );
 693 
 694 // Class for evex float registers
 695 reg_class float_reg_evex(XMM0,
 696                     XMM1,
 697                     XMM2,
 698                     XMM3,
 699                     XMM4,
 700                     XMM5,
 701                     XMM6,
 702                     XMM7
 703 #ifdef _LP64
 704                    ,XMM8,
 705                     XMM9,
 706                     XMM10,
 707                     XMM11,
 708                     XMM12,
 709                     XMM13,
 710                     XMM14,
 711                     XMM15,
 712                     XMM16,
 713                     XMM17,
 714                     XMM18,
 715                     XMM19,
 716                     XMM20,
 717                     XMM21,
 718                     XMM22,
 719                     XMM23,
 720                     XMM24,
 721                     XMM25,
 722                     XMM26,
 723                     XMM27,
 724                     XMM28,
 725                     XMM29,
 726                     XMM30,
 727                     XMM31
 728 #endif
 729                     );
 730 
 731 reg_class_dynamic float_reg(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() %} );
 732 reg_class_dynamic float_reg_vl(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
 733 
 734 // Class for pre evex double registers
 735 reg_class double_reg_legacy(XMM0,  XMM0b,
 736                      XMM1,  XMM1b,
 737                      XMM2,  XMM2b,
 738                      XMM3,  XMM3b,
 739                      XMM4,  XMM4b,
 740                      XMM5,  XMM5b,
 741                      XMM6,  XMM6b,
 742                      XMM7,  XMM7b
 743 #ifdef _LP64
 744                     ,XMM8,  XMM8b,
 745                      XMM9,  XMM9b,
 746                      XMM10, XMM10b,
 747                      XMM11, XMM11b,
 748                      XMM12, XMM12b,
 749                      XMM13, XMM13b,
 750                      XMM14, XMM14b,
 751                      XMM15, XMM15b
 752 #endif
 753                      );
 754 
 755 // Class for evex double registers
 756 reg_class double_reg_evex(XMM0,  XMM0b,
 757                      XMM1,  XMM1b,
 758                      XMM2,  XMM2b,
 759                      XMM3,  XMM3b,
 760                      XMM4,  XMM4b,
 761                      XMM5,  XMM5b,
 762                      XMM6,  XMM6b,
 763                      XMM7,  XMM7b
 764 #ifdef _LP64
 765                     ,XMM8,  XMM8b,
 766                      XMM9,  XMM9b,
 767                      XMM10, XMM10b,
 768                      XMM11, XMM11b,
 769                      XMM12, XMM12b,
 770                      XMM13, XMM13b,
 771                      XMM14, XMM14b,
 772                      XMM15, XMM15b,
 773                      XMM16, XMM16b,
 774                      XMM17, XMM17b,
 775                      XMM18, XMM18b,
 776                      XMM19, XMM19b,
 777                      XMM20, XMM20b,
 778                      XMM21, XMM21b,
 779                      XMM22, XMM22b,
 780                      XMM23, XMM23b,
 781                      XMM24, XMM24b,
 782                      XMM25, XMM25b,
 783                      XMM26, XMM26b,
 784                      XMM27, XMM27b,
 785                      XMM28, XMM28b,
 786                      XMM29, XMM29b,
 787                      XMM30, XMM30b,
 788                      XMM31, XMM31b
 789 #endif
 790                      );
 791 
 792 reg_class_dynamic double_reg(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() %} );
 793 reg_class_dynamic double_reg_vl(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
 794 
 795 // Class for pre evex 32bit vector registers
 796 reg_class vectors_reg_legacy(XMM0,
 797                       XMM1,
 798                       XMM2,
 799                       XMM3,
 800                       XMM4,
 801                       XMM5,
 802                       XMM6,
 803                       XMM7
 804 #ifdef _LP64
 805                      ,XMM8,
 806                       XMM9,
 807                       XMM10,
 808                       XMM11,
 809                       XMM12,
 810                       XMM13,
 811                       XMM14,
 812                       XMM15
 813 #endif
 814                       );
 815 
 816 // Class for evex 32bit vector registers
 817 reg_class vectors_reg_evex(XMM0,
 818                       XMM1,
 819                       XMM2,
 820                       XMM3,
 821                       XMM4,
 822                       XMM5,
 823                       XMM6,
 824                       XMM7
 825 #ifdef _LP64
 826                      ,XMM8,
 827                       XMM9,
 828                       XMM10,
 829                       XMM11,
 830                       XMM12,
 831                       XMM13,
 832                       XMM14,
 833                       XMM15,
 834                       XMM16,
 835                       XMM17,
 836                       XMM18,
 837                       XMM19,
 838                       XMM20,
 839                       XMM21,
 840                       XMM22,
 841                       XMM23,
 842                       XMM24,
 843                       XMM25,
 844                       XMM26,
 845                       XMM27,
 846                       XMM28,
 847                       XMM29,
 848                       XMM30,
 849                       XMM31
 850 #endif
 851                       );
 852 
 853 reg_class_dynamic vectors_reg(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_evex() %} );
 854 reg_class_dynamic vectors_reg_vlbwdq(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
 855 
 856 // Class for all 64bit vector registers
 857 reg_class vectord_reg_legacy(XMM0,  XMM0b,
 858                       XMM1,  XMM1b,
 859                       XMM2,  XMM2b,
 860                       XMM3,  XMM3b,
 861                       XMM4,  XMM4b,
 862                       XMM5,  XMM5b,
 863                       XMM6,  XMM6b,
 864                       XMM7,  XMM7b
 865 #ifdef _LP64
 866                      ,XMM8,  XMM8b,
 867                       XMM9,  XMM9b,
 868                       XMM10, XMM10b,
 869                       XMM11, XMM11b,
 870                       XMM12, XMM12b,
 871                       XMM13, XMM13b,
 872                       XMM14, XMM14b,
 873                       XMM15, XMM15b
 874 #endif
 875                       );
 876 
 877 // Class for all 64bit vector registers
 878 reg_class vectord_reg_evex(XMM0,  XMM0b,
 879                       XMM1,  XMM1b,
 880                       XMM2,  XMM2b,
 881                       XMM3,  XMM3b,
 882                       XMM4,  XMM4b,
 883                       XMM5,  XMM5b,
 884                       XMM6,  XMM6b,
 885                       XMM7,  XMM7b
 886 #ifdef _LP64
 887                      ,XMM8,  XMM8b,
 888                       XMM9,  XMM9b,
 889                       XMM10, XMM10b,
 890                       XMM11, XMM11b,
 891                       XMM12, XMM12b,
 892                       XMM13, XMM13b,
 893                       XMM14, XMM14b,
 894                       XMM15, XMM15b,
 895                       XMM16, XMM16b,
 896                       XMM17, XMM17b,
 897                       XMM18, XMM18b,
 898                       XMM19, XMM19b,
 899                       XMM20, XMM20b,
 900                       XMM21, XMM21b,
 901                       XMM22, XMM22b,
 902                       XMM23, XMM23b,
 903                       XMM24, XMM24b,
 904                       XMM25, XMM25b,
 905                       XMM26, XMM26b,
 906                       XMM27, XMM27b,
 907                       XMM28, XMM28b,
 908                       XMM29, XMM29b,
 909                       XMM30, XMM30b,
 910                       XMM31, XMM31b
 911 #endif
 912                       );
 913 
 914 reg_class_dynamic vectord_reg(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_evex() %} );
 915 reg_class_dynamic vectord_reg_vlbwdq(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
 916 
 917 // Class for all 128bit vector registers
 918 reg_class vectorx_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,
 919                       XMM1,  XMM1b,  XMM1c,  XMM1d,
 920                       XMM2,  XMM2b,  XMM2c,  XMM2d,
 921                       XMM3,  XMM3b,  XMM3c,  XMM3d,
 922                       XMM4,  XMM4b,  XMM4c,  XMM4d,
 923                       XMM5,  XMM5b,  XMM5c,  XMM5d,
 924                       XMM6,  XMM6b,  XMM6c,  XMM6d,
 925                       XMM7,  XMM7b,  XMM7c,  XMM7d
 926 #ifdef _LP64
 927                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,
 928                       XMM9,  XMM9b,  XMM9c,  XMM9d,
 929                       XMM10, XMM10b, XMM10c, XMM10d,
 930                       XMM11, XMM11b, XMM11c, XMM11d,
 931                       XMM12, XMM12b, XMM12c, XMM12d,
 932                       XMM13, XMM13b, XMM13c, XMM13d,
 933                       XMM14, XMM14b, XMM14c, XMM14d,
 934                       XMM15, XMM15b, XMM15c, XMM15d
 935 #endif
 936                       );
 937 
 938 // Class for all 128bit vector registers
 939 reg_class vectorx_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,
 940                       XMM1,  XMM1b,  XMM1c,  XMM1d,
 941                       XMM2,  XMM2b,  XMM2c,  XMM2d,
 942                       XMM3,  XMM3b,  XMM3c,  XMM3d,
 943                       XMM4,  XMM4b,  XMM4c,  XMM4d,
 944                       XMM5,  XMM5b,  XMM5c,  XMM5d,
 945                       XMM6,  XMM6b,  XMM6c,  XMM6d,
 946                       XMM7,  XMM7b,  XMM7c,  XMM7d
 947 #ifdef _LP64
 948                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,
 949                       XMM9,  XMM9b,  XMM9c,  XMM9d,
 950                       XMM10, XMM10b, XMM10c, XMM10d,
 951                       XMM11, XMM11b, XMM11c, XMM11d,
 952                       XMM12, XMM12b, XMM12c, XMM12d,
 953                       XMM13, XMM13b, XMM13c, XMM13d,
 954                       XMM14, XMM14b, XMM14c, XMM14d,
 955                       XMM15, XMM15b, XMM15c, XMM15d,
 956                       XMM16, XMM16b, XMM16c, XMM16d,
 957                       XMM17, XMM17b, XMM17c, XMM17d,
 958                       XMM18, XMM18b, XMM18c, XMM18d,
 959                       XMM19, XMM19b, XMM19c, XMM19d,
 960                       XMM20, XMM20b, XMM20c, XMM20d,
 961                       XMM21, XMM21b, XMM21c, XMM21d,
 962                       XMM22, XMM22b, XMM22c, XMM22d,
 963                       XMM23, XMM23b, XMM23c, XMM23d,
 964                       XMM24, XMM24b, XMM24c, XMM24d,
 965                       XMM25, XMM25b, XMM25c, XMM25d,
 966                       XMM26, XMM26b, XMM26c, XMM26d,
 967                       XMM27, XMM27b, XMM27c, XMM27d,
 968                       XMM28, XMM28b, XMM28c, XMM28d,
 969                       XMM29, XMM29b, XMM29c, XMM29d,
 970                       XMM30, XMM30b, XMM30c, XMM30d,
 971                       XMM31, XMM31b, XMM31c, XMM31d
 972 #endif
 973                       );
 974 
 975 reg_class_dynamic vectorx_reg(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_evex() %} );
 976 reg_class_dynamic vectorx_reg_vlbwdq(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
 977 
 978 // Class for all 256bit vector registers
 979 reg_class vectory_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
 980                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
 981                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
 982                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
 983                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
 984                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
 985                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
 986                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
 987 #ifdef _LP64
 988                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
 989                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
 990                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
 991                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
 992                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
 993                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
 994                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
 995                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h
 996 #endif
 997                       );
 998 
 999 // Class for all 256bit vector registers
1000 reg_class vectory_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
1001                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
1002                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
1003                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
1004                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
1005                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
1006                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
1007                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
1008 #ifdef _LP64
1009                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
1010                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
1011                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
1012                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
1013                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
1014                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
1015                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
1016                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h,
1017                       XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h,
1018                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h,
1019                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h,
1020                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h,
1021                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h,
1022                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h,
1023                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h,
1024                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h,
1025                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h,
1026                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h,
1027                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h,
1028                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h,
1029                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h,
1030                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h,
1031                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h,
1032                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h
1033 #endif
1034                       );
1035 
1036 reg_class_dynamic vectory_reg(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_evex() %} );
1037 reg_class_dynamic vectory_reg_vlbwdq(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
1038 
1039 // Class for all 512bit vector registers
1040 reg_class vectorz_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
1041                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
1042                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
1043                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
1044                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
1045                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
1046                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
1047                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
1048 #ifdef _LP64
1049                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
1050                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
1051                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
1052                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
1053                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
1054                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
1055                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
1056                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
1057                      ,XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
1058                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
1059                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
1060                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
1061                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
1062                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
1063                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
1064                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
1065                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
1066                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
1067                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
1068                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
1069                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
1070                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
1071                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
1072                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p
1073 #endif
1074                       );
1075 
1076 // Class for restricted 512bit vector registers
1077 reg_class vectorz_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
1078                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
1079                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
1080                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
1081                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
1082                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
1083                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
1084                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
1085 #ifdef _LP64
1086                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
1087                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
1088                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
1089                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
1090                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
1091                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
1092                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
1093                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
1094 #endif
1095                       );
1096 
1097 reg_class_dynamic vectorz_reg(vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() %} );
1098 reg_class_dynamic vectorz_reg_vl(vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
1099 
1100 reg_class xmm0_reg(XMM0, XMM0b, XMM0c, XMM0d);
1101 reg_class ymm0_reg(XMM0, XMM0b, XMM0c, XMM0d, XMM0e, XMM0f, XMM0g, XMM0h);
1102 reg_class zmm0_reg(XMM0, XMM0b, XMM0c, XMM0d, XMM0e, XMM0f, XMM0g, XMM0h, XMM0i, XMM0j, XMM0k, XMM0l, XMM0m, XMM0n, XMM0o, XMM0p);
1103 
1104 reg_class xmm1_reg(XMM1, XMM1b, XMM1c, XMM1d);
1105 reg_class ymm1_reg(XMM1, XMM1b, XMM1c, XMM1d, XMM1e, XMM1f, XMM1g, XMM1h);
1106 reg_class zmm1_reg(XMM1, XMM1b, XMM1c, XMM1d, XMM1e, XMM1f, XMM1g, XMM1h, XMM1i, XMM1j, XMM1k, XMM1l, XMM1m, XMM1n, XMM1o, XMM1p);
1107 
1108 reg_class xmm2_reg(XMM2, XMM2b, XMM2c, XMM2d);
1109 reg_class ymm2_reg(XMM2, XMM2b, XMM2c, XMM2d, XMM2e, XMM2f, XMM2g, XMM2h);
1110 reg_class zmm2_reg(XMM2, XMM2b, XMM2c, XMM2d, XMM2e, XMM2f, XMM2g, XMM2h, XMM2i, XMM2j, XMM2k, XMM2l, XMM2m, XMM2n, XMM2o, XMM2p);
1111 
1112 reg_class xmm3_reg(XMM3, XMM3b, XMM3c, XMM3d);
1113 reg_class ymm3_reg(XMM3, XMM3b, XMM3c, XMM3d, XMM3e, XMM3f, XMM3g, XMM3h);
1114 reg_class zmm3_reg(XMM3, XMM3b, XMM3c, XMM3d, XMM3e, XMM3f, XMM3g, XMM3h, XMM3i, XMM3j, XMM3k, XMM3l, XMM3m, XMM3n, XMM3o, XMM3p);
1115 
1116 reg_class xmm4_reg(XMM4, XMM4b, XMM4c, XMM4d);
1117 reg_class ymm4_reg(XMM4, XMM4b, XMM4c, XMM4d, XMM4e, XMM4f, XMM4g, XMM4h);
1118 reg_class zmm4_reg(XMM4, XMM4b, XMM4c, XMM4d, XMM4e, XMM4f, XMM4g, XMM4h, XMM4i, XMM4j, XMM4k, XMM4l, XMM4m, XMM4n, XMM4o, XMM4p);
1119 
1120 reg_class xmm5_reg(XMM5, XMM5b, XMM5c, XMM5d);
1121 reg_class ymm5_reg(XMM5, XMM5b, XMM5c, XMM5d, XMM5e, XMM5f, XMM5g, XMM5h);
1122 reg_class zmm5_reg(XMM5, XMM5b, XMM5c, XMM5d, XMM5e, XMM5f, XMM5g, XMM5h, XMM5i, XMM5j, XMM5k, XMM5l, XMM5m, XMM5n, XMM5o, XMM5p);
1123 
1124 reg_class xmm6_reg(XMM6, XMM6b, XMM6c, XMM6d);
1125 reg_class ymm6_reg(XMM6, XMM6b, XMM6c, XMM6d, XMM6e, XMM6f, XMM6g, XMM6h);
1126 reg_class zmm6_reg(XMM6, XMM6b, XMM6c, XMM6d, XMM6e, XMM6f, XMM6g, XMM6h, XMM6i, XMM6j, XMM6k, XMM6l, XMM6m, XMM6n, XMM6o, XMM6p);
1127 
1128 reg_class xmm7_reg(XMM7, XMM7b, XMM7c, XMM7d);
1129 reg_class ymm7_reg(XMM7, XMM7b, XMM7c, XMM7d, XMM7e, XMM7f, XMM7g, XMM7h);
1130 reg_class zmm7_reg(XMM7, XMM7b, XMM7c, XMM7d, XMM7e, XMM7f, XMM7g, XMM7h, XMM7i, XMM7j, XMM7k, XMM7l, XMM7m, XMM7n, XMM7o, XMM7p);
1131 
1132 #ifdef _LP64
1133 
1134 reg_class xmm8_reg(XMM8, XMM8b, XMM8c, XMM8d);
1135 reg_class ymm8_reg(XMM8, XMM8b, XMM8c, XMM8d, XMM8e, XMM8f, XMM8g, XMM8h);
1136 reg_class zmm8_reg(XMM8, XMM8b, XMM8c, XMM8d, XMM8e, XMM8f, XMM8g, XMM8h, XMM8i, XMM8j, XMM8k, XMM8l, XMM8m, XMM8n, XMM8o, XMM8p);
1137 
1138 reg_class xmm9_reg(XMM9, XMM9b, XMM9c, XMM9d);
1139 reg_class ymm9_reg(XMM9, XMM9b, XMM9c, XMM9d, XMM9e, XMM9f, XMM9g, XMM9h);
1140 reg_class zmm9_reg(XMM9, XMM9b, XMM9c, XMM9d, XMM9e, XMM9f, XMM9g, XMM9h, XMM9i, XMM9j, XMM9k, XMM9l, XMM9m, XMM9n, XMM9o, XMM9p);
1141 
1142 reg_class xmm10_reg(XMM10, XMM10b, XMM10c, XMM10d);
1143 reg_class ymm10_reg(XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h);
1144 reg_class zmm10_reg(XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p);
1145 
1146 reg_class xmm11_reg(XMM11, XMM11b, XMM11c, XMM11d);
1147 reg_class ymm11_reg(XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h);
1148 reg_class zmm11_reg(XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p);
1149 
1150 reg_class xmm12_reg(XMM12, XMM12b, XMM12c, XMM12d);
1151 reg_class ymm12_reg(XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h);
1152 reg_class zmm12_reg(XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p);
1153 
1154 reg_class xmm13_reg(XMM13, XMM13b, XMM13c, XMM13d);
1155 reg_class ymm13_reg(XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h);
1156 reg_class zmm13_reg(XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p);
1157 
1158 reg_class xmm14_reg(XMM14, XMM14b, XMM14c, XMM14d);
1159 reg_class ymm14_reg(XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h);
1160 reg_class zmm14_reg(XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p);
1161 
1162 reg_class xmm15_reg(XMM15, XMM15b, XMM15c, XMM15d);
1163 reg_class ymm15_reg(XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h);
1164 reg_class zmm15_reg(XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p);
1165 
1166 reg_class xmm16_reg(XMM16, XMM16b, XMM16c, XMM16d);
1167 reg_class ymm16_reg(XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h);
1168 reg_class zmm16_reg(XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p);
1169 
1170 reg_class xmm17_reg(XMM17, XMM17b, XMM17c, XMM17d);
1171 reg_class ymm17_reg(XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h);
1172 reg_class zmm17_reg(XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p);
1173 
1174 reg_class xmm18_reg(XMM18, XMM18b, XMM18c, XMM18d);
1175 reg_class ymm18_reg(XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h);
1176 reg_class zmm18_reg(XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p);
1177 
1178 reg_class xmm19_reg(XMM19, XMM19b, XMM19c, XMM19d);
1179 reg_class ymm19_reg(XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h);
1180 reg_class zmm19_reg(XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p);
1181 
1182 reg_class xmm20_reg(XMM20, XMM20b, XMM20c, XMM20d);
1183 reg_class ymm20_reg(XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h);
1184 reg_class zmm20_reg(XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p);
1185 
1186 reg_class xmm21_reg(XMM21, XMM21b, XMM21c, XMM21d);
1187 reg_class ymm21_reg(XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h);
1188 reg_class zmm21_reg(XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p);
1189 
1190 reg_class xmm22_reg(XMM22, XMM22b, XMM22c, XMM22d);
1191 reg_class ymm22_reg(XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h);
1192 reg_class zmm22_reg(XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p);
1193 
1194 reg_class xmm23_reg(XMM23, XMM23b, XMM23c, XMM23d);
1195 reg_class ymm23_reg(XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h);
1196 reg_class zmm23_reg(XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p);
1197 
1198 reg_class xmm24_reg(XMM24, XMM24b, XMM24c, XMM24d);
1199 reg_class ymm24_reg(XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h);
1200 reg_class zmm24_reg(XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p);
1201 
1202 reg_class xmm25_reg(XMM25, XMM25b, XMM25c, XMM25d);
1203 reg_class ymm25_reg(XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h);
1204 reg_class zmm25_reg(XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p);
1205 
1206 reg_class xmm26_reg(XMM26, XMM26b, XMM26c, XMM26d);
1207 reg_class ymm26_reg(XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h);
1208 reg_class zmm26_reg(XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p);
1209 
1210 reg_class xmm27_reg(XMM27, XMM27b, XMM27c, XMM27d);
1211 reg_class ymm27_reg(XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h);
1212 reg_class zmm27_reg(XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p);
1213 
1214 reg_class xmm28_reg(XMM28, XMM28b, XMM28c, XMM28d);
1215 reg_class ymm28_reg(XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h);
1216 reg_class zmm28_reg(XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p);
1217 
1218 reg_class xmm29_reg(XMM29, XMM29b, XMM29c, XMM29d);
1219 reg_class ymm29_reg(XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h);
1220 reg_class zmm29_reg(XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p);
1221 
1222 reg_class xmm30_reg(XMM30, XMM30b, XMM30c, XMM30d);
1223 reg_class ymm30_reg(XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h);
1224 reg_class zmm30_reg(XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p);
1225 
1226 reg_class xmm31_reg(XMM31, XMM31b, XMM31c, XMM31d);
1227 reg_class ymm31_reg(XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h);
1228 reg_class zmm31_reg(XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p);
1229 
1230 #endif
1231 
1232 %}
1233 
1234 
1235 //----------SOURCE BLOCK-------------------------------------------------------
1236 // This is a block of C++ code which provides values, functions, and
1237 // definitions necessary in the rest of the architecture description
1238 
1239 source_hpp %{
1240 // Header information of the source block.
1241 // Method declarations/definitions which are used outside
1242 // the ad-scope can conveniently be defined here.
1243 //
1244 // To keep related declarations/definitions/uses close together,
1245 // we switch between source %{ }% and source_hpp %{ }% freely as needed.
1246 
1247 class NativeJump;
1248 
1249 class CallStubImpl {
1250 
1251   //--------------------------------------------------------------
1252   //---<  Used for optimization in Compile::shorten_branches  >---
1253   //--------------------------------------------------------------
1254 
1255  public:
1256   // Size of call trampoline stub.
1257   static uint size_call_trampoline() {
1258     return 0; // no call trampolines on this platform
1259   }
1260 
1261   // number of relocations needed by a call trampoline stub
1262   static uint reloc_call_trampoline() {
1263     return 0; // no call trampolines on this platform
1264   }
1265 };
1266 
1267 class HandlerImpl {
1268 
1269  public:
1270 
1271   static int emit_exception_handler(CodeBuffer &cbuf);
1272   static int emit_deopt_handler(CodeBuffer& cbuf);
1273 
1274   static uint size_exception_handler() {
1275     // NativeCall instruction size is the same as NativeJump.
1276     // exception handler starts out as jump and can be patched to
1277     // a call be deoptimization.  (4932387)
1278     // Note that this value is also credited (in output.cpp) to
1279     // the size of the code section.
1280     return NativeJump::instruction_size;
1281   }
1282 
1283 #ifdef _LP64
1284   static uint size_deopt_handler() {
1285     // three 5 byte instructions plus one move for unreachable address.
1286     return 15+3;
1287   }
1288 #else
1289   static uint size_deopt_handler() {
1290     // NativeCall instruction size is the same as NativeJump.
1291     // exception handler starts out as jump and can be patched to
1292     // a call be deoptimization.  (4932387)
1293     // Note that this value is also credited (in output.cpp) to
1294     // the size of the code section.
1295     return 5 + NativeJump::instruction_size; // pushl(); jmp;
1296   }
1297 #endif
1298 };
1299 
1300 %} // end source_hpp
1301 
1302 source %{
1303 
1304 #include "opto/addnode.hpp"
1305 
1306 // Emit exception handler code.
1307 // Stuff framesize into a register and call a VM stub routine.
1308 int HandlerImpl::emit_exception_handler(CodeBuffer& cbuf) {
1309 
1310   // Note that the code buffer's insts_mark is always relative to insts.
1311   // That's why we must use the macroassembler to generate a handler.
1312   MacroAssembler _masm(&cbuf);
1313   address base = __ start_a_stub(size_exception_handler());
1314   if (base == NULL) {
1315     ciEnv::current()->record_failure("CodeCache is full");
1316     return 0;  // CodeBuffer::expand failed
1317   }
1318   int offset = __ offset();
1319   __ jump(RuntimeAddress(OptoRuntime::exception_blob()->entry_point()));
1320   assert(__ offset() - offset <= (int) size_exception_handler(), "overflow");
1321   __ end_a_stub();
1322   return offset;
1323 }
1324 
1325 // Emit deopt handler code.
1326 int HandlerImpl::emit_deopt_handler(CodeBuffer& cbuf) {
1327 
1328   // Note that the code buffer's insts_mark is always relative to insts.
1329   // That's why we must use the macroassembler to generate a handler.
1330   MacroAssembler _masm(&cbuf);
1331   address base = __ start_a_stub(size_deopt_handler());
1332   if (base == NULL) {
1333     ciEnv::current()->record_failure("CodeCache is full");
1334     return 0;  // CodeBuffer::expand failed
1335   }
1336   int offset = __ offset();
1337 
1338 #ifdef _LP64
1339   address the_pc = (address) __ pc();
1340   Label next;
1341   // push a "the_pc" on the stack without destroying any registers
1342   // as they all may be live.
1343 
1344   // push address of "next"
1345   __ call(next, relocInfo::none); // reloc none is fine since it is a disp32
1346   __ bind(next);
1347   // adjust it so it matches "the_pc"
1348   __ subptr(Address(rsp, 0), __ offset() - offset);
1349 #else
1350   InternalAddress here(__ pc());
1351   __ pushptr(here.addr());
1352 #endif
1353 
1354   __ jump(RuntimeAddress(SharedRuntime::deopt_blob()->unpack()));
1355   assert(__ offset() - offset <= (int) size_deopt_handler(), "overflow %d", (__ offset() - offset));
1356   __ end_a_stub();
1357   return offset;
1358 }
1359 
1360 
1361 //=============================================================================
1362 
1363   // Float masks come from different places depending on platform.
1364 #ifdef _LP64
1365   static address float_signmask()  { return StubRoutines::x86::float_sign_mask(); }
1366   static address float_signflip()  { return StubRoutines::x86::float_sign_flip(); }
1367   static address double_signmask() { return StubRoutines::x86::double_sign_mask(); }
1368   static address double_signflip() { return StubRoutines::x86::double_sign_flip(); }
1369 #else
1370   static address float_signmask()  { return (address)float_signmask_pool; }
1371   static address float_signflip()  { return (address)float_signflip_pool; }
1372   static address double_signmask() { return (address)double_signmask_pool; }
1373   static address double_signflip() { return (address)double_signflip_pool; }
1374 #endif
1375 
1376 
1377 const bool Matcher::match_rule_supported(int opcode) {
1378   if (!has_match_rule(opcode))
1379     return false;
1380 
1381   bool ret_value = true;
1382   switch (opcode) {
1383     case Op_PopCountI:
1384     case Op_PopCountL:
1385       if (!UsePopCountInstruction)
1386         ret_value = false;
1387       break;
1388     case Op_PopCountVI:
1389       if (!UsePopCountInstruction || !VM_Version::supports_vpopcntdq())
1390         ret_value = false;
1391       break;
1392     case Op_MulVI:
1393       if ((UseSSE < 4) && (UseAVX < 1)) // only with SSE4_1 or AVX
1394         ret_value = false;
1395       break;
1396     case Op_MulVL:
1397     case Op_MulReductionVL:
1398       if (VM_Version::supports_avx512dq() == false)
1399         ret_value = false;
1400       break;
1401     case Op_AddReductionVL:
1402       if (UseAVX < 3) // only EVEX : vector connectivity becomes an issue here
1403         ret_value = false;
1404       break;
1405     case Op_AddReductionVI:
1406       if (UseSSE < 3) // requires at least SSE3
1407         ret_value = false;
1408       break;
1409     case Op_MulReductionVI:
1410       if (UseSSE < 4) // requires at least SSE4
1411         ret_value = false;
1412       break;
1413     case Op_AddReductionVF:
1414     case Op_AddReductionVD:
1415     case Op_MulReductionVF:
1416     case Op_MulReductionVD:
1417       if (UseSSE < 1) // requires at least SSE
1418         ret_value = false;
1419       break;
1420     case Op_SqrtVD:
1421     case Op_SqrtVF:
1422       if (UseAVX < 1) // enabled for AVX only
1423         ret_value = false;
1424       break;
1425     case Op_CompareAndSwapL:
1426 #ifdef _LP64
1427     case Op_CompareAndSwapP:
1428 #endif
1429       if (!VM_Version::supports_cx8())
1430         ret_value = false;
1431       break;
1432     case Op_CMoveVF:
1433     case Op_CMoveVD:
1434       if (UseAVX < 1 || UseAVX > 2)
1435         ret_value = false;
1436       break;
1437     case Op_StrIndexOf:
1438       if (!UseSSE42Intrinsics)
1439         ret_value = false;
1440       break;
1441     case Op_StrIndexOfChar:
1442       if (!UseSSE42Intrinsics)
1443         ret_value = false;
1444       break;
1445     case Op_OnSpinWait:
1446       if (VM_Version::supports_on_spin_wait() == false)
1447         ret_value = false;
1448       break;
1449     case Op_MulAddVS2VI:
1450       if (UseSSE < 2)
1451         ret_value = false;
1452       break;
1453     case Op_MaxD:
1454     case Op_MaxF:
1455     case Op_MinD:
1456     case Op_MinF:
1457       if (UseAVX < 1) // enabled for AVX only
1458         ret_value = false;
1459       break;
1460   }
1461 
1462   return ret_value;  // Per default match rules are supported.
1463 }
1464 
1465 const bool Matcher::match_rule_supported_vector(int opcode, int vlen) {
1466   // identify extra cases that we might want to provide match rules for
1467   // e.g. Op_ vector nodes and other intrinsics while guarding with vlen
1468   bool ret_value = match_rule_supported(opcode);
1469   if (ret_value) {
1470     switch (opcode) {
1471       case Op_AddVB:
1472       case Op_SubVB:
1473         if ((vlen == 64) && (VM_Version::supports_avx512bw() == false))
1474           ret_value = false;
1475         break;
1476       case Op_URShiftVS:
1477       case Op_RShiftVS:
1478       case Op_LShiftVS:
1479       case Op_MulVS:
1480       case Op_AddVS:
1481       case Op_SubVS:
1482         if ((vlen == 32) && (VM_Version::supports_avx512bw() == false))
1483           ret_value = false;
1484         break;
1485       case Op_CMoveVF:
1486         if (vlen != 8)
1487           ret_value  = false;
1488         break;
1489       case Op_CMoveVD:
1490         if (vlen != 4)
1491           ret_value  = false;
1492         break;
1493     }
1494   }
1495 
1496   return ret_value;  // Per default match rules are supported.
1497 }
1498 
1499 const bool Matcher::has_predicated_vectors(void) {
1500   bool ret_value = false;
1501   if (UseAVX > 2) {
1502     ret_value = VM_Version::supports_avx512vl();
1503   }
1504 
1505   return ret_value;
1506 }
1507 
1508 const int Matcher::float_pressure(int default_pressure_threshold) {
1509   int float_pressure_threshold = default_pressure_threshold;
1510 #ifdef _LP64
1511   if (UseAVX > 2) {
1512     // Increase pressure threshold on machines with AVX3 which have
1513     // 2x more XMM registers.
1514     float_pressure_threshold = default_pressure_threshold * 2;
1515   }
1516 #endif
1517   return float_pressure_threshold;
1518 }
1519 
1520 // Max vector size in bytes. 0 if not supported.
1521 const int Matcher::vector_width_in_bytes(BasicType bt) {
1522   assert(is_java_primitive(bt), "only primitive type vectors");
1523   if (UseSSE < 2) return 0;
1524   // SSE2 supports 128bit vectors for all types.
1525   // AVX2 supports 256bit vectors for all types.
1526   // AVX2/EVEX supports 512bit vectors for all types.
1527   int size = (UseAVX > 1) ? (1 << UseAVX) * 8 : 16;
1528   // AVX1 supports 256bit vectors only for FLOAT and DOUBLE.
1529   if (UseAVX > 0 && (bt == T_FLOAT || bt == T_DOUBLE))
1530     size = (UseAVX > 2) ? 64 : 32;
1531   if (UseAVX > 2 && (bt == T_BYTE || bt == T_SHORT || bt == T_CHAR))
1532     size = (VM_Version::supports_avx512bw()) ? 64 : 32;
1533   // Use flag to limit vector size.
1534   size = MIN2(size,(int)MaxVectorSize);
1535   // Minimum 2 values in vector (or 4 for bytes).
1536   switch (bt) {
1537   case T_DOUBLE:
1538   case T_LONG:
1539     if (size < 16) return 0;
1540     break;
1541   case T_FLOAT:
1542   case T_INT:
1543     if (size < 8) return 0;
1544     break;
1545   case T_BOOLEAN:
1546     if (size < 4) return 0;
1547     break;
1548   case T_CHAR:
1549     if (size < 4) return 0;
1550     break;
1551   case T_BYTE:
1552     if (size < 4) return 0;
1553     break;
1554   case T_SHORT:
1555     if (size < 4) return 0;
1556     break;
1557   default:
1558     ShouldNotReachHere();
1559   }
1560   return size;
1561 }
1562 
1563 // Limits on vector size (number of elements) loaded into vector.
1564 const int Matcher::max_vector_size(const BasicType bt) {
1565   return vector_width_in_bytes(bt)/type2aelembytes(bt);
1566 }
1567 const int Matcher::min_vector_size(const BasicType bt) {
1568   int max_size = max_vector_size(bt);
1569   // Min size which can be loaded into vector is 4 bytes.
1570   int size = (type2aelembytes(bt) == 1) ? 4 : 2;
1571   return MIN2(size,max_size);
1572 }
1573 
1574 // Vector ideal reg corresponding to specified size in bytes
1575 const uint Matcher::vector_ideal_reg(int size) {
1576   assert(MaxVectorSize >= size, "");
1577   switch(size) {
1578     case  4: return Op_VecS;
1579     case  8: return Op_VecD;
1580     case 16: return Op_VecX;
1581     case 32: return Op_VecY;
1582     case 64: return Op_VecZ;
1583   }
1584   ShouldNotReachHere();
1585   return 0;
1586 }
1587 
1588 // Only lowest bits of xmm reg are used for vector shift count.
1589 const uint Matcher::vector_shift_count_ideal_reg(int size) {
1590   return Op_VecS;
1591 }
1592 
1593 // x86 supports misaligned vectors store/load.
1594 const bool Matcher::misaligned_vectors_ok() {
1595   return !AlignVector; // can be changed by flag
1596 }
1597 
1598 // x86 AES instructions are compatible with SunJCE expanded
1599 // keys, hence we do not need to pass the original key to stubs
1600 const bool Matcher::pass_original_key_for_aes() {
1601   return false;
1602 }
1603 
1604 
1605 const bool Matcher::convi2l_type_required = true;
1606 
1607 // Check for shift by small constant as well
1608 static bool clone_shift(Node* shift, Matcher* matcher, Matcher::MStack& mstack, VectorSet& address_visited) {
1609   if (shift->Opcode() == Op_LShiftX && shift->in(2)->is_Con() &&
1610       shift->in(2)->get_int() <= 3 &&
1611       // Are there other uses besides address expressions?
1612       !matcher->is_visited(shift)) {
1613     address_visited.set(shift->_idx); // Flag as address_visited
1614     mstack.push(shift->in(2), Matcher::Visit);
1615     Node *conv = shift->in(1);
1616 #ifdef _LP64
1617     // Allow Matcher to match the rule which bypass
1618     // ConvI2L operation for an array index on LP64
1619     // if the index value is positive.
1620     if (conv->Opcode() == Op_ConvI2L &&
1621         conv->as_Type()->type()->is_long()->_lo >= 0 &&
1622         // Are there other uses besides address expressions?
1623         !matcher->is_visited(conv)) {
1624       address_visited.set(conv->_idx); // Flag as address_visited
1625       mstack.push(conv->in(1), Matcher::Pre_Visit);
1626     } else
1627 #endif
1628       mstack.push(conv, Matcher::Pre_Visit);
1629     return true;
1630   }
1631   return false;
1632 }
1633 
1634 // Should the Matcher clone shifts on addressing modes, expecting them
1635 // to be subsumed into complex addressing expressions or compute them
1636 // into registers?
1637 bool Matcher::clone_address_expressions(AddPNode* m, Matcher::MStack& mstack, VectorSet& address_visited) {
1638   Node *off = m->in(AddPNode::Offset);
1639   if (off->is_Con()) {
1640     address_visited.test_set(m->_idx); // Flag as address_visited
1641     Node *adr = m->in(AddPNode::Address);
1642 
1643     // Intel can handle 2 adds in addressing mode
1644     // AtomicAdd is not an addressing expression.
1645     // Cheap to find it by looking for screwy base.
1646     if (adr->is_AddP() &&
1647         !adr->in(AddPNode::Base)->is_top() &&
1648         LP64_ONLY( off->get_long() == (int) (off->get_long()) && ) // immL32
1649         // Are there other uses besides address expressions?
1650         !is_visited(adr)) {
1651       address_visited.set(adr->_idx); // Flag as address_visited
1652       Node *shift = adr->in(AddPNode::Offset);
1653       if (!clone_shift(shift, this, mstack, address_visited)) {
1654         mstack.push(shift, Pre_Visit);
1655       }
1656       mstack.push(adr->in(AddPNode::Address), Pre_Visit);
1657       mstack.push(adr->in(AddPNode::Base), Pre_Visit);
1658     } else {
1659       mstack.push(adr, Pre_Visit);
1660     }
1661 
1662     // Clone X+offset as it also folds into most addressing expressions
1663     mstack.push(off, Visit);
1664     mstack.push(m->in(AddPNode::Base), Pre_Visit);
1665     return true;
1666   } else if (clone_shift(off, this, mstack, address_visited)) {
1667     address_visited.test_set(m->_idx); // Flag as address_visited
1668     mstack.push(m->in(AddPNode::Address), Pre_Visit);
1669     mstack.push(m->in(AddPNode::Base), Pre_Visit);
1670     return true;
1671   }
1672   return false;
1673 }
1674 
1675 void Compile::reshape_address(AddPNode* addp) {
1676 }
1677 
1678 // Helper methods for MachSpillCopyNode::implementation().
1679 static int vec_mov_helper(CodeBuffer *cbuf, bool do_size, int src_lo, int dst_lo,
1680                           int src_hi, int dst_hi, uint ireg, outputStream* st) {
1681   // In 64-bit VM size calculation is very complex. Emitting instructions
1682   // into scratch buffer is used to get size in 64-bit VM.
1683   LP64_ONLY( assert(!do_size, "this method calculates size only for 32-bit VM"); )
1684   assert(ireg == Op_VecS || // 32bit vector
1685          (src_lo & 1) == 0 && (src_lo + 1) == src_hi &&
1686          (dst_lo & 1) == 0 && (dst_lo + 1) == dst_hi,
1687          "no non-adjacent vector moves" );
1688   if (cbuf) {
1689     MacroAssembler _masm(cbuf);
1690     int offset = __ offset();
1691     switch (ireg) {
1692     case Op_VecS: // copy whole register
1693     case Op_VecD:
1694     case Op_VecX:
1695 #ifndef LP64
1696       __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
1697 #else
1698       if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
1699         __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
1700       } else {
1701         __ vpxor(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[dst_lo]), 2);
1702         __ vinserti32x4(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 0x0);
1703      }
1704 #endif
1705       break;
1706     case Op_VecY:
1707 #ifndef LP64
1708       __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
1709 #else
1710       if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
1711         __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
1712       } else {
1713         __ vpxor(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[dst_lo]), 2);
1714         __ vinserti64x4(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 0x0);
1715      }
1716 #endif
1717       break;
1718     case Op_VecZ:
1719       __ evmovdquq(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 2);
1720       break;
1721     default:
1722       ShouldNotReachHere();
1723     }
1724     int size = __ offset() - offset;
1725 #ifdef ASSERT
1726     // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
1727     assert(!do_size || size == 4, "incorrect size calculattion");
1728 #endif
1729     return size;
1730 #ifndef PRODUCT
1731   } else if (!do_size) {
1732     switch (ireg) {
1733     case Op_VecS:
1734     case Op_VecD:
1735     case Op_VecX:
1736       st->print("movdqu  %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
1737       break;
1738     case Op_VecY:
1739     case Op_VecZ:
1740       st->print("vmovdqu %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
1741       break;
1742     default:
1743       ShouldNotReachHere();
1744     }
1745 #endif
1746   }
1747   // VEX_2bytes prefix is used if UseAVX > 0, and it takes the same 2 bytes as SIMD prefix.
1748   return (UseAVX > 2) ? 6 : 4;
1749 }
1750 
1751 static int vec_spill_helper(CodeBuffer *cbuf, bool do_size, bool is_load,
1752                             int stack_offset, int reg, uint ireg, outputStream* st) {
1753   // In 64-bit VM size calculation is very complex. Emitting instructions
1754   // into scratch buffer is used to get size in 64-bit VM.
1755   LP64_ONLY( assert(!do_size, "this method calculates size only for 32-bit VM"); )
1756   if (cbuf) {
1757     MacroAssembler _masm(cbuf);
1758     int offset = __ offset();
1759     if (is_load) {
1760       switch (ireg) {
1761       case Op_VecS:
1762         __ movdl(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
1763         break;
1764       case Op_VecD:
1765         __ movq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
1766         break;
1767       case Op_VecX:
1768 #ifndef LP64
1769         __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
1770 #else
1771         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
1772           __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
1773         } else {
1774           __ vpxor(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 2);
1775           __ vinserti32x4(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset),0x0);
1776         }
1777 #endif
1778         break;
1779       case Op_VecY:
1780 #ifndef LP64
1781         __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
1782 #else
1783         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
1784           __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
1785         } else {
1786           __ vpxor(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 2);
1787           __ vinserti64x4(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset),0x0);
1788         }
1789 #endif
1790         break;
1791       case Op_VecZ:
1792         __ evmovdquq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset), 2);
1793         break;
1794       default:
1795         ShouldNotReachHere();
1796       }
1797     } else { // store
1798       switch (ireg) {
1799       case Op_VecS:
1800         __ movdl(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
1801         break;
1802       case Op_VecD:
1803         __ movq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
1804         break;
1805       case Op_VecX:
1806 #ifndef LP64
1807         __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
1808 #else
1809         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
1810           __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
1811         }
1812         else {
1813           __ vextracti32x4(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 0x0);
1814         }
1815 #endif
1816         break;
1817       case Op_VecY:
1818 #ifndef LP64
1819         __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
1820 #else
1821         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
1822           __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
1823         }
1824         else {
1825           __ vextracti64x4(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 0x0);
1826         }
1827 #endif
1828         break;
1829       case Op_VecZ:
1830         __ evmovdquq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 2);
1831         break;
1832       default:
1833         ShouldNotReachHere();
1834       }
1835     }
1836     int size = __ offset() - offset;
1837 #ifdef ASSERT
1838     int offset_size = (stack_offset == 0) ? 0 : ((stack_offset < 0x80) ? 1 : (UseAVX > 2) ? 6 : 4);
1839     // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
1840     assert(!do_size || size == (5+offset_size), "incorrect size calculattion");
1841 #endif
1842     return size;
1843 #ifndef PRODUCT
1844   } else if (!do_size) {
1845     if (is_load) {
1846       switch (ireg) {
1847       case Op_VecS:
1848         st->print("movd    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
1849         break;
1850       case Op_VecD:
1851         st->print("movq    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
1852         break;
1853        case Op_VecX:
1854         st->print("movdqu  %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
1855         break;
1856       case Op_VecY:
1857       case Op_VecZ:
1858         st->print("vmovdqu %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
1859         break;
1860       default:
1861         ShouldNotReachHere();
1862       }
1863     } else { // store
1864       switch (ireg) {
1865       case Op_VecS:
1866         st->print("movd    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
1867         break;
1868       case Op_VecD:
1869         st->print("movq    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
1870         break;
1871        case Op_VecX:
1872         st->print("movdqu  [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
1873         break;
1874       case Op_VecY:
1875       case Op_VecZ:
1876         st->print("vmovdqu [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
1877         break;
1878       default:
1879         ShouldNotReachHere();
1880       }
1881     }
1882 #endif
1883   }
1884   bool is_single_byte = false;
1885   int vec_len = 0;
1886   if ((UseAVX > 2) && (stack_offset != 0)) {
1887     int tuple_type = Assembler::EVEX_FVM;
1888     int input_size = Assembler::EVEX_32bit;
1889     switch (ireg) {
1890     case Op_VecS:
1891       tuple_type = Assembler::EVEX_T1S;
1892       break;
1893     case Op_VecD:
1894       tuple_type = Assembler::EVEX_T1S;
1895       input_size = Assembler::EVEX_64bit;
1896       break;
1897     case Op_VecX:
1898       break;
1899     case Op_VecY:
1900       vec_len = 1;
1901       break;
1902     case Op_VecZ:
1903       vec_len = 2;
1904       break;
1905     }
1906     is_single_byte = Assembler::query_compressed_disp_byte(stack_offset, true, vec_len, tuple_type, input_size, 0);
1907   }
1908   int offset_size = 0;
1909   int size = 5;
1910   if (UseAVX > 2 ) {
1911     if (VM_Version::supports_avx512novl() && (vec_len == 2)) {
1912       offset_size = (stack_offset == 0) ? 0 : ((is_single_byte) ? 1 : 4);
1913       size += 2; // Need an additional two bytes for EVEX encoding
1914     } else if (VM_Version::supports_avx512novl() && (vec_len < 2)) {
1915       offset_size = (stack_offset == 0) ? 0 : ((stack_offset <= 127) ? 1 : 4);
1916     } else {
1917       offset_size = (stack_offset == 0) ? 0 : ((is_single_byte) ? 1 : 4);
1918       size += 2; // Need an additional two bytes for EVEX encodding
1919     }
1920   } else {
1921     offset_size = (stack_offset == 0) ? 0 : ((stack_offset <= 127) ? 1 : 4);
1922   }
1923   // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
1924   return size+offset_size;
1925 }
1926 
1927 static inline jint replicate4_imm(int con, int width) {
1928   // Load a constant of "width" (in bytes) and replicate it to fill 32bit.
1929   assert(width == 1 || width == 2, "only byte or short types here");
1930   int bit_width = width * 8;
1931   jint val = con;
1932   val &= (1 << bit_width) - 1;  // mask off sign bits
1933   while(bit_width < 32) {
1934     val |= (val << bit_width);
1935     bit_width <<= 1;
1936   }
1937   return val;
1938 }
1939 
1940 static inline jlong replicate8_imm(int con, int width) {
1941   // Load a constant of "width" (in bytes) and replicate it to fill 64bit.
1942   assert(width == 1 || width == 2 || width == 4, "only byte, short or int types here");
1943   int bit_width = width * 8;
1944   jlong val = con;
1945   val &= (((jlong) 1) << bit_width) - 1;  // mask off sign bits
1946   while(bit_width < 64) {
1947     val |= (val << bit_width);
1948     bit_width <<= 1;
1949   }
1950   return val;
1951 }
1952 
1953 #ifndef PRODUCT
1954   void MachNopNode::format(PhaseRegAlloc*, outputStream* st) const {
1955     st->print("nop \t# %d bytes pad for loops and calls", _count);
1956   }
1957 #endif
1958 
1959   void MachNopNode::emit(CodeBuffer &cbuf, PhaseRegAlloc*) const {
1960     MacroAssembler _masm(&cbuf);
1961     __ nop(_count);
1962   }
1963 
1964   uint MachNopNode::size(PhaseRegAlloc*) const {
1965     return _count;
1966   }
1967 
1968 #ifndef PRODUCT
1969   void MachBreakpointNode::format(PhaseRegAlloc*, outputStream* st) const {
1970     st->print("# breakpoint");
1971   }
1972 #endif
1973 
1974   void MachBreakpointNode::emit(CodeBuffer &cbuf, PhaseRegAlloc* ra_) const {
1975     MacroAssembler _masm(&cbuf);
1976     __ int3();
1977   }
1978 
1979   uint MachBreakpointNode::size(PhaseRegAlloc* ra_) const {
1980     return MachNode::size(ra_);
1981   }
1982 
1983 %}
1984 
1985 encode %{
1986 
1987   enc_class call_epilog %{
1988     if (VerifyStackAtCalls) {
1989       // Check that stack depth is unchanged: find majik cookie on stack
1990       int framesize = ra_->reg2offset_unchecked(OptoReg::add(ra_->_matcher._old_SP, -3*VMRegImpl::slots_per_word));
1991       MacroAssembler _masm(&cbuf);
1992       Label L;
1993       __ cmpptr(Address(rsp, framesize), (int32_t)0xbadb100d);
1994       __ jccb(Assembler::equal, L);
1995       // Die if stack mismatch
1996       __ int3();
1997       __ bind(L);
1998     }
1999   %}
2000 
2001 %}
2002 
2003 
2004 //----------OPERANDS-----------------------------------------------------------
2005 // Operand definitions must precede instruction definitions for correct parsing
2006 // in the ADLC because operands constitute user defined types which are used in
2007 // instruction definitions.
2008 
2009 operand vecZ() %{
2010   constraint(ALLOC_IN_RC(vectorz_reg));
2011   match(VecZ);
2012 
2013   format %{ %}
2014   interface(REG_INTER);
2015 %}
2016 
2017 operand legVecZ() %{
2018   constraint(ALLOC_IN_RC(vectorz_reg_vl));
2019   match(VecZ);
2020 
2021   format %{ %}
2022   interface(REG_INTER);
2023 %}
2024 
2025 // Comparison Code for FP conditional move
2026 operand cmpOp_vcmppd() %{
2027   match(Bool);
2028 
2029   predicate(n->as_Bool()->_test._test != BoolTest::overflow &&
2030             n->as_Bool()->_test._test != BoolTest::no_overflow);
2031   format %{ "" %}
2032   interface(COND_INTER) %{
2033     equal        (0x0, "eq");
2034     less         (0x1, "lt");
2035     less_equal   (0x2, "le");
2036     not_equal    (0xC, "ne");
2037     greater_equal(0xD, "ge");
2038     greater      (0xE, "gt");
2039     //TODO cannot compile (adlc breaks) without two next lines with error:
2040     // x86_64.ad(13987) Syntax Error: :In operand cmpOp_vcmppd: Do not support this encode constant: ' %{
2041     // equal' for overflow.
2042     overflow     (0x20, "o");  // not really supported by the instruction
2043     no_overflow  (0x21, "no"); // not really supported by the instruction
2044   %}
2045 %}
2046 
2047 
2048 // INSTRUCTIONS -- Platform independent definitions (same for 32- and 64-bit)
2049 
2050 // ============================================================================
2051 
2052 instruct ShouldNotReachHere() %{
2053   match(Halt);
2054   format %{ "ud2\t# ShouldNotReachHere" %}
2055   ins_encode %{
2056     __ ud2();
2057   %}
2058   ins_pipe(pipe_slow);
2059 %}
2060 
2061 // =================================EVEX special===============================
2062 
2063 instruct setMask(rRegI dst, rRegI src) %{
2064   predicate(Matcher::has_predicated_vectors());
2065   match(Set dst (SetVectMaskI  src));
2066   effect(TEMP dst);
2067   format %{ "setvectmask   $dst, $src" %}
2068   ins_encode %{
2069     __ setvectmask($dst$$Register, $src$$Register);
2070   %}
2071   ins_pipe(pipe_slow);
2072 %}
2073 
2074 // ============================================================================
2075 
2076 instruct addF_reg(regF dst, regF src) %{
2077   predicate((UseSSE>=1) && (UseAVX == 0));
2078   match(Set dst (AddF dst src));
2079 
2080   format %{ "addss   $dst, $src" %}
2081   ins_cost(150);
2082   ins_encode %{
2083     __ addss($dst$$XMMRegister, $src$$XMMRegister);
2084   %}
2085   ins_pipe(pipe_slow);
2086 %}
2087 
2088 instruct addF_mem(regF dst, memory src) %{
2089   predicate((UseSSE>=1) && (UseAVX == 0));
2090   match(Set dst (AddF dst (LoadF src)));
2091 
2092   format %{ "addss   $dst, $src" %}
2093   ins_cost(150);
2094   ins_encode %{
2095     __ addss($dst$$XMMRegister, $src$$Address);
2096   %}
2097   ins_pipe(pipe_slow);
2098 %}
2099 
2100 instruct addF_imm(regF dst, immF con) %{
2101   predicate((UseSSE>=1) && (UseAVX == 0));
2102   match(Set dst (AddF dst con));
2103   format %{ "addss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2104   ins_cost(150);
2105   ins_encode %{
2106     __ addss($dst$$XMMRegister, $constantaddress($con));
2107   %}
2108   ins_pipe(pipe_slow);
2109 %}
2110 
2111 instruct addF_reg_reg(regF dst, regF src1, regF src2) %{
2112   predicate(UseAVX > 0);
2113   match(Set dst (AddF src1 src2));
2114 
2115   format %{ "vaddss  $dst, $src1, $src2" %}
2116   ins_cost(150);
2117   ins_encode %{
2118     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2119   %}
2120   ins_pipe(pipe_slow);
2121 %}
2122 
2123 instruct addF_reg_mem(regF dst, regF src1, memory src2) %{
2124   predicate(UseAVX > 0);
2125   match(Set dst (AddF src1 (LoadF src2)));
2126 
2127   format %{ "vaddss  $dst, $src1, $src2" %}
2128   ins_cost(150);
2129   ins_encode %{
2130     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2131   %}
2132   ins_pipe(pipe_slow);
2133 %}
2134 
2135 instruct addF_reg_imm(regF dst, regF src, immF con) %{
2136   predicate(UseAVX > 0);
2137   match(Set dst (AddF src con));
2138 
2139   format %{ "vaddss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
2140   ins_cost(150);
2141   ins_encode %{
2142     __ vaddss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2143   %}
2144   ins_pipe(pipe_slow);
2145 %}
2146 
2147 instruct addD_reg(regD dst, regD src) %{
2148   predicate((UseSSE>=2) && (UseAVX == 0));
2149   match(Set dst (AddD dst src));
2150 
2151   format %{ "addsd   $dst, $src" %}
2152   ins_cost(150);
2153   ins_encode %{
2154     __ addsd($dst$$XMMRegister, $src$$XMMRegister);
2155   %}
2156   ins_pipe(pipe_slow);
2157 %}
2158 
2159 instruct addD_mem(regD dst, memory src) %{
2160   predicate((UseSSE>=2) && (UseAVX == 0));
2161   match(Set dst (AddD dst (LoadD src)));
2162 
2163   format %{ "addsd   $dst, $src" %}
2164   ins_cost(150);
2165   ins_encode %{
2166     __ addsd($dst$$XMMRegister, $src$$Address);
2167   %}
2168   ins_pipe(pipe_slow);
2169 %}
2170 
2171 instruct addD_imm(regD dst, immD con) %{
2172   predicate((UseSSE>=2) && (UseAVX == 0));
2173   match(Set dst (AddD dst con));
2174   format %{ "addsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2175   ins_cost(150);
2176   ins_encode %{
2177     __ addsd($dst$$XMMRegister, $constantaddress($con));
2178   %}
2179   ins_pipe(pipe_slow);
2180 %}
2181 
2182 instruct addD_reg_reg(regD dst, regD src1, regD src2) %{
2183   predicate(UseAVX > 0);
2184   match(Set dst (AddD src1 src2));
2185 
2186   format %{ "vaddsd  $dst, $src1, $src2" %}
2187   ins_cost(150);
2188   ins_encode %{
2189     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2190   %}
2191   ins_pipe(pipe_slow);
2192 %}
2193 
2194 instruct addD_reg_mem(regD dst, regD src1, memory src2) %{
2195   predicate(UseAVX > 0);
2196   match(Set dst (AddD src1 (LoadD src2)));
2197 
2198   format %{ "vaddsd  $dst, $src1, $src2" %}
2199   ins_cost(150);
2200   ins_encode %{
2201     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2202   %}
2203   ins_pipe(pipe_slow);
2204 %}
2205 
2206 instruct addD_reg_imm(regD dst, regD src, immD con) %{
2207   predicate(UseAVX > 0);
2208   match(Set dst (AddD src con));
2209 
2210   format %{ "vaddsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
2211   ins_cost(150);
2212   ins_encode %{
2213     __ vaddsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2214   %}
2215   ins_pipe(pipe_slow);
2216 %}
2217 
2218 instruct subF_reg(regF dst, regF src) %{
2219   predicate((UseSSE>=1) && (UseAVX == 0));
2220   match(Set dst (SubF dst src));
2221 
2222   format %{ "subss   $dst, $src" %}
2223   ins_cost(150);
2224   ins_encode %{
2225     __ subss($dst$$XMMRegister, $src$$XMMRegister);
2226   %}
2227   ins_pipe(pipe_slow);
2228 %}
2229 
2230 instruct subF_mem(regF dst, memory src) %{
2231   predicate((UseSSE>=1) && (UseAVX == 0));
2232   match(Set dst (SubF dst (LoadF src)));
2233 
2234   format %{ "subss   $dst, $src" %}
2235   ins_cost(150);
2236   ins_encode %{
2237     __ subss($dst$$XMMRegister, $src$$Address);
2238   %}
2239   ins_pipe(pipe_slow);
2240 %}
2241 
2242 instruct subF_imm(regF dst, immF con) %{
2243   predicate((UseSSE>=1) && (UseAVX == 0));
2244   match(Set dst (SubF dst con));
2245   format %{ "subss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2246   ins_cost(150);
2247   ins_encode %{
2248     __ subss($dst$$XMMRegister, $constantaddress($con));
2249   %}
2250   ins_pipe(pipe_slow);
2251 %}
2252 
2253 instruct subF_reg_reg(regF dst, regF src1, regF src2) %{
2254   predicate(UseAVX > 0);
2255   match(Set dst (SubF src1 src2));
2256 
2257   format %{ "vsubss  $dst, $src1, $src2" %}
2258   ins_cost(150);
2259   ins_encode %{
2260     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2261   %}
2262   ins_pipe(pipe_slow);
2263 %}
2264 
2265 instruct subF_reg_mem(regF dst, regF src1, memory src2) %{
2266   predicate(UseAVX > 0);
2267   match(Set dst (SubF src1 (LoadF src2)));
2268 
2269   format %{ "vsubss  $dst, $src1, $src2" %}
2270   ins_cost(150);
2271   ins_encode %{
2272     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2273   %}
2274   ins_pipe(pipe_slow);
2275 %}
2276 
2277 instruct subF_reg_imm(regF dst, regF src, immF con) %{
2278   predicate(UseAVX > 0);
2279   match(Set dst (SubF src con));
2280 
2281   format %{ "vsubss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
2282   ins_cost(150);
2283   ins_encode %{
2284     __ vsubss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2285   %}
2286   ins_pipe(pipe_slow);
2287 %}
2288 
2289 instruct subD_reg(regD dst, regD src) %{
2290   predicate((UseSSE>=2) && (UseAVX == 0));
2291   match(Set dst (SubD dst src));
2292 
2293   format %{ "subsd   $dst, $src" %}
2294   ins_cost(150);
2295   ins_encode %{
2296     __ subsd($dst$$XMMRegister, $src$$XMMRegister);
2297   %}
2298   ins_pipe(pipe_slow);
2299 %}
2300 
2301 instruct subD_mem(regD dst, memory src) %{
2302   predicate((UseSSE>=2) && (UseAVX == 0));
2303   match(Set dst (SubD dst (LoadD src)));
2304 
2305   format %{ "subsd   $dst, $src" %}
2306   ins_cost(150);
2307   ins_encode %{
2308     __ subsd($dst$$XMMRegister, $src$$Address);
2309   %}
2310   ins_pipe(pipe_slow);
2311 %}
2312 
2313 instruct subD_imm(regD dst, immD con) %{
2314   predicate((UseSSE>=2) && (UseAVX == 0));
2315   match(Set dst (SubD dst con));
2316   format %{ "subsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2317   ins_cost(150);
2318   ins_encode %{
2319     __ subsd($dst$$XMMRegister, $constantaddress($con));
2320   %}
2321   ins_pipe(pipe_slow);
2322 %}
2323 
2324 instruct subD_reg_reg(regD dst, regD src1, regD src2) %{
2325   predicate(UseAVX > 0);
2326   match(Set dst (SubD src1 src2));
2327 
2328   format %{ "vsubsd  $dst, $src1, $src2" %}
2329   ins_cost(150);
2330   ins_encode %{
2331     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2332   %}
2333   ins_pipe(pipe_slow);
2334 %}
2335 
2336 instruct subD_reg_mem(regD dst, regD src1, memory src2) %{
2337   predicate(UseAVX > 0);
2338   match(Set dst (SubD src1 (LoadD src2)));
2339 
2340   format %{ "vsubsd  $dst, $src1, $src2" %}
2341   ins_cost(150);
2342   ins_encode %{
2343     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2344   %}
2345   ins_pipe(pipe_slow);
2346 %}
2347 
2348 instruct subD_reg_imm(regD dst, regD src, immD con) %{
2349   predicate(UseAVX > 0);
2350   match(Set dst (SubD src con));
2351 
2352   format %{ "vsubsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
2353   ins_cost(150);
2354   ins_encode %{
2355     __ vsubsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2356   %}
2357   ins_pipe(pipe_slow);
2358 %}
2359 
2360 instruct mulF_reg(regF dst, regF src) %{
2361   predicate((UseSSE>=1) && (UseAVX == 0));
2362   match(Set dst (MulF dst src));
2363 
2364   format %{ "mulss   $dst, $src" %}
2365   ins_cost(150);
2366   ins_encode %{
2367     __ mulss($dst$$XMMRegister, $src$$XMMRegister);
2368   %}
2369   ins_pipe(pipe_slow);
2370 %}
2371 
2372 instruct mulF_mem(regF dst, memory src) %{
2373   predicate((UseSSE>=1) && (UseAVX == 0));
2374   match(Set dst (MulF dst (LoadF src)));
2375 
2376   format %{ "mulss   $dst, $src" %}
2377   ins_cost(150);
2378   ins_encode %{
2379     __ mulss($dst$$XMMRegister, $src$$Address);
2380   %}
2381   ins_pipe(pipe_slow);
2382 %}
2383 
2384 instruct mulF_imm(regF dst, immF con) %{
2385   predicate((UseSSE>=1) && (UseAVX == 0));
2386   match(Set dst (MulF dst con));
2387   format %{ "mulss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2388   ins_cost(150);
2389   ins_encode %{
2390     __ mulss($dst$$XMMRegister, $constantaddress($con));
2391   %}
2392   ins_pipe(pipe_slow);
2393 %}
2394 
2395 instruct mulF_reg_reg(regF dst, regF src1, regF src2) %{
2396   predicate(UseAVX > 0);
2397   match(Set dst (MulF src1 src2));
2398 
2399   format %{ "vmulss  $dst, $src1, $src2" %}
2400   ins_cost(150);
2401   ins_encode %{
2402     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2403   %}
2404   ins_pipe(pipe_slow);
2405 %}
2406 
2407 instruct mulF_reg_mem(regF dst, regF src1, memory src2) %{
2408   predicate(UseAVX > 0);
2409   match(Set dst (MulF src1 (LoadF src2)));
2410 
2411   format %{ "vmulss  $dst, $src1, $src2" %}
2412   ins_cost(150);
2413   ins_encode %{
2414     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2415   %}
2416   ins_pipe(pipe_slow);
2417 %}
2418 
2419 instruct mulF_reg_imm(regF dst, regF src, immF con) %{
2420   predicate(UseAVX > 0);
2421   match(Set dst (MulF src con));
2422 
2423   format %{ "vmulss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
2424   ins_cost(150);
2425   ins_encode %{
2426     __ vmulss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2427   %}
2428   ins_pipe(pipe_slow);
2429 %}
2430 
2431 instruct mulD_reg(regD dst, regD src) %{
2432   predicate((UseSSE>=2) && (UseAVX == 0));
2433   match(Set dst (MulD dst src));
2434 
2435   format %{ "mulsd   $dst, $src" %}
2436   ins_cost(150);
2437   ins_encode %{
2438     __ mulsd($dst$$XMMRegister, $src$$XMMRegister);
2439   %}
2440   ins_pipe(pipe_slow);
2441 %}
2442 
2443 instruct mulD_mem(regD dst, memory src) %{
2444   predicate((UseSSE>=2) && (UseAVX == 0));
2445   match(Set dst (MulD dst (LoadD src)));
2446 
2447   format %{ "mulsd   $dst, $src" %}
2448   ins_cost(150);
2449   ins_encode %{
2450     __ mulsd($dst$$XMMRegister, $src$$Address);
2451   %}
2452   ins_pipe(pipe_slow);
2453 %}
2454 
2455 instruct mulD_imm(regD dst, immD con) %{
2456   predicate((UseSSE>=2) && (UseAVX == 0));
2457   match(Set dst (MulD dst con));
2458   format %{ "mulsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2459   ins_cost(150);
2460   ins_encode %{
2461     __ mulsd($dst$$XMMRegister, $constantaddress($con));
2462   %}
2463   ins_pipe(pipe_slow);
2464 %}
2465 
2466 instruct mulD_reg_reg(regD dst, regD src1, regD src2) %{
2467   predicate(UseAVX > 0);
2468   match(Set dst (MulD src1 src2));
2469 
2470   format %{ "vmulsd  $dst, $src1, $src2" %}
2471   ins_cost(150);
2472   ins_encode %{
2473     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2474   %}
2475   ins_pipe(pipe_slow);
2476 %}
2477 
2478 instruct mulD_reg_mem(regD dst, regD src1, memory src2) %{
2479   predicate(UseAVX > 0);
2480   match(Set dst (MulD src1 (LoadD src2)));
2481 
2482   format %{ "vmulsd  $dst, $src1, $src2" %}
2483   ins_cost(150);
2484   ins_encode %{
2485     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2486   %}
2487   ins_pipe(pipe_slow);
2488 %}
2489 
2490 instruct mulD_reg_imm(regD dst, regD src, immD con) %{
2491   predicate(UseAVX > 0);
2492   match(Set dst (MulD src con));
2493 
2494   format %{ "vmulsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
2495   ins_cost(150);
2496   ins_encode %{
2497     __ vmulsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2498   %}
2499   ins_pipe(pipe_slow);
2500 %}
2501 
2502 instruct divF_reg(regF dst, regF src) %{
2503   predicate((UseSSE>=1) && (UseAVX == 0));
2504   match(Set dst (DivF dst src));
2505 
2506   format %{ "divss   $dst, $src" %}
2507   ins_cost(150);
2508   ins_encode %{
2509     __ divss($dst$$XMMRegister, $src$$XMMRegister);
2510   %}
2511   ins_pipe(pipe_slow);
2512 %}
2513 
2514 instruct divF_mem(regF dst, memory src) %{
2515   predicate((UseSSE>=1) && (UseAVX == 0));
2516   match(Set dst (DivF dst (LoadF src)));
2517 
2518   format %{ "divss   $dst, $src" %}
2519   ins_cost(150);
2520   ins_encode %{
2521     __ divss($dst$$XMMRegister, $src$$Address);
2522   %}
2523   ins_pipe(pipe_slow);
2524 %}
2525 
2526 instruct divF_imm(regF dst, immF con) %{
2527   predicate((UseSSE>=1) && (UseAVX == 0));
2528   match(Set dst (DivF dst con));
2529   format %{ "divss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2530   ins_cost(150);
2531   ins_encode %{
2532     __ divss($dst$$XMMRegister, $constantaddress($con));
2533   %}
2534   ins_pipe(pipe_slow);
2535 %}
2536 
2537 instruct divF_reg_reg(regF dst, regF src1, regF src2) %{
2538   predicate(UseAVX > 0);
2539   match(Set dst (DivF src1 src2));
2540 
2541   format %{ "vdivss  $dst, $src1, $src2" %}
2542   ins_cost(150);
2543   ins_encode %{
2544     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2545   %}
2546   ins_pipe(pipe_slow);
2547 %}
2548 
2549 instruct divF_reg_mem(regF dst, regF src1, memory src2) %{
2550   predicate(UseAVX > 0);
2551   match(Set dst (DivF src1 (LoadF src2)));
2552 
2553   format %{ "vdivss  $dst, $src1, $src2" %}
2554   ins_cost(150);
2555   ins_encode %{
2556     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2557   %}
2558   ins_pipe(pipe_slow);
2559 %}
2560 
2561 instruct divF_reg_imm(regF dst, regF src, immF con) %{
2562   predicate(UseAVX > 0);
2563   match(Set dst (DivF src con));
2564 
2565   format %{ "vdivss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
2566   ins_cost(150);
2567   ins_encode %{
2568     __ vdivss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2569   %}
2570   ins_pipe(pipe_slow);
2571 %}
2572 
2573 instruct divD_reg(regD dst, regD src) %{
2574   predicate((UseSSE>=2) && (UseAVX == 0));
2575   match(Set dst (DivD dst src));
2576 
2577   format %{ "divsd   $dst, $src" %}
2578   ins_cost(150);
2579   ins_encode %{
2580     __ divsd($dst$$XMMRegister, $src$$XMMRegister);
2581   %}
2582   ins_pipe(pipe_slow);
2583 %}
2584 
2585 instruct divD_mem(regD dst, memory src) %{
2586   predicate((UseSSE>=2) && (UseAVX == 0));
2587   match(Set dst (DivD dst (LoadD src)));
2588 
2589   format %{ "divsd   $dst, $src" %}
2590   ins_cost(150);
2591   ins_encode %{
2592     __ divsd($dst$$XMMRegister, $src$$Address);
2593   %}
2594   ins_pipe(pipe_slow);
2595 %}
2596 
2597 instruct divD_imm(regD dst, immD con) %{
2598   predicate((UseSSE>=2) && (UseAVX == 0));
2599   match(Set dst (DivD dst con));
2600   format %{ "divsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2601   ins_cost(150);
2602   ins_encode %{
2603     __ divsd($dst$$XMMRegister, $constantaddress($con));
2604   %}
2605   ins_pipe(pipe_slow);
2606 %}
2607 
2608 instruct divD_reg_reg(regD dst, regD src1, regD src2) %{
2609   predicate(UseAVX > 0);
2610   match(Set dst (DivD src1 src2));
2611 
2612   format %{ "vdivsd  $dst, $src1, $src2" %}
2613   ins_cost(150);
2614   ins_encode %{
2615     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2616   %}
2617   ins_pipe(pipe_slow);
2618 %}
2619 
2620 instruct divD_reg_mem(regD dst, regD src1, memory src2) %{
2621   predicate(UseAVX > 0);
2622   match(Set dst (DivD src1 (LoadD src2)));
2623 
2624   format %{ "vdivsd  $dst, $src1, $src2" %}
2625   ins_cost(150);
2626   ins_encode %{
2627     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2628   %}
2629   ins_pipe(pipe_slow);
2630 %}
2631 
2632 instruct divD_reg_imm(regD dst, regD src, immD con) %{
2633   predicate(UseAVX > 0);
2634   match(Set dst (DivD src con));
2635 
2636   format %{ "vdivsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
2637   ins_cost(150);
2638   ins_encode %{
2639     __ vdivsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2640   %}
2641   ins_pipe(pipe_slow);
2642 %}
2643 
2644 instruct absF_reg(regF dst) %{
2645   predicate((UseSSE>=1) && (UseAVX == 0));
2646   match(Set dst (AbsF dst));
2647   ins_cost(150);
2648   format %{ "andps   $dst, [0x7fffffff]\t# abs float by sign masking" %}
2649   ins_encode %{
2650     __ andps($dst$$XMMRegister, ExternalAddress(float_signmask()));
2651   %}
2652   ins_pipe(pipe_slow);
2653 %}
2654 
2655 instruct absF_reg_reg(vlRegF dst, vlRegF src) %{
2656   predicate(UseAVX > 0);
2657   match(Set dst (AbsF src));
2658   ins_cost(150);
2659   format %{ "vandps  $dst, $src, [0x7fffffff]\t# abs float by sign masking" %}
2660   ins_encode %{
2661     int vector_len = 0;
2662     __ vandps($dst$$XMMRegister, $src$$XMMRegister,
2663               ExternalAddress(float_signmask()), vector_len);
2664   %}
2665   ins_pipe(pipe_slow);
2666 %}
2667 
2668 instruct absD_reg(regD dst) %{
2669   predicate((UseSSE>=2) && (UseAVX == 0));
2670   match(Set dst (AbsD dst));
2671   ins_cost(150);
2672   format %{ "andpd   $dst, [0x7fffffffffffffff]\t"
2673             "# abs double by sign masking" %}
2674   ins_encode %{
2675     __ andpd($dst$$XMMRegister, ExternalAddress(double_signmask()));
2676   %}
2677   ins_pipe(pipe_slow);
2678 %}
2679 
2680 instruct absD_reg_reg(vlRegD dst, vlRegD src) %{
2681   predicate(UseAVX > 0);
2682   match(Set dst (AbsD src));
2683   ins_cost(150);
2684   format %{ "vandpd  $dst, $src, [0x7fffffffffffffff]\t"
2685             "# abs double by sign masking" %}
2686   ins_encode %{
2687     int vector_len = 0;
2688     __ vandpd($dst$$XMMRegister, $src$$XMMRegister,
2689               ExternalAddress(double_signmask()), vector_len);
2690   %}
2691   ins_pipe(pipe_slow);
2692 %}
2693 
2694 instruct negF_reg(regF dst) %{
2695   predicate((UseSSE>=1) && (UseAVX == 0));
2696   match(Set dst (NegF dst));
2697   ins_cost(150);
2698   format %{ "xorps   $dst, [0x80000000]\t# neg float by sign flipping" %}
2699   ins_encode %{
2700     __ xorps($dst$$XMMRegister, ExternalAddress(float_signflip()));
2701   %}
2702   ins_pipe(pipe_slow);
2703 %}
2704 
2705 instruct negF_reg_reg(vlRegF dst, vlRegF src) %{
2706   predicate(UseAVX > 0);
2707   match(Set dst (NegF src));
2708   ins_cost(150);
2709   format %{ "vnegatess  $dst, $src, [0x80000000]\t# neg float by sign flipping" %}
2710   ins_encode %{
2711     __ vnegatess($dst$$XMMRegister, $src$$XMMRegister,
2712                  ExternalAddress(float_signflip()));
2713   %}
2714   ins_pipe(pipe_slow);
2715 %}
2716 
2717 instruct negD_reg(regD dst) %{
2718   predicate((UseSSE>=2) && (UseAVX == 0));
2719   match(Set dst (NegD dst));
2720   ins_cost(150);
2721   format %{ "xorpd   $dst, [0x8000000000000000]\t"
2722             "# neg double by sign flipping" %}
2723   ins_encode %{
2724     __ xorpd($dst$$XMMRegister, ExternalAddress(double_signflip()));
2725   %}
2726   ins_pipe(pipe_slow);
2727 %}
2728 
2729 instruct negD_reg_reg(vlRegD dst, vlRegD src) %{
2730   predicate(UseAVX > 0);
2731   match(Set dst (NegD src));
2732   ins_cost(150);
2733   format %{ "vnegatesd  $dst, $src, [0x8000000000000000]\t"
2734             "# neg double by sign flipping" %}
2735   ins_encode %{
2736     __ vnegatesd($dst$$XMMRegister, $src$$XMMRegister,
2737                  ExternalAddress(double_signflip()));
2738   %}
2739   ins_pipe(pipe_slow);
2740 %}
2741 
2742 instruct sqrtF_reg(regF dst, regF src) %{
2743   predicate(UseSSE>=1);
2744   match(Set dst (SqrtF src));
2745 
2746   format %{ "sqrtss  $dst, $src" %}
2747   ins_cost(150);
2748   ins_encode %{
2749     __ sqrtss($dst$$XMMRegister, $src$$XMMRegister);
2750   %}
2751   ins_pipe(pipe_slow);
2752 %}
2753 
2754 instruct sqrtF_mem(regF dst, memory src) %{
2755   predicate(UseSSE>=1);
2756   match(Set dst (SqrtF (LoadF src)));
2757 
2758   format %{ "sqrtss  $dst, $src" %}
2759   ins_cost(150);
2760   ins_encode %{
2761     __ sqrtss($dst$$XMMRegister, $src$$Address);
2762   %}
2763   ins_pipe(pipe_slow);
2764 %}
2765 
2766 instruct sqrtF_imm(regF dst, immF con) %{
2767   predicate(UseSSE>=1);
2768   match(Set dst (SqrtF con));
2769 
2770   format %{ "sqrtss  $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2771   ins_cost(150);
2772   ins_encode %{
2773     __ sqrtss($dst$$XMMRegister, $constantaddress($con));
2774   %}
2775   ins_pipe(pipe_slow);
2776 %}
2777 
2778 instruct sqrtD_reg(regD dst, regD src) %{
2779   predicate(UseSSE>=2);
2780   match(Set dst (SqrtD src));
2781 
2782   format %{ "sqrtsd  $dst, $src" %}
2783   ins_cost(150);
2784   ins_encode %{
2785     __ sqrtsd($dst$$XMMRegister, $src$$XMMRegister);
2786   %}
2787   ins_pipe(pipe_slow);
2788 %}
2789 
2790 instruct sqrtD_mem(regD dst, memory src) %{
2791   predicate(UseSSE>=2);
2792   match(Set dst (SqrtD (LoadD src)));
2793 
2794   format %{ "sqrtsd  $dst, $src" %}
2795   ins_cost(150);
2796   ins_encode %{
2797     __ sqrtsd($dst$$XMMRegister, $src$$Address);
2798   %}
2799   ins_pipe(pipe_slow);
2800 %}
2801 
2802 instruct sqrtD_imm(regD dst, immD con) %{
2803   predicate(UseSSE>=2);
2804   match(Set dst (SqrtD con));
2805   format %{ "sqrtsd  $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2806   ins_cost(150);
2807   ins_encode %{
2808     __ sqrtsd($dst$$XMMRegister, $constantaddress($con));
2809   %}
2810   ins_pipe(pipe_slow);
2811 %}
2812 
2813 instruct onspinwait() %{
2814   match(OnSpinWait);
2815   ins_cost(200);
2816 
2817   format %{
2818     $$template
2819     $$emit$$"pause\t! membar_onspinwait"
2820   %}
2821   ins_encode %{
2822     __ pause();
2823   %}
2824   ins_pipe(pipe_slow);
2825 %}
2826 
2827 // a * b + c
2828 instruct fmaD_reg(regD a, regD b, regD c) %{
2829   predicate(UseFMA);
2830   match(Set c (FmaD  c (Binary a b)));
2831   format %{ "fmasd $a,$b,$c\t# $c = $a * $b + $c" %}
2832   ins_cost(150);
2833   ins_encode %{
2834     __ fmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
2835   %}
2836   ins_pipe( pipe_slow );
2837 %}
2838 
2839 // a * b + c
2840 instruct fmaF_reg(regF a, regF b, regF c) %{
2841   predicate(UseFMA);
2842   match(Set c (FmaF  c (Binary a b)));
2843   format %{ "fmass $a,$b,$c\t# $c = $a * $b + $c" %}
2844   ins_cost(150);
2845   ins_encode %{
2846     __ fmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
2847   %}
2848   ins_pipe( pipe_slow );
2849 %}
2850 
2851 // Following pseudo code describes the algorithm for max[FD]/min[FD]: 
2852 //  if ( b < 0 )
2853 //    swap(a, b)
2854 //  Tmp  = Max_Float( a , b)
2855 //  Mask = a == NaN ? 1 : 0
2856 //  Res  = Mask ? a : Tmp 
2857 instruct maxF_reg(legRegF dst, legRegF a, legRegF b, legRegF tmp, legRegF mask) %{
2858   predicate(UseAVX > 0);
2859   match(Set dst (MaxF a b));
2860   effect(USE a, USE b, TEMP tmp, TEMP mask);
2861   format %{ 
2862      "blendvps         $tmp,$b,$a,$b   \n\t"
2863      "blendvps         $a,$a,$b,$b     \n\t"
2864      "movaps           $b,$tmp         \n\t"
2865      "vmaxps           $tmp,$a,$b      \n\t"
2866      "cmpps.unordered  $mask, $a, $a   \n\t"
2867      "blendvps         $dst,$tmp,$a,$mask  \n\t"
2868   %}
2869   ins_encode %{
2870     int vector_len = 0;
2871     __ blendvps($tmp$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, vector_len);
2872     __ blendvps($a$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $b$$XMMRegister, vector_len);
2873     __ movflt($b$$XMMRegister , $tmp$$XMMRegister);
2874     __ vmaxps($tmp$$XMMRegister, $a$$XMMRegister , $b$$XMMRegister);
2875     __ cmpps($mask$$XMMRegister, $a$$XMMRegister, $a$$XMMRegister, 0x3, vector_len);
2876     __ blendvps($dst$$XMMRegister, $tmp$$XMMRegister, $a$$XMMRegister, $mask$$XMMRegister, vector_len);
2877   %}
2878   ins_pipe( pipe_slow );
2879 %}
2880 
2881 // max = java.lang.Max(double a , double b)
2882 instruct maxD_reg(legRegD dst, legRegD a, legRegD b, legRegD tmp, legRegD mask) %{
2883   predicate(UseAVX > 0);
2884   match(Set dst (MaxD a b));
2885   effect(USE a, USE b, TEMP tmp, TEMP mask);
2886   format %{ 
2887      "blendvpd         $tmp,$b,$a,$b   \n\t"
2888      "blendvpd         $a,$a,$b,$b     \n\t"
2889      "movapd           $b,$tmp         \n\t"
2890      "vmaxpd           $tmp,$a,$b      \n\t"
2891      "cmppd.unordered  $mask, $a, $a   \n\t" 
2892      "blendvpd         $dst,$tmp,$a,$mask  \n\t"
2893   %}
2894   ins_encode %{
2895     int vector_len = 0;
2896     __ blendvpd($tmp$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, vector_len);
2897     __ blendvpd($a$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $b$$XMMRegister, vector_len);
2898     __ movdbl($b$$XMMRegister , $tmp$$XMMRegister);
2899     __ vmaxpd($tmp$$XMMRegister, $a$$XMMRegister , $b$$XMMRegister);
2900     __ cmppd($mask$$XMMRegister, $a$$XMMRegister, $a$$XMMRegister, 0x3, vector_len);
2901     __ blendvpd($dst$$XMMRegister, $tmp$$XMMRegister, $a$$XMMRegister, $mask$$XMMRegister, vector_len);
2902   %}
2903   ins_pipe( pipe_slow );
2904 %}
2905 
2906 
2907 // min = java.lang.Min(float a , float b)
2908 instruct minF_reg(legRegF dst, legRegF a, legRegF b, legRegF tmp, legRegF mask) %{
2909   predicate(UseAVX > 0);
2910   match(Set dst (MinF a b));
2911   effect(USE a, USE b, TEMP tmp, TEMP mask, DEF dst);
2912   format %{ 
2913      "blendvps         $tmp,$a,$b,$a   \n\t"
2914      "blendvps         $b,$b,$a,$a     \n\t"
2915      "movaps           $a,$tmp         \n\t"
2916      "vminps           $tmp,$a,$b      \n\t"
2917      "cmpps.unordered  $mask, $a, $a   \n\t" 
2918      "blendvps         $dst,$tmp,$a,$mask  \n\t"
2919   %}
2920   ins_encode %{
2921     int vector_len = 0;
2922     __ blendvps($tmp$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, vector_len);
2923     __ blendvps($b$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, $a$$XMMRegister, vector_len);
2924     __ movflt($a$$XMMRegister , $tmp$$XMMRegister);
2925     __ vminps($tmp$$XMMRegister, $a$$XMMRegister , $b$$XMMRegister);
2926     __ cmpps($mask$$XMMRegister, $a$$XMMRegister, $a$$XMMRegister, 0x3, vector_len);
2927     __ blendvps($dst$$XMMRegister, $tmp$$XMMRegister, $a$$XMMRegister, $mask$$XMMRegister, vector_len);
2928   %}
2929   ins_pipe( pipe_slow );
2930 %}
2931 
2932 // min = java.lang.Min(double a , double b)
2933 instruct minD_reg(legRegD dst, legRegD a, legRegD b, legRegD tmp, legRegD mask) %{
2934   predicate(UseAVX > 0);
2935   match(Set dst (MinD a b));
2936   effect(USE a, USE b, TEMP tmp, TEMP mask, DEF dst);
2937   format %{ 
2938      "blendvpd         $tmp,$a,$b,$a   \n\t"
2939      "blendvpd         $b,$b,$a,$a     \n\t"
2940      "movapd           $a,$tmp         \n\t"
2941      "vminpd           $tmp,$a,$b      \n\t"
2942      "cmppd.unordered  $mask, $a, $a   \n\t" 
2943      "blendvpd         $dst,$tmp,$a,$mask  \n\t"
2944   %}
2945   ins_encode %{
2946     int vector_len = 0;
2947     __ blendvpd($tmp$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, vector_len);
2948     __ blendvpd($b$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, $a$$XMMRegister, vector_len);
2949     __ movdbl($a$$XMMRegister , $tmp$$XMMRegister);
2950     __ vminpd($tmp$$XMMRegister, $a$$XMMRegister , $b$$XMMRegister);
2951     __ cmppd($mask$$XMMRegister, $a$$XMMRegister, $a$$XMMRegister, 0x3, vector_len);
2952     __ blendvpd($dst$$XMMRegister, $tmp$$XMMRegister, $a$$XMMRegister, $mask$$XMMRegister, vector_len);
2953   %}
2954   ins_pipe( pipe_slow );
2955 %}
2956 
2957 // ====================VECTOR INSTRUCTIONS=====================================
2958 
2959 
2960 // Load vectors (4 bytes long)
2961 instruct loadV4(vecS dst, memory mem) %{
2962   predicate(n->as_LoadVector()->memory_size() == 4);
2963   match(Set dst (LoadVector mem));
2964   ins_cost(125);
2965   format %{ "movd    $dst,$mem\t! load vector (4 bytes)" %}
2966   ins_encode %{
2967     __ movdl($dst$$XMMRegister, $mem$$Address);
2968   %}
2969   ins_pipe( pipe_slow );
2970 %}
2971 
2972 // Load vectors (4 bytes long)
2973 instruct MoveVecS2Leg(legVecS dst, vecS src) %{
2974   match(Set dst src);
2975   format %{ "movss $dst,$src\t! load vector (4 bytes)" %}
2976   ins_encode %{
2977     __ movflt($dst$$XMMRegister, $src$$XMMRegister);
2978   %}
2979   ins_pipe( fpu_reg_reg );
2980 %}
2981 
2982 // Load vectors (4 bytes long)
2983 instruct MoveLeg2VecS(vecS dst, legVecS src) %{
2984   match(Set dst src);
2985   format %{ "movss $dst,$src\t! load vector (4 bytes)" %}
2986   ins_encode %{
2987     __ movflt($dst$$XMMRegister, $src$$XMMRegister);
2988   %}
2989   ins_pipe( fpu_reg_reg );
2990 %}
2991 
2992 // Load vectors (8 bytes long)
2993 instruct loadV8(vecD dst, memory mem) %{
2994   predicate(n->as_LoadVector()->memory_size() == 8);
2995   match(Set dst (LoadVector mem));
2996   ins_cost(125);
2997   format %{ "movq    $dst,$mem\t! load vector (8 bytes)" %}
2998   ins_encode %{
2999     __ movq($dst$$XMMRegister, $mem$$Address);
3000   %}
3001   ins_pipe( pipe_slow );
3002 %}
3003 
3004 // Load vectors (8 bytes long)
3005 instruct MoveVecD2Leg(legVecD dst, vecD src) %{
3006   match(Set dst src);
3007   format %{ "movsd $dst,$src\t! load vector (8 bytes)" %}
3008   ins_encode %{
3009     __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
3010   %}
3011   ins_pipe( fpu_reg_reg );
3012 %}
3013 
3014 // Load vectors (8 bytes long)
3015 instruct MoveLeg2VecD(vecD dst, legVecD src) %{
3016   match(Set dst src);
3017   format %{ "movsd $dst,$src\t! load vector (8 bytes)" %}
3018   ins_encode %{
3019     __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
3020   %}
3021   ins_pipe( fpu_reg_reg );
3022 %}
3023 
3024 // Load vectors (16 bytes long)
3025 instruct loadV16(vecX dst, memory mem) %{
3026   predicate(n->as_LoadVector()->memory_size() == 16);
3027   match(Set dst (LoadVector mem));
3028   ins_cost(125);
3029   format %{ "movdqu  $dst,$mem\t! load vector (16 bytes)" %}
3030   ins_encode %{
3031     __ movdqu($dst$$XMMRegister, $mem$$Address);
3032   %}
3033   ins_pipe( pipe_slow );
3034 %}
3035 
3036 // Load vectors (16 bytes long)
3037 instruct MoveVecX2Leg(legVecX dst, vecX src) %{
3038   match(Set dst src);
3039   format %{ "movdqu $dst,$src\t! load vector (16 bytes)" %}
3040   ins_encode %{
3041     if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
3042       int vector_len = 2;
3043       __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
3044     } else {
3045       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
3046     }
3047   %}
3048   ins_pipe( fpu_reg_reg );
3049 %}
3050 
3051 // Load vectors (16 bytes long)
3052 instruct MoveLeg2VecX(vecX dst, legVecX src) %{
3053   match(Set dst src);
3054   format %{ "movdqu $dst,$src\t! load vector (16 bytes)" %}
3055   ins_encode %{
3056     if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
3057       int vector_len = 2;
3058       __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
3059     } else {
3060       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
3061     }
3062   %}
3063   ins_pipe( fpu_reg_reg );
3064 %}
3065 
3066 // Load vectors (32 bytes long)
3067 instruct loadV32(vecY dst, memory mem) %{
3068   predicate(n->as_LoadVector()->memory_size() == 32);
3069   match(Set dst (LoadVector mem));
3070   ins_cost(125);
3071   format %{ "vmovdqu $dst,$mem\t! load vector (32 bytes)" %}
3072   ins_encode %{
3073     __ vmovdqu($dst$$XMMRegister, $mem$$Address);
3074   %}
3075   ins_pipe( pipe_slow );
3076 %}
3077 
3078 // Load vectors (32 bytes long)
3079 instruct MoveVecY2Leg(legVecY dst, vecY src) %{
3080   match(Set dst src);
3081   format %{ "vmovdqu $dst,$src\t! load vector (32 bytes)" %}
3082   ins_encode %{
3083     if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
3084       int vector_len = 2;
3085       __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
3086     } else {
3087       __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister);
3088     }
3089   %}
3090   ins_pipe( fpu_reg_reg );
3091 %}
3092 
3093 // Load vectors (32 bytes long)
3094 instruct MoveLeg2VecY(vecY dst, legVecY src) %{
3095   match(Set dst src);
3096   format %{ "vmovdqu $dst,$src\t! load vector (32 bytes)" %}
3097   ins_encode %{
3098     if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
3099       int vector_len = 2;
3100       __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
3101     } else {
3102       __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister);
3103     }
3104   %}
3105   ins_pipe( fpu_reg_reg );
3106 %}
3107 
3108 // Load vectors (64 bytes long)
3109 instruct loadV64_dword(vecZ dst, memory mem) %{
3110   predicate(n->as_LoadVector()->memory_size() == 64 && n->as_LoadVector()->element_size() <= 4);
3111   match(Set dst (LoadVector mem));
3112   ins_cost(125);
3113   format %{ "vmovdqul $dst k0,$mem\t! load vector (64 bytes)" %}
3114   ins_encode %{
3115     int vector_len = 2;
3116     __ evmovdqul($dst$$XMMRegister, $mem$$Address, vector_len);
3117   %}
3118   ins_pipe( pipe_slow );
3119 %}
3120 
3121 // Load vectors (64 bytes long)
3122 instruct loadV64_qword(vecZ dst, memory mem) %{
3123   predicate(n->as_LoadVector()->memory_size() == 64 && n->as_LoadVector()->element_size() > 4);
3124   match(Set dst (LoadVector mem));
3125   ins_cost(125);
3126   format %{ "vmovdquq $dst k0,$mem\t! load vector (64 bytes)" %}
3127   ins_encode %{
3128     int vector_len = 2;
3129     __ evmovdquq($dst$$XMMRegister, $mem$$Address, vector_len);
3130   %}
3131   ins_pipe( pipe_slow );
3132 %}
3133 
3134 instruct MoveVecZ2Leg(legVecZ dst, vecZ  src) %{
3135   match(Set dst src);
3136   format %{ "vmovdquq $dst k0,$src\t! Move vector (64 bytes)" %}
3137   ins_encode %{
3138     int vector_len = 2;
3139     __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
3140   %}
3141   ins_pipe( fpu_reg_reg );
3142 %}
3143 
3144 instruct MoveLeg2VecZ(vecZ dst, legVecZ  src) %{
3145   match(Set dst src);
3146   format %{ "vmovdquq $dst k0,$src\t! Move vector (64 bytes)" %}
3147   ins_encode %{
3148     int vector_len = 2;
3149     __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
3150   %}
3151   ins_pipe( fpu_reg_reg );
3152 %}
3153 
3154 // Store vectors
3155 instruct storeV4(memory mem, vecS src) %{
3156   predicate(n->as_StoreVector()->memory_size() == 4);
3157   match(Set mem (StoreVector mem src));
3158   ins_cost(145);
3159   format %{ "movd    $mem,$src\t! store vector (4 bytes)" %}
3160   ins_encode %{
3161     __ movdl($mem$$Address, $src$$XMMRegister);
3162   %}
3163   ins_pipe( pipe_slow );
3164 %}
3165 
3166 instruct storeV8(memory mem, vecD src) %{
3167   predicate(n->as_StoreVector()->memory_size() == 8);
3168   match(Set mem (StoreVector mem src));
3169   ins_cost(145);
3170   format %{ "movq    $mem,$src\t! store vector (8 bytes)" %}
3171   ins_encode %{
3172     __ movq($mem$$Address, $src$$XMMRegister);
3173   %}
3174   ins_pipe( pipe_slow );
3175 %}
3176 
3177 instruct storeV16(memory mem, vecX src) %{
3178   predicate(n->as_StoreVector()->memory_size() == 16);
3179   match(Set mem (StoreVector mem src));
3180   ins_cost(145);
3181   format %{ "movdqu  $mem,$src\t! store vector (16 bytes)" %}
3182   ins_encode %{
3183     __ movdqu($mem$$Address, $src$$XMMRegister);
3184   %}
3185   ins_pipe( pipe_slow );
3186 %}
3187 
3188 instruct storeV32(memory mem, vecY src) %{
3189   predicate(n->as_StoreVector()->memory_size() == 32);
3190   match(Set mem (StoreVector mem src));
3191   ins_cost(145);
3192   format %{ "vmovdqu $mem,$src\t! store vector (32 bytes)" %}
3193   ins_encode %{
3194     __ vmovdqu($mem$$Address, $src$$XMMRegister);
3195   %}
3196   ins_pipe( pipe_slow );
3197 %}
3198 
3199 instruct storeV64_dword(memory mem, vecZ src) %{
3200   predicate(n->as_StoreVector()->memory_size() == 64 && n->as_StoreVector()->element_size() <= 4);
3201   match(Set mem (StoreVector mem src));
3202   ins_cost(145);
3203   format %{ "vmovdqul $mem k0,$src\t! store vector (64 bytes)" %}
3204   ins_encode %{
3205     int vector_len = 2;
3206     __ evmovdqul($mem$$Address, $src$$XMMRegister, vector_len);
3207   %}
3208   ins_pipe( pipe_slow );
3209 %}
3210 
3211 instruct storeV64_qword(memory mem, vecZ src) %{
3212   predicate(n->as_StoreVector()->memory_size() == 64 && n->as_StoreVector()->element_size() > 4);
3213   match(Set mem (StoreVector mem src));
3214   ins_cost(145);
3215   format %{ "vmovdquq $mem k0,$src\t! store vector (64 bytes)" %}
3216   ins_encode %{
3217     int vector_len = 2;
3218     __ evmovdquq($mem$$Address, $src$$XMMRegister, vector_len);
3219   %}
3220   ins_pipe( pipe_slow );
3221 %}
3222 
3223 // ====================LEGACY REPLICATE=======================================
3224 
3225 instruct Repl4B_mem(vecS dst, memory mem) %{
3226   predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
3227   match(Set dst (ReplicateB (LoadB mem)));
3228   format %{ "punpcklbw $dst,$mem\n\t"
3229             "pshuflw $dst,$dst,0x00\t! replicate4B" %}
3230   ins_encode %{
3231     __ punpcklbw($dst$$XMMRegister, $mem$$Address);
3232     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3233   %}
3234   ins_pipe( pipe_slow );
3235 %}
3236 
3237 instruct Repl8B_mem(vecD dst, memory mem) %{
3238   predicate(n->as_Vector()->length() == 8 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
3239   match(Set dst (ReplicateB (LoadB mem)));
3240   format %{ "punpcklbw $dst,$mem\n\t"
3241             "pshuflw $dst,$dst,0x00\t! replicate8B" %}
3242   ins_encode %{
3243     __ punpcklbw($dst$$XMMRegister, $mem$$Address);
3244     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3245   %}
3246   ins_pipe( pipe_slow );
3247 %}
3248 
3249 instruct Repl16B(vecX dst, rRegI src) %{
3250   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
3251   match(Set dst (ReplicateB src));
3252   format %{ "movd    $dst,$src\n\t"
3253             "punpcklbw $dst,$dst\n\t"
3254             "pshuflw $dst,$dst,0x00\n\t"
3255             "punpcklqdq $dst,$dst\t! replicate16B" %}
3256   ins_encode %{
3257     __ movdl($dst$$XMMRegister, $src$$Register);
3258     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
3259     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3260     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3261   %}
3262   ins_pipe( pipe_slow );
3263 %}
3264 
3265 instruct Repl16B_mem(vecX dst, memory mem) %{
3266   predicate(n->as_Vector()->length() == 16 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
3267   match(Set dst (ReplicateB (LoadB mem)));
3268   format %{ "punpcklbw $dst,$mem\n\t"
3269             "pshuflw $dst,$dst,0x00\n\t"
3270             "punpcklqdq $dst,$dst\t! replicate16B" %}
3271   ins_encode %{
3272     __ punpcklbw($dst$$XMMRegister, $mem$$Address);
3273     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3274     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3275   %}
3276   ins_pipe( pipe_slow );
3277 %}
3278 
3279 instruct Repl32B(vecY dst, rRegI src) %{
3280   predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
3281   match(Set dst (ReplicateB src));
3282   format %{ "movd    $dst,$src\n\t"
3283             "punpcklbw $dst,$dst\n\t"
3284             "pshuflw $dst,$dst,0x00\n\t"
3285             "punpcklqdq $dst,$dst\n\t"
3286             "vinserti128_high $dst,$dst\t! replicate32B" %}
3287   ins_encode %{
3288     __ movdl($dst$$XMMRegister, $src$$Register);
3289     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
3290     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3291     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3292     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3293   %}
3294   ins_pipe( pipe_slow );
3295 %}
3296 
3297 instruct Repl32B_mem(vecY dst, memory mem) %{
3298   predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
3299   match(Set dst (ReplicateB (LoadB mem)));
3300   format %{ "punpcklbw $dst,$mem\n\t"
3301             "pshuflw $dst,$dst,0x00\n\t"
3302             "punpcklqdq $dst,$dst\n\t"
3303             "vinserti128_high $dst,$dst\t! replicate32B" %}
3304   ins_encode %{
3305     __ punpcklbw($dst$$XMMRegister, $mem$$Address);
3306     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3307     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3308     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3309   %}
3310   ins_pipe( pipe_slow );
3311 %}
3312 
3313 instruct Repl64B(legVecZ dst, rRegI src) %{
3314   predicate(n->as_Vector()->length() == 64 && !VM_Version::supports_avx512vlbw());
3315   match(Set dst (ReplicateB src));
3316   format %{ "movd    $dst,$src\n\t"
3317             "punpcklbw $dst,$dst\n\t"
3318             "pshuflw $dst,$dst,0x00\n\t"
3319             "punpcklqdq $dst,$dst\n\t"
3320             "vinserti128_high $dst,$dst\t"
3321             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate64B" %}
3322   ins_encode %{
3323     __ movdl($dst$$XMMRegister, $src$$Register);
3324     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
3325     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3326     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3327     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3328     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3329   %}
3330   ins_pipe( pipe_slow );
3331 %}
3332 
3333 instruct Repl64B_mem(legVecZ dst, memory mem) %{
3334   predicate(n->as_Vector()->length() == 64 && !VM_Version::supports_avx512vlbw());
3335   match(Set dst (ReplicateB (LoadB mem)));
3336   format %{ "punpcklbw $dst,$mem\n\t"
3337             "pshuflw $dst,$dst,0x00\n\t"
3338             "punpcklqdq $dst,$dst\n\t"
3339             "vinserti128_high $dst,$dst\t"
3340             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate64B" %}
3341   ins_encode %{
3342     __ punpcklbw($dst$$XMMRegister, $mem$$Address);
3343     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3344     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3345     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3346     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3347   %}
3348   ins_pipe( pipe_slow );
3349 %}
3350 
3351 instruct Repl16B_imm(vecX dst, immI con) %{
3352   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
3353   match(Set dst (ReplicateB con));
3354   format %{ "movq    $dst,[$constantaddress]\n\t"
3355             "punpcklqdq $dst,$dst\t! replicate16B($con)" %}
3356   ins_encode %{
3357     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
3358     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3359   %}
3360   ins_pipe( pipe_slow );
3361 %}
3362 
3363 instruct Repl32B_imm(vecY dst, immI con) %{
3364   predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
3365   match(Set dst (ReplicateB con));
3366   format %{ "movq    $dst,[$constantaddress]\n\t"
3367             "punpcklqdq $dst,$dst\n\t"
3368             "vinserti128_high $dst,$dst\t! lreplicate32B($con)" %}
3369   ins_encode %{
3370     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
3371     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3372     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3373   %}
3374   ins_pipe( pipe_slow );
3375 %}
3376 
3377 instruct Repl64B_imm(legVecZ dst, immI con) %{
3378   predicate(n->as_Vector()->length() == 64 && !VM_Version::supports_avx512vlbw());
3379   match(Set dst (ReplicateB con));
3380   format %{ "movq    $dst,[$constantaddress]\n\t"
3381             "punpcklqdq $dst,$dst\n\t"
3382             "vinserti128_high $dst,$dst\t"
3383             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate64B($con)" %}
3384   ins_encode %{
3385     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
3386     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3387     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3388     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3389   %}
3390   ins_pipe( pipe_slow );
3391 %}
3392 
3393 instruct Repl4S(vecD dst, rRegI src) %{
3394   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vlbw());
3395   match(Set dst (ReplicateS src));
3396   format %{ "movd    $dst,$src\n\t"
3397             "pshuflw $dst,$dst,0x00\t! replicate4S" %}
3398   ins_encode %{
3399     __ movdl($dst$$XMMRegister, $src$$Register);
3400     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3401   %}
3402   ins_pipe( pipe_slow );
3403 %}
3404 
3405 instruct Repl4S_mem(vecD dst, memory mem) %{
3406   predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
3407   match(Set dst (ReplicateS (LoadS mem)));
3408   format %{ "pshuflw $dst,$mem,0x00\t! replicate4S" %}
3409   ins_encode %{
3410     __ pshuflw($dst$$XMMRegister, $mem$$Address, 0x00);
3411   %}
3412   ins_pipe( pipe_slow );
3413 %}
3414 
3415 instruct Repl8S(vecX dst, rRegI src) %{
3416   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vlbw());
3417   match(Set dst (ReplicateS src));
3418   format %{ "movd    $dst,$src\n\t"
3419             "pshuflw $dst,$dst,0x00\n\t"
3420             "punpcklqdq $dst,$dst\t! replicate8S" %}
3421   ins_encode %{
3422     __ movdl($dst$$XMMRegister, $src$$Register);
3423     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3424     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3425   %}
3426   ins_pipe( pipe_slow );
3427 %}
3428 
3429 instruct Repl8S_mem(vecX dst, memory mem) %{
3430   predicate(n->as_Vector()->length() == 8 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
3431   match(Set dst (ReplicateS (LoadS mem)));
3432   format %{ "pshuflw $dst,$mem,0x00\n\t"
3433             "punpcklqdq $dst,$dst\t! replicate8S" %}
3434   ins_encode %{
3435     __ pshuflw($dst$$XMMRegister, $mem$$Address, 0x00);
3436     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3437   %}
3438   ins_pipe( pipe_slow );
3439 %}
3440 
3441 instruct Repl8S_imm(vecX dst, immI con) %{
3442   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vlbw());
3443   match(Set dst (ReplicateS con));
3444   format %{ "movq    $dst,[$constantaddress]\n\t"
3445             "punpcklqdq $dst,$dst\t! replicate8S($con)" %}
3446   ins_encode %{
3447     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
3448     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3449   %}
3450   ins_pipe( pipe_slow );
3451 %}
3452 
3453 instruct Repl16S(vecY dst, rRegI src) %{
3454   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
3455   match(Set dst (ReplicateS src));
3456   format %{ "movd    $dst,$src\n\t"
3457             "pshuflw $dst,$dst,0x00\n\t"
3458             "punpcklqdq $dst,$dst\n\t"
3459             "vinserti128_high $dst,$dst\t! replicate16S" %}
3460   ins_encode %{
3461     __ movdl($dst$$XMMRegister, $src$$Register);
3462     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3463     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3464     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3465   %}
3466   ins_pipe( pipe_slow );
3467 %}
3468 
3469 instruct Repl16S_mem(vecY dst, memory mem) %{
3470   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
3471   match(Set dst (ReplicateS (LoadS mem)));
3472   format %{ "pshuflw $dst,$mem,0x00\n\t"
3473             "punpcklqdq $dst,$dst\n\t"
3474             "vinserti128_high $dst,$dst\t! replicate16S" %}
3475   ins_encode %{
3476     __ pshuflw($dst$$XMMRegister, $mem$$Address, 0x00);
3477     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3478     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3479   %}
3480   ins_pipe( pipe_slow );
3481 %}
3482 
3483 instruct Repl16S_imm(vecY dst, immI con) %{
3484   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
3485   match(Set dst (ReplicateS con));
3486   format %{ "movq    $dst,[$constantaddress]\n\t"
3487             "punpcklqdq $dst,$dst\n\t"
3488             "vinserti128_high $dst,$dst\t! replicate16S($con)" %}
3489   ins_encode %{
3490     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
3491     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3492     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3493   %}
3494   ins_pipe( pipe_slow );
3495 %}
3496 
3497 instruct Repl32S(legVecZ dst, rRegI src) %{
3498   predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
3499   match(Set dst (ReplicateS src));
3500   format %{ "movd    $dst,$src\n\t"
3501             "pshuflw $dst,$dst,0x00\n\t"
3502             "punpcklqdq $dst,$dst\n\t"
3503             "vinserti128_high $dst,$dst\t"
3504             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate32S" %}
3505   ins_encode %{
3506     __ movdl($dst$$XMMRegister, $src$$Register);
3507     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3508     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3509     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3510     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3511   %}
3512   ins_pipe( pipe_slow );
3513 %}
3514 
3515 instruct Repl32S_mem(legVecZ dst, memory mem) %{
3516   predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
3517   match(Set dst (ReplicateS (LoadS mem)));
3518   format %{ "pshuflw $dst,$mem,0x00\n\t"
3519             "punpcklqdq $dst,$dst\n\t"
3520             "vinserti128_high $dst,$dst\t"
3521             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate32S" %}
3522   ins_encode %{
3523     __ pshuflw($dst$$XMMRegister, $mem$$Address, 0x00);
3524     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3525     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3526     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3527   %}
3528   ins_pipe( pipe_slow );
3529 %}
3530 
3531 instruct Repl32S_imm(legVecZ dst, immI con) %{
3532   predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
3533   match(Set dst (ReplicateS con));
3534   format %{ "movq    $dst,[$constantaddress]\n\t"
3535             "punpcklqdq $dst,$dst\n\t"
3536             "vinserti128_high $dst,$dst\t"
3537             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate32S($con)" %}
3538   ins_encode %{
3539     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
3540     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3541     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3542     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3543   %}
3544   ins_pipe( pipe_slow );
3545 %}
3546 
3547 instruct Repl4I(vecX dst, rRegI src) %{
3548   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3549   match(Set dst (ReplicateI src));
3550   format %{ "movd    $dst,$src\n\t"
3551             "pshufd  $dst,$dst,0x00\t! replicate4I" %}
3552   ins_encode %{
3553     __ movdl($dst$$XMMRegister, $src$$Register);
3554     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3555   %}
3556   ins_pipe( pipe_slow );
3557 %}
3558 
3559 instruct Repl4I_mem(vecX dst, memory mem) %{
3560   predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3561   match(Set dst (ReplicateI (LoadI mem)));
3562   format %{ "pshufd  $dst,$mem,0x00\t! replicate4I" %}
3563   ins_encode %{
3564     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3565   %}
3566   ins_pipe( pipe_slow );
3567 %}
3568 
3569 instruct Repl8I(vecY dst, rRegI src) %{
3570   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3571   match(Set dst (ReplicateI src));
3572   format %{ "movd    $dst,$src\n\t"
3573             "pshufd  $dst,$dst,0x00\n\t"
3574             "vinserti128_high $dst,$dst\t! replicate8I" %}
3575   ins_encode %{
3576     __ movdl($dst$$XMMRegister, $src$$Register);
3577     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3578     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3579   %}
3580   ins_pipe( pipe_slow );
3581 %}
3582 
3583 instruct Repl8I_mem(vecY dst, memory mem) %{
3584   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3585   match(Set dst (ReplicateI (LoadI mem)));
3586   format %{ "pshufd  $dst,$mem,0x00\n\t"
3587             "vinserti128_high $dst,$dst\t! replicate8I" %}
3588   ins_encode %{
3589     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3590     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3591   %}
3592   ins_pipe( pipe_slow );
3593 %}
3594 
3595 instruct Repl16I(legVecZ dst, rRegI src) %{
3596   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vl());
3597   match(Set dst (ReplicateI src));
3598   format %{ "movd    $dst,$src\n\t"
3599             "pshufd  $dst,$dst,0x00\n\t"
3600             "vinserti128_high $dst,$dst\t"
3601             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate16I" %}
3602   ins_encode %{
3603     __ movdl($dst$$XMMRegister, $src$$Register);
3604     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3605     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3606     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3607   %}
3608   ins_pipe( pipe_slow );
3609 %}
3610 
3611 instruct Repl16I_mem(legVecZ dst, memory mem) %{
3612   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vl());
3613   match(Set dst (ReplicateI (LoadI mem)));
3614   format %{ "pshufd  $dst,$mem,0x00\n\t"
3615             "vinserti128_high $dst,$dst\t"
3616             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate16I" %}
3617   ins_encode %{
3618     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3619     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3620     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3621   %}
3622   ins_pipe( pipe_slow );
3623 %}
3624 
3625 instruct Repl4I_imm(vecX dst, immI con) %{
3626   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3627   match(Set dst (ReplicateI con));
3628   format %{ "movq    $dst,[$constantaddress]\t! replicate4I($con)\n\t"
3629             "punpcklqdq $dst,$dst" %}
3630   ins_encode %{
3631     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
3632     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3633   %}
3634   ins_pipe( pipe_slow );
3635 %}
3636 
3637 instruct Repl8I_imm(vecY dst, immI con) %{
3638   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3639   match(Set dst (ReplicateI con));
3640   format %{ "movq    $dst,[$constantaddress]\t! replicate8I($con)\n\t"
3641             "punpcklqdq $dst,$dst\n\t"
3642             "vinserti128_high $dst,$dst" %}
3643   ins_encode %{
3644     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
3645     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3646     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3647   %}
3648   ins_pipe( pipe_slow );
3649 %}
3650 
3651 instruct Repl16I_imm(legVecZ dst, immI con) %{
3652   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vl());
3653   match(Set dst (ReplicateI con));
3654   format %{ "movq    $dst,[$constantaddress]\t"
3655             "punpcklqdq $dst,$dst\n\t"
3656             "vinserti128_high $dst,$dst"
3657             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate16I($con)" %}
3658   ins_encode %{
3659     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
3660     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3661     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3662     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3663   %}
3664   ins_pipe( pipe_slow );
3665 %}
3666 
3667 // Long could be loaded into xmm register directly from memory.
3668 instruct Repl2L_mem(vecX dst, memory mem) %{
3669   predicate(n->as_Vector()->length() == 2 && !VM_Version::supports_avx512vlbw());
3670   match(Set dst (ReplicateL (LoadL mem)));
3671   format %{ "movq    $dst,$mem\n\t"
3672             "punpcklqdq $dst,$dst\t! replicate2L" %}
3673   ins_encode %{
3674     __ movq($dst$$XMMRegister, $mem$$Address);
3675     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3676   %}
3677   ins_pipe( pipe_slow );
3678 %}
3679 
3680 // Replicate long (8 byte) scalar to be vector
3681 #ifdef _LP64
3682 instruct Repl4L(vecY dst, rRegL src) %{
3683   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3684   match(Set dst (ReplicateL src));
3685   format %{ "movdq   $dst,$src\n\t"
3686             "punpcklqdq $dst,$dst\n\t"
3687             "vinserti128_high $dst,$dst\t! replicate4L" %}
3688   ins_encode %{
3689     __ movdq($dst$$XMMRegister, $src$$Register);
3690     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3691     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3692   %}
3693   ins_pipe( pipe_slow );
3694 %}
3695 
3696 instruct Repl8L(legVecZ dst, rRegL src) %{
3697   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3698   match(Set dst (ReplicateL src));
3699   format %{ "movdq   $dst,$src\n\t"
3700             "punpcklqdq $dst,$dst\n\t"
3701             "vinserti128_high $dst,$dst\t"
3702             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate8L" %}
3703   ins_encode %{
3704     __ movdq($dst$$XMMRegister, $src$$Register);
3705     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3706     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3707     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3708   %}
3709   ins_pipe( pipe_slow );
3710 %}
3711 #else // _LP64
3712 instruct Repl4L(vecY dst, eRegL src, vecY tmp) %{
3713   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3714   match(Set dst (ReplicateL src));
3715   effect(TEMP dst, USE src, TEMP tmp);
3716   format %{ "movdl   $dst,$src.lo\n\t"
3717             "movdl   $tmp,$src.hi\n\t"
3718             "punpckldq $dst,$tmp\n\t"
3719             "punpcklqdq $dst,$dst\n\t"
3720             "vinserti128_high $dst,$dst\t! replicate4L" %}
3721   ins_encode %{
3722     __ movdl($dst$$XMMRegister, $src$$Register);
3723     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
3724     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
3725     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3726     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3727   %}
3728   ins_pipe( pipe_slow );
3729 %}
3730 
3731 instruct Repl8L(legVecZ dst, eRegL src, legVecZ tmp) %{
3732   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3733   match(Set dst (ReplicateL src));
3734   effect(TEMP dst, USE src, TEMP tmp);
3735   format %{ "movdl   $dst,$src.lo\n\t"
3736             "movdl   $tmp,$src.hi\n\t"
3737             "punpckldq $dst,$tmp\n\t"
3738             "punpcklqdq $dst,$dst\n\t"
3739             "vinserti128_high $dst,$dst\t"
3740             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate8L" %}
3741   ins_encode %{
3742     __ movdl($dst$$XMMRegister, $src$$Register);
3743     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
3744     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
3745     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3746     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3747     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3748   %}
3749   ins_pipe( pipe_slow );
3750 %}
3751 #endif // _LP64
3752 
3753 instruct Repl4L_imm(vecY dst, immL con) %{
3754   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3755   match(Set dst (ReplicateL con));
3756   format %{ "movq    $dst,[$constantaddress]\n\t"
3757             "punpcklqdq $dst,$dst\n\t"
3758             "vinserti128_high $dst,$dst\t! replicate4L($con)" %}
3759   ins_encode %{
3760     __ movq($dst$$XMMRegister, $constantaddress($con));
3761     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3762     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3763   %}
3764   ins_pipe( pipe_slow );
3765 %}
3766 
3767 instruct Repl8L_imm(legVecZ dst, immL con) %{
3768   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3769   match(Set dst (ReplicateL con));
3770   format %{ "movq    $dst,[$constantaddress]\n\t"
3771             "punpcklqdq $dst,$dst\n\t"
3772             "vinserti128_high $dst,$dst\t"
3773             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate8L($con)" %}
3774   ins_encode %{
3775     __ movq($dst$$XMMRegister, $constantaddress($con));
3776     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3777     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3778     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3779   %}
3780   ins_pipe( pipe_slow );
3781 %}
3782 
3783 instruct Repl4L_mem(vecY dst, memory mem) %{
3784   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3785   match(Set dst (ReplicateL (LoadL mem)));
3786   format %{ "movq    $dst,$mem\n\t"
3787             "punpcklqdq $dst,$dst\n\t"
3788             "vinserti128_high $dst,$dst\t! replicate4L" %}
3789   ins_encode %{
3790     __ movq($dst$$XMMRegister, $mem$$Address);
3791     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3792     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3793   %}
3794   ins_pipe( pipe_slow );
3795 %}
3796 
3797 instruct Repl8L_mem(legVecZ dst, memory mem) %{
3798   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3799   match(Set dst (ReplicateL (LoadL mem)));
3800   format %{ "movq    $dst,$mem\n\t"
3801             "punpcklqdq $dst,$dst\n\t"
3802             "vinserti128_high $dst,$dst\t"
3803             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate8L" %}
3804   ins_encode %{
3805     __ movq($dst$$XMMRegister, $mem$$Address);
3806     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3807     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3808     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3809   %}
3810   ins_pipe( pipe_slow );
3811 %}
3812 
3813 instruct Repl2F_mem(vecD dst, memory mem) %{
3814   predicate(n->as_Vector()->length() == 2 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3815   match(Set dst (ReplicateF (LoadF mem)));
3816   format %{ "pshufd  $dst,$mem,0x00\t! replicate2F" %}
3817   ins_encode %{
3818     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3819   %}
3820   ins_pipe( pipe_slow );
3821 %}
3822 
3823 instruct Repl4F_mem(vecX dst, memory mem) %{
3824   predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3825   match(Set dst (ReplicateF (LoadF mem)));
3826   format %{ "pshufd  $dst,$mem,0x00\t! replicate4F" %}
3827   ins_encode %{
3828     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3829   %}
3830   ins_pipe( pipe_slow );
3831 %}
3832 
3833 instruct Repl8F(vecY dst, vlRegF src) %{
3834   predicate(n->as_Vector()->length() == 8 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3835   match(Set dst (ReplicateF src));
3836   format %{ "pshufd  $dst,$src,0x00\n\t"
3837             "vinsertf128_high $dst,$dst\t! replicate8F" %}
3838   ins_encode %{
3839     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
3840     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3841   %}
3842   ins_pipe( pipe_slow );
3843 %}
3844 
3845 instruct Repl8F_mem(vecY dst, memory mem) %{
3846   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3847   match(Set dst (ReplicateF (LoadF mem)));
3848   format %{ "pshufd  $dst,$mem,0x00\n\t"
3849             "vinsertf128_high $dst,$dst\t! replicate8F" %}
3850   ins_encode %{
3851     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3852     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3853   %}
3854   ins_pipe( pipe_slow );
3855 %}
3856 
3857 instruct Repl16F(legVecZ dst, vlRegF src) %{
3858   predicate(n->as_Vector()->length() == 16 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3859   match(Set dst (ReplicateF src));
3860   format %{ "pshufd  $dst,$src,0x00\n\t"
3861             "vinsertf128_high $dst,$dst\t"
3862             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate16F" %}
3863   ins_encode %{
3864     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
3865     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3866     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3867   %}
3868   ins_pipe( pipe_slow );
3869 %}
3870 
3871 instruct Repl16F_mem(legVecZ dst, memory mem) %{
3872   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vl());
3873   match(Set dst (ReplicateF (LoadF mem)));
3874   format %{ "pshufd  $dst,$mem,0x00\n\t"
3875             "vinsertf128_high $dst,$dst\t"
3876             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate16F" %}
3877   ins_encode %{
3878     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3879     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3880     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3881   %}
3882   ins_pipe( pipe_slow );
3883 %}
3884 
3885 instruct Repl2F_zero(vecD dst, immF0 zero) %{
3886   predicate(n->as_Vector()->length() == 2 && UseAVX < 3);
3887   match(Set dst (ReplicateF zero));
3888   format %{ "xorps   $dst,$dst\t! replicate2F zero" %}
3889   ins_encode %{
3890     __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
3891   %}
3892   ins_pipe( fpu_reg_reg );
3893 %}
3894 
3895 instruct Repl4F_zero(vecX dst, immF0 zero) %{
3896   predicate(n->as_Vector()->length() == 4 && UseAVX < 3);
3897   match(Set dst (ReplicateF zero));
3898   format %{ "xorps   $dst,$dst\t! replicate4F zero" %}
3899   ins_encode %{
3900     __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
3901   %}
3902   ins_pipe( fpu_reg_reg );
3903 %}
3904 
3905 instruct Repl8F_zero(vecY dst, immF0 zero) %{
3906   predicate(n->as_Vector()->length() == 8 && UseAVX < 3);
3907   match(Set dst (ReplicateF zero));
3908   format %{ "vxorps  $dst,$dst,$dst\t! replicate8F zero" %}
3909   ins_encode %{
3910     int vector_len = 1;
3911     __ vxorps($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3912   %}
3913   ins_pipe( fpu_reg_reg );
3914 %}
3915 
3916 instruct Repl2D_mem(vecX dst, memory mem) %{
3917   predicate(n->as_Vector()->length() == 2 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3918   match(Set dst (ReplicateD (LoadD mem)));
3919   format %{ "pshufd  $dst,$mem,0x44\t! replicate2D" %}
3920   ins_encode %{
3921     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x44);
3922   %}
3923   ins_pipe( pipe_slow );
3924 %}
3925 
3926 instruct Repl4D(vecY dst, vlRegD src) %{
3927   predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3928   match(Set dst (ReplicateD src));
3929   format %{ "pshufd  $dst,$src,0x44\n\t"
3930             "vinsertf128_high $dst,$dst\t! replicate4D" %}
3931   ins_encode %{
3932     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
3933     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3934   %}
3935   ins_pipe( pipe_slow );
3936 %}
3937 
3938 instruct Repl4D_mem(vecY dst, memory mem) %{
3939   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3940   match(Set dst (ReplicateD (LoadD mem)));
3941   format %{ "pshufd  $dst,$mem,0x44\n\t"
3942             "vinsertf128_high $dst,$dst\t! replicate4D" %}
3943   ins_encode %{
3944     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x44);
3945     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3946   %}
3947   ins_pipe( pipe_slow );
3948 %}
3949 
3950 instruct Repl8D(legVecZ dst, vlRegD src) %{
3951   predicate(n->as_Vector()->length() == 8 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3952   match(Set dst (ReplicateD src));
3953   format %{ "pshufd  $dst,$src,0x44\n\t"
3954             "vinsertf128_high $dst,$dst\t"
3955             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate8D" %}
3956   ins_encode %{
3957     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
3958     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3959     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3960   %}
3961   ins_pipe( pipe_slow );
3962 %}
3963 
3964 instruct Repl8D_mem(legVecZ dst, memory mem) %{
3965   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3966   match(Set dst (ReplicateD (LoadD mem)));
3967   format %{ "pshufd  $dst,$mem,0x44\n\t"
3968             "vinsertf128_high $dst,$dst\t"
3969             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate8D" %}
3970   ins_encode %{
3971     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x44);
3972     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3973     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3974   %}
3975   ins_pipe( pipe_slow );
3976 %}
3977 
3978 // Replicate double (8 byte) scalar zero to be vector
3979 instruct Repl2D_zero(vecX dst, immD0 zero) %{
3980   predicate(n->as_Vector()->length() == 2 && UseAVX < 3);
3981   match(Set dst (ReplicateD zero));
3982   format %{ "xorpd   $dst,$dst\t! replicate2D zero" %}
3983   ins_encode %{
3984     __ xorpd($dst$$XMMRegister, $dst$$XMMRegister);
3985   %}
3986   ins_pipe( fpu_reg_reg );
3987 %}
3988 
3989 instruct Repl4D_zero(vecY dst, immD0 zero) %{
3990   predicate(n->as_Vector()->length() == 4 && UseAVX < 3);
3991   match(Set dst (ReplicateD zero));
3992   format %{ "vxorpd  $dst,$dst,$dst,vect256\t! replicate4D zero" %}
3993   ins_encode %{
3994     int vector_len = 1;
3995     __ vxorpd($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3996   %}
3997   ins_pipe( fpu_reg_reg );
3998 %}
3999 
4000 // ====================GENERIC REPLICATE==========================================
4001 
4002 // Replicate byte scalar to be vector
4003 instruct Repl4B(vecS dst, rRegI src) %{
4004   predicate(n->as_Vector()->length() == 4);
4005   match(Set dst (ReplicateB src));
4006   format %{ "movd    $dst,$src\n\t"
4007             "punpcklbw $dst,$dst\n\t"
4008             "pshuflw $dst,$dst,0x00\t! replicate4B" %}
4009   ins_encode %{
4010     __ movdl($dst$$XMMRegister, $src$$Register);
4011     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
4012     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
4013   %}
4014   ins_pipe( pipe_slow );
4015 %}
4016 
4017 instruct Repl8B(vecD dst, rRegI src) %{
4018   predicate(n->as_Vector()->length() == 8);
4019   match(Set dst (ReplicateB src));
4020   format %{ "movd    $dst,$src\n\t"
4021             "punpcklbw $dst,$dst\n\t"
4022             "pshuflw $dst,$dst,0x00\t! replicate8B" %}
4023   ins_encode %{
4024     __ movdl($dst$$XMMRegister, $src$$Register);
4025     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
4026     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
4027   %}
4028   ins_pipe( pipe_slow );
4029 %}
4030 
4031 // Replicate byte scalar immediate to be vector by loading from const table.
4032 instruct Repl4B_imm(vecS dst, immI con) %{
4033   predicate(n->as_Vector()->length() == 4);
4034   match(Set dst (ReplicateB con));
4035   format %{ "movdl   $dst,[$constantaddress]\t! replicate4B($con)" %}
4036   ins_encode %{
4037     __ movdl($dst$$XMMRegister, $constantaddress(replicate4_imm($con$$constant, 1)));
4038   %}
4039   ins_pipe( pipe_slow );
4040 %}
4041 
4042 instruct Repl8B_imm(vecD dst, immI con) %{
4043   predicate(n->as_Vector()->length() == 8);
4044   match(Set dst (ReplicateB con));
4045   format %{ "movq    $dst,[$constantaddress]\t! replicate8B($con)" %}
4046   ins_encode %{
4047     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
4048   %}
4049   ins_pipe( pipe_slow );
4050 %}
4051 
4052 // Replicate byte scalar zero to be vector
4053 instruct Repl4B_zero(vecS dst, immI0 zero) %{
4054   predicate(n->as_Vector()->length() == 4);
4055   match(Set dst (ReplicateB zero));
4056   format %{ "pxor    $dst,$dst\t! replicate4B zero" %}
4057   ins_encode %{
4058     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4059   %}
4060   ins_pipe( fpu_reg_reg );
4061 %}
4062 
4063 instruct Repl8B_zero(vecD dst, immI0 zero) %{
4064   predicate(n->as_Vector()->length() == 8);
4065   match(Set dst (ReplicateB zero));
4066   format %{ "pxor    $dst,$dst\t! replicate8B zero" %}
4067   ins_encode %{
4068     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4069   %}
4070   ins_pipe( fpu_reg_reg );
4071 %}
4072 
4073 instruct Repl16B_zero(vecX dst, immI0 zero) %{
4074   predicate(n->as_Vector()->length() == 16);
4075   match(Set dst (ReplicateB zero));
4076   format %{ "pxor    $dst,$dst\t! replicate16B zero" %}
4077   ins_encode %{
4078     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4079   %}
4080   ins_pipe( fpu_reg_reg );
4081 %}
4082 
4083 instruct Repl32B_zero(vecY dst, immI0 zero) %{
4084   predicate(n->as_Vector()->length() == 32);
4085   match(Set dst (ReplicateB zero));
4086   format %{ "vpxor   $dst,$dst,$dst\t! replicate32B zero" %}
4087   ins_encode %{
4088     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
4089     int vector_len = 1;
4090     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4091   %}
4092   ins_pipe( fpu_reg_reg );
4093 %}
4094 
4095 // Replicate char/short (2 byte) scalar to be vector
4096 instruct Repl2S(vecS dst, rRegI src) %{
4097   predicate(n->as_Vector()->length() == 2);
4098   match(Set dst (ReplicateS src));
4099   format %{ "movd    $dst,$src\n\t"
4100             "pshuflw $dst,$dst,0x00\t! replicate2S" %}
4101   ins_encode %{
4102     __ movdl($dst$$XMMRegister, $src$$Register);
4103     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
4104   %}
4105   ins_pipe( fpu_reg_reg );
4106 %}
4107 
4108 // Replicate char/short (2 byte) scalar immediate to be vector by loading from const table.
4109 instruct Repl2S_imm(vecS dst, immI con) %{
4110   predicate(n->as_Vector()->length() == 2);
4111   match(Set dst (ReplicateS con));
4112   format %{ "movdl   $dst,[$constantaddress]\t! replicate2S($con)" %}
4113   ins_encode %{
4114     __ movdl($dst$$XMMRegister, $constantaddress(replicate4_imm($con$$constant, 2)));
4115   %}
4116   ins_pipe( fpu_reg_reg );
4117 %}
4118 
4119 instruct Repl4S_imm(vecD dst, immI con) %{
4120   predicate(n->as_Vector()->length() == 4);
4121   match(Set dst (ReplicateS con));
4122   format %{ "movq    $dst,[$constantaddress]\t! replicate4S($con)" %}
4123   ins_encode %{
4124     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
4125   %}
4126   ins_pipe( fpu_reg_reg );
4127 %}
4128 
4129 // Replicate char/short (2 byte) scalar zero to be vector
4130 instruct Repl2S_zero(vecS dst, immI0 zero) %{
4131   predicate(n->as_Vector()->length() == 2);
4132   match(Set dst (ReplicateS zero));
4133   format %{ "pxor    $dst,$dst\t! replicate2S zero" %}
4134   ins_encode %{
4135     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4136   %}
4137   ins_pipe( fpu_reg_reg );
4138 %}
4139 
4140 instruct Repl4S_zero(vecD dst, immI0 zero) %{
4141   predicate(n->as_Vector()->length() == 4);
4142   match(Set dst (ReplicateS zero));
4143   format %{ "pxor    $dst,$dst\t! replicate4S zero" %}
4144   ins_encode %{
4145     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4146   %}
4147   ins_pipe( fpu_reg_reg );
4148 %}
4149 
4150 instruct Repl8S_zero(vecX dst, immI0 zero) %{
4151   predicate(n->as_Vector()->length() == 8);
4152   match(Set dst (ReplicateS zero));
4153   format %{ "pxor    $dst,$dst\t! replicate8S zero" %}
4154   ins_encode %{
4155     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4156   %}
4157   ins_pipe( fpu_reg_reg );
4158 %}
4159 
4160 instruct Repl16S_zero(vecY dst, immI0 zero) %{
4161   predicate(n->as_Vector()->length() == 16);
4162   match(Set dst (ReplicateS zero));
4163   format %{ "vpxor   $dst,$dst,$dst\t! replicate16S zero" %}
4164   ins_encode %{
4165     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
4166     int vector_len = 1;
4167     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4168   %}
4169   ins_pipe( fpu_reg_reg );
4170 %}
4171 
4172 // Replicate integer (4 byte) scalar to be vector
4173 instruct Repl2I(vecD dst, rRegI src) %{
4174   predicate(n->as_Vector()->length() == 2);
4175   match(Set dst (ReplicateI src));
4176   format %{ "movd    $dst,$src\n\t"
4177             "pshufd  $dst,$dst,0x00\t! replicate2I" %}
4178   ins_encode %{
4179     __ movdl($dst$$XMMRegister, $src$$Register);
4180     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
4181   %}
4182   ins_pipe( fpu_reg_reg );
4183 %}
4184 
4185 // Integer could be loaded into xmm register directly from memory.
4186 instruct Repl2I_mem(vecD dst, memory mem) %{
4187   predicate(n->as_Vector()->length() == 2);
4188   match(Set dst (ReplicateI (LoadI mem)));
4189   format %{ "movd    $dst,$mem\n\t"
4190             "pshufd  $dst,$dst,0x00\t! replicate2I" %}
4191   ins_encode %{
4192     __ movdl($dst$$XMMRegister, $mem$$Address);
4193     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
4194   %}
4195   ins_pipe( fpu_reg_reg );
4196 %}
4197 
4198 // Replicate integer (4 byte) scalar immediate to be vector by loading from const table.
4199 instruct Repl2I_imm(vecD dst, immI con) %{
4200   predicate(n->as_Vector()->length() == 2);
4201   match(Set dst (ReplicateI con));
4202   format %{ "movq    $dst,[$constantaddress]\t! replicate2I($con)" %}
4203   ins_encode %{
4204     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
4205   %}
4206   ins_pipe( fpu_reg_reg );
4207 %}
4208 
4209 // Replicate integer (4 byte) scalar zero to be vector
4210 instruct Repl2I_zero(vecD dst, immI0 zero) %{
4211   predicate(n->as_Vector()->length() == 2);
4212   match(Set dst (ReplicateI zero));
4213   format %{ "pxor    $dst,$dst\t! replicate2I" %}
4214   ins_encode %{
4215     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4216   %}
4217   ins_pipe( fpu_reg_reg );
4218 %}
4219 
4220 instruct Repl4I_zero(vecX dst, immI0 zero) %{
4221   predicate(n->as_Vector()->length() == 4);
4222   match(Set dst (ReplicateI zero));
4223   format %{ "pxor    $dst,$dst\t! replicate4I zero)" %}
4224   ins_encode %{
4225     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4226   %}
4227   ins_pipe( fpu_reg_reg );
4228 %}
4229 
4230 instruct Repl8I_zero(vecY dst, immI0 zero) %{
4231   predicate(n->as_Vector()->length() == 8);
4232   match(Set dst (ReplicateI zero));
4233   format %{ "vpxor   $dst,$dst,$dst\t! replicate8I zero" %}
4234   ins_encode %{
4235     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
4236     int vector_len = 1;
4237     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4238   %}
4239   ins_pipe( fpu_reg_reg );
4240 %}
4241 
4242 // Replicate long (8 byte) scalar to be vector
4243 #ifdef _LP64
4244 instruct Repl2L(vecX dst, rRegL src) %{
4245   predicate(n->as_Vector()->length() == 2);
4246   match(Set dst (ReplicateL src));
4247   format %{ "movdq   $dst,$src\n\t"
4248             "punpcklqdq $dst,$dst\t! replicate2L" %}
4249   ins_encode %{
4250     __ movdq($dst$$XMMRegister, $src$$Register);
4251     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4252   %}
4253   ins_pipe( pipe_slow );
4254 %}
4255 #else // _LP64
4256 instruct Repl2L(vecX dst, eRegL src, vecX tmp) %{
4257   predicate(n->as_Vector()->length() == 2);
4258   match(Set dst (ReplicateL src));
4259   effect(TEMP dst, USE src, TEMP tmp);
4260   format %{ "movdl   $dst,$src.lo\n\t"
4261             "movdl   $tmp,$src.hi\n\t"
4262             "punpckldq $dst,$tmp\n\t"
4263             "punpcklqdq $dst,$dst\t! replicate2L"%}
4264   ins_encode %{
4265     __ movdl($dst$$XMMRegister, $src$$Register);
4266     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
4267     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
4268     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4269   %}
4270   ins_pipe( pipe_slow );
4271 %}
4272 #endif // _LP64
4273 
4274 // Replicate long (8 byte) scalar immediate to be vector by loading from const table.
4275 instruct Repl2L_imm(vecX dst, immL con) %{
4276   predicate(n->as_Vector()->length() == 2);
4277   match(Set dst (ReplicateL con));
4278   format %{ "movq    $dst,[$constantaddress]\n\t"
4279             "punpcklqdq $dst,$dst\t! replicate2L($con)" %}
4280   ins_encode %{
4281     __ movq($dst$$XMMRegister, $constantaddress($con));
4282     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4283   %}
4284   ins_pipe( pipe_slow );
4285 %}
4286 
4287 // Replicate long (8 byte) scalar zero to be vector
4288 instruct Repl2L_zero(vecX dst, immL0 zero) %{
4289   predicate(n->as_Vector()->length() == 2);
4290   match(Set dst (ReplicateL zero));
4291   format %{ "pxor    $dst,$dst\t! replicate2L zero" %}
4292   ins_encode %{
4293     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4294   %}
4295   ins_pipe( fpu_reg_reg );
4296 %}
4297 
4298 instruct Repl4L_zero(vecY dst, immL0 zero) %{
4299   predicate(n->as_Vector()->length() == 4);
4300   match(Set dst (ReplicateL zero));
4301   format %{ "vpxor   $dst,$dst,$dst\t! replicate4L zero" %}
4302   ins_encode %{
4303     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
4304     int vector_len = 1;
4305     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4306   %}
4307   ins_pipe( fpu_reg_reg );
4308 %}
4309 
4310 // Replicate float (4 byte) scalar to be vector
4311 instruct Repl2F(vecD dst, vlRegF src) %{
4312   predicate(n->as_Vector()->length() == 2);
4313   match(Set dst (ReplicateF src));
4314   format %{ "pshufd  $dst,$dst,0x00\t! replicate2F" %}
4315   ins_encode %{
4316     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
4317   %}
4318   ins_pipe( fpu_reg_reg );
4319 %}
4320 
4321 instruct Repl4F(vecX dst, vlRegF src) %{
4322   predicate(n->as_Vector()->length() == 4);
4323   match(Set dst (ReplicateF src));
4324   format %{ "pshufd  $dst,$dst,0x00\t! replicate4F" %}
4325   ins_encode %{
4326     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
4327   %}
4328   ins_pipe( pipe_slow );
4329 %}
4330 
4331 // Replicate double (8 bytes) scalar to be vector
4332 instruct Repl2D(vecX dst, vlRegD src) %{
4333   predicate(n->as_Vector()->length() == 2);
4334   match(Set dst (ReplicateD src));
4335   format %{ "pshufd  $dst,$src,0x44\t! replicate2D" %}
4336   ins_encode %{
4337     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
4338   %}
4339   ins_pipe( pipe_slow );
4340 %}
4341 
4342 // ====================EVEX REPLICATE=============================================
4343 
4344 instruct Repl4B_mem_evex(vecS dst, memory mem) %{
4345   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4346   match(Set dst (ReplicateB (LoadB mem)));
4347   format %{ "vpbroadcastb  $dst,$mem\t! replicate4B" %}
4348   ins_encode %{
4349     int vector_len = 0;
4350     __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
4351   %}
4352   ins_pipe( pipe_slow );
4353 %}
4354 
4355 instruct Repl8B_mem_evex(vecD dst, memory mem) %{
4356   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4357   match(Set dst (ReplicateB (LoadB mem)));
4358   format %{ "vpbroadcastb  $dst,$mem\t! replicate8B" %}
4359   ins_encode %{
4360     int vector_len = 0;
4361     __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
4362   %}
4363   ins_pipe( pipe_slow );
4364 %}
4365 
4366 instruct Repl16B_evex(vecX dst, rRegI src) %{
4367   predicate(n->as_Vector()->length() == 16 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4368   match(Set dst (ReplicateB src));
4369   format %{ "evpbroadcastb $dst,$src\t! replicate16B" %}
4370   ins_encode %{
4371    int vector_len = 0;
4372     __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vector_len);
4373   %}
4374   ins_pipe( pipe_slow );
4375 %}
4376 
4377 instruct Repl16B_mem_evex(vecX dst, memory mem) %{
4378   predicate(n->as_Vector()->length() == 16 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4379   match(Set dst (ReplicateB (LoadB mem)));
4380   format %{ "vpbroadcastb  $dst,$mem\t! replicate16B" %}
4381   ins_encode %{
4382     int vector_len = 0;
4383     __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
4384   %}
4385   ins_pipe( pipe_slow );
4386 %}
4387 
4388 instruct Repl32B_evex(vecY dst, rRegI src) %{
4389   predicate(n->as_Vector()->length() == 32 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4390   match(Set dst (ReplicateB src));
4391   format %{ "evpbroadcastb $dst,$src\t! replicate32B" %}
4392   ins_encode %{
4393    int vector_len = 1;
4394     __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vector_len);
4395   %}
4396   ins_pipe( pipe_slow );
4397 %}
4398 
4399 instruct Repl32B_mem_evex(vecY dst, memory mem) %{
4400   predicate(n->as_Vector()->length() == 32 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4401   match(Set dst (ReplicateB (LoadB mem)));
4402   format %{ "vpbroadcastb  $dst,$mem\t! replicate32B" %}
4403   ins_encode %{
4404     int vector_len = 1;
4405     __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
4406   %}
4407   ins_pipe( pipe_slow );
4408 %}
4409 
4410 instruct Repl64B_evex(vecZ dst, rRegI src) %{
4411   predicate(n->as_Vector()->length() == 64 && UseAVX > 2 && VM_Version::supports_avx512bw());
4412   match(Set dst (ReplicateB src));
4413   format %{ "evpbroadcastb $dst,$src\t! upper replicate64B" %}
4414   ins_encode %{
4415    int vector_len = 2;
4416     __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vector_len);
4417   %}
4418   ins_pipe( pipe_slow );
4419 %}
4420 
4421 instruct Repl64B_mem_evex(vecZ dst, memory mem) %{
4422   predicate(n->as_Vector()->length() == 64 && UseAVX > 2 && VM_Version::supports_avx512bw());
4423   match(Set dst (ReplicateB (LoadB mem)));
4424   format %{ "vpbroadcastb  $dst,$mem\t! replicate64B" %}
4425   ins_encode %{
4426     int vector_len = 2;
4427     __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
4428   %}
4429   ins_pipe( pipe_slow );
4430 %}
4431 
4432 instruct Repl16B_imm_evex(vecX dst, immI con) %{
4433   predicate(n->as_Vector()->length() == 16 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4434   match(Set dst (ReplicateB con));
4435   format %{ "movq    $dst,[$constantaddress]\n\t"
4436             "vpbroadcastb $dst,$dst\t! replicate16B" %}
4437   ins_encode %{
4438    int vector_len = 0;
4439     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
4440     __ vpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4441   %}
4442   ins_pipe( pipe_slow );
4443 %}
4444 
4445 instruct Repl32B_imm_evex(vecY dst, immI con) %{
4446   predicate(n->as_Vector()->length() == 32 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4447   match(Set dst (ReplicateB con));
4448   format %{ "movq    $dst,[$constantaddress]\n\t"
4449             "vpbroadcastb $dst,$dst\t! replicate32B" %}
4450   ins_encode %{
4451    int vector_len = 1;
4452     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
4453     __ vpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4454   %}
4455   ins_pipe( pipe_slow );
4456 %}
4457 
4458 instruct Repl64B_imm_evex(vecZ dst, immI con) %{
4459   predicate(n->as_Vector()->length() == 64 && UseAVX > 2 && VM_Version::supports_avx512bw());
4460   match(Set dst (ReplicateB con));
4461   format %{ "movq    $dst,[$constantaddress]\n\t"
4462             "vpbroadcastb $dst,$dst\t! upper replicate64B" %}
4463   ins_encode %{
4464    int vector_len = 2;
4465     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
4466     __ vpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4467   %}
4468   ins_pipe( pipe_slow );
4469 %}
4470 
4471 instruct Repl64B_zero_evex(vecZ dst, immI0 zero) %{
4472   predicate(n->as_Vector()->length() == 64 && UseAVX > 2);
4473   match(Set dst (ReplicateB zero));
4474   format %{ "vpxor   $dst k0,$dst,$dst\t! replicate64B zero" %}
4475   ins_encode %{
4476     // Use vxorpd since AVX does not have vpxor for 512-bit (EVEX will have it).
4477     int vector_len = 2;
4478     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4479   %}
4480   ins_pipe( fpu_reg_reg );
4481 %}
4482 
4483 instruct Repl4S_evex(vecD dst, rRegI src) %{
4484   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4485   match(Set dst (ReplicateS src));
4486   format %{ "evpbroadcastw $dst,$src\t! replicate4S" %}
4487   ins_encode %{
4488    int vector_len = 0;
4489     __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
4490   %}
4491   ins_pipe( pipe_slow );
4492 %}
4493 
4494 instruct Repl4S_mem_evex(vecD dst, memory mem) %{
4495   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4496   match(Set dst (ReplicateS (LoadS mem)));
4497   format %{ "vpbroadcastw  $dst,$mem\t! replicate4S" %}
4498   ins_encode %{
4499     int vector_len = 0;
4500     __ vpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
4501   %}
4502   ins_pipe( pipe_slow );
4503 %}
4504 
4505 instruct Repl8S_evex(vecX dst, rRegI src) %{
4506   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4507   match(Set dst (ReplicateS src));
4508   format %{ "evpbroadcastw $dst,$src\t! replicate8S" %}
4509   ins_encode %{
4510    int vector_len = 0;
4511     __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
4512   %}
4513   ins_pipe( pipe_slow );
4514 %}
4515 
4516 instruct Repl8S_mem_evex(vecX dst, memory mem) %{
4517   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4518   match(Set dst (ReplicateS (LoadS mem)));
4519   format %{ "vpbroadcastw  $dst,$mem\t! replicate8S" %}
4520   ins_encode %{
4521     int vector_len = 0;
4522     __ vpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
4523   %}
4524   ins_pipe( pipe_slow );
4525 %}
4526 
4527 instruct Repl16S_evex(vecY dst, rRegI src) %{
4528   predicate(n->as_Vector()->length() == 16 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4529   match(Set dst (ReplicateS src));
4530   format %{ "evpbroadcastw $dst,$src\t! replicate16S" %}
4531   ins_encode %{
4532    int vector_len = 1;
4533     __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
4534   %}
4535   ins_pipe( pipe_slow );
4536 %}
4537 
4538 instruct Repl16S_mem_evex(vecY dst, memory mem) %{
4539   predicate(n->as_Vector()->length() == 16 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4540   match(Set dst (ReplicateS (LoadS mem)));
4541   format %{ "vpbroadcastw  $dst,$mem\t! replicate16S" %}
4542   ins_encode %{
4543     int vector_len = 1;
4544     __ vpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
4545   %}
4546   ins_pipe( pipe_slow );
4547 %}
4548 
4549 instruct Repl32S_evex(vecZ dst, rRegI src) %{
4550   predicate(n->as_Vector()->length() == 32 && UseAVX > 2 && VM_Version::supports_avx512bw());
4551   match(Set dst (ReplicateS src));
4552   format %{ "evpbroadcastw $dst,$src\t! replicate32S" %}
4553   ins_encode %{
4554    int vector_len = 2;
4555     __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
4556   %}
4557   ins_pipe( pipe_slow );
4558 %}
4559 
4560 instruct Repl32S_mem_evex(vecZ dst, memory mem) %{
4561   predicate(n->as_Vector()->length() == 32 && UseAVX > 2 && VM_Version::supports_avx512bw());
4562   match(Set dst (ReplicateS (LoadS mem)));
4563   format %{ "vpbroadcastw  $dst,$mem\t! replicate32S" %}
4564   ins_encode %{
4565     int vector_len = 2;
4566     __ vpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
4567   %}
4568   ins_pipe( pipe_slow );
4569 %}
4570 
4571 instruct Repl8S_imm_evex(vecX dst, immI con) %{
4572   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4573   match(Set dst (ReplicateS con));
4574   format %{ "movq    $dst,[$constantaddress]\n\t"
4575             "vpbroadcastw $dst,$dst\t! replicate8S" %}
4576   ins_encode %{
4577    int vector_len = 0;
4578     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
4579     __ vpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4580   %}
4581   ins_pipe( pipe_slow );
4582 %}
4583 
4584 instruct Repl16S_imm_evex(vecY dst, immI con) %{
4585   predicate(n->as_Vector()->length() == 16 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4586   match(Set dst (ReplicateS con));
4587   format %{ "movq    $dst,[$constantaddress]\n\t"
4588             "vpbroadcastw $dst,$dst\t! replicate16S" %}
4589   ins_encode %{
4590    int vector_len = 1;
4591     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
4592     __ vpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4593   %}
4594   ins_pipe( pipe_slow );
4595 %}
4596 
4597 instruct Repl32S_imm_evex(vecZ dst, immI con) %{
4598   predicate(n->as_Vector()->length() == 32 && UseAVX > 2 && VM_Version::supports_avx512bw());
4599   match(Set dst (ReplicateS con));
4600   format %{ "movq    $dst,[$constantaddress]\n\t"
4601             "vpbroadcastw $dst,$dst\t! replicate32S" %}
4602   ins_encode %{
4603    int vector_len = 2;
4604     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
4605     __ vpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4606   %}
4607   ins_pipe( pipe_slow );
4608 %}
4609 
4610 instruct Repl32S_zero_evex(vecZ dst, immI0 zero) %{
4611   predicate(n->as_Vector()->length() == 32 && UseAVX > 2);
4612   match(Set dst (ReplicateS zero));
4613   format %{ "vpxor   $dst k0,$dst,$dst\t! replicate32S zero" %}
4614   ins_encode %{
4615     // Use vxorpd since AVX does not have vpxor for 512-bit (EVEX will have it).
4616     int vector_len = 2;
4617     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4618   %}
4619   ins_pipe( fpu_reg_reg );
4620 %}
4621 
4622 instruct Repl4I_evex(vecX dst, rRegI src) %{
4623   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
4624   match(Set dst (ReplicateI src));
4625   format %{ "evpbroadcastd  $dst,$src\t! replicate4I" %}
4626   ins_encode %{
4627     int vector_len = 0;
4628     __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vector_len);
4629   %}
4630   ins_pipe( pipe_slow );
4631 %}
4632 
4633 instruct Repl4I_mem_evex(vecX dst, memory mem) %{
4634   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
4635   match(Set dst (ReplicateI (LoadI mem)));
4636   format %{ "vpbroadcastd  $dst,$mem\t! replicate4I" %}
4637   ins_encode %{
4638     int vector_len = 0;
4639     __ vpbroadcastd($dst$$XMMRegister, $mem$$Address, vector_len);
4640   %}
4641   ins_pipe( pipe_slow );
4642 %}
4643 
4644 instruct Repl8I_evex(vecY dst, rRegI src) %{
4645   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vl());
4646   match(Set dst (ReplicateI src));
4647   format %{ "evpbroadcastd  $dst,$src\t! replicate8I" %}
4648   ins_encode %{
4649     int vector_len = 1;
4650     __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vector_len);
4651   %}
4652   ins_pipe( pipe_slow );
4653 %}
4654 
4655 instruct Repl8I_mem_evex(vecY dst, memory mem) %{
4656   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vl());
4657   match(Set dst (ReplicateI (LoadI mem)));
4658   format %{ "vpbroadcastd  $dst,$mem\t! replicate8I" %}
4659   ins_encode %{
4660     int vector_len = 1;
4661     __ vpbroadcastd($dst$$XMMRegister, $mem$$Address, vector_len);
4662   %}
4663   ins_pipe( pipe_slow );
4664 %}
4665 
4666 instruct Repl16I_evex(vecZ dst, rRegI src) %{
4667   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4668   match(Set dst (ReplicateI src));
4669   format %{ "evpbroadcastd  $dst,$src\t! replicate16I" %}
4670   ins_encode %{
4671     int vector_len = 2;
4672     __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vector_len);
4673   %}
4674   ins_pipe( pipe_slow );
4675 %}
4676 
4677 instruct Repl16I_mem_evex(vecZ dst, memory mem) %{
4678   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4679   match(Set dst (ReplicateI (LoadI mem)));
4680   format %{ "vpbroadcastd  $dst,$mem\t! replicate16I" %}
4681   ins_encode %{
4682     int vector_len = 2;
4683     __ vpbroadcastd($dst$$XMMRegister, $mem$$Address, vector_len);
4684   %}
4685   ins_pipe( pipe_slow );
4686 %}
4687 
4688 instruct Repl4I_imm_evex(vecX dst, immI con) %{
4689   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
4690   match(Set dst (ReplicateI con));
4691   format %{ "movq    $dst,[$constantaddress]\t! replicate8I($con)\n\t"
4692             "vpbroadcastd  $dst,$dst\t! replicate4I" %}
4693   ins_encode %{
4694     int vector_len = 0;
4695     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
4696     __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4697   %}
4698   ins_pipe( pipe_slow );
4699 %}
4700 
4701 instruct Repl8I_imm_evex(vecY dst, immI con) %{
4702   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vl());
4703   match(Set dst (ReplicateI con));
4704   format %{ "movq    $dst,[$constantaddress]\t! replicate8I($con)\n\t"
4705             "vpbroadcastd  $dst,$dst\t! replicate8I" %}
4706   ins_encode %{
4707     int vector_len = 1;
4708     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
4709     __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4710   %}
4711   ins_pipe( pipe_slow );
4712 %}
4713 
4714 instruct Repl16I_imm_evex(vecZ dst, immI con) %{
4715   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4716   match(Set dst (ReplicateI con));
4717   format %{ "movq    $dst,[$constantaddress]\t! replicate16I($con)\n\t"
4718             "vpbroadcastd  $dst,$dst\t! replicate16I" %}
4719   ins_encode %{
4720     int vector_len = 2;
4721     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
4722     __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4723   %}
4724   ins_pipe( pipe_slow );
4725 %}
4726 
4727 instruct Repl16I_zero_evex(vecZ dst, immI0 zero) %{
4728   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4729   match(Set dst (ReplicateI zero));
4730   format %{ "vpxor   $dst k0,$dst,$dst\t! replicate16I zero" %}
4731   ins_encode %{
4732     // Use vxorpd since AVX does not have vpxor for 512-bit (AVX2 will have it).
4733     int vector_len = 2;
4734     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4735   %}
4736   ins_pipe( fpu_reg_reg );
4737 %}
4738 
4739 // Replicate long (8 byte) scalar to be vector
4740 #ifdef _LP64
4741 instruct Repl4L_evex(vecY dst, rRegL src) %{
4742   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
4743   match(Set dst (ReplicateL src));
4744   format %{ "evpbroadcastq  $dst,$src\t! replicate4L" %}
4745   ins_encode %{
4746     int vector_len = 1;
4747     __ evpbroadcastq($dst$$XMMRegister, $src$$Register, vector_len);
4748   %}
4749   ins_pipe( pipe_slow );
4750 %}
4751 
4752 instruct Repl8L_evex(vecZ dst, rRegL src) %{
4753   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4754   match(Set dst (ReplicateL src));
4755   format %{ "evpbroadcastq  $dst,$src\t! replicate8L" %}
4756   ins_encode %{
4757     int vector_len = 2;
4758     __ evpbroadcastq($dst$$XMMRegister, $src$$Register, vector_len);
4759   %}
4760   ins_pipe( pipe_slow );
4761 %}
4762 #else // _LP64
4763 instruct Repl4L_evex(vecY dst, eRegL src, regD tmp) %{
4764   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
4765   match(Set dst (ReplicateL src));
4766   effect(TEMP dst, USE src, TEMP tmp);
4767   format %{ "movdl   $dst,$src.lo\n\t"
4768             "movdl   $tmp,$src.hi\n\t"
4769             "punpckldq $dst,$tmp\n\t"
4770             "vpbroadcastq  $dst,$dst\t! replicate4L" %}
4771   ins_encode %{
4772     int vector_len = 1;
4773     __ movdl($dst$$XMMRegister, $src$$Register);
4774     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
4775     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
4776     __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4777   %}
4778   ins_pipe( pipe_slow );
4779 %}
4780 
4781 instruct Repl8L_evex(legVecZ dst, eRegL src, legVecZ tmp) %{
4782   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4783   match(Set dst (ReplicateL src));
4784   effect(TEMP dst, USE src, TEMP tmp);
4785   format %{ "movdl   $dst,$src.lo\n\t"
4786             "movdl   $tmp,$src.hi\n\t"
4787             "punpckldq $dst,$tmp\n\t"
4788             "vpbroadcastq  $dst,$dst\t! replicate8L" %}
4789   ins_encode %{
4790     int vector_len = 2;
4791     __ movdl($dst$$XMMRegister, $src$$Register);
4792     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
4793     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
4794     __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4795   %}
4796   ins_pipe( pipe_slow );
4797 %}
4798 #endif // _LP64
4799 
4800 instruct Repl4L_imm_evex(vecY dst, immL con) %{
4801   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
4802   match(Set dst (ReplicateL con));
4803   format %{ "movq    $dst,[$constantaddress]\n\t"
4804             "vpbroadcastq  $dst,$dst\t! replicate4L" %}
4805   ins_encode %{
4806     int vector_len = 1;
4807     __ movq($dst$$XMMRegister, $constantaddress($con));
4808     __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4809   %}
4810   ins_pipe( pipe_slow );
4811 %}
4812 
4813 instruct Repl8L_imm_evex(vecZ dst, immL con) %{
4814   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4815   match(Set dst (ReplicateL con));
4816   format %{ "movq    $dst,[$constantaddress]\n\t"
4817             "vpbroadcastq  $dst,$dst\t! replicate8L" %}
4818   ins_encode %{
4819     int vector_len = 2;
4820     __ movq($dst$$XMMRegister, $constantaddress($con));
4821     __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4822   %}
4823   ins_pipe( pipe_slow );
4824 %}
4825 
4826 instruct Repl2L_mem_evex(vecX dst, memory mem) %{
4827   predicate(n->as_Vector()->length() == 2 && UseAVX > 2 && VM_Version::supports_avx512vl());
4828   match(Set dst (ReplicateL (LoadL mem)));
4829   format %{ "vpbroadcastd  $dst,$mem\t! replicate2L" %}
4830   ins_encode %{
4831     int vector_len = 0;
4832     __ vpbroadcastq($dst$$XMMRegister, $mem$$Address, vector_len);
4833   %}
4834   ins_pipe( pipe_slow );
4835 %}
4836 
4837 instruct Repl4L_mem_evex(vecY dst, memory mem) %{
4838   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
4839   match(Set dst (ReplicateL (LoadL mem)));
4840   format %{ "vpbroadcastd  $dst,$mem\t! replicate4L" %}
4841   ins_encode %{
4842     int vector_len = 1;
4843     __ vpbroadcastq($dst$$XMMRegister, $mem$$Address, vector_len);
4844   %}
4845   ins_pipe( pipe_slow );
4846 %}
4847 
4848 instruct Repl8L_mem_evex(vecZ dst, memory mem) %{
4849   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4850   match(Set dst (ReplicateL (LoadL mem)));
4851   format %{ "vpbroadcastd  $dst,$mem\t! replicate8L" %}
4852   ins_encode %{
4853     int vector_len = 2;
4854     __ vpbroadcastq($dst$$XMMRegister, $mem$$Address, vector_len);
4855   %}
4856   ins_pipe( pipe_slow );
4857 %}
4858 
4859 instruct Repl8L_zero_evex(vecZ dst, immL0 zero) %{
4860   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4861   match(Set dst (ReplicateL zero));
4862   format %{ "vpxor   $dst k0,$dst,$dst\t! replicate8L zero" %}
4863   ins_encode %{
4864     // Use vxorpd since AVX does not have vpxor for 512-bit (EVEX will have it).
4865     int vector_len = 2;
4866     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4867   %}
4868   ins_pipe( fpu_reg_reg );
4869 %}
4870 
4871 instruct Repl8F_evex(vecY dst, regF src) %{
4872   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vl());
4873   match(Set dst (ReplicateF src));
4874   format %{ "vpbroadcastss $dst,$src\t! replicate8F" %}
4875   ins_encode %{
4876     int vector_len = 1;
4877     __ vpbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vector_len);
4878   %}
4879   ins_pipe( pipe_slow );
4880 %}
4881 
4882 instruct Repl8F_mem_evex(vecY dst, memory mem) %{
4883   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vl());
4884   match(Set dst (ReplicateF (LoadF mem)));
4885   format %{ "vbroadcastss  $dst,$mem\t! replicate8F" %}
4886   ins_encode %{
4887     int vector_len = 1;
4888     __ vpbroadcastss($dst$$XMMRegister, $mem$$Address, vector_len);
4889   %}
4890   ins_pipe( pipe_slow );
4891 %}
4892 
4893 instruct Repl16F_evex(vecZ dst, regF src) %{
4894   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4895   match(Set dst (ReplicateF src));
4896   format %{ "vpbroadcastss $dst,$src\t! replicate16F" %}
4897   ins_encode %{
4898     int vector_len = 2;
4899     __ vpbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vector_len);
4900   %}
4901   ins_pipe( pipe_slow );
4902 %}
4903 
4904 instruct Repl16F_mem_evex(vecZ dst, memory mem) %{
4905   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4906   match(Set dst (ReplicateF (LoadF mem)));
4907   format %{ "vbroadcastss  $dst,$mem\t! replicate16F" %}
4908   ins_encode %{
4909     int vector_len = 2;
4910     __ vpbroadcastss($dst$$XMMRegister, $mem$$Address, vector_len);
4911   %}
4912   ins_pipe( pipe_slow );
4913 %}
4914 
4915 instruct Repl2F_zero_evex(vecD dst, immF0 zero) %{
4916   predicate(n->as_Vector()->length() == 2 && UseAVX > 2);
4917   match(Set dst (ReplicateF zero));
4918   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate2F zero" %}
4919   ins_encode %{
4920     // Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
4921     int vector_len = 2;
4922     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4923   %}
4924   ins_pipe( fpu_reg_reg );
4925 %}
4926 
4927 instruct Repl4F_zero_evex(vecX dst, immF0 zero) %{
4928   predicate(n->as_Vector()->length() == 4 && UseAVX > 2);
4929   match(Set dst (ReplicateF zero));
4930   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate4F zero" %}
4931   ins_encode %{
4932     // Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
4933     int vector_len = 2;
4934     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4935   %}
4936   ins_pipe( fpu_reg_reg );
4937 %}
4938 
4939 instruct Repl8F_zero_evex(vecY dst, immF0 zero) %{
4940   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4941   match(Set dst (ReplicateF zero));
4942   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate8F zero" %}
4943   ins_encode %{
4944     // Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
4945     int vector_len = 2;
4946     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4947   %}
4948   ins_pipe( fpu_reg_reg );
4949 %}
4950 
4951 instruct Repl16F_zero_evex(vecZ dst, immF0 zero) %{
4952   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4953   match(Set dst (ReplicateF zero));
4954   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate16F zero" %}
4955   ins_encode %{
4956     // Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
4957     int vector_len = 2;
4958     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4959   %}
4960   ins_pipe( fpu_reg_reg );
4961 %}
4962 
4963 instruct Repl4D_evex(vecY dst, regD src) %{
4964   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
4965   match(Set dst (ReplicateD src));
4966   format %{ "vpbroadcastsd $dst,$src\t! replicate4D" %}
4967   ins_encode %{
4968     int vector_len = 1;
4969     __ vpbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
4970   %}
4971   ins_pipe( pipe_slow );
4972 %}
4973 
4974 instruct Repl4D_mem_evex(vecY dst, memory mem) %{
4975   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
4976   match(Set dst (ReplicateD (LoadD mem)));
4977   format %{ "vbroadcastsd  $dst,$mem\t! replicate4D" %}
4978   ins_encode %{
4979     int vector_len = 1;
4980     __ vpbroadcastsd($dst$$XMMRegister, $mem$$Address, vector_len);
4981   %}
4982   ins_pipe( pipe_slow );
4983 %}
4984 
4985 instruct Repl8D_evex(vecZ dst, regD src) %{
4986   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4987   match(Set dst (ReplicateD src));
4988   format %{ "vpbroadcastsd $dst,$src\t! replicate8D" %}
4989   ins_encode %{
4990     int vector_len = 2;
4991     __ vpbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
4992   %}
4993   ins_pipe( pipe_slow );
4994 %}
4995 
4996 instruct Repl8D_mem_evex(vecZ dst, memory mem) %{
4997   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4998   match(Set dst (ReplicateD (LoadD mem)));
4999   format %{ "vbroadcastsd  $dst,$mem\t! replicate8D" %}
5000   ins_encode %{
5001     int vector_len = 2;
5002     __ vpbroadcastsd($dst$$XMMRegister, $mem$$Address, vector_len);
5003   %}
5004   ins_pipe( pipe_slow );
5005 %}
5006 
5007 instruct Repl2D_zero_evex(vecX dst, immD0 zero) %{
5008   predicate(n->as_Vector()->length() == 2 && UseAVX > 2);
5009   match(Set dst (ReplicateD zero));
5010   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate2D zero" %}
5011   ins_encode %{
5012     // Use vpxor in place of vxorpd since EVEX has a constriant on dq for vxorpd: this is a 512-bit operation
5013     int vector_len = 2;
5014     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
5015   %}
5016   ins_pipe( fpu_reg_reg );
5017 %}
5018 
5019 instruct Repl4D_zero_evex(vecY dst, immD0 zero) %{
5020   predicate(n->as_Vector()->length() == 4 && UseAVX > 2);
5021   match(Set dst (ReplicateD zero));
5022   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate4D zero" %}
5023   ins_encode %{
5024     // Use vpxor in place of vxorpd since EVEX has a constriant on dq for vxorpd: this is a 512-bit operation
5025     int vector_len = 2;
5026     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
5027   %}
5028   ins_pipe( fpu_reg_reg );
5029 %}
5030 
5031 instruct Repl8D_zero_evex(vecZ dst, immD0 zero) %{
5032   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
5033   match(Set dst (ReplicateD zero));
5034   format %{ "vpxor  $dst k0,$dst,$dst,vect512\t! replicate8D zero" %}
5035   ins_encode %{
5036     // Use vpxor in place of vxorpd since EVEX has a constriant on dq for vxorpd: this is a 512-bit operation
5037     int vector_len = 2;
5038     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
5039   %}
5040   ins_pipe( fpu_reg_reg );
5041 %}
5042 
5043 // ====================REDUCTION ARITHMETIC=======================================
5044 
5045 instruct rsadd2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, vecD tmp, vecD tmp2) %{
5046   predicate(UseSSE > 2 && UseAVX == 0);
5047   match(Set dst (AddReductionVI src1 src2));
5048   effect(TEMP tmp2, TEMP tmp);
5049   format %{ "movdqu  $tmp2,$src2\n\t"
5050             "phaddd  $tmp2,$tmp2\n\t"
5051             "movd    $tmp,$src1\n\t"
5052             "paddd   $tmp,$tmp2\n\t"
5053             "movd    $dst,$tmp\t! add reduction2I" %}
5054   ins_encode %{
5055     __ movdqu($tmp2$$XMMRegister, $src2$$XMMRegister);
5056     __ phaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister);
5057     __ movdl($tmp$$XMMRegister, $src1$$Register);
5058     __ paddd($tmp$$XMMRegister, $tmp2$$XMMRegister);
5059     __ movdl($dst$$Register, $tmp$$XMMRegister);
5060   %}
5061   ins_pipe( pipe_slow );
5062 %}
5063 
5064 instruct rvadd2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, vecD tmp, vecD tmp2) %{
5065   predicate(VM_Version::supports_avxonly());
5066   match(Set dst (AddReductionVI src1 src2));
5067   effect(TEMP tmp, TEMP tmp2);
5068   format %{ "vphaddd  $tmp,$src2,$src2\n\t"
5069             "movd     $tmp2,$src1\n\t"
5070             "vpaddd   $tmp2,$tmp2,$tmp\n\t"
5071             "movd     $dst,$tmp2\t! add reduction2I" %}
5072   ins_encode %{
5073     int vector_len = 0;
5074     __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len);
5075     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5076     __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
5077     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5078   %}
5079   ins_pipe( pipe_slow );
5080 %}
5081 
5082 instruct rvadd2I_reduction_reg_evex(rRegI dst, rRegI src1, vecD src2, vecD tmp, vecD tmp2) %{
5083   predicate(UseAVX > 2);
5084   match(Set dst (AddReductionVI src1 src2));
5085   effect(TEMP tmp, TEMP tmp2);
5086   format %{ "pshufd  $tmp2,$src2,0x1\n\t"
5087             "vpaddd  $tmp,$src2,$tmp2\n\t"
5088             "movd    $tmp2,$src1\n\t"
5089             "vpaddd  $tmp2,$tmp,$tmp2\n\t"
5090             "movd    $dst,$tmp2\t! add reduction2I" %}
5091   ins_encode %{
5092     int vector_len = 0;
5093     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5094     __ vpaddd($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5095     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5096     __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5097     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5098   %}
5099   ins_pipe( pipe_slow );
5100 %}
5101 
5102 instruct rsadd4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, vecX tmp, vecX tmp2) %{
5103   predicate(UseSSE > 2 && UseAVX == 0);
5104   match(Set dst (AddReductionVI src1 src2));
5105   effect(TEMP tmp, TEMP tmp2);
5106   format %{ "movdqu  $tmp,$src2\n\t"
5107             "phaddd  $tmp,$tmp\n\t"
5108             "phaddd  $tmp,$tmp\n\t"
5109             "movd    $tmp2,$src1\n\t"
5110             "paddd   $tmp2,$tmp\n\t"
5111             "movd    $dst,$tmp2\t! add reduction4I" %}
5112   ins_encode %{
5113     __ movdqu($tmp$$XMMRegister, $src2$$XMMRegister);
5114     __ phaddd($tmp$$XMMRegister, $tmp$$XMMRegister);
5115     __ phaddd($tmp$$XMMRegister, $tmp$$XMMRegister);
5116     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5117     __ paddd($tmp2$$XMMRegister, $tmp$$XMMRegister);
5118     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5119   %}
5120   ins_pipe( pipe_slow );
5121 %}
5122 
5123 instruct rvadd4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, vecX tmp, vecX tmp2) %{
5124   predicate(VM_Version::supports_avxonly());
5125   match(Set dst (AddReductionVI src1 src2));
5126   effect(TEMP tmp, TEMP tmp2);
5127   format %{ "vphaddd  $tmp,$src2,$src2\n\t"
5128             "vphaddd  $tmp,$tmp,$tmp\n\t"
5129             "movd     $tmp2,$src1\n\t"
5130             "vpaddd   $tmp2,$tmp2,$tmp\n\t"
5131             "movd     $dst,$tmp2\t! add reduction4I" %}
5132   ins_encode %{
5133     int vector_len = 0;
5134     __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len);
5135     __ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp$$XMMRegister, vector_len);
5136     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5137     __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
5138     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5139   %}
5140   ins_pipe( pipe_slow );
5141 %}
5142 
5143 instruct rvadd4I_reduction_reg_evex(rRegI dst, rRegI src1, vecX src2, vecX tmp, vecX tmp2) %{
5144   predicate(UseAVX > 2);
5145   match(Set dst (AddReductionVI src1 src2));
5146   effect(TEMP tmp, TEMP tmp2);
5147   format %{ "pshufd  $tmp2,$src2,0xE\n\t"
5148             "vpaddd  $tmp,$src2,$tmp2\n\t"
5149             "pshufd  $tmp2,$tmp,0x1\n\t"
5150             "vpaddd  $tmp,$tmp,$tmp2\n\t"
5151             "movd    $tmp2,$src1\n\t"
5152             "vpaddd  $tmp2,$tmp,$tmp2\n\t"
5153             "movd    $dst,$tmp2\t! add reduction4I" %}
5154   ins_encode %{
5155     int vector_len = 0;
5156     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
5157     __ vpaddd($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5158     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
5159     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5160     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5161     __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5162     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5163   %}
5164   ins_pipe( pipe_slow );
5165 %}
5166 
5167 instruct rvadd8I_reduction_reg(rRegI dst, rRegI src1, vecY src2, vecY tmp, vecY tmp2) %{
5168   predicate(VM_Version::supports_avxonly());
5169   match(Set dst (AddReductionVI src1 src2));
5170   effect(TEMP tmp, TEMP tmp2);
5171   format %{ "vphaddd  $tmp,$src2,$src2\n\t"
5172             "vphaddd  $tmp,$tmp,$tmp2\n\t"
5173             "vextracti128_high  $tmp2,$tmp\n\t"
5174             "vpaddd   $tmp,$tmp,$tmp2\n\t"
5175             "movd     $tmp2,$src1\n\t"
5176             "vpaddd   $tmp2,$tmp2,$tmp\n\t"
5177             "movd     $dst,$tmp2\t! add reduction8I" %}
5178   ins_encode %{
5179     int vector_len = 1;
5180     __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len);
5181     __ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5182     __ vextracti128_high($tmp2$$XMMRegister, $tmp$$XMMRegister);
5183     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5184     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5185     __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5186     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5187   %}
5188   ins_pipe( pipe_slow );
5189 %}
5190 
5191 instruct rvadd8I_reduction_reg_evex(rRegI dst, rRegI src1, vecY src2, vecY tmp, vecY tmp2) %{
5192   predicate(UseAVX > 2);
5193   match(Set dst (AddReductionVI src1 src2));
5194   effect(TEMP tmp, TEMP tmp2);
5195   format %{ "vextracti128_high  $tmp,$src2\n\t"
5196             "vpaddd  $tmp,$tmp,$src2\n\t"
5197             "pshufd  $tmp2,$tmp,0xE\n\t"
5198             "vpaddd  $tmp,$tmp,$tmp2\n\t"
5199             "pshufd  $tmp2,$tmp,0x1\n\t"
5200             "vpaddd  $tmp,$tmp,$tmp2\n\t"
5201             "movd    $tmp2,$src1\n\t"
5202             "vpaddd  $tmp2,$tmp,$tmp2\n\t"
5203             "movd    $dst,$tmp2\t! add reduction8I" %}
5204   ins_encode %{
5205     int vector_len = 0;
5206     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
5207     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
5208     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
5209     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5210     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
5211     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5212     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5213     __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5214     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5215   %}
5216   ins_pipe( pipe_slow );
5217 %}
5218 
5219 instruct rvadd16I_reduction_reg_evex(rRegI dst, rRegI src1, legVecZ src2, legVecZ tmp, legVecZ tmp2, legVecZ tmp3) %{
5220   predicate(UseAVX > 2);
5221   match(Set dst (AddReductionVI src1 src2));
5222   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
5223   format %{ "vextracti64x4_high  $tmp3,$src2\n\t"
5224             "vpaddd  $tmp3,$tmp3,$src2\n\t"
5225             "vextracti128_high  $tmp,$tmp3\n\t"
5226             "vpaddd  $tmp,$tmp,$tmp3\n\t"
5227             "pshufd  $tmp2,$tmp,0xE\n\t"
5228             "vpaddd  $tmp,$tmp,$tmp2\n\t"
5229             "pshufd  $tmp2,$tmp,0x1\n\t"
5230             "vpaddd  $tmp,$tmp,$tmp2\n\t"
5231             "movd    $tmp2,$src1\n\t"
5232             "vpaddd  $tmp2,$tmp,$tmp2\n\t"
5233             "movd    $dst,$tmp2\t! mul reduction16I" %}
5234   ins_encode %{
5235     __ vextracti64x4_high($tmp3$$XMMRegister, $src2$$XMMRegister);
5236     __ vpaddd($tmp3$$XMMRegister, $tmp3$$XMMRegister, $src2$$XMMRegister, 1);
5237     __ vextracti128_high($tmp$$XMMRegister, $tmp3$$XMMRegister);
5238     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp3$$XMMRegister, 0);
5239     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
5240     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5241     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
5242     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5243     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5244     __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5245     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5246   %}
5247   ins_pipe( pipe_slow );
5248 %}
5249 
5250 #ifdef _LP64
5251 instruct rvadd2L_reduction_reg(rRegL dst, rRegL src1, vecX src2, vecX tmp, vecX tmp2) %{
5252   predicate(UseAVX > 2);
5253   match(Set dst (AddReductionVL src1 src2));
5254   effect(TEMP tmp, TEMP tmp2);
5255   format %{ "pshufd  $tmp2,$src2,0xE\n\t"
5256             "vpaddq  $tmp,$src2,$tmp2\n\t"
5257             "movdq   $tmp2,$src1\n\t"
5258             "vpaddq  $tmp2,$tmp,$tmp2\n\t"
5259             "movdq   $dst,$tmp2\t! add reduction2L" %}
5260   ins_encode %{
5261     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
5262     __ vpaddq($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, 0);
5263     __ movdq($tmp2$$XMMRegister, $src1$$Register);
5264     __ vpaddq($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5265     __ movdq($dst$$Register, $tmp2$$XMMRegister);
5266   %}
5267   ins_pipe( pipe_slow );
5268 %}
5269 
5270 instruct rvadd4L_reduction_reg(rRegL dst, rRegL src1, vecY src2, vecY tmp, vecY tmp2) %{
5271   predicate(UseAVX > 2);
5272   match(Set dst (AddReductionVL src1 src2));
5273   effect(TEMP tmp, TEMP tmp2);
5274   format %{ "vextracti128_high  $tmp,$src2\n\t"
5275             "vpaddq  $tmp2,$tmp,$src2\n\t"
5276             "pshufd  $tmp,$tmp2,0xE\n\t"
5277             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
5278             "movdq   $tmp,$src1\n\t"
5279             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
5280             "movdq   $dst,$tmp2\t! add reduction4L" %}
5281   ins_encode %{
5282     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
5283     __ vpaddq($tmp2$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, 0);
5284     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5285     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5286     __ movdq($tmp$$XMMRegister, $src1$$Register);
5287     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5288     __ movdq($dst$$Register, $tmp2$$XMMRegister);
5289   %}
5290   ins_pipe( pipe_slow );
5291 %}
5292 
5293 instruct rvadd8L_reduction_reg(rRegL dst, rRegL src1, legVecZ src2, legVecZ tmp, legVecZ tmp2) %{
5294   predicate(UseAVX > 2);
5295   match(Set dst (AddReductionVL src1 src2));
5296   effect(TEMP tmp, TEMP tmp2);
5297   format %{ "vextracti64x4_high  $tmp2,$src2\n\t"
5298             "vpaddq  $tmp2,$tmp2,$src2\n\t"
5299             "vextracti128_high  $tmp,$tmp2\n\t"
5300             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
5301             "pshufd  $tmp,$tmp2,0xE\n\t"
5302             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
5303             "movdq   $tmp,$src1\n\t"
5304             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
5305             "movdq   $dst,$tmp2\t! add reduction8L" %}
5306   ins_encode %{
5307     __ vextracti64x4_high($tmp2$$XMMRegister, $src2$$XMMRegister);
5308     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
5309     __ vextracti128_high($tmp$$XMMRegister, $tmp2$$XMMRegister);
5310     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5311     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5312     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5313     __ movdq($tmp$$XMMRegister, $src1$$Register);
5314     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5315     __ movdq($dst$$Register, $tmp2$$XMMRegister);
5316   %}
5317   ins_pipe( pipe_slow );
5318 %}
5319 #endif
5320 
5321 instruct rsadd2F_reduction_reg(regF dst, vecD src2, vecD tmp) %{
5322   predicate(UseSSE >= 1 && UseAVX == 0);
5323   match(Set dst (AddReductionVF dst src2));
5324   effect(TEMP dst, TEMP tmp);
5325   format %{ "addss   $dst,$src2\n\t"
5326             "pshufd  $tmp,$src2,0x01\n\t"
5327             "addss   $dst,$tmp\t! add reduction2F" %}
5328   ins_encode %{
5329     __ addss($dst$$XMMRegister, $src2$$XMMRegister);
5330     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5331     __ addss($dst$$XMMRegister, $tmp$$XMMRegister);
5332   %}
5333   ins_pipe( pipe_slow );
5334 %}
5335 
5336 instruct rvadd2F_reduction_reg(regF dst, vecD src2, vecD tmp) %{
5337   predicate(UseAVX > 0);
5338   match(Set dst (AddReductionVF dst src2));
5339   effect(TEMP dst, TEMP tmp);
5340   format %{ "vaddss  $dst,$dst,$src2\n\t"
5341             "pshufd  $tmp,$src2,0x01\n\t"
5342             "vaddss  $dst,$dst,$tmp\t! add reduction2F" %}
5343   ins_encode %{
5344     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5345     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5346     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5347   %}
5348   ins_pipe( pipe_slow );
5349 %}
5350 
5351 instruct rsadd4F_reduction_reg(regF dst, vecX src2, vecX tmp) %{
5352   predicate(UseSSE >= 1 && UseAVX == 0);
5353   match(Set dst (AddReductionVF dst src2));
5354   effect(TEMP dst, TEMP tmp);
5355   format %{ "addss   $dst,$src2\n\t"
5356             "pshufd  $tmp,$src2,0x01\n\t"
5357             "addss   $dst,$tmp\n\t"
5358             "pshufd  $tmp,$src2,0x02\n\t"
5359             "addss   $dst,$tmp\n\t"
5360             "pshufd  $tmp,$src2,0x03\n\t"
5361             "addss   $dst,$tmp\t! add reduction4F" %}
5362   ins_encode %{
5363     __ addss($dst$$XMMRegister, $src2$$XMMRegister);
5364     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5365     __ addss($dst$$XMMRegister, $tmp$$XMMRegister);
5366     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5367     __ addss($dst$$XMMRegister, $tmp$$XMMRegister);
5368     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5369     __ addss($dst$$XMMRegister, $tmp$$XMMRegister);
5370   %}
5371   ins_pipe( pipe_slow );
5372 %}
5373 
5374 instruct rvadd4F_reduction_reg(regF dst, vecX src2, vecX tmp) %{
5375   predicate(UseAVX > 0);
5376   match(Set dst (AddReductionVF dst src2));
5377   effect(TEMP tmp, TEMP dst);
5378   format %{ "vaddss  $dst,dst,$src2\n\t"
5379             "pshufd  $tmp,$src2,0x01\n\t"
5380             "vaddss  $dst,$dst,$tmp\n\t"
5381             "pshufd  $tmp,$src2,0x02\n\t"
5382             "vaddss  $dst,$dst,$tmp\n\t"
5383             "pshufd  $tmp,$src2,0x03\n\t"
5384             "vaddss  $dst,$dst,$tmp\t! add reduction4F" %}
5385   ins_encode %{
5386     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5387     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5388     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5389     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5390     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5391     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5392     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5393   %}
5394   ins_pipe( pipe_slow );
5395 %}
5396 
5397 instruct radd8F_reduction_reg(regF dst, vecY src2, vecY tmp, vecY tmp2) %{
5398   predicate(UseAVX > 0);
5399   match(Set dst (AddReductionVF dst src2));
5400   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5401   format %{ "vaddss  $dst,$dst,$src2\n\t"
5402             "pshufd  $tmp,$src2,0x01\n\t"
5403             "vaddss  $dst,$dst,$tmp\n\t"
5404             "pshufd  $tmp,$src2,0x02\n\t"
5405             "vaddss  $dst,$dst,$tmp\n\t"
5406             "pshufd  $tmp,$src2,0x03\n\t"
5407             "vaddss  $dst,$dst,$tmp\n\t"
5408             "vextractf128_high  $tmp2,$src2\n\t"
5409             "vaddss  $dst,$dst,$tmp2\n\t"
5410             "pshufd  $tmp,$tmp2,0x01\n\t"
5411             "vaddss  $dst,$dst,$tmp\n\t"
5412             "pshufd  $tmp,$tmp2,0x02\n\t"
5413             "vaddss  $dst,$dst,$tmp\n\t"
5414             "pshufd  $tmp,$tmp2,0x03\n\t"
5415             "vaddss  $dst,$dst,$tmp\t! add reduction8F" %}
5416   ins_encode %{
5417     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5418     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5419     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5420     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5421     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5422     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5423     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5424     __ vextractf128_high($tmp2$$XMMRegister, $src2$$XMMRegister);
5425     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5426     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5427     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5428     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5429     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5430     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5431     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5432   %}
5433   ins_pipe( pipe_slow );
5434 %}
5435 
5436 instruct radd16F_reduction_reg(regF dst, legVecZ src2, legVecZ tmp, legVecZ tmp2) %{
5437   predicate(UseAVX > 2);
5438   match(Set dst (AddReductionVF dst src2));
5439   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5440   format %{ "vaddss  $dst,$dst,$src2\n\t"
5441             "pshufd  $tmp,$src2,0x01\n\t"
5442             "vaddss  $dst,$dst,$tmp\n\t"
5443             "pshufd  $tmp,$src2,0x02\n\t"
5444             "vaddss  $dst,$dst,$tmp\n\t"
5445             "pshufd  $tmp,$src2,0x03\n\t"
5446             "vaddss  $dst,$dst,$tmp\n\t"
5447             "vextractf32x4  $tmp2,$src2,0x1\n\t"
5448             "vaddss  $dst,$dst,$tmp2\n\t"
5449             "pshufd  $tmp,$tmp2,0x01\n\t"
5450             "vaddss  $dst,$dst,$tmp\n\t"
5451             "pshufd  $tmp,$tmp2,0x02\n\t"
5452             "vaddss  $dst,$dst,$tmp\n\t"
5453             "pshufd  $tmp,$tmp2,0x03\n\t"
5454             "vaddss  $dst,$dst,$tmp\n\t"
5455             "vextractf32x4  $tmp2,$src2,0x2\n\t"
5456             "vaddss  $dst,$dst,$tmp2\n\t"
5457             "pshufd  $tmp,$tmp2,0x01\n\t"
5458             "vaddss  $dst,$dst,$tmp\n\t"
5459             "pshufd  $tmp,$tmp2,0x02\n\t"
5460             "vaddss  $dst,$dst,$tmp\n\t"
5461             "pshufd  $tmp,$tmp2,0x03\n\t"
5462             "vaddss  $dst,$dst,$tmp\n\t"
5463             "vextractf32x4  $tmp2,$src2,0x3\n\t"
5464             "vaddss  $dst,$dst,$tmp2\n\t"
5465             "pshufd  $tmp,$tmp2,0x01\n\t"
5466             "vaddss  $dst,$dst,$tmp\n\t"
5467             "pshufd  $tmp,$tmp2,0x02\n\t"
5468             "vaddss  $dst,$dst,$tmp\n\t"
5469             "pshufd  $tmp,$tmp2,0x03\n\t"
5470             "vaddss  $dst,$dst,$tmp\t! add reduction16F" %}
5471   ins_encode %{
5472     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5473     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5474     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5475     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5476     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5477     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5478     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5479     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5480     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5481     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5482     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5483     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5484     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5485     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5486     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5487     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
5488     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5489     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5490     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5491     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5492     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5493     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5494     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5495     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
5496     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5497     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5498     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5499     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5500     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5501     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5502     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5503   %}
5504   ins_pipe( pipe_slow );
5505 %}
5506 
5507 instruct rsadd2D_reduction_reg(regD dst, vecX src2, vecX tmp) %{
5508   predicate(UseSSE >= 1 && UseAVX == 0);
5509   match(Set dst (AddReductionVD dst src2));
5510   effect(TEMP tmp, TEMP dst);
5511   format %{ "addsd   $dst,$src2\n\t"
5512             "pshufd  $tmp,$src2,0xE\n\t"
5513             "addsd   $dst,$tmp\t! add reduction2D" %}
5514   ins_encode %{
5515     __ addsd($dst$$XMMRegister, $src2$$XMMRegister);
5516     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5517     __ addsd($dst$$XMMRegister, $tmp$$XMMRegister);
5518   %}
5519   ins_pipe( pipe_slow );
5520 %}
5521 
5522 instruct rvadd2D_reduction_reg(regD dst, vecX src2, vecX tmp) %{
5523   predicate(UseAVX > 0);
5524   match(Set dst (AddReductionVD dst src2));
5525   effect(TEMP tmp, TEMP dst);
5526   format %{ "vaddsd  $dst,$dst,$src2\n\t"
5527             "pshufd  $tmp,$src2,0xE\n\t"
5528             "vaddsd  $dst,$dst,$tmp\t! add reduction2D" %}
5529   ins_encode %{
5530     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5531     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5532     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5533   %}
5534   ins_pipe( pipe_slow );
5535 %}
5536 
5537 instruct rvadd4D_reduction_reg(regD dst, vecY src2, vecX tmp, vecX tmp2) %{
5538   predicate(UseAVX > 0);
5539   match(Set dst (AddReductionVD dst src2));
5540   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5541   format %{ "vaddsd  $dst,$dst,$src2\n\t"
5542             "pshufd  $tmp,$src2,0xE\n\t"
5543             "vaddsd  $dst,$dst,$tmp\n\t"
5544             "vextractf128  $tmp2,$src2,0x1\n\t"
5545             "vaddsd  $dst,$dst,$tmp2\n\t"
5546             "pshufd  $tmp,$tmp2,0xE\n\t"
5547             "vaddsd  $dst,$dst,$tmp\t! add reduction4D" %}
5548   ins_encode %{
5549     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5550     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5551     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5552     __ vextractf128($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5553     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5554     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5555     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5556   %}
5557   ins_pipe( pipe_slow );
5558 %}
5559 
5560 instruct rvadd8D_reduction_reg(regD dst, legVecZ src2, legVecZ tmp, legVecZ tmp2) %{
5561   predicate(UseAVX > 2);
5562   match(Set dst (AddReductionVD dst src2));
5563   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5564   format %{ "vaddsd  $dst,$dst,$src2\n\t"
5565             "pshufd  $tmp,$src2,0xE\n\t"
5566             "vaddsd  $dst,$dst,$tmp\n\t"
5567             "vextractf32x4  $tmp2,$src2,0x1\n\t"
5568             "vaddsd  $dst,$dst,$tmp2\n\t"
5569             "pshufd  $tmp,$tmp2,0xE\n\t"
5570             "vaddsd  $dst,$dst,$tmp\n\t"
5571             "vextractf32x4  $tmp2,$src2,0x2\n\t"
5572             "vaddsd  $dst,$dst,$tmp2\n\t"
5573             "pshufd  $tmp,$tmp2,0xE\n\t"
5574             "vaddsd  $dst,$dst,$tmp\n\t"
5575             "vextractf32x4  $tmp2,$src2,0x3\n\t"
5576             "vaddsd  $dst,$dst,$tmp2\n\t"
5577             "pshufd  $tmp,$tmp2,0xE\n\t"
5578             "vaddsd  $dst,$dst,$tmp\t! add reduction8D" %}
5579   ins_encode %{
5580     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5581     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5582     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5583     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5584     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5585     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5586     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5587     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
5588     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5589     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5590     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5591     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
5592     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5593     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5594     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5595   %}
5596   ins_pipe( pipe_slow );
5597 %}
5598 
5599 instruct rsmul2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, vecD tmp, vecD tmp2) %{
5600   predicate(UseSSE > 3 && UseAVX == 0);
5601   match(Set dst (MulReductionVI src1 src2));
5602   effect(TEMP tmp, TEMP tmp2);
5603   format %{ "pshufd  $tmp2,$src2,0x1\n\t"
5604             "pmulld  $tmp2,$src2\n\t"
5605             "movd    $tmp,$src1\n\t"
5606             "pmulld  $tmp2,$tmp\n\t"
5607             "movd    $dst,$tmp2\t! mul reduction2I" %}
5608   ins_encode %{
5609     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5610     __ pmulld($tmp2$$XMMRegister, $src2$$XMMRegister);
5611     __ movdl($tmp$$XMMRegister, $src1$$Register);
5612     __ pmulld($tmp2$$XMMRegister, $tmp$$XMMRegister);
5613     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5614   %}
5615   ins_pipe( pipe_slow );
5616 %}
5617 
5618 instruct rvmul2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, vecD tmp, vecD tmp2) %{
5619   predicate(UseAVX > 0);
5620   match(Set dst (MulReductionVI src1 src2));
5621   effect(TEMP tmp, TEMP tmp2);
5622   format %{ "pshufd   $tmp2,$src2,0x1\n\t"
5623             "vpmulld  $tmp,$src2,$tmp2\n\t"
5624             "movd     $tmp2,$src1\n\t"
5625             "vpmulld  $tmp2,$tmp,$tmp2\n\t"
5626             "movd     $dst,$tmp2\t! mul reduction2I" %}
5627   ins_encode %{
5628     int vector_len = 0;
5629     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5630     __ vpmulld($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5631     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5632     __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5633     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5634   %}
5635   ins_pipe( pipe_slow );
5636 %}
5637 
5638 instruct rsmul4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, vecX tmp, vecX tmp2) %{
5639   predicate(UseSSE > 3 && UseAVX == 0);
5640   match(Set dst (MulReductionVI src1 src2));
5641   effect(TEMP tmp, TEMP tmp2);
5642   format %{ "pshufd  $tmp2,$src2,0xE\n\t"
5643             "pmulld  $tmp2,$src2\n\t"
5644             "pshufd  $tmp,$tmp2,0x1\n\t"
5645             "pmulld  $tmp2,$tmp\n\t"
5646             "movd    $tmp,$src1\n\t"
5647             "pmulld  $tmp2,$tmp\n\t"
5648             "movd    $dst,$tmp2\t! mul reduction4I" %}
5649   ins_encode %{
5650     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
5651     __ pmulld($tmp2$$XMMRegister, $src2$$XMMRegister);
5652     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x1);
5653     __ pmulld($tmp2$$XMMRegister, $tmp$$XMMRegister);
5654     __ movdl($tmp$$XMMRegister, $src1$$Register);
5655     __ pmulld($tmp2$$XMMRegister, $tmp$$XMMRegister);
5656     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5657   %}
5658   ins_pipe( pipe_slow );
5659 %}
5660 
5661 instruct rvmul4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, vecX tmp, vecX tmp2) %{
5662   predicate(UseAVX > 0);
5663   match(Set dst (MulReductionVI src1 src2));
5664   effect(TEMP tmp, TEMP tmp2);
5665   format %{ "pshufd   $tmp2,$src2,0xE\n\t"
5666             "vpmulld  $tmp,$src2,$tmp2\n\t"
5667             "pshufd   $tmp2,$tmp,0x1\n\t"
5668             "vpmulld  $tmp,$tmp,$tmp2\n\t"
5669             "movd     $tmp2,$src1\n\t"
5670             "vpmulld  $tmp2,$tmp,$tmp2\n\t"
5671             "movd     $dst,$tmp2\t! mul reduction4I" %}
5672   ins_encode %{
5673     int vector_len = 0;
5674     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
5675     __ vpmulld($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5676     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
5677     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5678     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5679     __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5680     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5681   %}
5682   ins_pipe( pipe_slow );
5683 %}
5684 
5685 instruct rvmul8I_reduction_reg(rRegI dst, rRegI src1, vecY src2, vecY tmp, vecY tmp2) %{
5686   predicate(UseAVX > 1);
5687   match(Set dst (MulReductionVI src1 src2));
5688   effect(TEMP tmp, TEMP tmp2);
5689   format %{ "vextracti128_high  $tmp,$src2\n\t"
5690             "vpmulld  $tmp,$tmp,$src2\n\t"
5691             "pshufd   $tmp2,$tmp,0xE\n\t"
5692             "vpmulld  $tmp,$tmp,$tmp2\n\t"
5693             "pshufd   $tmp2,$tmp,0x1\n\t"
5694             "vpmulld  $tmp,$tmp,$tmp2\n\t"
5695             "movd     $tmp2,$src1\n\t"
5696             "vpmulld  $tmp2,$tmp,$tmp2\n\t"
5697             "movd     $dst,$tmp2\t! mul reduction8I" %}
5698   ins_encode %{
5699     int vector_len = 0;
5700     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
5701     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
5702     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
5703     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5704     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
5705     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5706     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5707     __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5708     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5709   %}
5710   ins_pipe( pipe_slow );
5711 %}
5712 
5713 instruct rvmul16I_reduction_reg(rRegI dst, rRegI src1, legVecZ src2, legVecZ tmp, legVecZ tmp2, legVecZ tmp3) %{
5714   predicate(UseAVX > 2);
5715   match(Set dst (MulReductionVI src1 src2));
5716   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
5717   format %{ "vextracti64x4_high  $tmp3,$src2\n\t"
5718             "vpmulld  $tmp3,$tmp3,$src2\n\t"
5719             "vextracti128_high  $tmp,$tmp3\n\t"
5720             "vpmulld  $tmp,$tmp,$src2\n\t"
5721             "pshufd   $tmp2,$tmp,0xE\n\t"
5722             "vpmulld  $tmp,$tmp,$tmp2\n\t"
5723             "pshufd   $tmp2,$tmp,0x1\n\t"
5724             "vpmulld  $tmp,$tmp,$tmp2\n\t"
5725             "movd     $tmp2,$src1\n\t"
5726             "vpmulld  $tmp2,$tmp,$tmp2\n\t"
5727             "movd     $dst,$tmp2\t! mul reduction16I" %}
5728   ins_encode %{
5729     __ vextracti64x4_high($tmp3$$XMMRegister, $src2$$XMMRegister);
5730     __ vpmulld($tmp3$$XMMRegister, $tmp3$$XMMRegister, $src2$$XMMRegister, 1);
5731     __ vextracti128_high($tmp$$XMMRegister, $tmp3$$XMMRegister);
5732     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp3$$XMMRegister, 0);
5733     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
5734     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5735     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
5736     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5737     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5738     __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5739     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5740   %}
5741   ins_pipe( pipe_slow );
5742 %}
5743 
5744 #ifdef _LP64
5745 instruct rvmul2L_reduction_reg(rRegL dst, rRegL src1, vecX src2, vecX tmp, vecX tmp2) %{
5746   predicate(UseAVX > 2 && VM_Version::supports_avx512dq());
5747   match(Set dst (MulReductionVL src1 src2));
5748   effect(TEMP tmp, TEMP tmp2);
5749   format %{ "pshufd   $tmp2,$src2,0xE\n\t"
5750             "vpmullq  $tmp,$src2,$tmp2\n\t"
5751             "movdq    $tmp2,$src1\n\t"
5752             "vpmullq  $tmp2,$tmp,$tmp2\n\t"
5753             "movdq    $dst,$tmp2\t! mul reduction2L" %}
5754   ins_encode %{
5755     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
5756     __ vpmullq($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, 0);
5757     __ movdq($tmp2$$XMMRegister, $src1$$Register);
5758     __ vpmullq($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5759     __ movdq($dst$$Register, $tmp2$$XMMRegister);
5760   %}
5761   ins_pipe( pipe_slow );
5762 %}
5763 
5764 instruct rvmul4L_reduction_reg(rRegL dst, rRegL src1, vecY src2, vecY tmp, vecY tmp2) %{
5765   predicate(UseAVX > 2 && VM_Version::supports_avx512dq());
5766   match(Set dst (MulReductionVL src1 src2));
5767   effect(TEMP tmp, TEMP tmp2);
5768   format %{ "vextracti128_high  $tmp,$src2\n\t"
5769             "vpmullq  $tmp2,$tmp,$src2\n\t"
5770             "pshufd   $tmp,$tmp2,0xE\n\t"
5771             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5772             "movdq    $tmp,$src1\n\t"
5773             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5774             "movdq    $dst,$tmp2\t! mul reduction4L" %}
5775   ins_encode %{
5776     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
5777     __ vpmullq($tmp2$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, 0);
5778     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5779     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5780     __ movdq($tmp$$XMMRegister, $src1$$Register);
5781     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5782     __ movdq($dst$$Register, $tmp2$$XMMRegister);
5783   %}
5784   ins_pipe( pipe_slow );
5785 %}
5786 
5787 instruct rvmul8L_reduction_reg(rRegL dst, rRegL src1, legVecZ src2, legVecZ tmp, legVecZ tmp2) %{
5788   predicate(UseAVX > 2 && VM_Version::supports_avx512dq());
5789   match(Set dst (MulReductionVL src1 src2));
5790   effect(TEMP tmp, TEMP tmp2);
5791   format %{ "vextracti64x4_high  $tmp2,$src2\n\t"
5792             "vpmullq  $tmp2,$tmp2,$src2\n\t"
5793             "vextracti128_high  $tmp,$tmp2\n\t"
5794             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5795             "pshufd   $tmp,$tmp2,0xE\n\t"
5796             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5797             "movdq    $tmp,$src1\n\t"
5798             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5799             "movdq    $dst,$tmp2\t! mul reduction8L" %}
5800   ins_encode %{
5801     __ vextracti64x4_high($tmp2$$XMMRegister, $src2$$XMMRegister);
5802     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
5803     __ vextracti128_high($tmp$$XMMRegister, $tmp2$$XMMRegister);
5804     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5805     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5806     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5807     __ movdq($tmp$$XMMRegister, $src1$$Register);
5808     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5809     __ movdq($dst$$Register, $tmp2$$XMMRegister);
5810   %}
5811   ins_pipe( pipe_slow );
5812 %}
5813 #endif
5814 
5815 instruct rsmul2F_reduction(regF dst, vecD src2, vecD tmp) %{
5816   predicate(UseSSE >= 1 && UseAVX == 0);
5817   match(Set dst (MulReductionVF dst src2));
5818   effect(TEMP dst, TEMP tmp);
5819   format %{ "mulss   $dst,$src2\n\t"
5820             "pshufd  $tmp,$src2,0x01\n\t"
5821             "mulss   $dst,$tmp\t! mul reduction2F" %}
5822   ins_encode %{
5823     __ mulss($dst$$XMMRegister, $src2$$XMMRegister);
5824     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5825     __ mulss($dst$$XMMRegister, $tmp$$XMMRegister);
5826   %}
5827   ins_pipe( pipe_slow );
5828 %}
5829 
5830 instruct rvmul2F_reduction_reg(regF dst, vecD src2, vecD tmp) %{
5831   predicate(UseAVX > 0);
5832   match(Set dst (MulReductionVF dst src2));
5833   effect(TEMP tmp, TEMP dst);
5834   format %{ "vmulss  $dst,$dst,$src2\n\t"
5835             "pshufd  $tmp,$src2,0x01\n\t"
5836             "vmulss  $dst,$dst,$tmp\t! mul reduction2F" %}
5837   ins_encode %{
5838     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5839     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5840     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5841   %}
5842   ins_pipe( pipe_slow );
5843 %}
5844 
5845 instruct rsmul4F_reduction_reg(regF dst, vecX src2, vecX tmp) %{
5846   predicate(UseSSE >= 1 && UseAVX == 0);
5847   match(Set dst (MulReductionVF dst src2));
5848   effect(TEMP dst, TEMP tmp);
5849   format %{ "mulss   $dst,$src2\n\t"
5850             "pshufd  $tmp,$src2,0x01\n\t"
5851             "mulss   $dst,$tmp\n\t"
5852             "pshufd  $tmp,$src2,0x02\n\t"
5853             "mulss   $dst,$tmp\n\t"
5854             "pshufd  $tmp,$src2,0x03\n\t"
5855             "mulss   $dst,$tmp\t! mul reduction4F" %}
5856   ins_encode %{
5857     __ mulss($dst$$XMMRegister, $src2$$XMMRegister);
5858     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5859     __ mulss($dst$$XMMRegister, $tmp$$XMMRegister);
5860     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5861     __ mulss($dst$$XMMRegister, $tmp$$XMMRegister);
5862     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5863     __ mulss($dst$$XMMRegister, $tmp$$XMMRegister);
5864   %}
5865   ins_pipe( pipe_slow );
5866 %}
5867 
5868 instruct rvmul4F_reduction_reg(regF dst, vecX src2, vecX tmp) %{
5869   predicate(UseAVX > 0);
5870   match(Set dst (MulReductionVF dst src2));
5871   effect(TEMP tmp, TEMP dst);
5872   format %{ "vmulss  $dst,$dst,$src2\n\t"
5873             "pshufd  $tmp,$src2,0x01\n\t"
5874             "vmulss  $dst,$dst,$tmp\n\t"
5875             "pshufd  $tmp,$src2,0x02\n\t"
5876             "vmulss  $dst,$dst,$tmp\n\t"
5877             "pshufd  $tmp,$src2,0x03\n\t"
5878             "vmulss  $dst,$dst,$tmp\t! mul reduction4F" %}
5879   ins_encode %{
5880     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5881     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5882     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5883     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5884     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5885     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5886     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5887   %}
5888   ins_pipe( pipe_slow );
5889 %}
5890 
5891 instruct rvmul8F_reduction_reg(regF dst, vecY src2, vecY tmp, vecY tmp2) %{
5892   predicate(UseAVX > 0);
5893   match(Set dst (MulReductionVF dst src2));
5894   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5895   format %{ "vmulss  $dst,$dst,$src2\n\t"
5896             "pshufd  $tmp,$src2,0x01\n\t"
5897             "vmulss  $dst,$dst,$tmp\n\t"
5898             "pshufd  $tmp,$src2,0x02\n\t"
5899             "vmulss  $dst,$dst,$tmp\n\t"
5900             "pshufd  $tmp,$src2,0x03\n\t"
5901             "vmulss  $dst,$dst,$tmp\n\t"
5902             "vextractf128_high  $tmp2,$src2\n\t"
5903             "vmulss  $dst,$dst,$tmp2\n\t"
5904             "pshufd  $tmp,$tmp2,0x01\n\t"
5905             "vmulss  $dst,$dst,$tmp\n\t"
5906             "pshufd  $tmp,$tmp2,0x02\n\t"
5907             "vmulss  $dst,$dst,$tmp\n\t"
5908             "pshufd  $tmp,$tmp2,0x03\n\t"
5909             "vmulss  $dst,$dst,$tmp\t! mul reduction8F" %}
5910   ins_encode %{
5911     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5912     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5913     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5914     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5915     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5916     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5917     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5918     __ vextractf128_high($tmp2$$XMMRegister, $src2$$XMMRegister);
5919     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5920     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5921     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5922     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5923     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5924     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5925     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5926   %}
5927   ins_pipe( pipe_slow );
5928 %}
5929 
5930 instruct rvmul16F_reduction_reg(regF dst, legVecZ src2, legVecZ tmp, legVecZ tmp2) %{
5931   predicate(UseAVX > 2);
5932   match(Set dst (MulReductionVF dst src2));
5933   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5934   format %{ "vmulss  $dst,$dst,$src2\n\t"
5935             "pshufd  $tmp,$src2,0x01\n\t"
5936             "vmulss  $dst,$dst,$tmp\n\t"
5937             "pshufd  $tmp,$src2,0x02\n\t"
5938             "vmulss  $dst,$dst,$tmp\n\t"
5939             "pshufd  $tmp,$src2,0x03\n\t"
5940             "vmulss  $dst,$dst,$tmp\n\t"
5941             "vextractf32x4  $tmp2,$src2,0x1\n\t"
5942             "vmulss  $dst,$dst,$tmp2\n\t"
5943             "pshufd  $tmp,$tmp2,0x01\n\t"
5944             "vmulss  $dst,$dst,$tmp\n\t"
5945             "pshufd  $tmp,$tmp2,0x02\n\t"
5946             "vmulss  $dst,$dst,$tmp\n\t"
5947             "pshufd  $tmp,$tmp2,0x03\n\t"
5948             "vmulss  $dst,$dst,$tmp\n\t"
5949             "vextractf32x4  $tmp2,$src2,0x2\n\t"
5950             "vmulss  $dst,$dst,$tmp2\n\t"
5951             "pshufd  $tmp,$tmp2,0x01\n\t"
5952             "vmulss  $dst,$dst,$tmp\n\t"
5953             "pshufd  $tmp,$tmp2,0x02\n\t"
5954             "vmulss  $dst,$dst,$tmp\n\t"
5955             "pshufd  $tmp,$tmp2,0x03\n\t"
5956             "vmulss  $dst,$dst,$tmp\n\t"
5957             "vextractf32x4  $tmp2,$src2,0x3\n\t"
5958             "vmulss  $dst,$dst,$tmp2\n\t"
5959             "pshufd  $tmp,$tmp2,0x01\n\t"
5960             "vmulss  $dst,$dst,$tmp\n\t"
5961             "pshufd  $tmp,$tmp2,0x02\n\t"
5962             "vmulss  $dst,$dst,$tmp\n\t"
5963             "pshufd  $tmp,$tmp2,0x03\n\t"
5964             "vmulss  $dst,$dst,$tmp\t! mul reduction16F" %}
5965   ins_encode %{
5966     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5967     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5968     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5969     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5970     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5971     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5972     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5973     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5974     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5975     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5976     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5977     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5978     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5979     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5980     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5981     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
5982     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5983     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5984     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5985     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5986     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5987     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5988     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5989     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
5990     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5991     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5992     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5993     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5994     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5995     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5996     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5997   %}
5998   ins_pipe( pipe_slow );
5999 %}
6000 
6001 instruct rsmul2D_reduction_reg(regD dst, vecX src2, vecX tmp) %{
6002   predicate(UseSSE >= 1 && UseAVX == 0);
6003   match(Set dst (MulReductionVD dst src2));
6004   effect(TEMP dst, TEMP tmp);
6005   format %{ "mulsd   $dst,$src2\n\t"
6006             "pshufd  $tmp,$src2,0xE\n\t"
6007             "mulsd   $dst,$tmp\t! mul reduction2D" %}
6008   ins_encode %{
6009     __ mulsd($dst$$XMMRegister, $src2$$XMMRegister);
6010     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
6011     __ mulsd($dst$$XMMRegister, $tmp$$XMMRegister);
6012   %}
6013   ins_pipe( pipe_slow );
6014 %}
6015 
6016 instruct rvmul2D_reduction_reg(regD dst, vecX src2, vecX tmp) %{
6017   predicate(UseAVX > 0);
6018   match(Set dst (MulReductionVD dst src2));
6019   effect(TEMP tmp, TEMP dst);
6020   format %{ "vmulsd  $dst,$dst,$src2\n\t"
6021             "pshufd  $tmp,$src2,0xE\n\t"
6022             "vmulsd  $dst,$dst,$tmp\t! mul reduction2D" %}
6023   ins_encode %{
6024     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
6025     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
6026     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6027   %}
6028   ins_pipe( pipe_slow );
6029 %}
6030 
6031 instruct rvmul4D_reduction_reg(regD dst, vecY src2, vecY tmp, vecY tmp2) %{
6032   predicate(UseAVX > 0);
6033   match(Set dst (MulReductionVD dst src2));
6034   effect(TEMP tmp, TEMP dst, TEMP tmp2);
6035   format %{ "vmulsd  $dst,$dst,$src2\n\t"
6036             "pshufd  $tmp,$src2,0xE\n\t"
6037             "vmulsd  $dst,$dst,$tmp\n\t"
6038             "vextractf128_high  $tmp2,$src2\n\t"
6039             "vmulsd  $dst,$dst,$tmp2\n\t"
6040             "pshufd  $tmp,$tmp2,0xE\n\t"
6041             "vmulsd  $dst,$dst,$tmp\t! mul reduction4D" %}
6042   ins_encode %{
6043     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
6044     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
6045     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6046     __ vextractf128_high($tmp2$$XMMRegister, $src2$$XMMRegister);
6047     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
6048     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
6049     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6050   %}
6051   ins_pipe( pipe_slow );
6052 %}
6053 
6054 instruct rvmul8D_reduction_reg(regD dst, legVecZ src2, legVecZ tmp, legVecZ tmp2) %{
6055   predicate(UseAVX > 2);
6056   match(Set dst (MulReductionVD dst src2));
6057   effect(TEMP tmp, TEMP dst, TEMP tmp2);
6058   format %{ "vmulsd  $dst,$dst,$src2\n\t"
6059             "pshufd  $tmp,$src2,0xE\n\t"
6060             "vmulsd  $dst,$dst,$tmp\n\t"
6061             "vextractf32x4  $tmp2,$src2,0x1\n\t"
6062             "vmulsd  $dst,$dst,$tmp2\n\t"
6063             "pshufd  $tmp,$src2,0xE\n\t"
6064             "vmulsd  $dst,$dst,$tmp\n\t"
6065             "vextractf32x4  $tmp2,$src2,0x2\n\t"
6066             "vmulsd  $dst,$dst,$tmp2\n\t"
6067             "pshufd  $tmp,$tmp2,0xE\n\t"
6068             "vmulsd  $dst,$dst,$tmp\n\t"
6069             "vextractf32x4  $tmp2,$src2,0x3\n\t"
6070             "vmulsd  $dst,$dst,$tmp2\n\t"
6071             "pshufd  $tmp,$tmp2,0xE\n\t"
6072             "vmulsd  $dst,$dst,$tmp\t! mul reduction8D" %}
6073   ins_encode %{
6074     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
6075     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
6076     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6077     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
6078     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
6079     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
6080     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6081     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
6082     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
6083     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
6084     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6085     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
6086     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
6087     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
6088     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6089   %}
6090   ins_pipe( pipe_slow );
6091 %}
6092 
6093 // ====================VECTOR ARITHMETIC=======================================
6094 
6095 // --------------------------------- ADD --------------------------------------
6096 
6097 // Bytes vector add
6098 instruct vadd4B(vecS dst, vecS src) %{
6099   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
6100   match(Set dst (AddVB dst src));
6101   format %{ "paddb   $dst,$src\t! add packed4B" %}
6102   ins_encode %{
6103     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
6104   %}
6105   ins_pipe( pipe_slow );
6106 %}
6107 
6108 instruct vadd4B_reg(vecS dst, vecS src1, vecS src2) %{
6109   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6110   match(Set dst (AddVB src1 src2));
6111   format %{ "vpaddb  $dst,$src1,$src2\t! add packed4B" %}
6112   ins_encode %{
6113     int vector_len = 0;
6114     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6115   %}
6116   ins_pipe( pipe_slow );
6117 %}
6118 
6119 
6120 instruct vadd4B_mem(vecS dst, vecS src, memory mem) %{
6121   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6122   match(Set dst (AddVB src (LoadVector mem)));
6123   format %{ "vpaddb  $dst,$src,$mem\t! add packed4B" %}
6124   ins_encode %{
6125     int vector_len = 0;
6126     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6127   %}
6128   ins_pipe( pipe_slow );
6129 %}
6130 
6131 instruct vadd8B(vecD dst, vecD src) %{
6132   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
6133   match(Set dst (AddVB dst src));
6134   format %{ "paddb   $dst,$src\t! add packed8B" %}
6135   ins_encode %{
6136     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
6137   %}
6138   ins_pipe( pipe_slow );
6139 %}
6140 
6141 instruct vadd8B_reg(vecD dst, vecD src1, vecD src2) %{
6142   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6143   match(Set dst (AddVB src1 src2));
6144   format %{ "vpaddb  $dst,$src1,$src2\t! add packed8B" %}
6145   ins_encode %{
6146     int vector_len = 0;
6147     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6148   %}
6149   ins_pipe( pipe_slow );
6150 %}
6151 
6152 
6153 instruct vadd8B_mem(vecD dst, vecD src, memory mem) %{
6154   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6155   match(Set dst (AddVB src (LoadVector mem)));
6156   format %{ "vpaddb  $dst,$src,$mem\t! add packed8B" %}
6157   ins_encode %{
6158     int vector_len = 0;
6159     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6160   %}
6161   ins_pipe( pipe_slow );
6162 %}
6163 
6164 instruct vadd16B(vecX dst, vecX src) %{
6165   predicate(UseAVX == 0 && n->as_Vector()->length() == 16);
6166   match(Set dst (AddVB dst src));
6167   format %{ "paddb   $dst,$src\t! add packed16B" %}
6168   ins_encode %{
6169     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
6170   %}
6171   ins_pipe( pipe_slow );
6172 %}
6173 
6174 instruct vadd16B_reg(vecX dst, vecX src1, vecX src2) %{
6175   predicate(UseAVX > 0  && n->as_Vector()->length() == 16);
6176   match(Set dst (AddVB src1 src2));
6177   format %{ "vpaddb  $dst,$src1,$src2\t! add packed16B" %}
6178   ins_encode %{
6179     int vector_len = 0;
6180     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6181   %}
6182   ins_pipe( pipe_slow );
6183 %}
6184 
6185 instruct vadd16B_mem(vecX dst, vecX src, memory mem) %{
6186   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
6187   match(Set dst (AddVB src (LoadVector mem)));
6188   format %{ "vpaddb  $dst,$src,$mem\t! add packed16B" %}
6189   ins_encode %{
6190     int vector_len = 0;
6191     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6192   %}
6193   ins_pipe( pipe_slow );
6194 %}
6195 
6196 instruct vadd32B_reg(vecY dst, vecY src1, vecY src2) %{
6197   predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
6198   match(Set dst (AddVB src1 src2));
6199   format %{ "vpaddb  $dst,$src1,$src2\t! add packed32B" %}
6200   ins_encode %{
6201     int vector_len = 1;
6202     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6203   %}
6204   ins_pipe( pipe_slow );
6205 %}
6206 
6207 instruct vadd32B_mem(vecY dst, vecY src, memory mem) %{
6208   predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
6209   match(Set dst (AddVB src (LoadVector mem)));
6210   format %{ "vpaddb  $dst,$src,$mem\t! add packed32B" %}
6211   ins_encode %{
6212     int vector_len = 1;
6213     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6214   %}
6215   ins_pipe( pipe_slow );
6216 %}
6217 
6218 instruct vadd64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
6219   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64);
6220   match(Set dst (AddVB src1 src2));
6221   format %{ "vpaddb  $dst,$src1,$src2\t! add packed64B" %}
6222   ins_encode %{
6223     int vector_len = 2;
6224     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6225   %}
6226   ins_pipe( pipe_slow );
6227 %}
6228 
6229 instruct vadd64B_mem(vecZ dst, vecZ src, memory mem) %{
6230   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64);
6231   match(Set dst (AddVB src (LoadVector mem)));
6232   format %{ "vpaddb  $dst,$src,$mem\t! add packed64B" %}
6233   ins_encode %{
6234     int vector_len = 2;
6235     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6236   %}
6237   ins_pipe( pipe_slow );
6238 %}
6239 
6240 // Shorts/Chars vector add
6241 instruct vadd2S(vecS dst, vecS src) %{
6242   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
6243   match(Set dst (AddVS dst src));
6244   format %{ "paddw   $dst,$src\t! add packed2S" %}
6245   ins_encode %{
6246     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
6247   %}
6248   ins_pipe( pipe_slow );
6249 %}
6250 
6251 instruct vadd2S_reg(vecS dst, vecS src1, vecS src2) %{
6252   predicate(UseAVX > 0  && n->as_Vector()->length() == 2);
6253   match(Set dst (AddVS src1 src2));
6254   format %{ "vpaddw  $dst,$src1,$src2\t! add packed2S" %}
6255   ins_encode %{
6256     int vector_len = 0;
6257     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6258   %}
6259   ins_pipe( pipe_slow );
6260 %}
6261 
6262 instruct vadd2S_mem(vecS dst, vecS src, memory mem) %{
6263   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6264   match(Set dst (AddVS src (LoadVector mem)));
6265   format %{ "vpaddw  $dst,$src,$mem\t! add packed2S" %}
6266   ins_encode %{
6267     int vector_len = 0;
6268     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6269   %}
6270   ins_pipe( pipe_slow );
6271 %}
6272 
6273 instruct vadd4S(vecD dst, vecD src) %{
6274   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
6275   match(Set dst (AddVS dst src));
6276   format %{ "paddw   $dst,$src\t! add packed4S" %}
6277   ins_encode %{
6278     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
6279   %}
6280   ins_pipe( pipe_slow );
6281 %}
6282 
6283 instruct vadd4S_reg(vecD dst, vecD src1, vecD src2) %{
6284   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6285   match(Set dst (AddVS src1 src2));
6286   format %{ "vpaddw  $dst,$src1,$src2\t! add packed4S" %}
6287   ins_encode %{
6288     int vector_len = 0;
6289     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6290   %}
6291   ins_pipe( pipe_slow );
6292 %}
6293 
6294 instruct vadd4S_mem(vecD dst, vecD src, memory mem) %{
6295   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6296   match(Set dst (AddVS src (LoadVector mem)));
6297   format %{ "vpaddw  $dst,$src,$mem\t! add packed4S" %}
6298   ins_encode %{
6299     int vector_len = 0;
6300     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6301   %}
6302   ins_pipe( pipe_slow );
6303 %}
6304 
6305 instruct vadd8S(vecX dst, vecX src) %{
6306   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
6307   match(Set dst (AddVS dst src));
6308   format %{ "paddw   $dst,$src\t! add packed8S" %}
6309   ins_encode %{
6310     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
6311   %}
6312   ins_pipe( pipe_slow );
6313 %}
6314 
6315 instruct vadd8S_reg(vecX dst, vecX src1, vecX src2) %{
6316   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6317   match(Set dst (AddVS src1 src2));
6318   format %{ "vpaddw  $dst,$src1,$src2\t! add packed8S" %}
6319   ins_encode %{
6320     int vector_len = 0;
6321     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6322   %}
6323   ins_pipe( pipe_slow );
6324 %}
6325 
6326 instruct vadd8S_mem(vecX dst, vecX src, memory mem) %{
6327   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6328   match(Set dst (AddVS src (LoadVector mem)));
6329   format %{ "vpaddw  $dst,$src,$mem\t! add packed8S" %}
6330   ins_encode %{
6331     int vector_len = 0;
6332     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6333   %}
6334   ins_pipe( pipe_slow );
6335 %}
6336 
6337 instruct vadd16S_reg(vecY dst, vecY src1, vecY src2) %{
6338   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
6339   match(Set dst (AddVS src1 src2));
6340   format %{ "vpaddw  $dst,$src1,$src2\t! add packed16S" %}
6341   ins_encode %{
6342     int vector_len = 1;
6343     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6344   %}
6345   ins_pipe( pipe_slow );
6346 %}
6347 
6348 instruct vadd16S_mem(vecY dst, vecY src, memory mem) %{
6349   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
6350   match(Set dst (AddVS src (LoadVector mem)));
6351   format %{ "vpaddw  $dst,$src,$mem\t! add packed16S" %}
6352   ins_encode %{
6353     int vector_len = 1;
6354     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6355   %}
6356   ins_pipe( pipe_slow );
6357 %}
6358 
6359 instruct vadd32S_reg(vecZ dst, vecZ src1, vecZ src2) %{
6360   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
6361   match(Set dst (AddVS src1 src2));
6362   format %{ "vpaddw  $dst,$src1,$src2\t! add packed32S" %}
6363   ins_encode %{
6364     int vector_len = 2;
6365     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6366   %}
6367   ins_pipe( pipe_slow );
6368 %}
6369 
6370 instruct vadd32S_mem(vecZ dst, vecZ src, memory mem) %{
6371   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
6372   match(Set dst (AddVS src (LoadVector mem)));
6373   format %{ "vpaddw  $dst,$src,$mem\t! add packed32S" %}
6374   ins_encode %{
6375     int vector_len = 2;
6376     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6377   %}
6378   ins_pipe( pipe_slow );
6379 %}
6380 
6381 // Integers vector add
6382 instruct vadd2I(vecD dst, vecD src) %{
6383   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
6384   match(Set dst (AddVI dst src));
6385   format %{ "paddd   $dst,$src\t! add packed2I" %}
6386   ins_encode %{
6387     __ paddd($dst$$XMMRegister, $src$$XMMRegister);
6388   %}
6389   ins_pipe( pipe_slow );
6390 %}
6391 
6392 instruct vadd2I_reg(vecD dst, vecD src1, vecD src2) %{
6393   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6394   match(Set dst (AddVI src1 src2));
6395   format %{ "vpaddd  $dst,$src1,$src2\t! add packed2I" %}
6396   ins_encode %{
6397     int vector_len = 0;
6398     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6399   %}
6400   ins_pipe( pipe_slow );
6401 %}
6402 
6403 instruct vadd2I_mem(vecD dst, vecD src, memory mem) %{
6404   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6405   match(Set dst (AddVI src (LoadVector mem)));
6406   format %{ "vpaddd  $dst,$src,$mem\t! add packed2I" %}
6407   ins_encode %{
6408     int vector_len = 0;
6409     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6410   %}
6411   ins_pipe( pipe_slow );
6412 %}
6413 
6414 instruct vadd4I(vecX dst, vecX src) %{
6415   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
6416   match(Set dst (AddVI dst src));
6417   format %{ "paddd   $dst,$src\t! add packed4I" %}
6418   ins_encode %{
6419     __ paddd($dst$$XMMRegister, $src$$XMMRegister);
6420   %}
6421   ins_pipe( pipe_slow );
6422 %}
6423 
6424 instruct vadd4I_reg(vecX dst, vecX src1, vecX src2) %{
6425   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6426   match(Set dst (AddVI src1 src2));
6427   format %{ "vpaddd  $dst,$src1,$src2\t! add packed4I" %}
6428   ins_encode %{
6429     int vector_len = 0;
6430     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6431   %}
6432   ins_pipe( pipe_slow );
6433 %}
6434 
6435 instruct vadd4I_mem(vecX dst, vecX src, memory mem) %{
6436   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6437   match(Set dst (AddVI src (LoadVector mem)));
6438   format %{ "vpaddd  $dst,$src,$mem\t! add packed4I" %}
6439   ins_encode %{
6440     int vector_len = 0;
6441     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6442   %}
6443   ins_pipe( pipe_slow );
6444 %}
6445 
6446 instruct vadd8I_reg(vecY dst, vecY src1, vecY src2) %{
6447   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
6448   match(Set dst (AddVI src1 src2));
6449   format %{ "vpaddd  $dst,$src1,$src2\t! add packed8I" %}
6450   ins_encode %{
6451     int vector_len = 1;
6452     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6453   %}
6454   ins_pipe( pipe_slow );
6455 %}
6456 
6457 instruct vadd8I_mem(vecY dst, vecY src, memory mem) %{
6458   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
6459   match(Set dst (AddVI src (LoadVector mem)));
6460   format %{ "vpaddd  $dst,$src,$mem\t! add packed8I" %}
6461   ins_encode %{
6462     int vector_len = 1;
6463     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6464   %}
6465   ins_pipe( pipe_slow );
6466 %}
6467 
6468 instruct vadd16I_reg(vecZ dst, vecZ src1, vecZ src2) %{
6469   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
6470   match(Set dst (AddVI src1 src2));
6471   format %{ "vpaddd  $dst,$src1,$src2\t! add packed16I" %}
6472   ins_encode %{
6473     int vector_len = 2;
6474     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6475   %}
6476   ins_pipe( pipe_slow );
6477 %}
6478 
6479 instruct vadd16I_mem(vecZ dst, vecZ src, memory mem) %{
6480   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
6481   match(Set dst (AddVI src (LoadVector mem)));
6482   format %{ "vpaddd  $dst,$src,$mem\t! add packed16I" %}
6483   ins_encode %{
6484     int vector_len = 2;
6485     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6486   %}
6487   ins_pipe( pipe_slow );
6488 %}
6489 
6490 // Longs vector add
6491 instruct vadd2L(vecX dst, vecX src) %{
6492   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
6493   match(Set dst (AddVL dst src));
6494   format %{ "paddq   $dst,$src\t! add packed2L" %}
6495   ins_encode %{
6496     __ paddq($dst$$XMMRegister, $src$$XMMRegister);
6497   %}
6498   ins_pipe( pipe_slow );
6499 %}
6500 
6501 instruct vadd2L_reg(vecX dst, vecX src1, vecX src2) %{
6502   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6503   match(Set dst (AddVL src1 src2));
6504   format %{ "vpaddq  $dst,$src1,$src2\t! add packed2L" %}
6505   ins_encode %{
6506     int vector_len = 0;
6507     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6508   %}
6509   ins_pipe( pipe_slow );
6510 %}
6511 
6512 instruct vadd2L_mem(vecX dst, vecX src, memory mem) %{
6513   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6514   match(Set dst (AddVL src (LoadVector mem)));
6515   format %{ "vpaddq  $dst,$src,$mem\t! add packed2L" %}
6516   ins_encode %{
6517     int vector_len = 0;
6518     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6519   %}
6520   ins_pipe( pipe_slow );
6521 %}
6522 
6523 instruct vadd4L_reg(vecY dst, vecY src1, vecY src2) %{
6524   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
6525   match(Set dst (AddVL src1 src2));
6526   format %{ "vpaddq  $dst,$src1,$src2\t! add packed4L" %}
6527   ins_encode %{
6528     int vector_len = 1;
6529     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6530   %}
6531   ins_pipe( pipe_slow );
6532 %}
6533 
6534 instruct vadd4L_mem(vecY dst, vecY src, memory mem) %{
6535   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
6536   match(Set dst (AddVL src (LoadVector mem)));
6537   format %{ "vpaddq  $dst,$src,$mem\t! add packed4L" %}
6538   ins_encode %{
6539     int vector_len = 1;
6540     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6541   %}
6542   ins_pipe( pipe_slow );
6543 %}
6544 
6545 instruct vadd8L_reg(vecZ dst, vecZ src1, vecZ src2) %{
6546   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
6547   match(Set dst (AddVL src1 src2));
6548   format %{ "vpaddq  $dst,$src1,$src2\t! add packed8L" %}
6549   ins_encode %{
6550     int vector_len = 2;
6551     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6552   %}
6553   ins_pipe( pipe_slow );
6554 %}
6555 
6556 instruct vadd8L_mem(vecZ dst, vecZ src, memory mem) %{
6557   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
6558   match(Set dst (AddVL src (LoadVector mem)));
6559   format %{ "vpaddq  $dst,$src,$mem\t! add packed8L" %}
6560   ins_encode %{
6561     int vector_len = 2;
6562     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6563   %}
6564   ins_pipe( pipe_slow );
6565 %}
6566 
6567 // Floats vector add
6568 instruct vadd2F(vecD dst, vecD src) %{
6569   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
6570   match(Set dst (AddVF dst src));
6571   format %{ "addps   $dst,$src\t! add packed2F" %}
6572   ins_encode %{
6573     __ addps($dst$$XMMRegister, $src$$XMMRegister);
6574   %}
6575   ins_pipe( pipe_slow );
6576 %}
6577 
6578 instruct vadd2F_reg(vecD dst, vecD src1, vecD src2) %{
6579   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6580   match(Set dst (AddVF src1 src2));
6581   format %{ "vaddps  $dst,$src1,$src2\t! add packed2F" %}
6582   ins_encode %{
6583     int vector_len = 0;
6584     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6585   %}
6586   ins_pipe( pipe_slow );
6587 %}
6588 
6589 instruct vadd2F_mem(vecD dst, vecD src, memory mem) %{
6590   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6591   match(Set dst (AddVF src (LoadVector mem)));
6592   format %{ "vaddps  $dst,$src,$mem\t! add packed2F" %}
6593   ins_encode %{
6594     int vector_len = 0;
6595     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6596   %}
6597   ins_pipe( pipe_slow );
6598 %}
6599 
6600 instruct vadd4F(vecX dst, vecX src) %{
6601   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
6602   match(Set dst (AddVF dst src));
6603   format %{ "addps   $dst,$src\t! add packed4F" %}
6604   ins_encode %{
6605     __ addps($dst$$XMMRegister, $src$$XMMRegister);
6606   %}
6607   ins_pipe( pipe_slow );
6608 %}
6609 
6610 instruct vadd4F_reg(vecX dst, vecX src1, vecX src2) %{
6611   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6612   match(Set dst (AddVF src1 src2));
6613   format %{ "vaddps  $dst,$src1,$src2\t! add packed4F" %}
6614   ins_encode %{
6615     int vector_len = 0;
6616     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6617   %}
6618   ins_pipe( pipe_slow );
6619 %}
6620 
6621 instruct vadd4F_mem(vecX dst, vecX src, memory mem) %{
6622   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6623   match(Set dst (AddVF src (LoadVector mem)));
6624   format %{ "vaddps  $dst,$src,$mem\t! add packed4F" %}
6625   ins_encode %{
6626     int vector_len = 0;
6627     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6628   %}
6629   ins_pipe( pipe_slow );
6630 %}
6631 
6632 instruct vadd8F_reg(vecY dst, vecY src1, vecY src2) %{
6633   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6634   match(Set dst (AddVF src1 src2));
6635   format %{ "vaddps  $dst,$src1,$src2\t! add packed8F" %}
6636   ins_encode %{
6637     int vector_len = 1;
6638     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6639   %}
6640   ins_pipe( pipe_slow );
6641 %}
6642 
6643 instruct vadd8F_mem(vecY dst, vecY src, memory mem) %{
6644   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6645   match(Set dst (AddVF src (LoadVector mem)));
6646   format %{ "vaddps  $dst,$src,$mem\t! add packed8F" %}
6647   ins_encode %{
6648     int vector_len = 1;
6649     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6650   %}
6651   ins_pipe( pipe_slow );
6652 %}
6653 
6654 instruct vadd16F_reg(vecZ dst, vecZ src1, vecZ src2) %{
6655   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
6656   match(Set dst (AddVF src1 src2));
6657   format %{ "vaddps  $dst,$src1,$src2\t! add packed16F" %}
6658   ins_encode %{
6659     int vector_len = 2;
6660     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6661   %}
6662   ins_pipe( pipe_slow );
6663 %}
6664 
6665 instruct vadd16F_mem(vecZ dst, vecZ src, memory mem) %{
6666   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
6667   match(Set dst (AddVF src (LoadVector mem)));
6668   format %{ "vaddps  $dst,$src,$mem\t! add packed16F" %}
6669   ins_encode %{
6670     int vector_len = 2;
6671     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6672   %}
6673   ins_pipe( pipe_slow );
6674 %}
6675 
6676 // Doubles vector add
6677 instruct vadd2D(vecX dst, vecX src) %{
6678   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
6679   match(Set dst (AddVD dst src));
6680   format %{ "addpd   $dst,$src\t! add packed2D" %}
6681   ins_encode %{
6682     __ addpd($dst$$XMMRegister, $src$$XMMRegister);
6683   %}
6684   ins_pipe( pipe_slow );
6685 %}
6686 
6687 instruct vadd2D_reg(vecX dst, vecX src1, vecX src2) %{
6688   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6689   match(Set dst (AddVD src1 src2));
6690   format %{ "vaddpd  $dst,$src1,$src2\t! add packed2D" %}
6691   ins_encode %{
6692     int vector_len = 0;
6693     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6694   %}
6695   ins_pipe( pipe_slow );
6696 %}
6697 
6698 instruct vadd2D_mem(vecX dst, vecX src, memory mem) %{
6699   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6700   match(Set dst (AddVD src (LoadVector mem)));
6701   format %{ "vaddpd  $dst,$src,$mem\t! add packed2D" %}
6702   ins_encode %{
6703     int vector_len = 0;
6704     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6705   %}
6706   ins_pipe( pipe_slow );
6707 %}
6708 
6709 instruct vadd4D_reg(vecY dst, vecY src1, vecY src2) %{
6710   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6711   match(Set dst (AddVD src1 src2));
6712   format %{ "vaddpd  $dst,$src1,$src2\t! add packed4D" %}
6713   ins_encode %{
6714     int vector_len = 1;
6715     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6716   %}
6717   ins_pipe( pipe_slow );
6718 %}
6719 
6720 instruct vadd4D_mem(vecY dst, vecY src, memory mem) %{
6721   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6722   match(Set dst (AddVD src (LoadVector mem)));
6723   format %{ "vaddpd  $dst,$src,$mem\t! add packed4D" %}
6724   ins_encode %{
6725     int vector_len = 1;
6726     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6727   %}
6728   ins_pipe( pipe_slow );
6729 %}
6730 
6731 instruct vadd8D_reg(vecZ dst, vecZ src1, vecZ src2) %{
6732   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
6733   match(Set dst (AddVD src1 src2));
6734   format %{ "vaddpd  $dst,$src1,$src2\t! add packed8D" %}
6735   ins_encode %{
6736     int vector_len = 2;
6737     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6738   %}
6739   ins_pipe( pipe_slow );
6740 %}
6741 
6742 instruct vadd8D_mem(vecZ dst, vecZ src, memory mem) %{
6743   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
6744   match(Set dst (AddVD src (LoadVector mem)));
6745   format %{ "vaddpd  $dst,$src,$mem\t! add packed8D" %}
6746   ins_encode %{
6747     int vector_len = 2;
6748     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6749   %}
6750   ins_pipe( pipe_slow );
6751 %}
6752 
6753 // --------------------------------- SUB --------------------------------------
6754 
6755 // Bytes vector sub
6756 instruct vsub4B(vecS dst, vecS src) %{
6757   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
6758   match(Set dst (SubVB dst src));
6759   format %{ "psubb   $dst,$src\t! sub packed4B" %}
6760   ins_encode %{
6761     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
6762   %}
6763   ins_pipe( pipe_slow );
6764 %}
6765 
6766 instruct vsub4B_reg(vecS dst, vecS src1, vecS src2) %{
6767   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6768   match(Set dst (SubVB src1 src2));
6769   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed4B" %}
6770   ins_encode %{
6771     int vector_len = 0;
6772     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6773   %}
6774   ins_pipe( pipe_slow );
6775 %}
6776 
6777 instruct vsub4B_mem(vecS dst, vecS src, memory mem) %{
6778   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6779   match(Set dst (SubVB src (LoadVector mem)));
6780   format %{ "vpsubb  $dst,$src,$mem\t! sub packed4B" %}
6781   ins_encode %{
6782     int vector_len = 0;
6783     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6784   %}
6785   ins_pipe( pipe_slow );
6786 %}
6787 
6788 instruct vsub8B(vecD dst, vecD src) %{
6789   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
6790   match(Set dst (SubVB dst src));
6791   format %{ "psubb   $dst,$src\t! sub packed8B" %}
6792   ins_encode %{
6793     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
6794   %}
6795   ins_pipe( pipe_slow );
6796 %}
6797 
6798 instruct vsub8B_reg(vecD dst, vecD src1, vecD src2) %{
6799   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6800   match(Set dst (SubVB src1 src2));
6801   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed8B" %}
6802   ins_encode %{
6803     int vector_len = 0;
6804     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6805   %}
6806   ins_pipe( pipe_slow );
6807 %}
6808 
6809 instruct vsub8B_mem(vecD dst, vecD src, memory mem) %{
6810   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6811   match(Set dst (SubVB src (LoadVector mem)));
6812   format %{ "vpsubb  $dst,$src,$mem\t! sub packed8B" %}
6813   ins_encode %{
6814     int vector_len = 0;
6815     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6816   %}
6817   ins_pipe( pipe_slow );
6818 %}
6819 
6820 instruct vsub16B(vecX dst, vecX src) %{
6821   predicate(UseAVX == 0 && n->as_Vector()->length() == 16);
6822   match(Set dst (SubVB dst src));
6823   format %{ "psubb   $dst,$src\t! sub packed16B" %}
6824   ins_encode %{
6825     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
6826   %}
6827   ins_pipe( pipe_slow );
6828 %}
6829 
6830 instruct vsub16B_reg(vecX dst, vecX src1, vecX src2) %{
6831   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
6832   match(Set dst (SubVB src1 src2));
6833   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed16B" %}
6834   ins_encode %{
6835     int vector_len = 0;
6836     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6837   %}
6838   ins_pipe( pipe_slow );
6839 %}
6840 
6841 instruct vsub16B_mem(vecX dst, vecX src, memory mem) %{
6842   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
6843   match(Set dst (SubVB src (LoadVector mem)));
6844   format %{ "vpsubb  $dst,$src,$mem\t! sub packed16B" %}
6845   ins_encode %{
6846     int vector_len = 0;
6847     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6848   %}
6849   ins_pipe( pipe_slow );
6850 %}
6851 
6852 instruct vsub32B_reg(vecY dst, vecY src1, vecY src2) %{
6853   predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
6854   match(Set dst (SubVB src1 src2));
6855   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed32B" %}
6856   ins_encode %{
6857     int vector_len = 1;
6858     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6859   %}
6860   ins_pipe( pipe_slow );
6861 %}
6862 
6863 instruct vsub32B_mem(vecY dst, vecY src, memory mem) %{
6864   predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
6865   match(Set dst (SubVB src (LoadVector mem)));
6866   format %{ "vpsubb  $dst,$src,$mem\t! sub packed32B" %}
6867   ins_encode %{
6868     int vector_len = 1;
6869     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6870   %}
6871   ins_pipe( pipe_slow );
6872 %}
6873 
6874 instruct vsub64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
6875   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64);
6876   match(Set dst (SubVB src1 src2));
6877   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed64B" %}
6878   ins_encode %{
6879     int vector_len = 2;
6880     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6881   %}
6882   ins_pipe( pipe_slow );
6883 %}
6884 
6885 instruct vsub64B_mem(vecZ dst, vecZ src, memory mem) %{
6886   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64);
6887   match(Set dst (SubVB src (LoadVector mem)));
6888   format %{ "vpsubb  $dst,$src,$mem\t! sub packed64B" %}
6889   ins_encode %{
6890     int vector_len = 2;
6891     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6892   %}
6893   ins_pipe( pipe_slow );
6894 %}
6895 
6896 // Shorts/Chars vector sub
6897 instruct vsub2S(vecS dst, vecS src) %{
6898   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
6899   match(Set dst (SubVS dst src));
6900   format %{ "psubw   $dst,$src\t! sub packed2S" %}
6901   ins_encode %{
6902     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
6903   %}
6904   ins_pipe( pipe_slow );
6905 %}
6906 
6907 instruct vsub2S_reg(vecS dst, vecS src1, vecS src2) %{
6908   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6909   match(Set dst (SubVS src1 src2));
6910   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed2S" %}
6911   ins_encode %{
6912     int vector_len = 0;
6913     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6914   %}
6915   ins_pipe( pipe_slow );
6916 %}
6917 
6918 instruct vsub2S_mem(vecS dst, vecS src, memory mem) %{
6919   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6920   match(Set dst (SubVS src (LoadVector mem)));
6921   format %{ "vpsubw  $dst,$src,$mem\t! sub packed2S" %}
6922   ins_encode %{
6923     int vector_len = 0;
6924     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6925   %}
6926   ins_pipe( pipe_slow );
6927 %}
6928 
6929 instruct vsub4S(vecD dst, vecD src) %{
6930   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
6931   match(Set dst (SubVS dst src));
6932   format %{ "psubw   $dst,$src\t! sub packed4S" %}
6933   ins_encode %{
6934     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
6935   %}
6936   ins_pipe( pipe_slow );
6937 %}
6938 
6939 instruct vsub4S_reg(vecD dst, vecD src1, vecD src2) %{
6940   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6941   match(Set dst (SubVS src1 src2));
6942   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed4S" %}
6943   ins_encode %{
6944     int vector_len = 0;
6945     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6946   %}
6947   ins_pipe( pipe_slow );
6948 %}
6949 
6950 instruct vsub4S_mem(vecD dst, vecD src, memory mem) %{
6951   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6952   match(Set dst (SubVS src (LoadVector mem)));
6953   format %{ "vpsubw  $dst,$src,$mem\t! sub packed4S" %}
6954   ins_encode %{
6955     int vector_len = 0;
6956     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6957   %}
6958   ins_pipe( pipe_slow );
6959 %}
6960 
6961 instruct vsub8S(vecX dst, vecX src) %{
6962   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
6963   match(Set dst (SubVS dst src));
6964   format %{ "psubw   $dst,$src\t! sub packed8S" %}
6965   ins_encode %{
6966     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
6967   %}
6968   ins_pipe( pipe_slow );
6969 %}
6970 
6971 instruct vsub8S_reg(vecX dst, vecX src1, vecX src2) %{
6972   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6973   match(Set dst (SubVS src1 src2));
6974   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed8S" %}
6975   ins_encode %{
6976     int vector_len = 0;
6977     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6978   %}
6979   ins_pipe( pipe_slow );
6980 %}
6981 
6982 instruct vsub8S_mem(vecX dst, vecX src, memory mem) %{
6983   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6984   match(Set dst (SubVS src (LoadVector mem)));
6985   format %{ "vpsubw  $dst,$src,$mem\t! sub packed8S" %}
6986   ins_encode %{
6987     int vector_len = 0;
6988     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6989   %}
6990   ins_pipe( pipe_slow );
6991 %}
6992 
6993 instruct vsub16S_reg(vecY dst, vecY src1, vecY src2) %{
6994   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
6995   match(Set dst (SubVS src1 src2));
6996   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed16S" %}
6997   ins_encode %{
6998     int vector_len = 1;
6999     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7000   %}
7001   ins_pipe( pipe_slow );
7002 %}
7003 
7004 instruct vsub16S_mem(vecY dst, vecY src, memory mem) %{
7005   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
7006   match(Set dst (SubVS src (LoadVector mem)));
7007   format %{ "vpsubw  $dst,$src,$mem\t! sub packed16S" %}
7008   ins_encode %{
7009     int vector_len = 1;
7010     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7011   %}
7012   ins_pipe( pipe_slow );
7013 %}
7014 
7015 instruct vsub32S_reg(vecZ dst, vecZ src1, vecZ src2) %{
7016   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
7017   match(Set dst (SubVS src1 src2));
7018   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed32S" %}
7019   ins_encode %{
7020     int vector_len = 2;
7021     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7022   %}
7023   ins_pipe( pipe_slow );
7024 %}
7025 
7026 instruct vsub32S_mem(vecZ dst, vecZ src, memory mem) %{
7027   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
7028   match(Set dst (SubVS src (LoadVector mem)));
7029   format %{ "vpsubw  $dst,$src,$mem\t! sub packed32S" %}
7030   ins_encode %{
7031     int vector_len = 2;
7032     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7033   %}
7034   ins_pipe( pipe_slow );
7035 %}
7036 
7037 // Integers vector sub
7038 instruct vsub2I(vecD dst, vecD src) %{
7039   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
7040   match(Set dst (SubVI dst src));
7041   format %{ "psubd   $dst,$src\t! sub packed2I" %}
7042   ins_encode %{
7043     __ psubd($dst$$XMMRegister, $src$$XMMRegister);
7044   %}
7045   ins_pipe( pipe_slow );
7046 %}
7047 
7048 instruct vsub2I_reg(vecD dst, vecD src1, vecD src2) %{
7049   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7050   match(Set dst (SubVI src1 src2));
7051   format %{ "vpsubd  $dst,$src1,$src2\t! sub packed2I" %}
7052   ins_encode %{
7053     int vector_len = 0;
7054     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7055   %}
7056   ins_pipe( pipe_slow );
7057 %}
7058 
7059 instruct vsub2I_mem(vecD dst, vecD src, memory mem) %{
7060   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7061   match(Set dst (SubVI src (LoadVector mem)));
7062   format %{ "vpsubd  $dst,$src,$mem\t! sub packed2I" %}
7063   ins_encode %{
7064     int vector_len = 0;
7065     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7066   %}
7067   ins_pipe( pipe_slow );
7068 %}
7069 
7070 instruct vsub4I(vecX dst, vecX src) %{
7071   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
7072   match(Set dst (SubVI dst src));
7073   format %{ "psubd   $dst,$src\t! sub packed4I" %}
7074   ins_encode %{
7075     __ psubd($dst$$XMMRegister, $src$$XMMRegister);
7076   %}
7077   ins_pipe( pipe_slow );
7078 %}
7079 
7080 instruct vsub4I_reg(vecX dst, vecX src1, vecX src2) %{
7081   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7082   match(Set dst (SubVI src1 src2));
7083   format %{ "vpsubd  $dst,$src1,$src2\t! sub packed4I" %}
7084   ins_encode %{
7085     int vector_len = 0;
7086     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7087   %}
7088   ins_pipe( pipe_slow );
7089 %}
7090 
7091 instruct vsub4I_mem(vecX dst, vecX src, memory mem) %{
7092   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7093   match(Set dst (SubVI src (LoadVector mem)));
7094   format %{ "vpsubd  $dst,$src,$mem\t! sub packed4I" %}
7095   ins_encode %{
7096     int vector_len = 0;
7097     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7098   %}
7099   ins_pipe( pipe_slow );
7100 %}
7101 
7102 instruct vsub8I_reg(vecY dst, vecY src1, vecY src2) %{
7103   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
7104   match(Set dst (SubVI src1 src2));
7105   format %{ "vpsubd  $dst,$src1,$src2\t! sub packed8I" %}
7106   ins_encode %{
7107     int vector_len = 1;
7108     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7109   %}
7110   ins_pipe( pipe_slow );
7111 %}
7112 
7113 instruct vsub8I_mem(vecY dst, vecY src, memory mem) %{
7114   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
7115   match(Set dst (SubVI src (LoadVector mem)));
7116   format %{ "vpsubd  $dst,$src,$mem\t! sub packed8I" %}
7117   ins_encode %{
7118     int vector_len = 1;
7119     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7120   %}
7121   ins_pipe( pipe_slow );
7122 %}
7123 
7124 instruct vsub16I_reg(vecZ dst, vecZ src1, vecZ src2) %{
7125   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
7126   match(Set dst (SubVI src1 src2));
7127   format %{ "vpsubd  $dst,$src1,$src2\t! sub packed16I" %}
7128   ins_encode %{
7129     int vector_len = 2;
7130     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7131   %}
7132   ins_pipe( pipe_slow );
7133 %}
7134 
7135 instruct vsub16I_mem(vecZ dst, vecZ src, memory mem) %{
7136   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
7137   match(Set dst (SubVI src (LoadVector mem)));
7138   format %{ "vpsubd  $dst,$src,$mem\t! sub packed16I" %}
7139   ins_encode %{
7140     int vector_len = 2;
7141     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7142   %}
7143   ins_pipe( pipe_slow );
7144 %}
7145 
7146 // Longs vector sub
7147 instruct vsub2L(vecX dst, vecX src) %{
7148   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
7149   match(Set dst (SubVL dst src));
7150   format %{ "psubq   $dst,$src\t! sub packed2L" %}
7151   ins_encode %{
7152     __ psubq($dst$$XMMRegister, $src$$XMMRegister);
7153   %}
7154   ins_pipe( pipe_slow );
7155 %}
7156 
7157 instruct vsub2L_reg(vecX dst, vecX src1, vecX src2) %{
7158   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7159   match(Set dst (SubVL src1 src2));
7160   format %{ "vpsubq  $dst,$src1,$src2\t! sub packed2L" %}
7161   ins_encode %{
7162     int vector_len = 0;
7163     __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7164   %}
7165   ins_pipe( pipe_slow );
7166 %}
7167 
7168 instruct vsub2L_mem(vecX dst, vecX src, memory mem) %{
7169   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7170   match(Set dst (SubVL src (LoadVector mem)));
7171   format %{ "vpsubq  $dst,$src,$mem\t! sub packed2L" %}
7172   ins_encode %{
7173     int vector_len = 0;
7174     __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7175   %}
7176   ins_pipe( pipe_slow );
7177 %}
7178 
7179 instruct vsub4L_reg(vecY dst, vecY src1, vecY src2) %{
7180   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
7181   match(Set dst (SubVL src1 src2));
7182   format %{ "vpsubq  $dst,$src1,$src2\t! sub packed4L" %}
7183   ins_encode %{
7184     int vector_len = 1;
7185     __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7186   %}
7187   ins_pipe( pipe_slow );
7188 %}
7189 
7190 instruct vsub4L_mem(vecY dst, vecY src, memory mem) %{
7191   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
7192   match(Set dst (SubVL src (LoadVector mem)));
7193   format %{ "vpsubq  $dst,$src,$mem\t! sub packed4L" %}
7194   ins_encode %{
7195     int vector_len = 1;
7196     __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7197   %}
7198   ins_pipe( pipe_slow );
7199 %}
7200 
7201 instruct vsub8L_reg(vecZ dst, vecZ src1, vecZ src2) %{
7202   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
7203   match(Set dst (SubVL src1 src2));
7204   format %{ "vpsubq  $dst,$src1,$src2\t! sub packed8L" %}
7205   ins_encode %{
7206     int vector_len = 2;
7207     __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7208   %}
7209   ins_pipe( pipe_slow );
7210 %}
7211 
7212 instruct vsub8L_mem(vecZ dst, vecZ src, memory mem) %{
7213   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
7214   match(Set dst (SubVL src (LoadVector mem)));
7215   format %{ "vpsubq  $dst,$src,$mem\t! sub packed8L" %}
7216   ins_encode %{
7217     int vector_len = 2;
7218     __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7219   %}
7220   ins_pipe( pipe_slow );
7221 %}
7222 
7223 // Floats vector sub
7224 instruct vsub2F(vecD dst, vecD src) %{
7225   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
7226   match(Set dst (SubVF dst src));
7227   format %{ "subps   $dst,$src\t! sub packed2F" %}
7228   ins_encode %{
7229     __ subps($dst$$XMMRegister, $src$$XMMRegister);
7230   %}
7231   ins_pipe( pipe_slow );
7232 %}
7233 
7234 instruct vsub2F_reg(vecD dst, vecD src1, vecD src2) %{
7235   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7236   match(Set dst (SubVF src1 src2));
7237   format %{ "vsubps  $dst,$src1,$src2\t! sub packed2F" %}
7238   ins_encode %{
7239     int vector_len = 0;
7240     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7241   %}
7242   ins_pipe( pipe_slow );
7243 %}
7244 
7245 instruct vsub2F_mem(vecD dst, vecD src, memory mem) %{
7246   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7247   match(Set dst (SubVF src (LoadVector mem)));
7248   format %{ "vsubps  $dst,$src,$mem\t! sub packed2F" %}
7249   ins_encode %{
7250     int vector_len = 0;
7251     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7252   %}
7253   ins_pipe( pipe_slow );
7254 %}
7255 
7256 instruct vsub4F(vecX dst, vecX src) %{
7257   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
7258   match(Set dst (SubVF dst src));
7259   format %{ "subps   $dst,$src\t! sub packed4F" %}
7260   ins_encode %{
7261     __ subps($dst$$XMMRegister, $src$$XMMRegister);
7262   %}
7263   ins_pipe( pipe_slow );
7264 %}
7265 
7266 instruct vsub4F_reg(vecX dst, vecX src1, vecX src2) %{
7267   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7268   match(Set dst (SubVF src1 src2));
7269   format %{ "vsubps  $dst,$src1,$src2\t! sub packed4F" %}
7270   ins_encode %{
7271     int vector_len = 0;
7272     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7273   %}
7274   ins_pipe( pipe_slow );
7275 %}
7276 
7277 instruct vsub4F_mem(vecX dst, vecX src, memory mem) %{
7278   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7279   match(Set dst (SubVF src (LoadVector mem)));
7280   format %{ "vsubps  $dst,$src,$mem\t! sub packed4F" %}
7281   ins_encode %{
7282     int vector_len = 0;
7283     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7284   %}
7285   ins_pipe( pipe_slow );
7286 %}
7287 
7288 instruct vsub8F_reg(vecY dst, vecY src1, vecY src2) %{
7289   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
7290   match(Set dst (SubVF src1 src2));
7291   format %{ "vsubps  $dst,$src1,$src2\t! sub packed8F" %}
7292   ins_encode %{
7293     int vector_len = 1;
7294     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7295   %}
7296   ins_pipe( pipe_slow );
7297 %}
7298 
7299 instruct vsub8F_mem(vecY dst, vecY src, memory mem) %{
7300   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
7301   match(Set dst (SubVF src (LoadVector mem)));
7302   format %{ "vsubps  $dst,$src,$mem\t! sub packed8F" %}
7303   ins_encode %{
7304     int vector_len = 1;
7305     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7306   %}
7307   ins_pipe( pipe_slow );
7308 %}
7309 
7310 instruct vsub16F_reg(vecZ dst, vecZ src1, vecZ src2) %{
7311   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
7312   match(Set dst (SubVF src1 src2));
7313   format %{ "vsubps  $dst,$src1,$src2\t! sub packed16F" %}
7314   ins_encode %{
7315     int vector_len = 2;
7316     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7317   %}
7318   ins_pipe( pipe_slow );
7319 %}
7320 
7321 instruct vsub16F_mem(vecZ dst, vecZ src, memory mem) %{
7322   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
7323   match(Set dst (SubVF src (LoadVector mem)));
7324   format %{ "vsubps  $dst,$src,$mem\t! sub packed16F" %}
7325   ins_encode %{
7326     int vector_len = 2;
7327     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7328   %}
7329   ins_pipe( pipe_slow );
7330 %}
7331 
7332 // Doubles vector sub
7333 instruct vsub2D(vecX dst, vecX src) %{
7334   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
7335   match(Set dst (SubVD dst src));
7336   format %{ "subpd   $dst,$src\t! sub packed2D" %}
7337   ins_encode %{
7338     __ subpd($dst$$XMMRegister, $src$$XMMRegister);
7339   %}
7340   ins_pipe( pipe_slow );
7341 %}
7342 
7343 instruct vsub2D_reg(vecX dst, vecX src1, vecX src2) %{
7344   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7345   match(Set dst (SubVD src1 src2));
7346   format %{ "vsubpd  $dst,$src1,$src2\t! sub packed2D" %}
7347   ins_encode %{
7348     int vector_len = 0;
7349     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7350   %}
7351   ins_pipe( pipe_slow );
7352 %}
7353 
7354 instruct vsub2D_mem(vecX dst, vecX src, memory mem) %{
7355   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7356   match(Set dst (SubVD src (LoadVector mem)));
7357   format %{ "vsubpd  $dst,$src,$mem\t! sub packed2D" %}
7358   ins_encode %{
7359     int vector_len = 0;
7360     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7361   %}
7362   ins_pipe( pipe_slow );
7363 %}
7364 
7365 instruct vsub4D_reg(vecY dst, vecY src1, vecY src2) %{
7366   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7367   match(Set dst (SubVD src1 src2));
7368   format %{ "vsubpd  $dst,$src1,$src2\t! sub packed4D" %}
7369   ins_encode %{
7370     int vector_len = 1;
7371     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7372   %}
7373   ins_pipe( pipe_slow );
7374 %}
7375 
7376 instruct vsub4D_mem(vecY dst, vecY src, memory mem) %{
7377   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7378   match(Set dst (SubVD src (LoadVector mem)));
7379   format %{ "vsubpd  $dst,$src,$mem\t! sub packed4D" %}
7380   ins_encode %{
7381     int vector_len = 1;
7382     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7383   %}
7384   ins_pipe( pipe_slow );
7385 %}
7386 
7387 instruct vsub8D_reg(vecZ dst, vecZ src1, vecZ src2) %{
7388   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
7389   match(Set dst (SubVD src1 src2));
7390   format %{ "vsubpd  $dst,$src1,$src2\t! sub packed8D" %}
7391   ins_encode %{
7392     int vector_len = 2;
7393     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7394   %}
7395   ins_pipe( pipe_slow );
7396 %}
7397 
7398 instruct vsub8D_mem(vecZ dst, vecZ src, memory mem) %{
7399   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
7400   match(Set dst (SubVD src (LoadVector mem)));
7401   format %{ "vsubpd  $dst,$src,$mem\t! sub packed8D" %}
7402   ins_encode %{
7403     int vector_len = 2;
7404     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7405   %}
7406   ins_pipe( pipe_slow );
7407 %}
7408 
7409 // --------------------------------- MUL --------------------------------------
7410 
7411 // Shorts/Chars vector mul
7412 instruct vmul2S(vecS dst, vecS src) %{
7413   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
7414   match(Set dst (MulVS dst src));
7415   format %{ "pmullw $dst,$src\t! mul packed2S" %}
7416   ins_encode %{
7417     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
7418   %}
7419   ins_pipe( pipe_slow );
7420 %}
7421 
7422 instruct vmul2S_reg(vecS dst, vecS src1, vecS src2) %{
7423   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7424   match(Set dst (MulVS src1 src2));
7425   format %{ "vpmullw $dst,$src1,$src2\t! mul packed2S" %}
7426   ins_encode %{
7427     int vector_len = 0;
7428     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7429   %}
7430   ins_pipe( pipe_slow );
7431 %}
7432 
7433 instruct vmul2S_mem(vecS dst, vecS src, memory mem) %{
7434   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7435   match(Set dst (MulVS src (LoadVector mem)));
7436   format %{ "vpmullw $dst,$src,$mem\t! mul packed2S" %}
7437   ins_encode %{
7438     int vector_len = 0;
7439     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7440   %}
7441   ins_pipe( pipe_slow );
7442 %}
7443 
7444 instruct vmul4S(vecD dst, vecD src) %{
7445   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
7446   match(Set dst (MulVS dst src));
7447   format %{ "pmullw  $dst,$src\t! mul packed4S" %}
7448   ins_encode %{
7449     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
7450   %}
7451   ins_pipe( pipe_slow );
7452 %}
7453 
7454 instruct vmul4S_reg(vecD dst, vecD src1, vecD src2) %{
7455   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7456   match(Set dst (MulVS src1 src2));
7457   format %{ "vpmullw $dst,$src1,$src2\t! mul packed4S" %}
7458   ins_encode %{
7459     int vector_len = 0;
7460     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7461   %}
7462   ins_pipe( pipe_slow );
7463 %}
7464 
7465 instruct vmul4S_mem(vecD dst, vecD src, memory mem) %{
7466   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7467   match(Set dst (MulVS src (LoadVector mem)));
7468   format %{ "vpmullw $dst,$src,$mem\t! mul packed4S" %}
7469   ins_encode %{
7470     int vector_len = 0;
7471     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7472   %}
7473   ins_pipe( pipe_slow );
7474 %}
7475 
7476 instruct vmul8S(vecX dst, vecX src) %{
7477   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
7478   match(Set dst (MulVS dst src));
7479   format %{ "pmullw  $dst,$src\t! mul packed8S" %}
7480   ins_encode %{
7481     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
7482   %}
7483   ins_pipe( pipe_slow );
7484 %}
7485 
7486 instruct vmul8S_reg(vecX dst, vecX src1, vecX src2) %{
7487   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
7488   match(Set dst (MulVS src1 src2));
7489   format %{ "vpmullw $dst,$src1,$src2\t! mul packed8S" %}
7490   ins_encode %{
7491     int vector_len = 0;
7492     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7493   %}
7494   ins_pipe( pipe_slow );
7495 %}
7496 
7497 instruct vmul8S_mem(vecX dst, vecX src, memory mem) %{
7498   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
7499   match(Set dst (MulVS src (LoadVector mem)));
7500   format %{ "vpmullw $dst,$src,$mem\t! mul packed8S" %}
7501   ins_encode %{
7502     int vector_len = 0;
7503     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7504   %}
7505   ins_pipe( pipe_slow );
7506 %}
7507 
7508 instruct vmul16S_reg(vecY dst, vecY src1, vecY src2) %{
7509   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
7510   match(Set dst (MulVS src1 src2));
7511   format %{ "vpmullw $dst,$src1,$src2\t! mul packed16S" %}
7512   ins_encode %{
7513     int vector_len = 1;
7514     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7515   %}
7516   ins_pipe( pipe_slow );
7517 %}
7518 
7519 instruct vmul16S_mem(vecY dst, vecY src, memory mem) %{
7520   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
7521   match(Set dst (MulVS src (LoadVector mem)));
7522   format %{ "vpmullw $dst,$src,$mem\t! mul packed16S" %}
7523   ins_encode %{
7524     int vector_len = 1;
7525     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7526   %}
7527   ins_pipe( pipe_slow );
7528 %}
7529 
7530 instruct vmul32S_reg(vecZ dst, vecZ src1, vecZ src2) %{
7531   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
7532   match(Set dst (MulVS src1 src2));
7533   format %{ "vpmullw $dst,$src1,$src2\t! mul packed32S" %}
7534   ins_encode %{
7535     int vector_len = 2;
7536     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7537   %}
7538   ins_pipe( pipe_slow );
7539 %}
7540 
7541 instruct vmul32S_mem(vecZ dst, vecZ src, memory mem) %{
7542   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
7543   match(Set dst (MulVS src (LoadVector mem)));
7544   format %{ "vpmullw $dst,$src,$mem\t! mul packed32S" %}
7545   ins_encode %{
7546     int vector_len = 2;
7547     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7548   %}
7549   ins_pipe( pipe_slow );
7550 %}
7551 
7552 // Integers vector mul (sse4_1)
7553 instruct vmul2I(vecD dst, vecD src) %{
7554   predicate(UseSSE > 3 && n->as_Vector()->length() == 2);
7555   match(Set dst (MulVI dst src));
7556   format %{ "pmulld  $dst,$src\t! mul packed2I" %}
7557   ins_encode %{
7558     __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
7559   %}
7560   ins_pipe( pipe_slow );
7561 %}
7562 
7563 instruct vmul2I_reg(vecD dst, vecD src1, vecD src2) %{
7564   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7565   match(Set dst (MulVI src1 src2));
7566   format %{ "vpmulld $dst,$src1,$src2\t! mul packed2I" %}
7567   ins_encode %{
7568     int vector_len = 0;
7569     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7570   %}
7571   ins_pipe( pipe_slow );
7572 %}
7573 
7574 instruct vmul2I_mem(vecD dst, vecD src, memory mem) %{
7575   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7576   match(Set dst (MulVI src (LoadVector mem)));
7577   format %{ "vpmulld $dst,$src,$mem\t! mul packed2I" %}
7578   ins_encode %{
7579     int vector_len = 0;
7580     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7581   %}
7582   ins_pipe( pipe_slow );
7583 %}
7584 
7585 instruct vmul4I(vecX dst, vecX src) %{
7586   predicate(UseSSE > 3 && n->as_Vector()->length() == 4);
7587   match(Set dst (MulVI dst src));
7588   format %{ "pmulld  $dst,$src\t! mul packed4I" %}
7589   ins_encode %{
7590     __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
7591   %}
7592   ins_pipe( pipe_slow );
7593 %}
7594 
7595 instruct vmul4I_reg(vecX dst, vecX src1, vecX src2) %{
7596   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7597   match(Set dst (MulVI src1 src2));
7598   format %{ "vpmulld $dst,$src1,$src2\t! mul packed4I" %}
7599   ins_encode %{
7600     int vector_len = 0;
7601     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7602   %}
7603   ins_pipe( pipe_slow );
7604 %}
7605 
7606 instruct vmul4I_mem(vecX dst, vecX src, memory mem) %{
7607   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7608   match(Set dst (MulVI src (LoadVector mem)));
7609   format %{ "vpmulld $dst,$src,$mem\t! mul packed4I" %}
7610   ins_encode %{
7611     int vector_len = 0;
7612     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7613   %}
7614   ins_pipe( pipe_slow );
7615 %}
7616 
7617 instruct vmul2L_reg(vecX dst, vecX src1, vecX src2) %{
7618   predicate(UseAVX > 2 && n->as_Vector()->length() == 2 && VM_Version::supports_avx512dq());
7619   match(Set dst (MulVL src1 src2));
7620   format %{ "vpmullq $dst,$src1,$src2\t! mul packed2L" %}
7621   ins_encode %{
7622     int vector_len = 0;
7623     __ vpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7624   %}
7625   ins_pipe( pipe_slow );
7626 %}
7627 
7628 instruct vmul2L_mem(vecX dst, vecX src, memory mem) %{
7629   predicate(UseAVX > 2 && n->as_Vector()->length() == 2 && VM_Version::supports_avx512dq());
7630   match(Set dst (MulVL src (LoadVector mem)));
7631   format %{ "vpmullq $dst,$src,$mem\t! mul packed2L" %}
7632   ins_encode %{
7633     int vector_len = 0;
7634     __ vpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7635   %}
7636   ins_pipe( pipe_slow );
7637 %}
7638 
7639 instruct vmul4L_reg(vecY dst, vecY src1, vecY src2) %{
7640   predicate(UseAVX > 2 && n->as_Vector()->length() == 4 && VM_Version::supports_avx512dq());
7641   match(Set dst (MulVL src1 src2));
7642   format %{ "vpmullq $dst,$src1,$src2\t! mul packed4L" %}
7643   ins_encode %{
7644     int vector_len = 1;
7645     __ vpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7646   %}
7647   ins_pipe( pipe_slow );
7648 %}
7649 
7650 instruct vmul4L_mem(vecY dst, vecY src, memory mem) %{
7651   predicate(UseAVX > 2 && n->as_Vector()->length() == 4 && VM_Version::supports_avx512dq());
7652   match(Set dst (MulVL src (LoadVector mem)));
7653   format %{ "vpmullq $dst,$src,$mem\t! mul packed4L" %}
7654   ins_encode %{
7655     int vector_len = 1;
7656     __ vpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7657   %}
7658   ins_pipe( pipe_slow );
7659 %}
7660 
7661 instruct vmul8L_reg(vecZ dst, vecZ src1, vecZ src2) %{
7662   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && VM_Version::supports_avx512dq());
7663   match(Set dst (MulVL src1 src2));
7664   format %{ "vpmullq $dst,$src1,$src2\t! mul packed8L" %}
7665   ins_encode %{
7666     int vector_len = 2;
7667     __ vpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7668   %}
7669   ins_pipe( pipe_slow );
7670 %}
7671 
7672 instruct vmul8L_mem(vecZ dst, vecZ src, memory mem) %{
7673   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && VM_Version::supports_avx512dq());
7674   match(Set dst (MulVL src (LoadVector mem)));
7675   format %{ "vpmullq $dst,$src,$mem\t! mul packed8L" %}
7676   ins_encode %{
7677     int vector_len = 2;
7678     __ vpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7679   %}
7680   ins_pipe( pipe_slow );
7681 %}
7682 
7683 instruct vmul8I_reg(vecY dst, vecY src1, vecY src2) %{
7684   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
7685   match(Set dst (MulVI src1 src2));
7686   format %{ "vpmulld $dst,$src1,$src2\t! mul packed8I" %}
7687   ins_encode %{
7688     int vector_len = 1;
7689     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7690   %}
7691   ins_pipe( pipe_slow );
7692 %}
7693 
7694 instruct vmul8I_mem(vecY dst, vecY src, memory mem) %{
7695   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
7696   match(Set dst (MulVI src (LoadVector mem)));
7697   format %{ "vpmulld $dst,$src,$mem\t! mul packed8I" %}
7698   ins_encode %{
7699     int vector_len = 1;
7700     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7701   %}
7702   ins_pipe( pipe_slow );
7703 %}
7704 
7705 instruct vmul16I_reg(vecZ dst, vecZ src1, vecZ src2) %{
7706   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
7707   match(Set dst (MulVI src1 src2));
7708   format %{ "vpmulld $dst,$src1,$src2\t! mul packed16I" %}
7709   ins_encode %{
7710     int vector_len = 2;
7711     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7712   %}
7713   ins_pipe( pipe_slow );
7714 %}
7715 
7716 instruct vmul16I_mem(vecZ dst, vecZ src, memory mem) %{
7717   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
7718   match(Set dst (MulVI src (LoadVector mem)));
7719   format %{ "vpmulld $dst,$src,$mem\t! mul packed16I" %}
7720   ins_encode %{
7721     int vector_len = 2;
7722     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7723   %}
7724   ins_pipe( pipe_slow );
7725 %}
7726 
7727 // Floats vector mul
7728 instruct vmul2F(vecD dst, vecD src) %{
7729   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
7730   match(Set dst (MulVF dst src));
7731   format %{ "mulps   $dst,$src\t! mul packed2F" %}
7732   ins_encode %{
7733     __ mulps($dst$$XMMRegister, $src$$XMMRegister);
7734   %}
7735   ins_pipe( pipe_slow );
7736 %}
7737 
7738 instruct vmul2F_reg(vecD dst, vecD src1, vecD src2) %{
7739   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7740   match(Set dst (MulVF src1 src2));
7741   format %{ "vmulps  $dst,$src1,$src2\t! mul packed2F" %}
7742   ins_encode %{
7743     int vector_len = 0;
7744     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7745   %}
7746   ins_pipe( pipe_slow );
7747 %}
7748 
7749 instruct vmul2F_mem(vecD dst, vecD src, memory mem) %{
7750   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7751   match(Set dst (MulVF src (LoadVector mem)));
7752   format %{ "vmulps  $dst,$src,$mem\t! mul packed2F" %}
7753   ins_encode %{
7754     int vector_len = 0;
7755     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7756   %}
7757   ins_pipe( pipe_slow );
7758 %}
7759 
7760 instruct vmul4F(vecX dst, vecX src) %{
7761   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
7762   match(Set dst (MulVF dst src));
7763   format %{ "mulps   $dst,$src\t! mul packed4F" %}
7764   ins_encode %{
7765     __ mulps($dst$$XMMRegister, $src$$XMMRegister);
7766   %}
7767   ins_pipe( pipe_slow );
7768 %}
7769 
7770 instruct vmul4F_reg(vecX dst, vecX src1, vecX src2) %{
7771   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7772   match(Set dst (MulVF src1 src2));
7773   format %{ "vmulps  $dst,$src1,$src2\t! mul packed4F" %}
7774   ins_encode %{
7775     int vector_len = 0;
7776     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7777   %}
7778   ins_pipe( pipe_slow );
7779 %}
7780 
7781 instruct vmul4F_mem(vecX dst, vecX src, memory mem) %{
7782   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7783   match(Set dst (MulVF src (LoadVector mem)));
7784   format %{ "vmulps  $dst,$src,$mem\t! mul packed4F" %}
7785   ins_encode %{
7786     int vector_len = 0;
7787     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7788   %}
7789   ins_pipe( pipe_slow );
7790 %}
7791 
7792 instruct vmul8F_reg(vecY dst, vecY src1, vecY src2) %{
7793   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
7794   match(Set dst (MulVF src1 src2));
7795   format %{ "vmulps  $dst,$src1,$src2\t! mul packed8F" %}
7796   ins_encode %{
7797     int vector_len = 1;
7798     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7799   %}
7800   ins_pipe( pipe_slow );
7801 %}
7802 
7803 instruct vmul8F_mem(vecY dst, vecY src, memory mem) %{
7804   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
7805   match(Set dst (MulVF src (LoadVector mem)));
7806   format %{ "vmulps  $dst,$src,$mem\t! mul packed8F" %}
7807   ins_encode %{
7808     int vector_len = 1;
7809     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7810   %}
7811   ins_pipe( pipe_slow );
7812 %}
7813 
7814 instruct vmul16F_reg(vecZ dst, vecZ src1, vecZ src2) %{
7815   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
7816   match(Set dst (MulVF src1 src2));
7817   format %{ "vmulps  $dst,$src1,$src2\t! mul packed16F" %}
7818   ins_encode %{
7819     int vector_len = 2;
7820     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7821   %}
7822   ins_pipe( pipe_slow );
7823 %}
7824 
7825 instruct vmul16F_mem(vecZ dst, vecZ src, memory mem) %{
7826   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
7827   match(Set dst (MulVF src (LoadVector mem)));
7828   format %{ "vmulps  $dst,$src,$mem\t! mul packed16F" %}
7829   ins_encode %{
7830     int vector_len = 2;
7831     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7832   %}
7833   ins_pipe( pipe_slow );
7834 %}
7835 
7836 // Doubles vector mul
7837 instruct vmul2D(vecX dst, vecX src) %{
7838   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
7839   match(Set dst (MulVD dst src));
7840   format %{ "mulpd   $dst,$src\t! mul packed2D" %}
7841   ins_encode %{
7842     __ mulpd($dst$$XMMRegister, $src$$XMMRegister);
7843   %}
7844   ins_pipe( pipe_slow );
7845 %}
7846 
7847 instruct vmul2D_reg(vecX dst, vecX src1, vecX src2) %{
7848   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7849   match(Set dst (MulVD src1 src2));
7850   format %{ "vmulpd  $dst,$src1,$src2\t! mul packed2D" %}
7851   ins_encode %{
7852     int vector_len = 0;
7853     __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7854   %}
7855   ins_pipe( pipe_slow );
7856 %}
7857 
7858 instruct vmul2D_mem(vecX dst, vecX src, memory mem) %{
7859   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7860   match(Set dst (MulVD src (LoadVector mem)));
7861   format %{ "vmulpd  $dst,$src,$mem\t! mul packed2D" %}
7862   ins_encode %{
7863     int vector_len = 0;
7864     __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7865   %}
7866   ins_pipe( pipe_slow );
7867 %}
7868 
7869 instruct vmul4D_reg(vecY dst, vecY src1, vecY src2) %{
7870   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7871   match(Set dst (MulVD src1 src2));
7872   format %{ "vmulpd  $dst,$src1,$src2\t! mul packed4D" %}
7873   ins_encode %{
7874     int vector_len = 1;
7875     __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7876   %}
7877   ins_pipe( pipe_slow );
7878 %}
7879 
7880 instruct vmul4D_mem(vecY dst, vecY src, memory mem) %{
7881   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7882   match(Set dst (MulVD src (LoadVector mem)));
7883   format %{ "vmulpd  $dst,$src,$mem\t! mul packed4D" %}
7884   ins_encode %{
7885     int vector_len = 1;
7886     __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7887   %}
7888   ins_pipe( pipe_slow );
7889 %}
7890 
7891 instruct vmul8D_reg(vecZ dst, vecZ src1, vecZ src2) %{
7892   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
7893   match(Set dst (MulVD src1 src2));
7894   format %{ "vmulpd  $dst k0,$src1,$src2\t! mul packed8D" %}
7895   ins_encode %{
7896     int vector_len = 2;
7897     __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7898   %}
7899   ins_pipe( pipe_slow );
7900 %}
7901 
7902 instruct vmul8D_mem(vecZ dst, vecZ src, memory mem) %{
7903   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
7904   match(Set dst (MulVD src (LoadVector mem)));
7905   format %{ "vmulpd  $dst k0,$src,$mem\t! mul packed8D" %}
7906   ins_encode %{
7907     int vector_len = 2;
7908     __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7909   %}
7910   ins_pipe( pipe_slow );
7911 %}
7912 
7913 instruct vcmov8F_reg(legVecY dst, legVecY src1, legVecY src2, immI8 cop, cmpOp_vcmppd copnd) %{
7914   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
7915   match(Set dst (CMoveVF (Binary copnd cop) (Binary src1 src2)));
7916   effect(TEMP dst, USE src1, USE src2);
7917   format %{ "cmpps.$copnd  $dst, $src1, $src2  ! vcmovevf, cond=$cop\n\t"
7918             "blendvps $dst,$src1,$src2,$dst ! vcmovevf\n\t"
7919          %}
7920   ins_encode %{
7921     int vector_len = 1;
7922     int cond = (Assembler::Condition)($copnd$$cmpcode);
7923     __ cmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cond, vector_len);
7924     __ blendvps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $dst$$XMMRegister, vector_len);
7925   %}
7926   ins_pipe( pipe_slow );
7927 %}
7928 
7929 instruct vcmov4D_reg(legVecY dst, legVecY src1, legVecY src2, immI8 cop, cmpOp_vcmppd copnd) %{
7930   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7931   match(Set dst (CMoveVD (Binary copnd cop) (Binary src1 src2)));
7932   effect(TEMP dst, USE src1, USE src2);
7933   format %{ "cmppd.$copnd  $dst, $src1, $src2  ! vcmovevd, cond=$cop\n\t"
7934             "blendvpd $dst,$src1,$src2,$dst ! vcmovevd\n\t"
7935          %}
7936   ins_encode %{
7937     int vector_len = 1;
7938     int cond = (Assembler::Condition)($copnd$$cmpcode);
7939     __ cmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cond, vector_len);
7940     __ blendvpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $dst$$XMMRegister, vector_len);
7941   %}
7942   ins_pipe( pipe_slow );
7943 %}
7944 
7945 // --------------------------------- DIV --------------------------------------
7946 
7947 // Floats vector div
7948 instruct vdiv2F(vecD dst, vecD src) %{
7949   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
7950   match(Set dst (DivVF dst src));
7951   format %{ "divps   $dst,$src\t! div packed2F" %}
7952   ins_encode %{
7953     __ divps($dst$$XMMRegister, $src$$XMMRegister);
7954   %}
7955   ins_pipe( pipe_slow );
7956 %}
7957 
7958 instruct vdiv2F_reg(vecD dst, vecD src1, vecD src2) %{
7959   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7960   match(Set dst (DivVF src1 src2));
7961   format %{ "vdivps  $dst,$src1,$src2\t! div packed2F" %}
7962   ins_encode %{
7963     int vector_len = 0;
7964     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7965   %}
7966   ins_pipe( pipe_slow );
7967 %}
7968 
7969 instruct vdiv2F_mem(vecD dst, vecD src, memory mem) %{
7970   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7971   match(Set dst (DivVF src (LoadVector mem)));
7972   format %{ "vdivps  $dst,$src,$mem\t! div packed2F" %}
7973   ins_encode %{
7974     int vector_len = 0;
7975     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7976   %}
7977   ins_pipe( pipe_slow );
7978 %}
7979 
7980 instruct vdiv4F(vecX dst, vecX src) %{
7981   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
7982   match(Set dst (DivVF dst src));
7983   format %{ "divps   $dst,$src\t! div packed4F" %}
7984   ins_encode %{
7985     __ divps($dst$$XMMRegister, $src$$XMMRegister);
7986   %}
7987   ins_pipe( pipe_slow );
7988 %}
7989 
7990 instruct vdiv4F_reg(vecX dst, vecX src1, vecX src2) %{
7991   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7992   match(Set dst (DivVF src1 src2));
7993   format %{ "vdivps  $dst,$src1,$src2\t! div packed4F" %}
7994   ins_encode %{
7995     int vector_len = 0;
7996     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7997   %}
7998   ins_pipe( pipe_slow );
7999 %}
8000 
8001 instruct vdiv4F_mem(vecX dst, vecX src, memory mem) %{
8002   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8003   match(Set dst (DivVF src (LoadVector mem)));
8004   format %{ "vdivps  $dst,$src,$mem\t! div packed4F" %}
8005   ins_encode %{
8006     int vector_len = 0;
8007     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8008   %}
8009   ins_pipe( pipe_slow );
8010 %}
8011 
8012 instruct vdiv8F_reg(vecY dst, vecY src1, vecY src2) %{
8013   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
8014   match(Set dst (DivVF src1 src2));
8015   format %{ "vdivps  $dst,$src1,$src2\t! div packed8F" %}
8016   ins_encode %{
8017     int vector_len = 1;
8018     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8019   %}
8020   ins_pipe( pipe_slow );
8021 %}
8022 
8023 instruct vdiv8F_mem(vecY dst, vecY src, memory mem) %{
8024   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
8025   match(Set dst (DivVF src (LoadVector mem)));
8026   format %{ "vdivps  $dst,$src,$mem\t! div packed8F" %}
8027   ins_encode %{
8028     int vector_len = 1;
8029     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8030   %}
8031   ins_pipe( pipe_slow );
8032 %}
8033 
8034 instruct vdiv16F_reg(vecZ dst, vecZ src1, vecZ src2) %{
8035   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
8036   match(Set dst (DivVF src1 src2));
8037   format %{ "vdivps  $dst,$src1,$src2\t! div packed16F" %}
8038   ins_encode %{
8039     int vector_len = 2;
8040     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8041   %}
8042   ins_pipe( pipe_slow );
8043 %}
8044 
8045 instruct vdiv16F_mem(vecZ dst, vecZ src, memory mem) %{
8046   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
8047   match(Set dst (DivVF src (LoadVector mem)));
8048   format %{ "vdivps  $dst,$src,$mem\t! div packed16F" %}
8049   ins_encode %{
8050     int vector_len = 2;
8051     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8052   %}
8053   ins_pipe( pipe_slow );
8054 %}
8055 
8056 // Doubles vector div
8057 instruct vdiv2D(vecX dst, vecX src) %{
8058   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
8059   match(Set dst (DivVD dst src));
8060   format %{ "divpd   $dst,$src\t! div packed2D" %}
8061   ins_encode %{
8062     __ divpd($dst$$XMMRegister, $src$$XMMRegister);
8063   %}
8064   ins_pipe( pipe_slow );
8065 %}
8066 
8067 instruct vdiv2D_reg(vecX dst, vecX src1, vecX src2) %{
8068   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8069   match(Set dst (DivVD src1 src2));
8070   format %{ "vdivpd  $dst,$src1,$src2\t! div packed2D" %}
8071   ins_encode %{
8072     int vector_len = 0;
8073     __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8074   %}
8075   ins_pipe( pipe_slow );
8076 %}
8077 
8078 instruct vdiv2D_mem(vecX dst, vecX src, memory mem) %{
8079   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8080   match(Set dst (DivVD src (LoadVector mem)));
8081   format %{ "vdivpd  $dst,$src,$mem\t! div packed2D" %}
8082   ins_encode %{
8083     int vector_len = 0;
8084     __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8085   %}
8086   ins_pipe( pipe_slow );
8087 %}
8088 
8089 instruct vdiv4D_reg(vecY dst, vecY src1, vecY src2) %{
8090   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8091   match(Set dst (DivVD src1 src2));
8092   format %{ "vdivpd  $dst,$src1,$src2\t! div packed4D" %}
8093   ins_encode %{
8094     int vector_len = 1;
8095     __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8096   %}
8097   ins_pipe( pipe_slow );
8098 %}
8099 
8100 instruct vdiv4D_mem(vecY dst, vecY src, memory mem) %{
8101   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8102   match(Set dst (DivVD src (LoadVector mem)));
8103   format %{ "vdivpd  $dst,$src,$mem\t! div packed4D" %}
8104   ins_encode %{
8105     int vector_len = 1;
8106     __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8107   %}
8108   ins_pipe( pipe_slow );
8109 %}
8110 
8111 instruct vdiv8D_reg(vecZ dst, vecZ src1, vecZ src2) %{
8112   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8113   match(Set dst (DivVD src1 src2));
8114   format %{ "vdivpd  $dst,$src1,$src2\t! div packed8D" %}
8115   ins_encode %{
8116     int vector_len = 2;
8117     __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8118   %}
8119   ins_pipe( pipe_slow );
8120 %}
8121 
8122 instruct vdiv8D_mem(vecZ dst, vecZ src, memory mem) %{
8123   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8124   match(Set dst (DivVD src (LoadVector mem)));
8125   format %{ "vdivpd  $dst,$src,$mem\t! div packed8D" %}
8126   ins_encode %{
8127     int vector_len = 2;
8128     __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8129   %}
8130   ins_pipe( pipe_slow );
8131 %}
8132 
8133 // ------------------------------ Shift ---------------------------------------
8134 
8135 // Left and right shift count vectors are the same on x86
8136 // (only lowest bits of xmm reg are used for count).
8137 instruct vshiftcnt(vecS dst, rRegI cnt) %{
8138   match(Set dst (LShiftCntV cnt));
8139   match(Set dst (RShiftCntV cnt));
8140   format %{ "movd    $dst,$cnt\t! load shift count" %}
8141   ins_encode %{
8142     __ movdl($dst$$XMMRegister, $cnt$$Register);
8143   %}
8144   ins_pipe( pipe_slow );
8145 %}
8146 
8147 // --------------------------------- Sqrt --------------------------------------
8148 
8149 // Floating point vector sqrt
8150 instruct vsqrt2D_reg(vecX dst, vecX src) %{
8151   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8152   match(Set dst (SqrtVD src));
8153   format %{ "vsqrtpd  $dst,$src\t! sqrt packed2D" %}
8154   ins_encode %{
8155     int vector_len = 0;
8156     __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
8157   %}
8158   ins_pipe( pipe_slow );
8159 %}
8160 
8161 instruct vsqrt2D_mem(vecX dst, memory mem) %{
8162   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8163   match(Set dst (SqrtVD (LoadVector mem)));
8164   format %{ "vsqrtpd  $dst,$mem\t! sqrt packed2D" %}
8165   ins_encode %{
8166     int vector_len = 0;
8167     __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vector_len);
8168   %}
8169   ins_pipe( pipe_slow );
8170 %}
8171 
8172 instruct vsqrt4D_reg(vecY dst, vecY src) %{
8173   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8174   match(Set dst (SqrtVD src));
8175   format %{ "vsqrtpd  $dst,$src\t! sqrt packed4D" %}
8176   ins_encode %{
8177     int vector_len = 1;
8178     __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
8179   %}
8180   ins_pipe( pipe_slow );
8181 %}
8182 
8183 instruct vsqrt4D_mem(vecY dst, memory mem) %{
8184   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8185   match(Set dst (SqrtVD (LoadVector mem)));
8186   format %{ "vsqrtpd  $dst,$mem\t! sqrt packed4D" %}
8187   ins_encode %{
8188     int vector_len = 1;
8189     __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vector_len);
8190   %}
8191   ins_pipe( pipe_slow );
8192 %}
8193 
8194 instruct vsqrt8D_reg(vecZ dst, vecZ src) %{
8195   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8196   match(Set dst (SqrtVD src));
8197   format %{ "vsqrtpd  $dst,$src\t! sqrt packed8D" %}
8198   ins_encode %{
8199     int vector_len = 2;
8200     __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
8201   %}
8202   ins_pipe( pipe_slow );
8203 %}
8204 
8205 instruct vsqrt8D_mem(vecZ dst, memory mem) %{
8206   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8207   match(Set dst (SqrtVD (LoadVector mem)));
8208   format %{ "vsqrtpd  $dst,$mem\t! sqrt packed8D" %}
8209   ins_encode %{
8210     int vector_len = 2;
8211     __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vector_len);
8212   %}
8213   ins_pipe( pipe_slow );
8214 %}
8215 
8216 instruct vsqrt2F_reg(vecD dst, vecD src) %{
8217   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8218   match(Set dst (SqrtVF src));
8219   format %{ "vsqrtps  $dst,$src\t! sqrt packed2F" %}
8220   ins_encode %{
8221     int vector_len = 0;
8222     __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vector_len);
8223   %}
8224   ins_pipe( pipe_slow );
8225 %}
8226 
8227 instruct vsqrt2F_mem(vecD dst, memory mem) %{
8228   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8229   match(Set dst (SqrtVF (LoadVector mem)));
8230   format %{ "vsqrtps  $dst,$mem\t! sqrt packed2F" %}
8231   ins_encode %{
8232     int vector_len = 0;
8233     __ vsqrtps($dst$$XMMRegister, $mem$$Address, vector_len);
8234   %}
8235   ins_pipe( pipe_slow );
8236 %}
8237 
8238 instruct vsqrt4F_reg(vecX dst, vecX src) %{
8239   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8240   match(Set dst (SqrtVF src));
8241   format %{ "vsqrtps  $dst,$src\t! sqrt packed4F" %}
8242   ins_encode %{
8243     int vector_len = 0;
8244     __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vector_len);
8245   %}
8246   ins_pipe( pipe_slow );
8247 %}
8248 
8249 instruct vsqrt4F_mem(vecX dst, memory mem) %{
8250   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8251   match(Set dst (SqrtVF (LoadVector mem)));
8252   format %{ "vsqrtps  $dst,$mem\t! sqrt packed4F" %}
8253   ins_encode %{
8254     int vector_len = 0;
8255     __ vsqrtps($dst$$XMMRegister, $mem$$Address, vector_len);
8256   %}
8257   ins_pipe( pipe_slow );
8258 %}
8259 
8260 instruct vsqrt8F_reg(vecY dst, vecY src) %{
8261   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
8262   match(Set dst (SqrtVF src));
8263   format %{ "vsqrtps  $dst,$src\t! sqrt packed8F" %}
8264   ins_encode %{
8265     int vector_len = 1;
8266     __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vector_len);
8267   %}
8268   ins_pipe( pipe_slow );
8269 %}
8270 
8271 instruct vsqrt8F_mem(vecY dst, memory mem) %{
8272   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
8273   match(Set dst (SqrtVF (LoadVector mem)));
8274   format %{ "vsqrtps  $dst,$mem\t! sqrt packed8F" %}
8275   ins_encode %{
8276     int vector_len = 1;
8277     __ vsqrtps($dst$$XMMRegister, $mem$$Address, vector_len);
8278   %}
8279   ins_pipe( pipe_slow );
8280 %}
8281 
8282 instruct vsqrt16F_reg(vecZ dst, vecZ src) %{
8283   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
8284   match(Set dst (SqrtVF src));
8285   format %{ "vsqrtps  $dst,$src\t! sqrt packed16F" %}
8286   ins_encode %{
8287     int vector_len = 2;
8288     __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vector_len);
8289   %}
8290   ins_pipe( pipe_slow );
8291 %}
8292 
8293 instruct vsqrt16F_mem(vecZ dst, memory mem) %{
8294   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
8295   match(Set dst (SqrtVF (LoadVector mem)));
8296   format %{ "vsqrtps  $dst,$mem\t! sqrt packed16F" %}
8297   ins_encode %{
8298     int vector_len = 2;
8299     __ vsqrtps($dst$$XMMRegister, $mem$$Address, vector_len);
8300   %}
8301   ins_pipe( pipe_slow );
8302 %}
8303 
8304 // ------------------------------ LeftShift -----------------------------------
8305 
8306 // Shorts/Chars vector left shift
8307 instruct vsll2S(vecS dst, vecS shift) %{
8308   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
8309   match(Set dst (LShiftVS dst shift));
8310   format %{ "psllw   $dst,$shift\t! left shift packed2S" %}
8311   ins_encode %{
8312     __ psllw($dst$$XMMRegister, $shift$$XMMRegister);
8313   %}
8314   ins_pipe( pipe_slow );
8315 %}
8316 
8317 instruct vsll2S_imm(vecS dst, immI8 shift) %{
8318   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
8319   match(Set dst (LShiftVS dst shift));
8320   format %{ "psllw   $dst,$shift\t! left shift packed2S" %}
8321   ins_encode %{
8322     __ psllw($dst$$XMMRegister, (int)$shift$$constant);
8323   %}
8324   ins_pipe( pipe_slow );
8325 %}
8326 
8327 instruct vsll2S_reg(vecS dst, vecS src, vecS shift) %{
8328   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8329   match(Set dst (LShiftVS src shift));
8330   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
8331   ins_encode %{
8332     int vector_len = 0;
8333     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8334   %}
8335   ins_pipe( pipe_slow );
8336 %}
8337 
8338 instruct vsll2S_reg_imm(vecS dst, vecS src, immI8 shift) %{
8339   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8340   match(Set dst (LShiftVS src shift));
8341   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
8342   ins_encode %{
8343     int vector_len = 0;
8344     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8345   %}
8346   ins_pipe( pipe_slow );
8347 %}
8348 
8349 instruct vsll4S(vecD dst, vecS shift) %{
8350   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
8351   match(Set dst (LShiftVS dst shift));
8352   format %{ "psllw   $dst,$shift\t! left shift packed4S" %}
8353   ins_encode %{
8354     __ psllw($dst$$XMMRegister, $shift$$XMMRegister);
8355   %}
8356   ins_pipe( pipe_slow );
8357 %}
8358 
8359 instruct vsll4S_imm(vecD dst, immI8 shift) %{
8360   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
8361   match(Set dst (LShiftVS dst shift));
8362   format %{ "psllw   $dst,$shift\t! left shift packed4S" %}
8363   ins_encode %{
8364     __ psllw($dst$$XMMRegister, (int)$shift$$constant);
8365   %}
8366   ins_pipe( pipe_slow );
8367 %}
8368 
8369 instruct vsll4S_reg(vecD dst, vecD src, vecS shift) %{
8370   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8371   match(Set dst (LShiftVS src shift));
8372   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
8373   ins_encode %{
8374     int vector_len = 0;
8375     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8376   %}
8377   ins_pipe( pipe_slow );
8378 %}
8379 
8380 instruct vsll4S_reg_imm(vecD dst, vecD src, immI8 shift) %{
8381   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8382   match(Set dst (LShiftVS src shift));
8383   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
8384   ins_encode %{
8385     int vector_len = 0;
8386     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8387   %}
8388   ins_pipe( pipe_slow );
8389 %}
8390 
8391 instruct vsll8S(vecX dst, vecS shift) %{
8392   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
8393   match(Set dst (LShiftVS dst shift));
8394   format %{ "psllw   $dst,$shift\t! left shift packed8S" %}
8395   ins_encode %{
8396     __ psllw($dst$$XMMRegister, $shift$$XMMRegister);
8397   %}
8398   ins_pipe( pipe_slow );
8399 %}
8400 
8401 instruct vsll8S_imm(vecX dst, immI8 shift) %{
8402   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
8403   match(Set dst (LShiftVS dst shift));
8404   format %{ "psllw   $dst,$shift\t! left shift packed8S" %}
8405   ins_encode %{
8406     __ psllw($dst$$XMMRegister, (int)$shift$$constant);
8407   %}
8408   ins_pipe( pipe_slow );
8409 %}
8410 
8411 instruct vsll8S_reg(vecX dst, vecX src, vecS shift) %{
8412   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
8413   match(Set dst (LShiftVS src shift));
8414   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
8415   ins_encode %{
8416     int vector_len = 0;
8417     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8418   %}
8419   ins_pipe( pipe_slow );
8420 %}
8421 
8422 instruct vsll8S_reg_imm(vecX dst, vecX src, immI8 shift) %{
8423   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
8424   match(Set dst (LShiftVS src shift));
8425   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
8426   ins_encode %{
8427     int vector_len = 0;
8428     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8429   %}
8430   ins_pipe( pipe_slow );
8431 %}
8432 
8433 instruct vsll16S_reg(vecY dst, vecY src, vecS shift) %{
8434   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
8435   match(Set dst (LShiftVS src shift));
8436   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
8437   ins_encode %{
8438     int vector_len = 1;
8439     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8440   %}
8441   ins_pipe( pipe_slow );
8442 %}
8443 
8444 instruct vsll16S_reg_imm(vecY dst, vecY src, immI8 shift) %{
8445   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
8446   match(Set dst (LShiftVS src shift));
8447   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
8448   ins_encode %{
8449     int vector_len = 1;
8450     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8451   %}
8452   ins_pipe( pipe_slow );
8453 %}
8454 
8455 instruct vsll32S_reg(vecZ dst, vecZ src, vecS shift) %{
8456   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
8457   match(Set dst (LShiftVS src shift));
8458   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed32S" %}
8459   ins_encode %{
8460     int vector_len = 2;
8461     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8462   %}
8463   ins_pipe( pipe_slow );
8464 %}
8465 
8466 instruct vsll32S_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
8467   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
8468   match(Set dst (LShiftVS src shift));
8469   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed32S" %}
8470   ins_encode %{
8471     int vector_len = 2;
8472     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8473   %}
8474   ins_pipe( pipe_slow );
8475 %}
8476 
8477 // Integers vector left shift
8478 instruct vsll2I(vecD dst, vecS shift) %{
8479   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
8480   match(Set dst (LShiftVI dst shift));
8481   format %{ "pslld   $dst,$shift\t! left shift packed2I" %}
8482   ins_encode %{
8483     __ pslld($dst$$XMMRegister, $shift$$XMMRegister);
8484   %}
8485   ins_pipe( pipe_slow );
8486 %}
8487 
8488 instruct vsll2I_imm(vecD dst, immI8 shift) %{
8489   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
8490   match(Set dst (LShiftVI dst shift));
8491   format %{ "pslld   $dst,$shift\t! left shift packed2I" %}
8492   ins_encode %{
8493     __ pslld($dst$$XMMRegister, (int)$shift$$constant);
8494   %}
8495   ins_pipe( pipe_slow );
8496 %}
8497 
8498 instruct vsll2I_reg(vecD dst, vecD src, vecS shift) %{
8499   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8500   match(Set dst (LShiftVI src shift));
8501   format %{ "vpslld  $dst,$src,$shift\t! left shift packed2I" %}
8502   ins_encode %{
8503     int vector_len = 0;
8504     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8505   %}
8506   ins_pipe( pipe_slow );
8507 %}
8508 
8509 instruct vsll2I_reg_imm(vecD dst, vecD src, immI8 shift) %{
8510   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8511   match(Set dst (LShiftVI src shift));
8512   format %{ "vpslld  $dst,$src,$shift\t! left shift packed2I" %}
8513   ins_encode %{
8514     int vector_len = 0;
8515     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8516   %}
8517   ins_pipe( pipe_slow );
8518 %}
8519 
8520 instruct vsll4I(vecX dst, vecS shift) %{
8521   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
8522   match(Set dst (LShiftVI dst shift));
8523   format %{ "pslld   $dst,$shift\t! left shift packed4I" %}
8524   ins_encode %{
8525     __ pslld($dst$$XMMRegister, $shift$$XMMRegister);
8526   %}
8527   ins_pipe( pipe_slow );
8528 %}
8529 
8530 instruct vsll4I_imm(vecX dst, immI8 shift) %{
8531   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
8532   match(Set dst (LShiftVI dst shift));
8533   format %{ "pslld   $dst,$shift\t! left shift packed4I" %}
8534   ins_encode %{
8535     __ pslld($dst$$XMMRegister, (int)$shift$$constant);
8536   %}
8537   ins_pipe( pipe_slow );
8538 %}
8539 
8540 instruct vsll4I_reg(vecX dst, vecX src, vecS shift) %{
8541   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8542   match(Set dst (LShiftVI src shift));
8543   format %{ "vpslld  $dst,$src,$shift\t! left shift packed4I" %}
8544   ins_encode %{
8545     int vector_len = 0;
8546     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8547   %}
8548   ins_pipe( pipe_slow );
8549 %}
8550 
8551 instruct vsll4I_reg_imm(vecX dst, vecX src, immI8 shift) %{
8552   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8553   match(Set dst (LShiftVI src shift));
8554   format %{ "vpslld  $dst,$src,$shift\t! left shift packed4I" %}
8555   ins_encode %{
8556     int vector_len = 0;
8557     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8558   %}
8559   ins_pipe( pipe_slow );
8560 %}
8561 
8562 instruct vsll8I_reg(vecY dst, vecY src, vecS shift) %{
8563   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
8564   match(Set dst (LShiftVI src shift));
8565   format %{ "vpslld  $dst,$src,$shift\t! left shift packed8I" %}
8566   ins_encode %{
8567     int vector_len = 1;
8568     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8569   %}
8570   ins_pipe( pipe_slow );
8571 %}
8572 
8573 instruct vsll8I_reg_imm(vecY dst, vecY src, immI8 shift) %{
8574   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
8575   match(Set dst (LShiftVI src shift));
8576   format %{ "vpslld  $dst,$src,$shift\t! left shift packed8I" %}
8577   ins_encode %{
8578     int vector_len = 1;
8579     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8580   %}
8581   ins_pipe( pipe_slow );
8582 %}
8583 
8584 instruct vsll16I_reg(vecZ dst, vecZ src, vecS shift) %{
8585   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
8586   match(Set dst (LShiftVI src shift));
8587   format %{ "vpslld  $dst,$src,$shift\t! left shift packed16I" %}
8588   ins_encode %{
8589     int vector_len = 2;
8590     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8591   %}
8592   ins_pipe( pipe_slow );
8593 %}
8594 
8595 instruct vsll16I_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
8596   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
8597   match(Set dst (LShiftVI src shift));
8598   format %{ "vpslld  $dst,$src,$shift\t! left shift packed16I" %}
8599   ins_encode %{
8600     int vector_len = 2;
8601     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8602   %}
8603   ins_pipe( pipe_slow );
8604 %}
8605 
8606 // Longs vector left shift
8607 instruct vsll2L(vecX dst, vecS shift) %{
8608   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
8609   match(Set dst (LShiftVL dst shift));
8610   format %{ "psllq   $dst,$shift\t! left shift packed2L" %}
8611   ins_encode %{
8612     __ psllq($dst$$XMMRegister, $shift$$XMMRegister);
8613   %}
8614   ins_pipe( pipe_slow );
8615 %}
8616 
8617 instruct vsll2L_imm(vecX dst, immI8 shift) %{
8618   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
8619   match(Set dst (LShiftVL dst shift));
8620   format %{ "psllq   $dst,$shift\t! left shift packed2L" %}
8621   ins_encode %{
8622     __ psllq($dst$$XMMRegister, (int)$shift$$constant);
8623   %}
8624   ins_pipe( pipe_slow );
8625 %}
8626 
8627 instruct vsll2L_reg(vecX dst, vecX src, vecS shift) %{
8628   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8629   match(Set dst (LShiftVL src shift));
8630   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed2L" %}
8631   ins_encode %{
8632     int vector_len = 0;
8633     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8634   %}
8635   ins_pipe( pipe_slow );
8636 %}
8637 
8638 instruct vsll2L_reg_imm(vecX dst, vecX src, immI8 shift) %{
8639   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8640   match(Set dst (LShiftVL src shift));
8641   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed2L" %}
8642   ins_encode %{
8643     int vector_len = 0;
8644     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8645   %}
8646   ins_pipe( pipe_slow );
8647 %}
8648 
8649 instruct vsll4L_reg(vecY dst, vecY src, vecS shift) %{
8650   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
8651   match(Set dst (LShiftVL src shift));
8652   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed4L" %}
8653   ins_encode %{
8654     int vector_len = 1;
8655     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8656   %}
8657   ins_pipe( pipe_slow );
8658 %}
8659 
8660 instruct vsll4L_reg_imm(vecY dst, vecY src, immI8 shift) %{
8661   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
8662   match(Set dst (LShiftVL src shift));
8663   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed4L" %}
8664   ins_encode %{
8665     int vector_len = 1;
8666     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8667   %}
8668   ins_pipe( pipe_slow );
8669 %}
8670 
8671 instruct vsll8L_reg(vecZ dst, vecZ src, vecS shift) %{
8672   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8673   match(Set dst (LShiftVL src shift));
8674   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed8L" %}
8675   ins_encode %{
8676     int vector_len = 2;
8677     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8678   %}
8679   ins_pipe( pipe_slow );
8680 %}
8681 
8682 instruct vsll8L_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
8683   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8684   match(Set dst (LShiftVL src shift));
8685   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed8L" %}
8686   ins_encode %{
8687     int vector_len = 2;
8688     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8689   %}
8690   ins_pipe( pipe_slow );
8691 %}
8692 
8693 // ----------------------- LogicalRightShift -----------------------------------
8694 
8695 // Shorts vector logical right shift produces incorrect Java result
8696 // for negative data because java code convert short value into int with
8697 // sign extension before a shift. But char vectors are fine since chars are
8698 // unsigned values.
8699 
8700 instruct vsrl2S(vecS dst, vecS shift) %{
8701   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
8702   match(Set dst (URShiftVS dst shift));
8703   format %{ "psrlw   $dst,$shift\t! logical right shift packed2S" %}
8704   ins_encode %{
8705     __ psrlw($dst$$XMMRegister, $shift$$XMMRegister);
8706   %}
8707   ins_pipe( pipe_slow );
8708 %}
8709 
8710 instruct vsrl2S_imm(vecS dst, immI8 shift) %{
8711   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
8712   match(Set dst (URShiftVS dst shift));
8713   format %{ "psrlw   $dst,$shift\t! logical right shift packed2S" %}
8714   ins_encode %{
8715     __ psrlw($dst$$XMMRegister, (int)$shift$$constant);
8716   %}
8717   ins_pipe( pipe_slow );
8718 %}
8719 
8720 instruct vsrl2S_reg(vecS dst, vecS src, vecS shift) %{
8721   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8722   match(Set dst (URShiftVS src shift));
8723   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed2S" %}
8724   ins_encode %{
8725     int vector_len = 0;
8726     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8727   %}
8728   ins_pipe( pipe_slow );
8729 %}
8730 
8731 instruct vsrl2S_reg_imm(vecS dst, vecS src, immI8 shift) %{
8732   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8733   match(Set dst (URShiftVS src shift));
8734   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed2S" %}
8735   ins_encode %{
8736     int vector_len = 0;
8737     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8738   %}
8739   ins_pipe( pipe_slow );
8740 %}
8741 
8742 instruct vsrl4S(vecD dst, vecS shift) %{
8743   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
8744   match(Set dst (URShiftVS dst shift));
8745   format %{ "psrlw   $dst,$shift\t! logical right shift packed4S" %}
8746   ins_encode %{
8747     __ psrlw($dst$$XMMRegister, $shift$$XMMRegister);
8748   %}
8749   ins_pipe( pipe_slow );
8750 %}
8751 
8752 instruct vsrl4S_imm(vecD dst, immI8 shift) %{
8753   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
8754   match(Set dst (URShiftVS dst shift));
8755   format %{ "psrlw   $dst,$shift\t! logical right shift packed4S" %}
8756   ins_encode %{
8757     __ psrlw($dst$$XMMRegister, (int)$shift$$constant);
8758   %}
8759   ins_pipe( pipe_slow );
8760 %}
8761 
8762 instruct vsrl4S_reg(vecD dst, vecD src, vecS shift) %{
8763   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8764   match(Set dst (URShiftVS src shift));
8765   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed4S" %}
8766   ins_encode %{
8767     int vector_len = 0;
8768     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8769   %}
8770   ins_pipe( pipe_slow );
8771 %}
8772 
8773 instruct vsrl4S_reg_imm(vecD dst, vecD src, immI8 shift) %{
8774   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8775   match(Set dst (URShiftVS src shift));
8776   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed4S" %}
8777   ins_encode %{
8778     int vector_len = 0;
8779     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8780   %}
8781   ins_pipe( pipe_slow );
8782 %}
8783 
8784 instruct vsrl8S(vecX dst, vecS shift) %{
8785   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
8786   match(Set dst (URShiftVS dst shift));
8787   format %{ "psrlw   $dst,$shift\t! logical right shift packed8S" %}
8788   ins_encode %{
8789     __ psrlw($dst$$XMMRegister, $shift$$XMMRegister);
8790   %}
8791   ins_pipe( pipe_slow );
8792 %}
8793 
8794 instruct vsrl8S_imm(vecX dst, immI8 shift) %{
8795   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
8796   match(Set dst (URShiftVS dst shift));
8797   format %{ "psrlw   $dst,$shift\t! logical right shift packed8S" %}
8798   ins_encode %{
8799     __ psrlw($dst$$XMMRegister, (int)$shift$$constant);
8800   %}
8801   ins_pipe( pipe_slow );
8802 %}
8803 
8804 instruct vsrl8S_reg(vecX dst, vecX src, vecS shift) %{
8805   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
8806   match(Set dst (URShiftVS src shift));
8807   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed8S" %}
8808   ins_encode %{
8809     int vector_len = 0;
8810     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8811   %}
8812   ins_pipe( pipe_slow );
8813 %}
8814 
8815 instruct vsrl8S_reg_imm(vecX dst, vecX src, immI8 shift) %{
8816   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
8817   match(Set dst (URShiftVS src shift));
8818   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed8S" %}
8819   ins_encode %{
8820     int vector_len = 0;
8821     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8822   %}
8823   ins_pipe( pipe_slow );
8824 %}
8825 
8826 instruct vsrl16S_reg(vecY dst, vecY src, vecS shift) %{
8827   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
8828   match(Set dst (URShiftVS src shift));
8829   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed16S" %}
8830   ins_encode %{
8831     int vector_len = 1;
8832     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8833   %}
8834   ins_pipe( pipe_slow );
8835 %}
8836 
8837 instruct vsrl16S_reg_imm(vecY dst, vecY src, immI8 shift) %{
8838   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
8839   match(Set dst (URShiftVS src shift));
8840   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed16S" %}
8841   ins_encode %{
8842     int vector_len = 1;
8843     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8844   %}
8845   ins_pipe( pipe_slow );
8846 %}
8847 
8848 instruct vsrl32S_reg(vecZ dst, vecZ src, vecS shift) %{
8849   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
8850   match(Set dst (URShiftVS src shift));
8851   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed32S" %}
8852   ins_encode %{
8853     int vector_len = 2;
8854     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8855   %}
8856   ins_pipe( pipe_slow );
8857 %}
8858 
8859 instruct vsrl32S_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
8860   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
8861   match(Set dst (URShiftVS src shift));
8862   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed32S" %}
8863   ins_encode %{
8864     int vector_len = 2;
8865     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8866   %}
8867   ins_pipe( pipe_slow );
8868 %}
8869 
8870 // Integers vector logical right shift
8871 instruct vsrl2I(vecD dst, vecS shift) %{
8872   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
8873   match(Set dst (URShiftVI dst shift));
8874   format %{ "psrld   $dst,$shift\t! logical right shift packed2I" %}
8875   ins_encode %{
8876     __ psrld($dst$$XMMRegister, $shift$$XMMRegister);
8877   %}
8878   ins_pipe( pipe_slow );
8879 %}
8880 
8881 instruct vsrl2I_imm(vecD dst, immI8 shift) %{
8882   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
8883   match(Set dst (URShiftVI dst shift));
8884   format %{ "psrld   $dst,$shift\t! logical right shift packed2I" %}
8885   ins_encode %{
8886     __ psrld($dst$$XMMRegister, (int)$shift$$constant);
8887   %}
8888   ins_pipe( pipe_slow );
8889 %}
8890 
8891 instruct vsrl2I_reg(vecD dst, vecD src, vecS shift) %{
8892   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8893   match(Set dst (URShiftVI src shift));
8894   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed2I" %}
8895   ins_encode %{
8896     int vector_len = 0;
8897     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8898   %}
8899   ins_pipe( pipe_slow );
8900 %}
8901 
8902 instruct vsrl2I_reg_imm(vecD dst, vecD src, immI8 shift) %{
8903   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8904   match(Set dst (URShiftVI src shift));
8905   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed2I" %}
8906   ins_encode %{
8907     int vector_len = 0;
8908     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8909   %}
8910   ins_pipe( pipe_slow );
8911 %}
8912 
8913 instruct vsrl4I(vecX dst, vecS shift) %{
8914   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
8915   match(Set dst (URShiftVI dst shift));
8916   format %{ "psrld   $dst,$shift\t! logical right shift packed4I" %}
8917   ins_encode %{
8918     __ psrld($dst$$XMMRegister, $shift$$XMMRegister);
8919   %}
8920   ins_pipe( pipe_slow );
8921 %}
8922 
8923 instruct vsrl4I_imm(vecX dst, immI8 shift) %{
8924   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
8925   match(Set dst (URShiftVI dst shift));
8926   format %{ "psrld   $dst,$shift\t! logical right shift packed4I" %}
8927   ins_encode %{
8928     __ psrld($dst$$XMMRegister, (int)$shift$$constant);
8929   %}
8930   ins_pipe( pipe_slow );
8931 %}
8932 
8933 instruct vsrl4I_reg(vecX dst, vecX src, vecS shift) %{
8934   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8935   match(Set dst (URShiftVI src shift));
8936   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed4I" %}
8937   ins_encode %{
8938     int vector_len = 0;
8939     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8940   %}
8941   ins_pipe( pipe_slow );
8942 %}
8943 
8944 instruct vsrl4I_reg_imm(vecX dst, vecX src, immI8 shift) %{
8945   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8946   match(Set dst (URShiftVI src shift));
8947   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed4I" %}
8948   ins_encode %{
8949     int vector_len = 0;
8950     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8951   %}
8952   ins_pipe( pipe_slow );
8953 %}
8954 
8955 instruct vsrl8I_reg(vecY dst, vecY src, vecS shift) %{
8956   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
8957   match(Set dst (URShiftVI src shift));
8958   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed8I" %}
8959   ins_encode %{
8960     int vector_len = 1;
8961     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8962   %}
8963   ins_pipe( pipe_slow );
8964 %}
8965 
8966 instruct vsrl8I_reg_imm(vecY dst, vecY src, immI8 shift) %{
8967   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
8968   match(Set dst (URShiftVI src shift));
8969   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed8I" %}
8970   ins_encode %{
8971     int vector_len = 1;
8972     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8973   %}
8974   ins_pipe( pipe_slow );
8975 %}
8976 
8977 instruct vsrl16I_reg(vecZ dst, vecZ src, vecS shift) %{
8978   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
8979   match(Set dst (URShiftVI src shift));
8980   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed16I" %}
8981   ins_encode %{
8982     int vector_len = 2;
8983     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8984   %}
8985   ins_pipe( pipe_slow );
8986 %}
8987 
8988 instruct vsrl16I_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
8989   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
8990   match(Set dst (URShiftVI src shift));
8991   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed16I" %}
8992   ins_encode %{
8993     int vector_len = 2;
8994     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8995   %}
8996   ins_pipe( pipe_slow );
8997 %}
8998 
8999 // Longs vector logical right shift
9000 instruct vsrl2L(vecX dst, vecS shift) %{
9001   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
9002   match(Set dst (URShiftVL dst shift));
9003   format %{ "psrlq   $dst,$shift\t! logical right shift packed2L" %}
9004   ins_encode %{
9005     __ psrlq($dst$$XMMRegister, $shift$$XMMRegister);
9006   %}
9007   ins_pipe( pipe_slow );
9008 %}
9009 
9010 instruct vsrl2L_imm(vecX dst, immI8 shift) %{
9011   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
9012   match(Set dst (URShiftVL dst shift));
9013   format %{ "psrlq   $dst,$shift\t! logical right shift packed2L" %}
9014   ins_encode %{
9015     __ psrlq($dst$$XMMRegister, (int)$shift$$constant);
9016   %}
9017   ins_pipe( pipe_slow );
9018 %}
9019 
9020 instruct vsrl2L_reg(vecX dst, vecX src, vecS shift) %{
9021   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
9022   match(Set dst (URShiftVL src shift));
9023   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed2L" %}
9024   ins_encode %{
9025     int vector_len = 0;
9026     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9027   %}
9028   ins_pipe( pipe_slow );
9029 %}
9030 
9031 instruct vsrl2L_reg_imm(vecX dst, vecX src, immI8 shift) %{
9032   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
9033   match(Set dst (URShiftVL src shift));
9034   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed2L" %}
9035   ins_encode %{
9036     int vector_len = 0;
9037     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9038   %}
9039   ins_pipe( pipe_slow );
9040 %}
9041 
9042 instruct vsrl4L_reg(vecY dst, vecY src, vecS shift) %{
9043   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
9044   match(Set dst (URShiftVL src shift));
9045   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed4L" %}
9046   ins_encode %{
9047     int vector_len = 1;
9048     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9049   %}
9050   ins_pipe( pipe_slow );
9051 %}
9052 
9053 instruct vsrl4L_reg_imm(vecY dst, vecY src, immI8 shift) %{
9054   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
9055   match(Set dst (URShiftVL src shift));
9056   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed4L" %}
9057   ins_encode %{
9058     int vector_len = 1;
9059     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9060   %}
9061   ins_pipe( pipe_slow );
9062 %}
9063 
9064 instruct vsrl8L_reg(vecZ dst, vecZ src, vecS shift) %{
9065   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
9066   match(Set dst (URShiftVL src shift));
9067   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed8L" %}
9068   ins_encode %{
9069     int vector_len = 2;
9070     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9071   %}
9072   ins_pipe( pipe_slow );
9073 %}
9074 
9075 instruct vsrl8L_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
9076   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
9077   match(Set dst (URShiftVL src shift));
9078   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed8L" %}
9079   ins_encode %{
9080     int vector_len = 2;
9081     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9082   %}
9083   ins_pipe( pipe_slow );
9084 %}
9085 
9086 // ------------------- ArithmeticRightShift -----------------------------------
9087 
9088 // Shorts/Chars vector arithmetic right shift
9089 instruct vsra2S(vecS dst, vecS shift) %{
9090   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
9091   match(Set dst (RShiftVS dst shift));
9092   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed2S" %}
9093   ins_encode %{
9094     __ psraw($dst$$XMMRegister, $shift$$XMMRegister);
9095   %}
9096   ins_pipe( pipe_slow );
9097 %}
9098 
9099 instruct vsra2S_imm(vecS dst, immI8 shift) %{
9100   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
9101   match(Set dst (RShiftVS dst shift));
9102   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed2S" %}
9103   ins_encode %{
9104     __ psraw($dst$$XMMRegister, (int)$shift$$constant);
9105   %}
9106   ins_pipe( pipe_slow );
9107 %}
9108 
9109 instruct vsra2S_reg(vecS dst, vecS src, vecS shift) %{
9110   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
9111   match(Set dst (RShiftVS src shift));
9112   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
9113   ins_encode %{
9114     int vector_len = 0;
9115     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9116   %}
9117   ins_pipe( pipe_slow );
9118 %}
9119 
9120 instruct vsra2S_reg_imm(vecS dst, vecS src, immI8 shift) %{
9121   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
9122   match(Set dst (RShiftVS src shift));
9123   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
9124   ins_encode %{
9125     int vector_len = 0;
9126     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9127   %}
9128   ins_pipe( pipe_slow );
9129 %}
9130 
9131 instruct vsra4S(vecD dst, vecS shift) %{
9132   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
9133   match(Set dst (RShiftVS dst shift));
9134   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed4S" %}
9135   ins_encode %{
9136     __ psraw($dst$$XMMRegister, $shift$$XMMRegister);
9137   %}
9138   ins_pipe( pipe_slow );
9139 %}
9140 
9141 instruct vsra4S_imm(vecD dst, immI8 shift) %{
9142   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
9143   match(Set dst (RShiftVS dst shift));
9144   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed4S" %}
9145   ins_encode %{
9146     __ psraw($dst$$XMMRegister, (int)$shift$$constant);
9147   %}
9148   ins_pipe( pipe_slow );
9149 %}
9150 
9151 instruct vsra4S_reg(vecD dst, vecD src, vecS shift) %{
9152   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
9153   match(Set dst (RShiftVS src shift));
9154   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
9155   ins_encode %{
9156     int vector_len = 0;
9157     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9158   %}
9159   ins_pipe( pipe_slow );
9160 %}
9161 
9162 instruct vsra4S_reg_imm(vecD dst, vecD src, immI8 shift) %{
9163   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
9164   match(Set dst (RShiftVS src shift));
9165   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
9166   ins_encode %{
9167     int vector_len = 0;
9168     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9169   %}
9170   ins_pipe( pipe_slow );
9171 %}
9172 
9173 instruct vsra8S(vecX dst, vecS shift) %{
9174   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
9175   match(Set dst (RShiftVS dst shift));
9176   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed8S" %}
9177   ins_encode %{
9178     __ psraw($dst$$XMMRegister, $shift$$XMMRegister);
9179   %}
9180   ins_pipe( pipe_slow );
9181 %}
9182 
9183 instruct vsra8S_imm(vecX dst, immI8 shift) %{
9184   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
9185   match(Set dst (RShiftVS dst shift));
9186   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed8S" %}
9187   ins_encode %{
9188     __ psraw($dst$$XMMRegister, (int)$shift$$constant);
9189   %}
9190   ins_pipe( pipe_slow );
9191 %}
9192 
9193 instruct vsra8S_reg(vecX dst, vecX src, vecS shift) %{
9194   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
9195   match(Set dst (RShiftVS src shift));
9196   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
9197   ins_encode %{
9198     int vector_len = 0;
9199     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9200   %}
9201   ins_pipe( pipe_slow );
9202 %}
9203 
9204 instruct vsra8S_reg_imm(vecX dst, vecX src, immI8 shift) %{
9205   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
9206   match(Set dst (RShiftVS src shift));
9207   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
9208   ins_encode %{
9209     int vector_len = 0;
9210     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9211   %}
9212   ins_pipe( pipe_slow );
9213 %}
9214 
9215 instruct vsra16S_reg(vecY dst, vecY src, vecS shift) %{
9216   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
9217   match(Set dst (RShiftVS src shift));
9218   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
9219   ins_encode %{
9220     int vector_len = 1;
9221     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9222   %}
9223   ins_pipe( pipe_slow );
9224 %}
9225 
9226 instruct vsra16S_reg_imm(vecY dst, vecY src, immI8 shift) %{
9227   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
9228   match(Set dst (RShiftVS src shift));
9229   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
9230   ins_encode %{
9231     int vector_len = 1;
9232     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9233   %}
9234   ins_pipe( pipe_slow );
9235 %}
9236 
9237 instruct vsra32S_reg(vecZ dst, vecZ src, vecS shift) %{
9238   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
9239   match(Set dst (RShiftVS src shift));
9240   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed32S" %}
9241   ins_encode %{
9242     int vector_len = 2;
9243     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9244   %}
9245   ins_pipe( pipe_slow );
9246 %}
9247 
9248 instruct vsra32S_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
9249   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
9250   match(Set dst (RShiftVS src shift));
9251   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed32S" %}
9252   ins_encode %{
9253     int vector_len = 2;
9254     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9255   %}
9256   ins_pipe( pipe_slow );
9257 %}
9258 
9259 // Integers vector arithmetic right shift
9260 instruct vsra2I(vecD dst, vecS shift) %{
9261   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
9262   match(Set dst (RShiftVI dst shift));
9263   format %{ "psrad   $dst,$shift\t! arithmetic right shift packed2I" %}
9264   ins_encode %{
9265     __ psrad($dst$$XMMRegister, $shift$$XMMRegister);
9266   %}
9267   ins_pipe( pipe_slow );
9268 %}
9269 
9270 instruct vsra2I_imm(vecD dst, immI8 shift) %{
9271   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
9272   match(Set dst (RShiftVI dst shift));
9273   format %{ "psrad   $dst,$shift\t! arithmetic right shift packed2I" %}
9274   ins_encode %{
9275     __ psrad($dst$$XMMRegister, (int)$shift$$constant);
9276   %}
9277   ins_pipe( pipe_slow );
9278 %}
9279 
9280 instruct vsra2I_reg(vecD dst, vecD src, vecS shift) %{
9281   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
9282   match(Set dst (RShiftVI src shift));
9283   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed2I" %}
9284   ins_encode %{
9285     int vector_len = 0;
9286     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9287   %}
9288   ins_pipe( pipe_slow );
9289 %}
9290 
9291 instruct vsra2I_reg_imm(vecD dst, vecD src, immI8 shift) %{
9292   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
9293   match(Set dst (RShiftVI src shift));
9294   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed2I" %}
9295   ins_encode %{
9296     int vector_len = 0;
9297     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9298   %}
9299   ins_pipe( pipe_slow );
9300 %}
9301 
9302 instruct vsra4I(vecX dst, vecS shift) %{
9303   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
9304   match(Set dst (RShiftVI dst shift));
9305   format %{ "psrad   $dst,$shift\t! arithmetic right shift packed4I" %}
9306   ins_encode %{
9307     __ psrad($dst$$XMMRegister, $shift$$XMMRegister);
9308   %}
9309   ins_pipe( pipe_slow );
9310 %}
9311 
9312 instruct vsra4I_imm(vecX dst, immI8 shift) %{
9313   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
9314   match(Set dst (RShiftVI dst shift));
9315   format %{ "psrad   $dst,$shift\t! arithmetic right shift packed4I" %}
9316   ins_encode %{
9317     __ psrad($dst$$XMMRegister, (int)$shift$$constant);
9318   %}
9319   ins_pipe( pipe_slow );
9320 %}
9321 
9322 instruct vsra4I_reg(vecX dst, vecX src, vecS shift) %{
9323   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
9324   match(Set dst (RShiftVI src shift));
9325   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed4I" %}
9326   ins_encode %{
9327     int vector_len = 0;
9328     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9329   %}
9330   ins_pipe( pipe_slow );
9331 %}
9332 
9333 instruct vsra4I_reg_imm(vecX dst, vecX src, immI8 shift) %{
9334   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
9335   match(Set dst (RShiftVI src shift));
9336   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed4I" %}
9337   ins_encode %{
9338     int vector_len = 0;
9339     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9340   %}
9341   ins_pipe( pipe_slow );
9342 %}
9343 
9344 instruct vsra8I_reg(vecY dst, vecY src, vecS shift) %{
9345   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
9346   match(Set dst (RShiftVI src shift));
9347   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed8I" %}
9348   ins_encode %{
9349     int vector_len = 1;
9350     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9351   %}
9352   ins_pipe( pipe_slow );
9353 %}
9354 
9355 instruct vsra8I_reg_imm(vecY dst, vecY src, immI8 shift) %{
9356   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
9357   match(Set dst (RShiftVI src shift));
9358   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed8I" %}
9359   ins_encode %{
9360     int vector_len = 1;
9361     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9362   %}
9363   ins_pipe( pipe_slow );
9364 %}
9365 
9366 instruct vsra16I_reg(vecZ dst, vecZ src, vecS shift) %{
9367   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
9368   match(Set dst (RShiftVI src shift));
9369   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed16I" %}
9370   ins_encode %{
9371     int vector_len = 2;
9372     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9373   %}
9374   ins_pipe( pipe_slow );
9375 %}
9376 
9377 instruct vsra16I_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
9378   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
9379   match(Set dst (RShiftVI src shift));
9380   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed16I" %}
9381   ins_encode %{
9382     int vector_len = 2;
9383     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9384   %}
9385   ins_pipe( pipe_slow );
9386 %}
9387 
9388 // There are no longs vector arithmetic right shift instructions.
9389 
9390 
9391 // --------------------------------- AND --------------------------------------
9392 
9393 instruct vand4B(vecS dst, vecS src) %{
9394   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 4);
9395   match(Set dst (AndV dst src));
9396   format %{ "pand    $dst,$src\t! and vectors (4 bytes)" %}
9397   ins_encode %{
9398     __ pand($dst$$XMMRegister, $src$$XMMRegister);
9399   %}
9400   ins_pipe( pipe_slow );
9401 %}
9402 
9403 instruct vand4B_reg(vecS dst, vecS src1, vecS src2) %{
9404   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
9405   match(Set dst (AndV src1 src2));
9406   format %{ "vpand   $dst,$src1,$src2\t! and vectors (4 bytes)" %}
9407   ins_encode %{
9408     int vector_len = 0;
9409     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9410   %}
9411   ins_pipe( pipe_slow );
9412 %}
9413 
9414 instruct vand4B_mem(vecS dst, vecS src, memory mem) %{
9415   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
9416   match(Set dst (AndV src (LoadVector mem)));
9417   format %{ "vpand   $dst,$src,$mem\t! and vectors (4 bytes)" %}
9418   ins_encode %{
9419     int vector_len = 0;
9420     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9421   %}
9422   ins_pipe( pipe_slow );
9423 %}
9424 
9425 instruct vand8B(vecD dst, vecD src) %{
9426   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 8);
9427   match(Set dst (AndV dst src));
9428   format %{ "pand    $dst,$src\t! and vectors (8 bytes)" %}
9429   ins_encode %{
9430     __ pand($dst$$XMMRegister, $src$$XMMRegister);
9431   %}
9432   ins_pipe( pipe_slow );
9433 %}
9434 
9435 instruct vand8B_reg(vecD dst, vecD src1, vecD src2) %{
9436   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
9437   match(Set dst (AndV src1 src2));
9438   format %{ "vpand   $dst,$src1,$src2\t! and vectors (8 bytes)" %}
9439   ins_encode %{
9440     int vector_len = 0;
9441     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9442   %}
9443   ins_pipe( pipe_slow );
9444 %}
9445 
9446 instruct vand8B_mem(vecD dst, vecD src, memory mem) %{
9447   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
9448   match(Set dst (AndV src (LoadVector mem)));
9449   format %{ "vpand   $dst,$src,$mem\t! and vectors (8 bytes)" %}
9450   ins_encode %{
9451     int vector_len = 0;
9452     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9453   %}
9454   ins_pipe( pipe_slow );
9455 %}
9456 
9457 instruct vand16B(vecX dst, vecX src) %{
9458   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 16);
9459   match(Set dst (AndV dst src));
9460   format %{ "pand    $dst,$src\t! and vectors (16 bytes)" %}
9461   ins_encode %{
9462     __ pand($dst$$XMMRegister, $src$$XMMRegister);
9463   %}
9464   ins_pipe( pipe_slow );
9465 %}
9466 
9467 instruct vand16B_reg(vecX dst, vecX src1, vecX src2) %{
9468   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
9469   match(Set dst (AndV src1 src2));
9470   format %{ "vpand   $dst,$src1,$src2\t! and vectors (16 bytes)" %}
9471   ins_encode %{
9472     int vector_len = 0;
9473     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9474   %}
9475   ins_pipe( pipe_slow );
9476 %}
9477 
9478 instruct vand16B_mem(vecX dst, vecX src, memory mem) %{
9479   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
9480   match(Set dst (AndV src (LoadVector mem)));
9481   format %{ "vpand   $dst,$src,$mem\t! and vectors (16 bytes)" %}
9482   ins_encode %{
9483     int vector_len = 0;
9484     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9485   %}
9486   ins_pipe( pipe_slow );
9487 %}
9488 
9489 instruct vand32B_reg(vecY dst, vecY src1, vecY src2) %{
9490   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
9491   match(Set dst (AndV src1 src2));
9492   format %{ "vpand   $dst,$src1,$src2\t! and vectors (32 bytes)" %}
9493   ins_encode %{
9494     int vector_len = 1;
9495     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9496   %}
9497   ins_pipe( pipe_slow );
9498 %}
9499 
9500 instruct vand32B_mem(vecY dst, vecY src, memory mem) %{
9501   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
9502   match(Set dst (AndV src (LoadVector mem)));
9503   format %{ "vpand   $dst,$src,$mem\t! and vectors (32 bytes)" %}
9504   ins_encode %{
9505     int vector_len = 1;
9506     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9507   %}
9508   ins_pipe( pipe_slow );
9509 %}
9510 
9511 instruct vand64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
9512   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
9513   match(Set dst (AndV src1 src2));
9514   format %{ "vpand   $dst,$src1,$src2\t! and vectors (64 bytes)" %}
9515   ins_encode %{
9516     int vector_len = 2;
9517     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9518   %}
9519   ins_pipe( pipe_slow );
9520 %}
9521 
9522 instruct vand64B_mem(vecZ dst, vecZ src, memory mem) %{
9523   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
9524   match(Set dst (AndV src (LoadVector mem)));
9525   format %{ "vpand   $dst,$src,$mem\t! and vectors (64 bytes)" %}
9526   ins_encode %{
9527     int vector_len = 2;
9528     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9529   %}
9530   ins_pipe( pipe_slow );
9531 %}
9532 
9533 // --------------------------------- OR ---------------------------------------
9534 
9535 instruct vor4B(vecS dst, vecS src) %{
9536   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 4);
9537   match(Set dst (OrV dst src));
9538   format %{ "por     $dst,$src\t! or vectors (4 bytes)" %}
9539   ins_encode %{
9540     __ por($dst$$XMMRegister, $src$$XMMRegister);
9541   %}
9542   ins_pipe( pipe_slow );
9543 %}
9544 
9545 instruct vor4B_reg(vecS dst, vecS src1, vecS src2) %{
9546   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
9547   match(Set dst (OrV src1 src2));
9548   format %{ "vpor    $dst,$src1,$src2\t! or vectors (4 bytes)" %}
9549   ins_encode %{
9550     int vector_len = 0;
9551     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9552   %}
9553   ins_pipe( pipe_slow );
9554 %}
9555 
9556 instruct vor4B_mem(vecS dst, vecS src, memory mem) %{
9557   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
9558   match(Set dst (OrV src (LoadVector mem)));
9559   format %{ "vpor    $dst,$src,$mem\t! or vectors (4 bytes)" %}
9560   ins_encode %{
9561     int vector_len = 0;
9562     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9563   %}
9564   ins_pipe( pipe_slow );
9565 %}
9566 
9567 instruct vor8B(vecD dst, vecD src) %{
9568   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 8);
9569   match(Set dst (OrV dst src));
9570   format %{ "por     $dst,$src\t! or vectors (8 bytes)" %}
9571   ins_encode %{
9572     __ por($dst$$XMMRegister, $src$$XMMRegister);
9573   %}
9574   ins_pipe( pipe_slow );
9575 %}
9576 
9577 instruct vor8B_reg(vecD dst, vecD src1, vecD src2) %{
9578   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
9579   match(Set dst (OrV src1 src2));
9580   format %{ "vpor    $dst,$src1,$src2\t! or vectors (8 bytes)" %}
9581   ins_encode %{
9582     int vector_len = 0;
9583     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9584   %}
9585   ins_pipe( pipe_slow );
9586 %}
9587 
9588 instruct vor8B_mem(vecD dst, vecD src, memory mem) %{
9589   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
9590   match(Set dst (OrV src (LoadVector mem)));
9591   format %{ "vpor    $dst,$src,$mem\t! or vectors (8 bytes)" %}
9592   ins_encode %{
9593     int vector_len = 0;
9594     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9595   %}
9596   ins_pipe( pipe_slow );
9597 %}
9598 
9599 instruct vor16B(vecX dst, vecX src) %{
9600   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 16);
9601   match(Set dst (OrV dst src));
9602   format %{ "por     $dst,$src\t! or vectors (16 bytes)" %}
9603   ins_encode %{
9604     __ por($dst$$XMMRegister, $src$$XMMRegister);
9605   %}
9606   ins_pipe( pipe_slow );
9607 %}
9608 
9609 instruct vor16B_reg(vecX dst, vecX src1, vecX src2) %{
9610   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
9611   match(Set dst (OrV src1 src2));
9612   format %{ "vpor    $dst,$src1,$src2\t! or vectors (16 bytes)" %}
9613   ins_encode %{
9614     int vector_len = 0;
9615     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9616   %}
9617   ins_pipe( pipe_slow );
9618 %}
9619 
9620 instruct vor16B_mem(vecX dst, vecX src, memory mem) %{
9621   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
9622   match(Set dst (OrV src (LoadVector mem)));
9623   format %{ "vpor    $dst,$src,$mem\t! or vectors (16 bytes)" %}
9624   ins_encode %{
9625     int vector_len = 0;
9626     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9627   %}
9628   ins_pipe( pipe_slow );
9629 %}
9630 
9631 instruct vor32B_reg(vecY dst, vecY src1, vecY src2) %{
9632   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
9633   match(Set dst (OrV src1 src2));
9634   format %{ "vpor    $dst,$src1,$src2\t! or vectors (32 bytes)" %}
9635   ins_encode %{
9636     int vector_len = 1;
9637     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9638   %}
9639   ins_pipe( pipe_slow );
9640 %}
9641 
9642 instruct vor32B_mem(vecY dst, vecY src, memory mem) %{
9643   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
9644   match(Set dst (OrV src (LoadVector mem)));
9645   format %{ "vpor    $dst,$src,$mem\t! or vectors (32 bytes)" %}
9646   ins_encode %{
9647     int vector_len = 1;
9648     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9649   %}
9650   ins_pipe( pipe_slow );
9651 %}
9652 
9653 instruct vor64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
9654   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
9655   match(Set dst (OrV src1 src2));
9656   format %{ "vpor    $dst,$src1,$src2\t! or vectors (64 bytes)" %}
9657   ins_encode %{
9658     int vector_len = 2;
9659     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9660   %}
9661   ins_pipe( pipe_slow );
9662 %}
9663 
9664 instruct vor64B_mem(vecZ dst, vecZ src, memory mem) %{
9665   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
9666   match(Set dst (OrV src (LoadVector mem)));
9667   format %{ "vpor    $dst,$src,$mem\t! or vectors (64 bytes)" %}
9668   ins_encode %{
9669     int vector_len = 2;
9670     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9671   %}
9672   ins_pipe( pipe_slow );
9673 %}
9674 
9675 // --------------------------------- XOR --------------------------------------
9676 
9677 instruct vxor4B(vecS dst, vecS src) %{
9678   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 4);
9679   match(Set dst (XorV dst src));
9680   format %{ "pxor    $dst,$src\t! xor vectors (4 bytes)" %}
9681   ins_encode %{
9682     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
9683   %}
9684   ins_pipe( pipe_slow );
9685 %}
9686 
9687 instruct vxor4B_reg(vecS dst, vecS src1, vecS src2) %{
9688   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
9689   match(Set dst (XorV src1 src2));
9690   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (4 bytes)" %}
9691   ins_encode %{
9692     int vector_len = 0;
9693     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9694   %}
9695   ins_pipe( pipe_slow );
9696 %}
9697 
9698 instruct vxor4B_mem(vecS dst, vecS src, memory mem) %{
9699   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
9700   match(Set dst (XorV src (LoadVector mem)));
9701   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (4 bytes)" %}
9702   ins_encode %{
9703     int vector_len = 0;
9704     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9705   %}
9706   ins_pipe( pipe_slow );
9707 %}
9708 
9709 instruct vxor8B(vecD dst, vecD src) %{
9710   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 8);
9711   match(Set dst (XorV dst src));
9712   format %{ "pxor    $dst,$src\t! xor vectors (8 bytes)" %}
9713   ins_encode %{
9714     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
9715   %}
9716   ins_pipe( pipe_slow );
9717 %}
9718 
9719 instruct vxor8B_reg(vecD dst, vecD src1, vecD src2) %{
9720   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
9721   match(Set dst (XorV src1 src2));
9722   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (8 bytes)" %}
9723   ins_encode %{
9724     int vector_len = 0;
9725     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9726   %}
9727   ins_pipe( pipe_slow );
9728 %}
9729 
9730 instruct vxor8B_mem(vecD dst, vecD src, memory mem) %{
9731   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
9732   match(Set dst (XorV src (LoadVector mem)));
9733   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (8 bytes)" %}
9734   ins_encode %{
9735     int vector_len = 0;
9736     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9737   %}
9738   ins_pipe( pipe_slow );
9739 %}
9740 
9741 instruct vxor16B(vecX dst, vecX src) %{
9742   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 16);
9743   match(Set dst (XorV dst src));
9744   format %{ "pxor    $dst,$src\t! xor vectors (16 bytes)" %}
9745   ins_encode %{
9746     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
9747   %}
9748   ins_pipe( pipe_slow );
9749 %}
9750 
9751 instruct vxor16B_reg(vecX dst, vecX src1, vecX src2) %{
9752   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
9753   match(Set dst (XorV src1 src2));
9754   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (16 bytes)" %}
9755   ins_encode %{
9756     int vector_len = 0;
9757     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9758   %}
9759   ins_pipe( pipe_slow );
9760 %}
9761 
9762 instruct vxor16B_mem(vecX dst, vecX src, memory mem) %{
9763   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
9764   match(Set dst (XorV src (LoadVector mem)));
9765   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (16 bytes)" %}
9766   ins_encode %{
9767     int vector_len = 0;
9768     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9769   %}
9770   ins_pipe( pipe_slow );
9771 %}
9772 
9773 instruct vxor32B_reg(vecY dst, vecY src1, vecY src2) %{
9774   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
9775   match(Set dst (XorV src1 src2));
9776   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (32 bytes)" %}
9777   ins_encode %{
9778     int vector_len = 1;
9779     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9780   %}
9781   ins_pipe( pipe_slow );
9782 %}
9783 
9784 instruct vxor32B_mem(vecY dst, vecY src, memory mem) %{
9785   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
9786   match(Set dst (XorV src (LoadVector mem)));
9787   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (32 bytes)" %}
9788   ins_encode %{
9789     int vector_len = 1;
9790     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9791   %}
9792   ins_pipe( pipe_slow );
9793 %}
9794 
9795 instruct vxor64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
9796   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
9797   match(Set dst (XorV src1 src2));
9798   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (64 bytes)" %}
9799   ins_encode %{
9800     int vector_len = 2;
9801     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9802   %}
9803   ins_pipe( pipe_slow );
9804 %}
9805 
9806 instruct vxor64B_mem(vecZ dst, vecZ src, memory mem) %{
9807   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
9808   match(Set dst (XorV src (LoadVector mem)));
9809   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (64 bytes)" %}
9810   ins_encode %{
9811     int vector_len = 2;
9812     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9813   %}
9814   ins_pipe( pipe_slow );
9815 %}
9816 
9817 // --------------------------------- FMA --------------------------------------
9818 
9819 // a * b + c
9820 instruct vfma2D_reg(vecX a, vecX b, vecX c) %{
9821   predicate(UseFMA && n->as_Vector()->length() == 2);
9822   match(Set c (FmaVD  c (Binary a b)));
9823   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed2D" %}
9824   ins_cost(150);
9825   ins_encode %{
9826     int vector_len = 0;
9827     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
9828   %}
9829   ins_pipe( pipe_slow );
9830 %}
9831 
9832 // a * b + c
9833 instruct vfma2D_mem(vecX a, memory b, vecX c) %{
9834   predicate(UseFMA && n->as_Vector()->length() == 2);
9835   match(Set c (FmaVD  c (Binary a (LoadVector b))));
9836   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed2D" %}
9837   ins_cost(150);
9838   ins_encode %{
9839     int vector_len = 0;
9840     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
9841   %}
9842   ins_pipe( pipe_slow );
9843 %}
9844 
9845 
9846 // a * b + c
9847 instruct vfma4D_reg(vecY a, vecY b, vecY c) %{
9848   predicate(UseFMA && n->as_Vector()->length() == 4);
9849   match(Set c (FmaVD  c (Binary a b)));
9850   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed4D" %}
9851   ins_cost(150);
9852   ins_encode %{
9853     int vector_len = 1;
9854     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
9855   %}
9856   ins_pipe( pipe_slow );
9857 %}
9858 
9859 // a * b + c
9860 instruct vfma4D_mem(vecY a, memory b, vecY c) %{
9861   predicate(UseFMA && n->as_Vector()->length() == 4);
9862   match(Set c (FmaVD  c (Binary a (LoadVector b))));
9863   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed4D" %}
9864   ins_cost(150);
9865   ins_encode %{
9866     int vector_len = 1;
9867     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
9868   %}
9869   ins_pipe( pipe_slow );
9870 %}
9871 
9872 // a * b + c
9873 instruct vfma8D_reg(vecZ a, vecZ b, vecZ c) %{
9874   predicate(UseFMA && n->as_Vector()->length() == 8);
9875   match(Set c (FmaVD  c (Binary a b)));
9876   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed8D" %}
9877   ins_cost(150);
9878   ins_encode %{
9879     int vector_len = 2;
9880     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
9881   %}
9882   ins_pipe( pipe_slow );
9883 %}
9884 
9885 // a * b + c
9886 instruct vfma8D_mem(vecZ a, memory b, vecZ c) %{
9887   predicate(UseFMA && n->as_Vector()->length() == 8);
9888   match(Set c (FmaVD  c (Binary a (LoadVector b))));
9889   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed8D" %}
9890   ins_cost(150);
9891   ins_encode %{
9892     int vector_len = 2;
9893     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
9894   %}
9895   ins_pipe( pipe_slow );
9896 %}
9897 
9898 // a * b + c
9899 instruct vfma4F_reg(vecX a, vecX b, vecX c) %{
9900   predicate(UseFMA && n->as_Vector()->length() == 4);
9901   match(Set c (FmaVF  c (Binary a b)));
9902   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed4F" %}
9903   ins_cost(150);
9904   ins_encode %{
9905     int vector_len = 0;
9906     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
9907   %}
9908   ins_pipe( pipe_slow );
9909 %}
9910 
9911 // a * b + c
9912 instruct vfma4F_mem(vecX a, memory b, vecX c) %{
9913   predicate(UseFMA && n->as_Vector()->length() == 4);
9914   match(Set c (FmaVF  c (Binary a (LoadVector b))));
9915   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed4F" %}
9916   ins_cost(150);
9917   ins_encode %{
9918     int vector_len = 0;
9919     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
9920   %}
9921   ins_pipe( pipe_slow );
9922 %}
9923 
9924 // a * b + c
9925 instruct vfma8F_reg(vecY a, vecY b, vecY c) %{
9926   predicate(UseFMA && n->as_Vector()->length() == 8);
9927   match(Set c (FmaVF  c (Binary a b)));
9928   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed8F" %}
9929   ins_cost(150);
9930   ins_encode %{
9931     int vector_len = 1;
9932     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
9933   %}
9934   ins_pipe( pipe_slow );
9935 %}
9936 
9937 // a * b + c
9938 instruct vfma8F_mem(vecY a, memory b, vecY c) %{
9939   predicate(UseFMA && n->as_Vector()->length() == 8);
9940   match(Set c (FmaVF  c (Binary a (LoadVector b))));
9941   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed8F" %}
9942   ins_cost(150);
9943   ins_encode %{
9944     int vector_len = 1;
9945     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
9946   %}
9947   ins_pipe( pipe_slow );
9948 %}
9949 
9950 // a * b + c
9951 instruct vfma16F_reg(vecZ a, vecZ b, vecZ c) %{
9952   predicate(UseFMA && n->as_Vector()->length() == 16);
9953   match(Set c (FmaVF  c (Binary a b)));
9954   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed16F" %}
9955   ins_cost(150);
9956   ins_encode %{
9957     int vector_len = 2;
9958     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
9959   %}
9960   ins_pipe( pipe_slow );
9961 %}
9962 
9963 // a * b + c
9964 instruct vfma16F_mem(vecZ a, memory b, vecZ c) %{
9965   predicate(UseFMA && n->as_Vector()->length() == 16);
9966   match(Set c (FmaVF  c (Binary a (LoadVector b))));
9967   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed16F" %}
9968   ins_cost(150);
9969   ins_encode %{
9970     int vector_len = 2;
9971     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
9972   %}
9973   ins_pipe( pipe_slow );
9974 %}
9975 
9976 // --------------------------------- Vector Multiply Add --------------------------------------
9977 
9978 instruct smuladd4S2I_reg(vecD dst, vecD src1) %{
9979   predicate(UseSSE >= 2 && UseAVX == 0 && n->as_Vector()->length() == 2);
9980   match(Set dst (MulAddVS2VI dst src1));
9981   format %{ "pmaddwd $dst,$dst,$src1\t! muladd packed4Sto2I" %}
9982   ins_encode %{
9983     __ pmaddwd($dst$$XMMRegister, $src1$$XMMRegister);
9984   %}
9985   ins_pipe( pipe_slow );
9986 %}
9987 
9988 instruct vmuladd4S2I_reg(vecD dst, vecD src1, vecD src2) %{
9989   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
9990   match(Set dst (MulAddVS2VI src1 src2));
9991   format %{ "vpmaddwd $dst,$src1,$src2\t! muladd packed4Sto2I" %}
9992   ins_encode %{
9993     int vector_len = 0;
9994     __ vpmaddwd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9995   %}
9996   ins_pipe( pipe_slow );
9997 %}
9998 
9999 instruct smuladd8S4I_reg(vecX dst, vecX src1) %{
10000   predicate(UseSSE >= 2 && UseAVX == 0 && n->as_Vector()->length() == 4);
10001   match(Set dst (MulAddVS2VI dst src1));
10002   format %{ "pmaddwd $dst,$dst,$src1\t! muladd packed8Sto4I" %}
10003   ins_encode %{
10004     __ pmaddwd($dst$$XMMRegister, $src1$$XMMRegister);
10005   %}
10006   ins_pipe( pipe_slow );
10007 %}
10008 
10009 instruct vmuladd8S4I_reg(vecX dst, vecX src1, vecX src2) %{
10010   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
10011   match(Set dst (MulAddVS2VI src1 src2));
10012   format %{ "vpmaddwd $dst,$src1,$src2\t! muladd packed8Sto4I" %}
10013   ins_encode %{
10014     int vector_len = 0;
10015     __ vpmaddwd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
10016   %}
10017   ins_pipe( pipe_slow );
10018 %}
10019 
10020 instruct vmuladd16S8I_reg(vecY dst, vecY src1, vecY src2) %{
10021   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
10022   match(Set dst (MulAddVS2VI src1 src2));
10023   format %{ "vpmaddwd $dst,$src1,$src2\t! muladd packed16Sto8I" %}
10024   ins_encode %{
10025     int vector_len = 1;
10026     __ vpmaddwd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
10027   %}
10028   ins_pipe( pipe_slow );
10029 %}
10030 
10031 instruct vmuladd32S16I_reg(vecZ dst, vecZ src1, vecZ src2) %{
10032   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
10033   match(Set dst (MulAddVS2VI src1 src2));
10034   format %{ "vpmaddwd $dst,$src1,$src2\t! muladd packed32Sto16I" %}
10035   ins_encode %{
10036     int vector_len = 2;
10037     __ vpmaddwd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
10038   %}
10039   ins_pipe( pipe_slow );
10040 %}
10041 
10042 // --------------------------------- Vector Multiply Add Add ----------------------------------
10043 
10044 instruct vmuladdadd4S2I_reg(vecD dst, vecD src1, vecD src2) %{
10045   predicate(VM_Version::supports_vnni() && UseAVX > 2 && n->as_Vector()->length() == 2);
10046   match(Set dst (AddVI (MulAddVS2VI src1 src2) dst));
10047   format %{ "evpdpwssd $dst,$src1,$src2\t! muladdadd packed4Sto2I" %}
10048   ins_encode %{
10049     int vector_len = 0;
10050     __ evpdpwssd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
10051   %}
10052   ins_pipe( pipe_slow );
10053   ins_cost(10);
10054 %}
10055 
10056 instruct vmuladdadd8S4I_reg(vecX dst, vecX src1, vecX src2) %{
10057   predicate(VM_Version::supports_vnni() && UseAVX > 2 && n->as_Vector()->length() == 4);
10058   match(Set dst (AddVI (MulAddVS2VI src1 src2) dst));
10059   format %{ "evpdpwssd $dst,$src1,$src2\t! muladdadd packed8Sto4I" %}
10060   ins_encode %{
10061     int vector_len = 0;
10062     __ evpdpwssd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
10063   %}
10064   ins_pipe( pipe_slow );
10065   ins_cost(10);
10066 %}
10067 
10068 instruct vmuladdadd16S8I_reg(vecY dst, vecY src1, vecY src2) %{
10069   predicate(VM_Version::supports_vnni() && UseAVX > 2 && n->as_Vector()->length() == 8);
10070   match(Set dst (AddVI (MulAddVS2VI src1 src2) dst));
10071   format %{ "evpdpwssd $dst,$src1,$src2\t! muladdadd packed16Sto8I" %}
10072   ins_encode %{
10073     int vector_len = 1;
10074     __ evpdpwssd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
10075   %}
10076   ins_pipe( pipe_slow );
10077   ins_cost(10);
10078 %}
10079 
10080 instruct vmuladdadd32S16I_reg(vecZ dst, vecZ src1, vecZ src2) %{
10081   predicate(VM_Version::supports_vnni() && UseAVX > 2 && n->as_Vector()->length() == 16);
10082   match(Set dst (AddVI (MulAddVS2VI src1 src2) dst));
10083   format %{ "evpdpwssd $dst,$src1,$src2\t! muladdadd packed32Sto16I" %}
10084   ins_encode %{
10085     int vector_len = 2;
10086     __ evpdpwssd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
10087   %}
10088   ins_pipe( pipe_slow );
10089   ins_cost(10);
10090 %}
10091 
10092 // --------------------------------- PopCount --------------------------------------
10093 
10094 instruct vpopcount2I(vecD dst, vecD src) %{
10095   predicate(VM_Version::supports_vpopcntdq() && UsePopCountInstruction && n->as_Vector()->length() == 2);
10096   match(Set dst (PopCountVI src));
10097   format %{ "vpopcntd  $dst,$src\t! vector popcount packed2I" %}
10098   ins_encode %{
10099     int vector_len = 0;
10100     __ vpopcntd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
10101   %}
10102   ins_pipe( pipe_slow );
10103 %}
10104 
10105 instruct vpopcount4I(vecX dst, vecX src) %{
10106   predicate(VM_Version::supports_vpopcntdq() && UsePopCountInstruction && n->as_Vector()->length() == 4);
10107   match(Set dst (PopCountVI src));
10108   format %{ "vpopcntd  $dst,$src\t! vector popcount packed4I" %}
10109   ins_encode %{
10110     int vector_len = 0;
10111     __ vpopcntd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
10112   %}
10113   ins_pipe( pipe_slow );
10114 %}
10115 
10116 instruct vpopcount8I(vecY dst, vecY src) %{
10117   predicate(VM_Version::supports_vpopcntdq() && UsePopCountInstruction && n->as_Vector()->length() == 8);
10118   match(Set dst (PopCountVI src));
10119   format %{ "vpopcntd  $dst,$src\t! vector popcount packed8I" %}
10120   ins_encode %{
10121     int vector_len = 1;
10122     __ vpopcntd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
10123   %}
10124   ins_pipe( pipe_slow );
10125 %}
10126 
10127 instruct vpopcount16I(vecZ dst, vecZ src) %{
10128   predicate(VM_Version::supports_vpopcntdq() && UsePopCountInstruction && n->as_Vector()->length() == 16);
10129   match(Set dst (PopCountVI src));
10130   format %{ "vpopcntd  $dst,$src\t! vector popcount packed16I" %}
10131   ins_encode %{
10132     int vector_len = 2;
10133     __ vpopcntd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
10134   %}
10135   ins_pipe( pipe_slow );
10136 %}
--- EOF ---