1 //
   2 // Copyright (c) 2011, 2018, Oracle and/or its affiliates. All rights reserved.
   3 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4 //
   5 // This code is free software; you can redistribute it and/or modify it
   6 // under the terms of the GNU General Public License version 2 only, as
   7 // published by the Free Software Foundation.
   8 //
   9 // This code is distributed in the hope that it will be useful, but WITHOUT
  10 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11 // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12 // version 2 for more details (a copy is included in the LICENSE file that
  13 // accompanied this code).
  14 //
  15 // You should have received a copy of the GNU General Public License version
  16 // 2 along with this work; if not, write to the Free Software Foundation,
  17 // Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18 //
  19 // Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20 // or visit www.oracle.com if you need additional information or have any
  21 // questions.
  22 //
  23 //
  24 
  25 // X86 Common Architecture Description File
  26 
  27 //----------REGISTER DEFINITION BLOCK------------------------------------------
  28 // This information is used by the matcher and the register allocator to
  29 // describe individual registers and classes of registers within the target
  30 // archtecture.
  31 
  32 register %{
  33 //----------Architecture Description Register Definitions----------------------
  34 // General Registers
  35 // "reg_def"  name ( register save type, C convention save type,
  36 //                   ideal register type, encoding );
  37 // Register Save Types:
  38 //
  39 // NS  = No-Save:       The register allocator assumes that these registers
  40 //                      can be used without saving upon entry to the method, &
  41 //                      that they do not need to be saved at call sites.
  42 //
  43 // SOC = Save-On-Call:  The register allocator assumes that these registers
  44 //                      can be used without saving upon entry to the method,
  45 //                      but that they must be saved at call sites.
  46 //
  47 // SOE = Save-On-Entry: The register allocator assumes that these registers
  48 //                      must be saved before using them upon entry to the
  49 //                      method, but they do not need to be saved at call
  50 //                      sites.
  51 //
  52 // AS  = Always-Save:   The register allocator assumes that these registers
  53 //                      must be saved before using them upon entry to the
  54 //                      method, & that they must be saved at call sites.
  55 //
  56 // Ideal Register Type is used to determine how to save & restore a
  57 // register.  Op_RegI will get spilled with LoadI/StoreI, Op_RegP will get
  58 // spilled with LoadP/StoreP.  If the register supports both, use Op_RegI.
  59 //
  60 // The encoding number is the actual bit-pattern placed into the opcodes.
  61 
  62 // XMM registers.  512-bit registers or 8 words each, labeled (a)-p.
  63 // Word a in each register holds a Float, words ab hold a Double.
  64 // The whole registers are used in SSE4.2 version intrinsics,
  65 // array copy stubs and superword operations (see UseSSE42Intrinsics,
  66 // UseXMMForArrayCopy and UseSuperword flags).
  67 // For pre EVEX enabled architectures:
  68 //      XMM8-XMM15 must be encoded with REX (VEX for UseAVX)
  69 // For EVEX enabled architectures:
  70 //      XMM8-XMM31 must be encoded with REX (EVEX for UseAVX).
  71 //
  72 // Linux ABI:   No register preserved across function calls
  73 //              XMM0-XMM7 might hold parameters
  74 // Windows ABI: XMM6-XMM31 preserved across function calls
  75 //              XMM0-XMM3 might hold parameters
  76 
  77 reg_def XMM0 ( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg());
  78 reg_def XMM0b( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(1));
  79 reg_def XMM0c( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(2));
  80 reg_def XMM0d( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(3));
  81 reg_def XMM0e( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(4));
  82 reg_def XMM0f( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(5));
  83 reg_def XMM0g( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(6));
  84 reg_def XMM0h( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(7));
  85 reg_def XMM0i( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(8));
  86 reg_def XMM0j( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(9));
  87 reg_def XMM0k( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(10));
  88 reg_def XMM0l( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(11));
  89 reg_def XMM0m( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(12));
  90 reg_def XMM0n( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(13));
  91 reg_def XMM0o( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(14));
  92 reg_def XMM0p( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(15));
  93 
  94 reg_def XMM1 ( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg());
  95 reg_def XMM1b( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(1));
  96 reg_def XMM1c( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(2));
  97 reg_def XMM1d( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(3));
  98 reg_def XMM1e( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(4));
  99 reg_def XMM1f( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(5));
 100 reg_def XMM1g( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(6));
 101 reg_def XMM1h( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(7));
 102 reg_def XMM1i( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(8));
 103 reg_def XMM1j( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(9));
 104 reg_def XMM1k( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(10));
 105 reg_def XMM1l( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(11));
 106 reg_def XMM1m( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(12));
 107 reg_def XMM1n( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(13));
 108 reg_def XMM1o( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(14));
 109 reg_def XMM1p( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(15));
 110 
 111 reg_def XMM2 ( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg());
 112 reg_def XMM2b( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(1));
 113 reg_def XMM2c( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(2));
 114 reg_def XMM2d( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(3));
 115 reg_def XMM2e( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(4));
 116 reg_def XMM2f( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(5));
 117 reg_def XMM2g( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(6));
 118 reg_def XMM2h( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(7));
 119 reg_def XMM2i( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(8));
 120 reg_def XMM2j( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(9));
 121 reg_def XMM2k( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(10));
 122 reg_def XMM2l( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(11));
 123 reg_def XMM2m( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(12));
 124 reg_def XMM2n( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(13));
 125 reg_def XMM2o( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(14));
 126 reg_def XMM2p( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(15));
 127 
 128 reg_def XMM3 ( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg());
 129 reg_def XMM3b( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(1));
 130 reg_def XMM3c( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(2));
 131 reg_def XMM3d( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(3));
 132 reg_def XMM3e( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(4));
 133 reg_def XMM3f( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(5));
 134 reg_def XMM3g( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(6));
 135 reg_def XMM3h( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(7));
 136 reg_def XMM3i( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(8));
 137 reg_def XMM3j( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(9));
 138 reg_def XMM3k( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(10));
 139 reg_def XMM3l( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(11));
 140 reg_def XMM3m( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(12));
 141 reg_def XMM3n( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(13));
 142 reg_def XMM3o( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(14));
 143 reg_def XMM3p( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(15));
 144 
 145 reg_def XMM4 ( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg());
 146 reg_def XMM4b( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(1));
 147 reg_def XMM4c( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(2));
 148 reg_def XMM4d( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(3));
 149 reg_def XMM4e( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(4));
 150 reg_def XMM4f( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(5));
 151 reg_def XMM4g( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(6));
 152 reg_def XMM4h( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(7));
 153 reg_def XMM4i( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(8));
 154 reg_def XMM4j( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(9));
 155 reg_def XMM4k( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(10));
 156 reg_def XMM4l( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(11));
 157 reg_def XMM4m( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(12));
 158 reg_def XMM4n( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(13));
 159 reg_def XMM4o( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(14));
 160 reg_def XMM4p( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(15));
 161 
 162 reg_def XMM5 ( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg());
 163 reg_def XMM5b( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(1));
 164 reg_def XMM5c( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(2));
 165 reg_def XMM5d( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(3));
 166 reg_def XMM5e( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(4));
 167 reg_def XMM5f( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(5));
 168 reg_def XMM5g( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(6));
 169 reg_def XMM5h( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(7));
 170 reg_def XMM5i( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(8));
 171 reg_def XMM5j( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(9));
 172 reg_def XMM5k( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(10));
 173 reg_def XMM5l( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(11));
 174 reg_def XMM5m( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(12));
 175 reg_def XMM5n( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(13));
 176 reg_def XMM5o( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(14));
 177 reg_def XMM5p( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(15));
 178 
 179 reg_def XMM6 ( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg());
 180 reg_def XMM6b( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(1));
 181 reg_def XMM6c( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(2));
 182 reg_def XMM6d( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(3));
 183 reg_def XMM6e( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(4));
 184 reg_def XMM6f( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(5));
 185 reg_def XMM6g( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(6));
 186 reg_def XMM6h( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(7));
 187 reg_def XMM6i( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(8));
 188 reg_def XMM6j( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(9));
 189 reg_def XMM6k( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(10));
 190 reg_def XMM6l( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(11));
 191 reg_def XMM6m( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(12));
 192 reg_def XMM6n( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(13));
 193 reg_def XMM6o( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(14));
 194 reg_def XMM6p( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(15));
 195 
 196 reg_def XMM7 ( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg());
 197 reg_def XMM7b( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(1));
 198 reg_def XMM7c( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(2));
 199 reg_def XMM7d( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(3));
 200 reg_def XMM7e( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(4));
 201 reg_def XMM7f( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(5));
 202 reg_def XMM7g( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(6));
 203 reg_def XMM7h( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(7));
 204 reg_def XMM7i( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(8));
 205 reg_def XMM7j( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(9));
 206 reg_def XMM7k( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(10));
 207 reg_def XMM7l( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(11));
 208 reg_def XMM7m( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(12));
 209 reg_def XMM7n( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(13));
 210 reg_def XMM7o( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(14));
 211 reg_def XMM7p( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(15));
 212 
 213 #ifdef _LP64
 214 
 215 reg_def XMM8 ( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg());
 216 reg_def XMM8b( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(1));
 217 reg_def XMM8c( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(2));
 218 reg_def XMM8d( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(3));
 219 reg_def XMM8e( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(4));
 220 reg_def XMM8f( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(5));
 221 reg_def XMM8g( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(6));
 222 reg_def XMM8h( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(7));
 223 reg_def XMM8i( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(8));
 224 reg_def XMM8j( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(9));
 225 reg_def XMM8k( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(10));
 226 reg_def XMM8l( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(11));
 227 reg_def XMM8m( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(12));
 228 reg_def XMM8n( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(13));
 229 reg_def XMM8o( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(14));
 230 reg_def XMM8p( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(15));
 231 
 232 reg_def XMM9 ( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg());
 233 reg_def XMM9b( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(1));
 234 reg_def XMM9c( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(2));
 235 reg_def XMM9d( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(3));
 236 reg_def XMM9e( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(4));
 237 reg_def XMM9f( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(5));
 238 reg_def XMM9g( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(6));
 239 reg_def XMM9h( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(7));
 240 reg_def XMM9i( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(8));
 241 reg_def XMM9j( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(9));
 242 reg_def XMM9k( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(10));
 243 reg_def XMM9l( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(11));
 244 reg_def XMM9m( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(12));
 245 reg_def XMM9n( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(13));
 246 reg_def XMM9o( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(14));
 247 reg_def XMM9p( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(15));
 248 
 249 reg_def XMM10 ( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg());
 250 reg_def XMM10b( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(1));
 251 reg_def XMM10c( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(2));
 252 reg_def XMM10d( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(3));
 253 reg_def XMM10e( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(4));
 254 reg_def XMM10f( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(5));
 255 reg_def XMM10g( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(6));
 256 reg_def XMM10h( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(7));
 257 reg_def XMM10i( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(8));
 258 reg_def XMM10j( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(9));
 259 reg_def XMM10k( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(10));
 260 reg_def XMM10l( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(11));
 261 reg_def XMM10m( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(12));
 262 reg_def XMM10n( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(13));
 263 reg_def XMM10o( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(14));
 264 reg_def XMM10p( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(15));
 265 
 266 reg_def XMM11 ( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg());
 267 reg_def XMM11b( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(1));
 268 reg_def XMM11c( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(2));
 269 reg_def XMM11d( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(3));
 270 reg_def XMM11e( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(4));
 271 reg_def XMM11f( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(5));
 272 reg_def XMM11g( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(6));
 273 reg_def XMM11h( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(7));
 274 reg_def XMM11i( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(8));
 275 reg_def XMM11j( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(9));
 276 reg_def XMM11k( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(10));
 277 reg_def XMM11l( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(11));
 278 reg_def XMM11m( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(12));
 279 reg_def XMM11n( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(13));
 280 reg_def XMM11o( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(14));
 281 reg_def XMM11p( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(15));
 282 
 283 reg_def XMM12 ( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg());
 284 reg_def XMM12b( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(1));
 285 reg_def XMM12c( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(2));
 286 reg_def XMM12d( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(3));
 287 reg_def XMM12e( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(4));
 288 reg_def XMM12f( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(5));
 289 reg_def XMM12g( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(6));
 290 reg_def XMM12h( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(7));
 291 reg_def XMM12i( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(8));
 292 reg_def XMM12j( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(9));
 293 reg_def XMM12k( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(10));
 294 reg_def XMM12l( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(11));
 295 reg_def XMM12m( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(12));
 296 reg_def XMM12n( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(13));
 297 reg_def XMM12o( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(14));
 298 reg_def XMM12p( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(15));
 299 
 300 reg_def XMM13 ( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg());
 301 reg_def XMM13b( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(1));
 302 reg_def XMM13c( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(2));
 303 reg_def XMM13d( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(3));
 304 reg_def XMM13e( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(4));
 305 reg_def XMM13f( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(5));
 306 reg_def XMM13g( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(6));
 307 reg_def XMM13h( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(7));
 308 reg_def XMM13i( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(8));
 309 reg_def XMM13j( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(9));
 310 reg_def XMM13k( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(10));
 311 reg_def XMM13l( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(11));
 312 reg_def XMM13m( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(12));
 313 reg_def XMM13n( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(13));
 314 reg_def XMM13o( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(14));
 315 reg_def XMM13p( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(15));
 316 
 317 reg_def XMM14 ( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg());
 318 reg_def XMM14b( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(1));
 319 reg_def XMM14c( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(2));
 320 reg_def XMM14d( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(3));
 321 reg_def XMM14e( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(4));
 322 reg_def XMM14f( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(5));
 323 reg_def XMM14g( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(6));
 324 reg_def XMM14h( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(7));
 325 reg_def XMM14i( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(8));
 326 reg_def XMM14j( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(9));
 327 reg_def XMM14k( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(10));
 328 reg_def XMM14l( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(11));
 329 reg_def XMM14m( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(12));
 330 reg_def XMM14n( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(13));
 331 reg_def XMM14o( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(14));
 332 reg_def XMM14p( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(15));
 333 
 334 reg_def XMM15 ( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg());
 335 reg_def XMM15b( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(1));
 336 reg_def XMM15c( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(2));
 337 reg_def XMM15d( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(3));
 338 reg_def XMM15e( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(4));
 339 reg_def XMM15f( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(5));
 340 reg_def XMM15g( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(6));
 341 reg_def XMM15h( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(7));
 342 reg_def XMM15i( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(8));
 343 reg_def XMM15j( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(9));
 344 reg_def XMM15k( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(10));
 345 reg_def XMM15l( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(11));
 346 reg_def XMM15m( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(12));
 347 reg_def XMM15n( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(13));
 348 reg_def XMM15o( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(14));
 349 reg_def XMM15p( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(15));
 350 
 351 reg_def XMM16 ( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg());
 352 reg_def XMM16b( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(1));
 353 reg_def XMM16c( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(2));
 354 reg_def XMM16d( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(3));
 355 reg_def XMM16e( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(4));
 356 reg_def XMM16f( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(5));
 357 reg_def XMM16g( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(6));
 358 reg_def XMM16h( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(7));
 359 reg_def XMM16i( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(8));
 360 reg_def XMM16j( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(9));
 361 reg_def XMM16k( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(10));
 362 reg_def XMM16l( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(11));
 363 reg_def XMM16m( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(12));
 364 reg_def XMM16n( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(13));
 365 reg_def XMM16o( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(14));
 366 reg_def XMM16p( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(15));
 367 
 368 reg_def XMM17 ( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg());
 369 reg_def XMM17b( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(1));
 370 reg_def XMM17c( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(2));
 371 reg_def XMM17d( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(3));
 372 reg_def XMM17e( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(4));
 373 reg_def XMM17f( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(5));
 374 reg_def XMM17g( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(6));
 375 reg_def XMM17h( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(7));
 376 reg_def XMM17i( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(8));
 377 reg_def XMM17j( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(9));
 378 reg_def XMM17k( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(10));
 379 reg_def XMM17l( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(11));
 380 reg_def XMM17m( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(12));
 381 reg_def XMM17n( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(13));
 382 reg_def XMM17o( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(14));
 383 reg_def XMM17p( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(15));
 384 
 385 reg_def XMM18 ( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg());
 386 reg_def XMM18b( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(1));
 387 reg_def XMM18c( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(2));
 388 reg_def XMM18d( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(3));
 389 reg_def XMM18e( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(4));
 390 reg_def XMM18f( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(5));
 391 reg_def XMM18g( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(6));
 392 reg_def XMM18h( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(7));
 393 reg_def XMM18i( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(8));
 394 reg_def XMM18j( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(9));
 395 reg_def XMM18k( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(10));
 396 reg_def XMM18l( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(11));
 397 reg_def XMM18m( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(12));
 398 reg_def XMM18n( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(13));
 399 reg_def XMM18o( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(14));
 400 reg_def XMM18p( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(15));
 401 
 402 reg_def XMM19 ( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg());
 403 reg_def XMM19b( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(1));
 404 reg_def XMM19c( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(2));
 405 reg_def XMM19d( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(3));
 406 reg_def XMM19e( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(4));
 407 reg_def XMM19f( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(5));
 408 reg_def XMM19g( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(6));
 409 reg_def XMM19h( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(7));
 410 reg_def XMM19i( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(8));
 411 reg_def XMM19j( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(9));
 412 reg_def XMM19k( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(10));
 413 reg_def XMM19l( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(11));
 414 reg_def XMM19m( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(12));
 415 reg_def XMM19n( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(13));
 416 reg_def XMM19o( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(14));
 417 reg_def XMM19p( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(15));
 418 
 419 reg_def XMM20 ( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg());
 420 reg_def XMM20b( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(1));
 421 reg_def XMM20c( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(2));
 422 reg_def XMM20d( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(3));
 423 reg_def XMM20e( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(4));
 424 reg_def XMM20f( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(5));
 425 reg_def XMM20g( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(6));
 426 reg_def XMM20h( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(7));
 427 reg_def XMM20i( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(8));
 428 reg_def XMM20j( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(9));
 429 reg_def XMM20k( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(10));
 430 reg_def XMM20l( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(11));
 431 reg_def XMM20m( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(12));
 432 reg_def XMM20n( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(13));
 433 reg_def XMM20o( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(14));
 434 reg_def XMM20p( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(15));
 435 
 436 reg_def XMM21 ( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg());
 437 reg_def XMM21b( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(1));
 438 reg_def XMM21c( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(2));
 439 reg_def XMM21d( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(3));
 440 reg_def XMM21e( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(4));
 441 reg_def XMM21f( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(5));
 442 reg_def XMM21g( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(6));
 443 reg_def XMM21h( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(7));
 444 reg_def XMM21i( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(8));
 445 reg_def XMM21j( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(9));
 446 reg_def XMM21k( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(10));
 447 reg_def XMM21l( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(11));
 448 reg_def XMM21m( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(12));
 449 reg_def XMM21n( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(13));
 450 reg_def XMM21o( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(14));
 451 reg_def XMM21p( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(15));
 452 
 453 reg_def XMM22 ( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg());
 454 reg_def XMM22b( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(1));
 455 reg_def XMM22c( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(2));
 456 reg_def XMM22d( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(3));
 457 reg_def XMM22e( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(4));
 458 reg_def XMM22f( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(5));
 459 reg_def XMM22g( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(6));
 460 reg_def XMM22h( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(7));
 461 reg_def XMM22i( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(8));
 462 reg_def XMM22j( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(9));
 463 reg_def XMM22k( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(10));
 464 reg_def XMM22l( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(11));
 465 reg_def XMM22m( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(12));
 466 reg_def XMM22n( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(13));
 467 reg_def XMM22o( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(14));
 468 reg_def XMM22p( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(15));
 469 
 470 reg_def XMM23 ( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg());
 471 reg_def XMM23b( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(1));
 472 reg_def XMM23c( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(2));
 473 reg_def XMM23d( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(3));
 474 reg_def XMM23e( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(4));
 475 reg_def XMM23f( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(5));
 476 reg_def XMM23g( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(6));
 477 reg_def XMM23h( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(7));
 478 reg_def XMM23i( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(8));
 479 reg_def XMM23j( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(9));
 480 reg_def XMM23k( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(10));
 481 reg_def XMM23l( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(11));
 482 reg_def XMM23m( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(12));
 483 reg_def XMM23n( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(13));
 484 reg_def XMM23o( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(14));
 485 reg_def XMM23p( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(15));
 486 
 487 reg_def XMM24 ( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg());
 488 reg_def XMM24b( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(1));
 489 reg_def XMM24c( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(2));
 490 reg_def XMM24d( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(3));
 491 reg_def XMM24e( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(4));
 492 reg_def XMM24f( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(5));
 493 reg_def XMM24g( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(6));
 494 reg_def XMM24h( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(7));
 495 reg_def XMM24i( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(8));
 496 reg_def XMM24j( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(9));
 497 reg_def XMM24k( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(10));
 498 reg_def XMM24l( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(11));
 499 reg_def XMM24m( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(12));
 500 reg_def XMM24n( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(13));
 501 reg_def XMM24o( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(14));
 502 reg_def XMM24p( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(15));
 503 
 504 reg_def XMM25 ( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg());
 505 reg_def XMM25b( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(1));
 506 reg_def XMM25c( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(2));
 507 reg_def XMM25d( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(3));
 508 reg_def XMM25e( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(4));
 509 reg_def XMM25f( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(5));
 510 reg_def XMM25g( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(6));
 511 reg_def XMM25h( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(7));
 512 reg_def XMM25i( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(8));
 513 reg_def XMM25j( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(9));
 514 reg_def XMM25k( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(10));
 515 reg_def XMM25l( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(11));
 516 reg_def XMM25m( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(12));
 517 reg_def XMM25n( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(13));
 518 reg_def XMM25o( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(14));
 519 reg_def XMM25p( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(15));
 520 
 521 reg_def XMM26 ( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg());
 522 reg_def XMM26b( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(1));
 523 reg_def XMM26c( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(2));
 524 reg_def XMM26d( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(3));
 525 reg_def XMM26e( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(4));
 526 reg_def XMM26f( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(5));
 527 reg_def XMM26g( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(6));
 528 reg_def XMM26h( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(7));
 529 reg_def XMM26i( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(8));
 530 reg_def XMM26j( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(9));
 531 reg_def XMM26k( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(10));
 532 reg_def XMM26l( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(11));
 533 reg_def XMM26m( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(12));
 534 reg_def XMM26n( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(13));
 535 reg_def XMM26o( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(14));
 536 reg_def XMM26p( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(15));
 537 
 538 reg_def XMM27 ( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg());
 539 reg_def XMM27b( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(1));
 540 reg_def XMM27c( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(2));
 541 reg_def XMM27d( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(3));
 542 reg_def XMM27e( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(4));
 543 reg_def XMM27f( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(5));
 544 reg_def XMM27g( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(6));
 545 reg_def XMM27h( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(7));
 546 reg_def XMM27i( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(8));
 547 reg_def XMM27j( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(9));
 548 reg_def XMM27k( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(10));
 549 reg_def XMM27l( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(11));
 550 reg_def XMM27m( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(12));
 551 reg_def XMM27n( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(13));
 552 reg_def XMM27o( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(14));
 553 reg_def XMM27p( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(15));
 554 
 555 reg_def XMM28 ( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg());
 556 reg_def XMM28b( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(1));
 557 reg_def XMM28c( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(2));
 558 reg_def XMM28d( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(3));
 559 reg_def XMM28e( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(4));
 560 reg_def XMM28f( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(5));
 561 reg_def XMM28g( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(6));
 562 reg_def XMM28h( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(7));
 563 reg_def XMM28i( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(8));
 564 reg_def XMM28j( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(9));
 565 reg_def XMM28k( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(10));
 566 reg_def XMM28l( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(11));
 567 reg_def XMM28m( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(12));
 568 reg_def XMM28n( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(13));
 569 reg_def XMM28o( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(14));
 570 reg_def XMM28p( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(15));
 571 
 572 reg_def XMM29 ( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg());
 573 reg_def XMM29b( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(1));
 574 reg_def XMM29c( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(2));
 575 reg_def XMM29d( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(3));
 576 reg_def XMM29e( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(4));
 577 reg_def XMM29f( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(5));
 578 reg_def XMM29g( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(6));
 579 reg_def XMM29h( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(7));
 580 reg_def XMM29i( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(8));
 581 reg_def XMM29j( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(9));
 582 reg_def XMM29k( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(10));
 583 reg_def XMM29l( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(11));
 584 reg_def XMM29m( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(12));
 585 reg_def XMM29n( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(13));
 586 reg_def XMM29o( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(14));
 587 reg_def XMM29p( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(15));
 588 
 589 reg_def XMM30 ( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg());
 590 reg_def XMM30b( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(1));
 591 reg_def XMM30c( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(2));
 592 reg_def XMM30d( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(3));
 593 reg_def XMM30e( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(4));
 594 reg_def XMM30f( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(5));
 595 reg_def XMM30g( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(6));
 596 reg_def XMM30h( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(7));
 597 reg_def XMM30i( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(8));
 598 reg_def XMM30j( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(9));
 599 reg_def XMM30k( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(10));
 600 reg_def XMM30l( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(11));
 601 reg_def XMM30m( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(12));
 602 reg_def XMM30n( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(13));
 603 reg_def XMM30o( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(14));
 604 reg_def XMM30p( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(15));
 605 
 606 reg_def XMM31 ( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg());
 607 reg_def XMM31b( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(1));
 608 reg_def XMM31c( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(2));
 609 reg_def XMM31d( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(3));
 610 reg_def XMM31e( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(4));
 611 reg_def XMM31f( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(5));
 612 reg_def XMM31g( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(6));
 613 reg_def XMM31h( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(7));
 614 reg_def XMM31i( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(8));
 615 reg_def XMM31j( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(9));
 616 reg_def XMM31k( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(10));
 617 reg_def XMM31l( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(11));
 618 reg_def XMM31m( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(12));
 619 reg_def XMM31n( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(13));
 620 reg_def XMM31o( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(14));
 621 reg_def XMM31p( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(15));
 622 
 623 #endif // _LP64
 624 
 625 #ifdef _LP64
 626 reg_def RFLAGS(SOC, SOC, 0, 16, VMRegImpl::Bad());
 627 #else
 628 reg_def RFLAGS(SOC, SOC, 0, 8, VMRegImpl::Bad());
 629 #endif // _LP64
 630 
 631 alloc_class chunk1(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
 632                    XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
 633                    XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
 634                    XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
 635                    XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
 636                    XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
 637                    XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
 638                    XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
 639 #ifdef _LP64
 640                   ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
 641                    XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
 642                    XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
 643                    XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
 644                    XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
 645                    XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
 646                    XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
 647                    XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
 648                   ,XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
 649                    XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
 650                    XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
 651                    XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
 652                    XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
 653                    XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
 654                    XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
 655                    XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
 656                    XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
 657                    XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
 658                    XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
 659                    XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
 660                    XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
 661                    XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
 662                    XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
 663                    XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p
 664 #endif
 665                       );
 666 
 667 // flags allocation class should be last.
 668 alloc_class chunk2(RFLAGS);
 669 
 670 // Singleton class for condition codes
 671 reg_class int_flags(RFLAGS);
 672 
 673 // Class for pre evex float registers
 674 reg_class float_reg_legacy(XMM0,
 675                     XMM1,
 676                     XMM2,
 677                     XMM3,
 678                     XMM4,
 679                     XMM5,
 680                     XMM6,
 681                     XMM7
 682 #ifdef _LP64
 683                    ,XMM8,
 684                     XMM9,
 685                     XMM10,
 686                     XMM11,
 687                     XMM12,
 688                     XMM13,
 689                     XMM14,
 690                     XMM15
 691 #endif
 692                     );
 693 
 694 // Class for evex float registers
 695 reg_class float_reg_evex(XMM0,
 696                     XMM1,
 697                     XMM2,
 698                     XMM3,
 699                     XMM4,
 700                     XMM5,
 701                     XMM6,
 702                     XMM7
 703 #ifdef _LP64
 704                    ,XMM8,
 705                     XMM9,
 706                     XMM10,
 707                     XMM11,
 708                     XMM12,
 709                     XMM13,
 710                     XMM14,
 711                     XMM15,
 712                     XMM16,
 713                     XMM17,
 714                     XMM18,
 715                     XMM19,
 716                     XMM20,
 717                     XMM21,
 718                     XMM22,
 719                     XMM23,
 720                     XMM24,
 721                     XMM25,
 722                     XMM26,
 723                     XMM27,
 724                     XMM28,
 725                     XMM29,
 726                     XMM30,
 727                     XMM31
 728 #endif
 729                     );
 730 
 731 reg_class_dynamic float_reg(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() %} );
 732 reg_class_dynamic float_reg_vl(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
 733 
 734 // Class for pre evex double registers
 735 reg_class double_reg_legacy(XMM0,  XMM0b,
 736                      XMM1,  XMM1b,
 737                      XMM2,  XMM2b,
 738                      XMM3,  XMM3b,
 739                      XMM4,  XMM4b,
 740                      XMM5,  XMM5b,
 741                      XMM6,  XMM6b,
 742                      XMM7,  XMM7b
 743 #ifdef _LP64
 744                     ,XMM8,  XMM8b,
 745                      XMM9,  XMM9b,
 746                      XMM10, XMM10b,
 747                      XMM11, XMM11b,
 748                      XMM12, XMM12b,
 749                      XMM13, XMM13b,
 750                      XMM14, XMM14b,
 751                      XMM15, XMM15b
 752 #endif
 753                      );
 754 
 755 // Class for evex double registers
 756 reg_class double_reg_evex(XMM0,  XMM0b,
 757                      XMM1,  XMM1b,
 758                      XMM2,  XMM2b,
 759                      XMM3,  XMM3b,
 760                      XMM4,  XMM4b,
 761                      XMM5,  XMM5b,
 762                      XMM6,  XMM6b,
 763                      XMM7,  XMM7b
 764 #ifdef _LP64
 765                     ,XMM8,  XMM8b,
 766                      XMM9,  XMM9b,
 767                      XMM10, XMM10b,
 768                      XMM11, XMM11b,
 769                      XMM12, XMM12b,
 770                      XMM13, XMM13b,
 771                      XMM14, XMM14b,
 772                      XMM15, XMM15b,
 773                      XMM16, XMM16b,
 774                      XMM17, XMM17b,
 775                      XMM18, XMM18b,
 776                      XMM19, XMM19b,
 777                      XMM20, XMM20b,
 778                      XMM21, XMM21b,
 779                      XMM22, XMM22b,
 780                      XMM23, XMM23b,
 781                      XMM24, XMM24b,
 782                      XMM25, XMM25b,
 783                      XMM26, XMM26b,
 784                      XMM27, XMM27b,
 785                      XMM28, XMM28b,
 786                      XMM29, XMM29b,
 787                      XMM30, XMM30b,
 788                      XMM31, XMM31b
 789 #endif
 790                      );
 791 
 792 reg_class_dynamic double_reg(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() %} );
 793 reg_class_dynamic double_reg_vl(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
 794 
 795 // Class for pre evex 32bit vector registers
 796 reg_class vectors_reg_legacy(XMM0,
 797                       XMM1,
 798                       XMM2,
 799                       XMM3,
 800                       XMM4,
 801                       XMM5,
 802                       XMM6,
 803                       XMM7
 804 #ifdef _LP64
 805                      ,XMM8,
 806                       XMM9,
 807                       XMM10,
 808                       XMM11,
 809                       XMM12,
 810                       XMM13,
 811                       XMM14,
 812                       XMM15
 813 #endif
 814                       );
 815 
 816 // Class for evex 32bit vector registers
 817 reg_class vectors_reg_evex(XMM0,
 818                       XMM1,
 819                       XMM2,
 820                       XMM3,
 821                       XMM4,
 822                       XMM5,
 823                       XMM6,
 824                       XMM7
 825 #ifdef _LP64
 826                      ,XMM8,
 827                       XMM9,
 828                       XMM10,
 829                       XMM11,
 830                       XMM12,
 831                       XMM13,
 832                       XMM14,
 833                       XMM15,
 834                       XMM16,
 835                       XMM17,
 836                       XMM18,
 837                       XMM19,
 838                       XMM20,
 839                       XMM21,
 840                       XMM22,
 841                       XMM23,
 842                       XMM24,
 843                       XMM25,
 844                       XMM26,
 845                       XMM27,
 846                       XMM28,
 847                       XMM29,
 848                       XMM30,
 849                       XMM31
 850 #endif
 851                       );
 852 
 853 reg_class_dynamic vectors_reg(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_evex() %} );
 854 reg_class_dynamic vectors_reg_vlbwdq(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
 855 
 856 // Class for all 64bit vector registers
 857 reg_class vectord_reg_legacy(XMM0,  XMM0b,
 858                       XMM1,  XMM1b,
 859                       XMM2,  XMM2b,
 860                       XMM3,  XMM3b,
 861                       XMM4,  XMM4b,
 862                       XMM5,  XMM5b,
 863                       XMM6,  XMM6b,
 864                       XMM7,  XMM7b
 865 #ifdef _LP64
 866                      ,XMM8,  XMM8b,
 867                       XMM9,  XMM9b,
 868                       XMM10, XMM10b,
 869                       XMM11, XMM11b,
 870                       XMM12, XMM12b,
 871                       XMM13, XMM13b,
 872                       XMM14, XMM14b,
 873                       XMM15, XMM15b
 874 #endif
 875                       );
 876 
 877 // Class for all 64bit vector registers
 878 reg_class vectord_reg_evex(XMM0,  XMM0b,
 879                       XMM1,  XMM1b,
 880                       XMM2,  XMM2b,
 881                       XMM3,  XMM3b,
 882                       XMM4,  XMM4b,
 883                       XMM5,  XMM5b,
 884                       XMM6,  XMM6b,
 885                       XMM7,  XMM7b
 886 #ifdef _LP64
 887                      ,XMM8,  XMM8b,
 888                       XMM9,  XMM9b,
 889                       XMM10, XMM10b,
 890                       XMM11, XMM11b,
 891                       XMM12, XMM12b,
 892                       XMM13, XMM13b,
 893                       XMM14, XMM14b,
 894                       XMM15, XMM15b,
 895                       XMM16, XMM16b,
 896                       XMM17, XMM17b,
 897                       XMM18, XMM18b,
 898                       XMM19, XMM19b,
 899                       XMM20, XMM20b,
 900                       XMM21, XMM21b,
 901                       XMM22, XMM22b,
 902                       XMM23, XMM23b,
 903                       XMM24, XMM24b,
 904                       XMM25, XMM25b,
 905                       XMM26, XMM26b,
 906                       XMM27, XMM27b,
 907                       XMM28, XMM28b,
 908                       XMM29, XMM29b,
 909                       XMM30, XMM30b,
 910                       XMM31, XMM31b
 911 #endif
 912                       );
 913 
 914 reg_class_dynamic vectord_reg(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_evex() %} );
 915 reg_class_dynamic vectord_reg_vlbwdq(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
 916 
 917 // Class for all 128bit vector registers
 918 reg_class vectorx_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,
 919                       XMM1,  XMM1b,  XMM1c,  XMM1d,
 920                       XMM2,  XMM2b,  XMM2c,  XMM2d,
 921                       XMM3,  XMM3b,  XMM3c,  XMM3d,
 922                       XMM4,  XMM4b,  XMM4c,  XMM4d,
 923                       XMM5,  XMM5b,  XMM5c,  XMM5d,
 924                       XMM6,  XMM6b,  XMM6c,  XMM6d,
 925                       XMM7,  XMM7b,  XMM7c,  XMM7d
 926 #ifdef _LP64
 927                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,
 928                       XMM9,  XMM9b,  XMM9c,  XMM9d,
 929                       XMM10, XMM10b, XMM10c, XMM10d,
 930                       XMM11, XMM11b, XMM11c, XMM11d,
 931                       XMM12, XMM12b, XMM12c, XMM12d,
 932                       XMM13, XMM13b, XMM13c, XMM13d,
 933                       XMM14, XMM14b, XMM14c, XMM14d,
 934                       XMM15, XMM15b, XMM15c, XMM15d
 935 #endif
 936                       );
 937 
 938 // Class for all 128bit vector registers
 939 reg_class vectorx_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,
 940                       XMM1,  XMM1b,  XMM1c,  XMM1d,
 941                       XMM2,  XMM2b,  XMM2c,  XMM2d,
 942                       XMM3,  XMM3b,  XMM3c,  XMM3d,
 943                       XMM4,  XMM4b,  XMM4c,  XMM4d,
 944                       XMM5,  XMM5b,  XMM5c,  XMM5d,
 945                       XMM6,  XMM6b,  XMM6c,  XMM6d,
 946                       XMM7,  XMM7b,  XMM7c,  XMM7d
 947 #ifdef _LP64
 948                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,
 949                       XMM9,  XMM9b,  XMM9c,  XMM9d,
 950                       XMM10, XMM10b, XMM10c, XMM10d,
 951                       XMM11, XMM11b, XMM11c, XMM11d,
 952                       XMM12, XMM12b, XMM12c, XMM12d,
 953                       XMM13, XMM13b, XMM13c, XMM13d,
 954                       XMM14, XMM14b, XMM14c, XMM14d,
 955                       XMM15, XMM15b, XMM15c, XMM15d,
 956                       XMM16, XMM16b, XMM16c, XMM16d,
 957                       XMM17, XMM17b, XMM17c, XMM17d,
 958                       XMM18, XMM18b, XMM18c, XMM18d,
 959                       XMM19, XMM19b, XMM19c, XMM19d,
 960                       XMM20, XMM20b, XMM20c, XMM20d,
 961                       XMM21, XMM21b, XMM21c, XMM21d,
 962                       XMM22, XMM22b, XMM22c, XMM22d,
 963                       XMM23, XMM23b, XMM23c, XMM23d,
 964                       XMM24, XMM24b, XMM24c, XMM24d,
 965                       XMM25, XMM25b, XMM25c, XMM25d,
 966                       XMM26, XMM26b, XMM26c, XMM26d,
 967                       XMM27, XMM27b, XMM27c, XMM27d,
 968                       XMM28, XMM28b, XMM28c, XMM28d,
 969                       XMM29, XMM29b, XMM29c, XMM29d,
 970                       XMM30, XMM30b, XMM30c, XMM30d,
 971                       XMM31, XMM31b, XMM31c, XMM31d
 972 #endif
 973                       );
 974 
 975 reg_class_dynamic vectorx_reg(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_evex() %} );
 976 reg_class_dynamic vectorx_reg_vlbwdq(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
 977 
 978 // Class for all 256bit vector registers
 979 reg_class vectory_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
 980                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
 981                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
 982                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
 983                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
 984                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
 985                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
 986                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
 987 #ifdef _LP64
 988                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
 989                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
 990                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
 991                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
 992                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
 993                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
 994                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
 995                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h
 996 #endif
 997                       );
 998 
 999 // Class for all 256bit vector registers
1000 reg_class vectory_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
1001                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
1002                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
1003                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
1004                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
1005                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
1006                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
1007                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
1008 #ifdef _LP64
1009                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
1010                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
1011                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
1012                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
1013                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
1014                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
1015                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
1016                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h,
1017                       XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h,
1018                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h,
1019                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h,
1020                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h,
1021                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h,
1022                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h,
1023                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h,
1024                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h,
1025                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h,
1026                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h,
1027                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h,
1028                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h,
1029                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h,
1030                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h,
1031                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h,
1032                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h
1033 #endif
1034                       );
1035 
1036 reg_class_dynamic vectory_reg(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_evex() %} );
1037 reg_class_dynamic vectory_reg_vlbwdq(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
1038 
1039 // Class for all 512bit vector registers
1040 reg_class vectorz_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
1041                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
1042                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
1043                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
1044                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
1045                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
1046                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
1047                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
1048 #ifdef _LP64
1049                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
1050                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
1051                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
1052                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
1053                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
1054                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
1055                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
1056                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
1057                      ,XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
1058                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
1059                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
1060                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
1061                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
1062                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
1063                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
1064                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
1065                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
1066                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
1067                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
1068                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
1069                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
1070                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
1071                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
1072                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p
1073 #endif
1074                       );
1075 
1076 // Class for restricted 512bit vector registers
1077 reg_class vectorz_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
1078                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
1079                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
1080                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
1081                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
1082                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
1083                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
1084                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
1085 #ifdef _LP64
1086                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
1087                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
1088                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
1089                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
1090                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
1091                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
1092                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
1093                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
1094 #endif
1095                       );
1096 
1097 reg_class_dynamic vectorz_reg(vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() %} );
1098 reg_class_dynamic vectorz_reg_vl(vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
1099 
1100 reg_class xmm0_reg(XMM0, XMM0b, XMM0c, XMM0d);
1101 reg_class ymm0_reg(XMM0, XMM0b, XMM0c, XMM0d, XMM0e, XMM0f, XMM0g, XMM0h);
1102 reg_class zmm0_reg(XMM0, XMM0b, XMM0c, XMM0d, XMM0e, XMM0f, XMM0g, XMM0h, XMM0i, XMM0j, XMM0k, XMM0l, XMM0m, XMM0n, XMM0o, XMM0p);
1103 
1104 reg_class xmm1_reg(XMM1, XMM1b, XMM1c, XMM1d);
1105 reg_class ymm1_reg(XMM1, XMM1b, XMM1c, XMM1d, XMM1e, XMM1f, XMM1g, XMM1h);
1106 reg_class zmm1_reg(XMM1, XMM1b, XMM1c, XMM1d, XMM1e, XMM1f, XMM1g, XMM1h, XMM1i, XMM1j, XMM1k, XMM1l, XMM1m, XMM1n, XMM1o, XMM1p);
1107 
1108 reg_class xmm2_reg(XMM2, XMM2b, XMM2c, XMM2d);
1109 reg_class ymm2_reg(XMM2, XMM2b, XMM2c, XMM2d, XMM2e, XMM2f, XMM2g, XMM2h);
1110 reg_class zmm2_reg(XMM2, XMM2b, XMM2c, XMM2d, XMM2e, XMM2f, XMM2g, XMM2h, XMM2i, XMM2j, XMM2k, XMM2l, XMM2m, XMM2n, XMM2o, XMM2p);
1111 
1112 reg_class xmm3_reg(XMM3, XMM3b, XMM3c, XMM3d);
1113 reg_class ymm3_reg(XMM3, XMM3b, XMM3c, XMM3d, XMM3e, XMM3f, XMM3g, XMM3h);
1114 reg_class zmm3_reg(XMM3, XMM3b, XMM3c, XMM3d, XMM3e, XMM3f, XMM3g, XMM3h, XMM3i, XMM3j, XMM3k, XMM3l, XMM3m, XMM3n, XMM3o, XMM3p);
1115 
1116 reg_class xmm4_reg(XMM4, XMM4b, XMM4c, XMM4d);
1117 reg_class ymm4_reg(XMM4, XMM4b, XMM4c, XMM4d, XMM4e, XMM4f, XMM4g, XMM4h);
1118 reg_class zmm4_reg(XMM4, XMM4b, XMM4c, XMM4d, XMM4e, XMM4f, XMM4g, XMM4h, XMM4i, XMM4j, XMM4k, XMM4l, XMM4m, XMM4n, XMM4o, XMM4p);
1119 
1120 reg_class xmm5_reg(XMM5, XMM5b, XMM5c, XMM5d);
1121 reg_class ymm5_reg(XMM5, XMM5b, XMM5c, XMM5d, XMM5e, XMM5f, XMM5g, XMM5h);
1122 reg_class zmm5_reg(XMM5, XMM5b, XMM5c, XMM5d, XMM5e, XMM5f, XMM5g, XMM5h, XMM5i, XMM5j, XMM5k, XMM5l, XMM5m, XMM5n, XMM5o, XMM5p);
1123 
1124 reg_class xmm6_reg(XMM6, XMM6b, XMM6c, XMM6d);
1125 reg_class ymm6_reg(XMM6, XMM6b, XMM6c, XMM6d, XMM6e, XMM6f, XMM6g, XMM6h);
1126 reg_class zmm6_reg(XMM6, XMM6b, XMM6c, XMM6d, XMM6e, XMM6f, XMM6g, XMM6h, XMM6i, XMM6j, XMM6k, XMM6l, XMM6m, XMM6n, XMM6o, XMM6p);
1127 
1128 reg_class xmm7_reg(XMM7, XMM7b, XMM7c, XMM7d);
1129 reg_class ymm7_reg(XMM7, XMM7b, XMM7c, XMM7d, XMM7e, XMM7f, XMM7g, XMM7h);
1130 reg_class zmm7_reg(XMM7, XMM7b, XMM7c, XMM7d, XMM7e, XMM7f, XMM7g, XMM7h, XMM7i, XMM7j, XMM7k, XMM7l, XMM7m, XMM7n, XMM7o, XMM7p);
1131 
1132 #ifdef _LP64
1133 
1134 reg_class xmm8_reg(XMM8, XMM8b, XMM8c, XMM8d);
1135 reg_class ymm8_reg(XMM8, XMM8b, XMM8c, XMM8d, XMM8e, XMM8f, XMM8g, XMM8h);
1136 reg_class zmm8_reg(XMM8, XMM8b, XMM8c, XMM8d, XMM8e, XMM8f, XMM8g, XMM8h, XMM8i, XMM8j, XMM8k, XMM8l, XMM8m, XMM8n, XMM8o, XMM8p);
1137 
1138 reg_class xmm9_reg(XMM9, XMM9b, XMM9c, XMM9d);
1139 reg_class ymm9_reg(XMM9, XMM9b, XMM9c, XMM9d, XMM9e, XMM9f, XMM9g, XMM9h);
1140 reg_class zmm9_reg(XMM9, XMM9b, XMM9c, XMM9d, XMM9e, XMM9f, XMM9g, XMM9h, XMM9i, XMM9j, XMM9k, XMM9l, XMM9m, XMM9n, XMM9o, XMM9p);
1141 
1142 reg_class xmm10_reg(XMM10, XMM10b, XMM10c, XMM10d);
1143 reg_class ymm10_reg(XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h);
1144 reg_class zmm10_reg(XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p);
1145 
1146 reg_class xmm11_reg(XMM11, XMM11b, XMM11c, XMM11d);
1147 reg_class ymm11_reg(XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h);
1148 reg_class zmm11_reg(XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p);
1149 
1150 reg_class xmm12_reg(XMM12, XMM12b, XMM12c, XMM12d);
1151 reg_class ymm12_reg(XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h);
1152 reg_class zmm12_reg(XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p);
1153 
1154 reg_class xmm13_reg(XMM13, XMM13b, XMM13c, XMM13d);
1155 reg_class ymm13_reg(XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h);
1156 reg_class zmm13_reg(XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p);
1157 
1158 reg_class xmm14_reg(XMM14, XMM14b, XMM14c, XMM14d);
1159 reg_class ymm14_reg(XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h);
1160 reg_class zmm14_reg(XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p);
1161 
1162 reg_class xmm15_reg(XMM15, XMM15b, XMM15c, XMM15d);
1163 reg_class ymm15_reg(XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h);
1164 reg_class zmm15_reg(XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p);
1165 
1166 reg_class xmm16_reg(XMM16, XMM16b, XMM16c, XMM16d);
1167 reg_class ymm16_reg(XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h);
1168 reg_class zmm16_reg(XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p);
1169 
1170 reg_class xmm17_reg(XMM17, XMM17b, XMM17c, XMM17d);
1171 reg_class ymm17_reg(XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h);
1172 reg_class zmm17_reg(XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p);
1173 
1174 reg_class xmm18_reg(XMM18, XMM18b, XMM18c, XMM18d);
1175 reg_class ymm18_reg(XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h);
1176 reg_class zmm18_reg(XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p);
1177 
1178 reg_class xmm19_reg(XMM19, XMM19b, XMM19c, XMM19d);
1179 reg_class ymm19_reg(XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h);
1180 reg_class zmm19_reg(XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p);
1181 
1182 reg_class xmm20_reg(XMM20, XMM20b, XMM20c, XMM20d);
1183 reg_class ymm20_reg(XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h);
1184 reg_class zmm20_reg(XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p);
1185 
1186 reg_class xmm21_reg(XMM21, XMM21b, XMM21c, XMM21d);
1187 reg_class ymm21_reg(XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h);
1188 reg_class zmm21_reg(XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p);
1189 
1190 reg_class xmm22_reg(XMM22, XMM22b, XMM22c, XMM22d);
1191 reg_class ymm22_reg(XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h);
1192 reg_class zmm22_reg(XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p);
1193 
1194 reg_class xmm23_reg(XMM23, XMM23b, XMM23c, XMM23d);
1195 reg_class ymm23_reg(XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h);
1196 reg_class zmm23_reg(XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p);
1197 
1198 reg_class xmm24_reg(XMM24, XMM24b, XMM24c, XMM24d);
1199 reg_class ymm24_reg(XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h);
1200 reg_class zmm24_reg(XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p);
1201 
1202 reg_class xmm25_reg(XMM25, XMM25b, XMM25c, XMM25d);
1203 reg_class ymm25_reg(XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h);
1204 reg_class zmm25_reg(XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p);
1205 
1206 reg_class xmm26_reg(XMM26, XMM26b, XMM26c, XMM26d);
1207 reg_class ymm26_reg(XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h);
1208 reg_class zmm26_reg(XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p);
1209 
1210 reg_class xmm27_reg(XMM27, XMM27b, XMM27c, XMM27d);
1211 reg_class ymm27_reg(XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h);
1212 reg_class zmm27_reg(XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p);
1213 
1214 reg_class xmm28_reg(XMM28, XMM28b, XMM28c, XMM28d);
1215 reg_class ymm28_reg(XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h);
1216 reg_class zmm28_reg(XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p);
1217 
1218 reg_class xmm29_reg(XMM29, XMM29b, XMM29c, XMM29d);
1219 reg_class ymm29_reg(XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h);
1220 reg_class zmm29_reg(XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p);
1221 
1222 reg_class xmm30_reg(XMM30, XMM30b, XMM30c, XMM30d);
1223 reg_class ymm30_reg(XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h);
1224 reg_class zmm30_reg(XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p);
1225 
1226 reg_class xmm31_reg(XMM31, XMM31b, XMM31c, XMM31d);
1227 reg_class ymm31_reg(XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h);
1228 reg_class zmm31_reg(XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p);
1229 
1230 #endif
1231 
1232 %}
1233 
1234 
1235 //----------SOURCE BLOCK-------------------------------------------------------
1236 // This is a block of C++ code which provides values, functions, and
1237 // definitions necessary in the rest of the architecture description
1238 
1239 source_hpp %{
1240 // Header information of the source block.
1241 // Method declarations/definitions which are used outside
1242 // the ad-scope can conveniently be defined here.
1243 //
1244 // To keep related declarations/definitions/uses close together,
1245 // we switch between source %{ }% and source_hpp %{ }% freely as needed.
1246 
1247 class NativeJump;
1248 
1249 class CallStubImpl {
1250 
1251   //--------------------------------------------------------------
1252   //---<  Used for optimization in Compile::shorten_branches  >---
1253   //--------------------------------------------------------------
1254 
1255  public:
1256   // Size of call trampoline stub.
1257   static uint size_call_trampoline() {
1258     return 0; // no call trampolines on this platform
1259   }
1260 
1261   // number of relocations needed by a call trampoline stub
1262   static uint reloc_call_trampoline() {
1263     return 0; // no call trampolines on this platform
1264   }
1265 };
1266 
1267 class HandlerImpl {
1268 
1269  public:
1270 
1271   static int emit_exception_handler(CodeBuffer &cbuf);
1272   static int emit_deopt_handler(CodeBuffer& cbuf);
1273 
1274   static uint size_exception_handler() {
1275     // NativeCall instruction size is the same as NativeJump.
1276     // exception handler starts out as jump and can be patched to
1277     // a call be deoptimization.  (4932387)
1278     // Note that this value is also credited (in output.cpp) to
1279     // the size of the code section.
1280     return NativeJump::instruction_size;
1281   }
1282 
1283 #ifdef _LP64
1284   static uint size_deopt_handler() {
1285     // three 5 byte instructions plus one move for unreachable address.
1286     return 15+3;
1287   }
1288 #else
1289   static uint size_deopt_handler() {
1290     // NativeCall instruction size is the same as NativeJump.
1291     // exception handler starts out as jump and can be patched to
1292     // a call be deoptimization.  (4932387)
1293     // Note that this value is also credited (in output.cpp) to
1294     // the size of the code section.
1295     return 5 + NativeJump::instruction_size; // pushl(); jmp;
1296   }
1297 #endif
1298 };
1299 
1300 %} // end source_hpp
1301 
1302 source %{
1303 
1304 #include "opto/addnode.hpp"
1305 
1306 // Emit exception handler code.
1307 // Stuff framesize into a register and call a VM stub routine.
1308 int HandlerImpl::emit_exception_handler(CodeBuffer& cbuf) {
1309 
1310   // Note that the code buffer's insts_mark is always relative to insts.
1311   // That's why we must use the macroassembler to generate a handler.
1312   MacroAssembler _masm(&cbuf);
1313   address base = __ start_a_stub(size_exception_handler());
1314   if (base == NULL) {
1315     ciEnv::current()->record_failure("CodeCache is full");
1316     return 0;  // CodeBuffer::expand failed
1317   }
1318   int offset = __ offset();
1319   __ jump(RuntimeAddress(OptoRuntime::exception_blob()->entry_point()));
1320   assert(__ offset() - offset <= (int) size_exception_handler(), "overflow");
1321   __ end_a_stub();
1322   return offset;
1323 }
1324 
1325 // Emit deopt handler code.
1326 int HandlerImpl::emit_deopt_handler(CodeBuffer& cbuf) {
1327 
1328   // Note that the code buffer's insts_mark is always relative to insts.
1329   // That's why we must use the macroassembler to generate a handler.
1330   MacroAssembler _masm(&cbuf);
1331   address base = __ start_a_stub(size_deopt_handler());
1332   if (base == NULL) {
1333     ciEnv::current()->record_failure("CodeCache is full");
1334     return 0;  // CodeBuffer::expand failed
1335   }
1336   int offset = __ offset();
1337 
1338 #ifdef _LP64
1339   address the_pc = (address) __ pc();
1340   Label next;
1341   // push a "the_pc" on the stack without destroying any registers
1342   // as they all may be live.
1343 
1344   // push address of "next"
1345   __ call(next, relocInfo::none); // reloc none is fine since it is a disp32
1346   __ bind(next);
1347   // adjust it so it matches "the_pc"
1348   __ subptr(Address(rsp, 0), __ offset() - offset);
1349 #else
1350   InternalAddress here(__ pc());
1351   __ pushptr(here.addr());
1352 #endif
1353 
1354   __ jump(RuntimeAddress(SharedRuntime::deopt_blob()->unpack()));
1355   assert(__ offset() - offset <= (int) size_deopt_handler(), "overflow %d", (__ offset() - offset));
1356   __ end_a_stub();
1357   return offset;
1358 }
1359 
1360 
1361 //=============================================================================
1362 
1363   // Float masks come from different places depending on platform.
1364 #ifdef _LP64
1365   static address float_signmask()  { return StubRoutines::x86::float_sign_mask(); }
1366   static address float_signflip()  { return StubRoutines::x86::float_sign_flip(); }
1367   static address double_signmask() { return StubRoutines::x86::double_sign_mask(); }
1368   static address double_signflip() { return StubRoutines::x86::double_sign_flip(); }
1369 #else
1370   static address float_signmask()  { return (address)float_signmask_pool; }
1371   static address float_signflip()  { return (address)float_signflip_pool; }
1372   static address double_signmask() { return (address)double_signmask_pool; }
1373   static address double_signflip() { return (address)double_signflip_pool; }
1374 #endif
1375 
1376 
1377 const bool Matcher::match_rule_supported(int opcode) {
1378   if (!has_match_rule(opcode))
1379     return false;
1380 
1381   bool ret_value = true;
1382   switch (opcode) {
1383     case Op_PopCountI:
1384     case Op_PopCountL:
1385       if (!UsePopCountInstruction)
1386         ret_value = false;
1387       break;
1388     case Op_PopCountVI:
1389       if (!UsePopCountInstruction || !VM_Version::supports_vpopcntdq())
1390         ret_value = false;
1391       break;
1392     case Op_MulVI:
1393       if ((UseSSE < 4) && (UseAVX < 1)) // only with SSE4_1 or AVX
1394         ret_value = false;
1395       break;
1396     case Op_MulVL:
1397     case Op_MulReductionVL:
1398       if (VM_Version::supports_avx512dq() == false)
1399         ret_value = false;
1400       break;
1401     case Op_AddReductionVL:
1402       if (UseAVX < 3) // only EVEX : vector connectivity becomes an issue here
1403         ret_value = false;
1404       break;
1405     case Op_AddReductionVI:
1406       if (UseSSE < 3) // requires at least SSE3
1407         ret_value = false;
1408       break;
1409     case Op_MulReductionVI:
1410       if (UseSSE < 4) // requires at least SSE4
1411         ret_value = false;
1412       break;
1413     case Op_AddReductionVF:
1414     case Op_AddReductionVD:
1415     case Op_MulReductionVF:
1416     case Op_MulReductionVD:
1417       if (UseSSE < 1) // requires at least SSE
1418         ret_value = false;
1419       break;
1420     case Op_SqrtVD:
1421     case Op_SqrtVF:
1422       if (UseAVX < 1) // enabled for AVX only
1423         ret_value = false;
1424       break;
1425     case Op_CompareAndSwapL:
1426 #ifdef _LP64
1427     case Op_CompareAndSwapP:
1428 #endif
1429       if (!VM_Version::supports_cx8())
1430         ret_value = false;
1431       break;
1432     case Op_CMoveVF:
1433     case Op_CMoveVD:
1434       if (UseAVX < 1 || UseAVX > 2)
1435         ret_value = false;
1436       break;
1437     case Op_StrIndexOf:
1438       if (!UseSSE42Intrinsics)
1439         ret_value = false;
1440       break;
1441     case Op_StrIndexOfChar:
1442       if (!UseSSE42Intrinsics)
1443         ret_value = false;
1444       break;
1445     case Op_OnSpinWait:
1446       if (VM_Version::supports_on_spin_wait() == false)
1447         ret_value = false;
1448       break;
1449     case Op_MulAddVS2VI:
1450       if (UseSSE < 2)
1451         ret_value = false;
1452       break;
1453   }
1454 
1455   return ret_value;  // Per default match rules are supported.
1456 }
1457 
1458 const bool Matcher::match_rule_supported_vector(int opcode, int vlen) {
1459   // identify extra cases that we might want to provide match rules for
1460   // e.g. Op_ vector nodes and other intrinsics while guarding with vlen
1461   bool ret_value = match_rule_supported(opcode);
1462   if (ret_value) {
1463     switch (opcode) {
1464       case Op_AddVB:
1465       case Op_SubVB:
1466         if ((vlen == 64) && (VM_Version::supports_avx512bw() == false))
1467           ret_value = false;
1468         break;
1469       case Op_URShiftVS:
1470       case Op_RShiftVS:
1471       case Op_LShiftVS:
1472       case Op_MulVS:
1473       case Op_AddVS:
1474       case Op_SubVS:
1475         if ((vlen == 32) && (VM_Version::supports_avx512bw() == false))
1476           ret_value = false;
1477         break;
1478       case Op_CMoveVF:
1479         if (vlen != 8)
1480           ret_value  = false;
1481         break;
1482       case Op_CMoveVD:
1483         if (vlen != 4)
1484           ret_value  = false;
1485         break;
1486     }
1487   }
1488 
1489   return ret_value;  // Per default match rules are supported.
1490 }
1491 
1492 const bool Matcher::has_predicated_vectors(void) {
1493   bool ret_value = false;
1494   if (UseAVX > 2) {
1495     ret_value = VM_Version::supports_avx512vl();
1496   }
1497 
1498   return ret_value;
1499 }
1500 
1501 const int Matcher::float_pressure(int default_pressure_threshold) {
1502   int float_pressure_threshold = default_pressure_threshold;
1503 #ifdef _LP64
1504   if (UseAVX > 2) {
1505     // Increase pressure threshold on machines with AVX3 which have
1506     // 2x more XMM registers.
1507     float_pressure_threshold = default_pressure_threshold * 2;
1508   }
1509 #endif
1510   return float_pressure_threshold;
1511 }
1512 
1513 // Max vector size in bytes. 0 if not supported.
1514 const int Matcher::vector_width_in_bytes(BasicType bt) {
1515   assert(is_java_primitive(bt), "only primitive type vectors");
1516   if (UseSSE < 2) return 0;
1517   // SSE2 supports 128bit vectors for all types.
1518   // AVX2 supports 256bit vectors for all types.
1519   // AVX2/EVEX supports 512bit vectors for all types.
1520   int size = (UseAVX > 1) ? (1 << UseAVX) * 8 : 16;
1521   // AVX1 supports 256bit vectors only for FLOAT and DOUBLE.
1522   if (UseAVX > 0 && (bt == T_FLOAT || bt == T_DOUBLE))
1523     size = (UseAVX > 2) ? 64 : 32;
1524   if (UseAVX > 2 && (bt == T_BYTE || bt == T_SHORT || bt == T_CHAR))
1525     size = (VM_Version::supports_avx512bw()) ? 64 : 32;
1526   // Use flag to limit vector size.
1527   size = MIN2(size,(int)MaxVectorSize);
1528   // Minimum 2 values in vector (or 4 for bytes).
1529   switch (bt) {
1530   case T_DOUBLE:
1531   case T_LONG:
1532     if (size < 16) return 0;
1533     break;
1534   case T_FLOAT:
1535   case T_INT:
1536     if (size < 8) return 0;
1537     break;
1538   case T_BOOLEAN:
1539     if (size < 4) return 0;
1540     break;
1541   case T_CHAR:
1542     if (size < 4) return 0;
1543     break;
1544   case T_BYTE:
1545     if (size < 4) return 0;
1546     break;
1547   case T_SHORT:
1548     if (size < 4) return 0;
1549     break;
1550   default:
1551     ShouldNotReachHere();
1552   }
1553   return size;
1554 }
1555 
1556 // Limits on vector size (number of elements) loaded into vector.
1557 const int Matcher::max_vector_size(const BasicType bt) {
1558   return vector_width_in_bytes(bt)/type2aelembytes(bt);
1559 }
1560 const int Matcher::min_vector_size(const BasicType bt) {
1561   int max_size = max_vector_size(bt);
1562   // Min size which can be loaded into vector is 4 bytes.
1563   int size = (type2aelembytes(bt) == 1) ? 4 : 2;
1564   return MIN2(size,max_size);
1565 }
1566 
1567 // Vector ideal reg corresponding to specified size in bytes
1568 const uint Matcher::vector_ideal_reg(int size) {
1569   assert(MaxVectorSize >= size, "");
1570   switch(size) {
1571     case  4: return Op_VecS;
1572     case  8: return Op_VecD;
1573     case 16: return Op_VecX;
1574     case 32: return Op_VecY;
1575     case 64: return Op_VecZ;
1576   }
1577   ShouldNotReachHere();
1578   return 0;
1579 }
1580 
1581 // Only lowest bits of xmm reg are used for vector shift count.
1582 const uint Matcher::vector_shift_count_ideal_reg(int size) {
1583   return Op_VecS;
1584 }
1585 
1586 // x86 supports misaligned vectors store/load.
1587 const bool Matcher::misaligned_vectors_ok() {
1588   return true;
1589 }
1590 
1591 // x86 AES instructions are compatible with SunJCE expanded
1592 // keys, hence we do not need to pass the original key to stubs
1593 const bool Matcher::pass_original_key_for_aes() {
1594   return false;
1595 }
1596 
1597 
1598 const bool Matcher::convi2l_type_required = true;
1599 
1600 // Check for shift by small constant as well
1601 static bool clone_shift(Node* shift, Matcher* matcher, Matcher::MStack& mstack, VectorSet& address_visited) {
1602   if (shift->Opcode() == Op_LShiftX && shift->in(2)->is_Con() &&
1603       shift->in(2)->get_int() <= 3 &&
1604       // Are there other uses besides address expressions?
1605       !matcher->is_visited(shift)) {
1606     address_visited.set(shift->_idx); // Flag as address_visited
1607     mstack.push(shift->in(2), Matcher::Visit);
1608     Node *conv = shift->in(1);
1609 #ifdef _LP64
1610     // Allow Matcher to match the rule which bypass
1611     // ConvI2L operation for an array index on LP64
1612     // if the index value is positive.
1613     if (conv->Opcode() == Op_ConvI2L &&
1614         conv->as_Type()->type()->is_long()->_lo >= 0 &&
1615         // Are there other uses besides address expressions?
1616         !matcher->is_visited(conv)) {
1617       address_visited.set(conv->_idx); // Flag as address_visited
1618       mstack.push(conv->in(1), Matcher::Pre_Visit);
1619     } else
1620 #endif
1621       mstack.push(conv, Matcher::Pre_Visit);
1622     return true;
1623   }
1624   return false;
1625 }
1626 
1627 // Should the Matcher clone shifts on addressing modes, expecting them
1628 // to be subsumed into complex addressing expressions or compute them
1629 // into registers?
1630 bool Matcher::clone_address_expressions(AddPNode* m, Matcher::MStack& mstack, VectorSet& address_visited) {
1631   Node *off = m->in(AddPNode::Offset);
1632   if (off->is_Con()) {
1633     address_visited.test_set(m->_idx); // Flag as address_visited
1634     Node *adr = m->in(AddPNode::Address);
1635 
1636     // Intel can handle 2 adds in addressing mode
1637     // AtomicAdd is not an addressing expression.
1638     // Cheap to find it by looking for screwy base.
1639     if (adr->is_AddP() &&
1640         !adr->in(AddPNode::Base)->is_top() &&
1641         // Are there other uses besides address expressions?
1642         !is_visited(adr)) {
1643       address_visited.set(adr->_idx); // Flag as address_visited
1644       Node *shift = adr->in(AddPNode::Offset);
1645       if (!clone_shift(shift, this, mstack, address_visited)) {
1646         mstack.push(shift, Pre_Visit);
1647       }
1648       mstack.push(adr->in(AddPNode::Address), Pre_Visit);
1649       mstack.push(adr->in(AddPNode::Base), Pre_Visit);
1650     } else {
1651       mstack.push(adr, Pre_Visit);
1652     }
1653 
1654     // Clone X+offset as it also folds into most addressing expressions
1655     mstack.push(off, Visit);
1656     mstack.push(m->in(AddPNode::Base), Pre_Visit);
1657     return true;
1658   } else if (clone_shift(off, this, mstack, address_visited)) {
1659     address_visited.test_set(m->_idx); // Flag as address_visited
1660     mstack.push(m->in(AddPNode::Address), Pre_Visit);
1661     mstack.push(m->in(AddPNode::Base), Pre_Visit);
1662     return true;
1663   }
1664   return false;
1665 }
1666 
1667 void Compile::reshape_address(AddPNode* addp) {
1668 }
1669 
1670 // Helper methods for MachSpillCopyNode::implementation().
1671 static int vec_mov_helper(CodeBuffer *cbuf, bool do_size, int src_lo, int dst_lo,
1672                           int src_hi, int dst_hi, uint ireg, outputStream* st) {
1673   // In 64-bit VM size calculation is very complex. Emitting instructions
1674   // into scratch buffer is used to get size in 64-bit VM.
1675   LP64_ONLY( assert(!do_size, "this method calculates size only for 32-bit VM"); )
1676   assert(ireg == Op_VecS || // 32bit vector
1677          (src_lo & 1) == 0 && (src_lo + 1) == src_hi &&
1678          (dst_lo & 1) == 0 && (dst_lo + 1) == dst_hi,
1679          "no non-adjacent vector moves" );
1680   if (cbuf) {
1681     MacroAssembler _masm(cbuf);
1682     int offset = __ offset();
1683     switch (ireg) {
1684     case Op_VecS: // copy whole register
1685     case Op_VecD:
1686     case Op_VecX:
1687 #ifndef LP64
1688       __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
1689 #else
1690       if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
1691         __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
1692       } else {
1693         __ vpxor(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[dst_lo]), 2);
1694         __ vinserti32x4(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 0x0);
1695      }
1696 #endif
1697       break;
1698     case Op_VecY:
1699 #ifndef LP64
1700       __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
1701 #else
1702       if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
1703         __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
1704       } else {
1705         __ vpxor(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[dst_lo]), 2);
1706         __ vinserti64x4(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 0x0);
1707      }
1708 #endif
1709       break;
1710     case Op_VecZ:
1711       __ evmovdquq(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 2);
1712       break;
1713     default:
1714       ShouldNotReachHere();
1715     }
1716     int size = __ offset() - offset;
1717 #ifdef ASSERT
1718     // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
1719     assert(!do_size || size == 4, "incorrect size calculattion");
1720 #endif
1721     return size;
1722 #ifndef PRODUCT
1723   } else if (!do_size) {
1724     switch (ireg) {
1725     case Op_VecS:
1726     case Op_VecD:
1727     case Op_VecX:
1728       st->print("movdqu  %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
1729       break;
1730     case Op_VecY:
1731     case Op_VecZ:
1732       st->print("vmovdqu %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
1733       break;
1734     default:
1735       ShouldNotReachHere();
1736     }
1737 #endif
1738   }
1739   // VEX_2bytes prefix is used if UseAVX > 0, and it takes the same 2 bytes as SIMD prefix.
1740   return (UseAVX > 2) ? 6 : 4;
1741 }
1742 
1743 static int vec_spill_helper(CodeBuffer *cbuf, bool do_size, bool is_load,
1744                             int stack_offset, int reg, uint ireg, outputStream* st) {
1745   // In 64-bit VM size calculation is very complex. Emitting instructions
1746   // into scratch buffer is used to get size in 64-bit VM.
1747   LP64_ONLY( assert(!do_size, "this method calculates size only for 32-bit VM"); )
1748   if (cbuf) {
1749     MacroAssembler _masm(cbuf);
1750     int offset = __ offset();
1751     if (is_load) {
1752       switch (ireg) {
1753       case Op_VecS:
1754         __ movdl(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
1755         break;
1756       case Op_VecD:
1757         __ movq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
1758         break;
1759       case Op_VecX:
1760 #ifndef LP64
1761         __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
1762 #else
1763         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
1764           __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
1765         } else {
1766           __ vpxor(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 2);
1767           __ vinserti32x4(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset),0x0);
1768         }
1769 #endif
1770         break;
1771       case Op_VecY:
1772 #ifndef LP64
1773         __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
1774 #else
1775         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
1776           __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
1777         } else {
1778           __ vpxor(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 2);
1779           __ vinserti64x4(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset),0x0);
1780         }
1781 #endif
1782         break;
1783       case Op_VecZ:
1784         __ evmovdquq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset), 2);
1785         break;
1786       default:
1787         ShouldNotReachHere();
1788       }
1789     } else { // store
1790       switch (ireg) {
1791       case Op_VecS:
1792         __ movdl(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
1793         break;
1794       case Op_VecD:
1795         __ movq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
1796         break;
1797       case Op_VecX:
1798 #ifndef LP64
1799         __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
1800 #else
1801         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
1802           __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
1803         }
1804         else {
1805           __ vextracti32x4(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 0x0);
1806         }
1807 #endif
1808         break;
1809       case Op_VecY:
1810 #ifndef LP64
1811         __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
1812 #else
1813         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
1814           __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
1815         }
1816         else {
1817           __ vextracti64x4(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 0x0);
1818         }
1819 #endif
1820         break;
1821       case Op_VecZ:
1822         __ evmovdquq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 2);
1823         break;
1824       default:
1825         ShouldNotReachHere();
1826       }
1827     }
1828     int size = __ offset() - offset;
1829 #ifdef ASSERT
1830     int offset_size = (stack_offset == 0) ? 0 : ((stack_offset < 0x80) ? 1 : (UseAVX > 2) ? 6 : 4);
1831     // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
1832     assert(!do_size || size == (5+offset_size), "incorrect size calculattion");
1833 #endif
1834     return size;
1835 #ifndef PRODUCT
1836   } else if (!do_size) {
1837     if (is_load) {
1838       switch (ireg) {
1839       case Op_VecS:
1840         st->print("movd    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
1841         break;
1842       case Op_VecD:
1843         st->print("movq    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
1844         break;
1845        case Op_VecX:
1846         st->print("movdqu  %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
1847         break;
1848       case Op_VecY:
1849       case Op_VecZ:
1850         st->print("vmovdqu %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
1851         break;
1852       default:
1853         ShouldNotReachHere();
1854       }
1855     } else { // store
1856       switch (ireg) {
1857       case Op_VecS:
1858         st->print("movd    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
1859         break;
1860       case Op_VecD:
1861         st->print("movq    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
1862         break;
1863        case Op_VecX:
1864         st->print("movdqu  [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
1865         break;
1866       case Op_VecY:
1867       case Op_VecZ:
1868         st->print("vmovdqu [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
1869         break;
1870       default:
1871         ShouldNotReachHere();
1872       }
1873     }
1874 #endif
1875   }
1876   bool is_single_byte = false;
1877   int vec_len = 0;
1878   if ((UseAVX > 2) && (stack_offset != 0)) {
1879     int tuple_type = Assembler::EVEX_FVM;
1880     int input_size = Assembler::EVEX_32bit;
1881     switch (ireg) {
1882     case Op_VecS:
1883       tuple_type = Assembler::EVEX_T1S;
1884       break;
1885     case Op_VecD:
1886       tuple_type = Assembler::EVEX_T1S;
1887       input_size = Assembler::EVEX_64bit;
1888       break;
1889     case Op_VecX:
1890       break;
1891     case Op_VecY:
1892       vec_len = 1;
1893       break;
1894     case Op_VecZ:
1895       vec_len = 2;
1896       break;
1897     }
1898     is_single_byte = Assembler::query_compressed_disp_byte(stack_offset, true, vec_len, tuple_type, input_size, 0);
1899   }
1900   int offset_size = 0;
1901   int size = 5;
1902   if (UseAVX > 2 ) {
1903     if (VM_Version::supports_avx512novl() && (vec_len == 2)) {
1904       offset_size = (stack_offset == 0) ? 0 : ((is_single_byte) ? 1 : 4);
1905       size += 2; // Need an additional two bytes for EVEX encoding
1906     } else if (VM_Version::supports_avx512novl() && (vec_len < 2)) {
1907       offset_size = (stack_offset == 0) ? 0 : ((stack_offset <= 127) ? 1 : 4);
1908     } else {
1909       offset_size = (stack_offset == 0) ? 0 : ((is_single_byte) ? 1 : 4);
1910       size += 2; // Need an additional two bytes for EVEX encodding
1911     }
1912   } else {
1913     offset_size = (stack_offset == 0) ? 0 : ((stack_offset <= 127) ? 1 : 4);
1914   }
1915   // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
1916   return size+offset_size;
1917 }
1918 
1919 static inline jint replicate4_imm(int con, int width) {
1920   // Load a constant of "width" (in bytes) and replicate it to fill 32bit.
1921   assert(width == 1 || width == 2, "only byte or short types here");
1922   int bit_width = width * 8;
1923   jint val = con;
1924   val &= (1 << bit_width) - 1;  // mask off sign bits
1925   while(bit_width < 32) {
1926     val |= (val << bit_width);
1927     bit_width <<= 1;
1928   }
1929   return val;
1930 }
1931 
1932 static inline jlong replicate8_imm(int con, int width) {
1933   // Load a constant of "width" (in bytes) and replicate it to fill 64bit.
1934   assert(width == 1 || width == 2 || width == 4, "only byte, short or int types here");
1935   int bit_width = width * 8;
1936   jlong val = con;
1937   val &= (((jlong) 1) << bit_width) - 1;  // mask off sign bits
1938   while(bit_width < 64) {
1939     val |= (val << bit_width);
1940     bit_width <<= 1;
1941   }
1942   return val;
1943 }
1944 
1945 #ifndef PRODUCT
1946   void MachNopNode::format(PhaseRegAlloc*, outputStream* st) const {
1947     st->print("nop \t# %d bytes pad for loops and calls", _count);
1948   }
1949 #endif
1950 
1951   void MachNopNode::emit(CodeBuffer &cbuf, PhaseRegAlloc*) const {
1952     MacroAssembler _masm(&cbuf);
1953     __ nop(_count);
1954   }
1955 
1956   uint MachNopNode::size(PhaseRegAlloc*) const {
1957     return _count;
1958   }
1959 
1960 #ifndef PRODUCT
1961   void MachBreakpointNode::format(PhaseRegAlloc*, outputStream* st) const {
1962     st->print("# breakpoint");
1963   }
1964 #endif
1965 
1966   void MachBreakpointNode::emit(CodeBuffer &cbuf, PhaseRegAlloc* ra_) const {
1967     MacroAssembler _masm(&cbuf);
1968     __ int3();
1969   }
1970 
1971   uint MachBreakpointNode::size(PhaseRegAlloc* ra_) const {
1972     return MachNode::size(ra_);
1973   }
1974 
1975 %}
1976 
1977 encode %{
1978 
1979   enc_class call_epilog %{
1980     if (VerifyStackAtCalls) {
1981       // Check that stack depth is unchanged: find majik cookie on stack
1982       int framesize = ra_->reg2offset_unchecked(OptoReg::add(ra_->_matcher._old_SP, -3*VMRegImpl::slots_per_word));
1983       MacroAssembler _masm(&cbuf);
1984       Label L;
1985       __ cmpptr(Address(rsp, framesize), (int32_t)0xbadb100d);
1986       __ jccb(Assembler::equal, L);
1987       // Die if stack mismatch
1988       __ int3();
1989       __ bind(L);
1990     }
1991   %}
1992 
1993 %}
1994 
1995 
1996 //----------OPERANDS-----------------------------------------------------------
1997 // Operand definitions must precede instruction definitions for correct parsing
1998 // in the ADLC because operands constitute user defined types which are used in
1999 // instruction definitions.
2000 
2001 operand vecZ() %{
2002   constraint(ALLOC_IN_RC(vectorz_reg));
2003   match(VecZ);
2004 
2005   format %{ %}
2006   interface(REG_INTER);
2007 %}
2008 
2009 operand legVecZ() %{
2010   constraint(ALLOC_IN_RC(vectorz_reg_vl));
2011   match(VecZ);
2012 
2013   format %{ %}
2014   interface(REG_INTER);
2015 %}
2016 
2017 // Comparison Code for FP conditional move
2018 operand cmpOp_vcmppd() %{
2019   match(Bool);
2020 
2021   predicate(n->as_Bool()->_test._test != BoolTest::overflow &&
2022             n->as_Bool()->_test._test != BoolTest::no_overflow);
2023   format %{ "" %}
2024   interface(COND_INTER) %{
2025     equal        (0x0, "eq");
2026     less         (0x1, "lt");
2027     less_equal   (0x2, "le");
2028     not_equal    (0xC, "ne");
2029     greater_equal(0xD, "ge");
2030     greater      (0xE, "gt");
2031     //TODO cannot compile (adlc breaks) without two next lines with error:
2032     // x86_64.ad(13987) Syntax Error: :In operand cmpOp_vcmppd: Do not support this encode constant: ' %{
2033     // equal' for overflow.
2034     overflow     (0x20, "o");  // not really supported by the instruction
2035     no_overflow  (0x21, "no"); // not really supported by the instruction
2036   %}
2037 %}
2038 
2039 
2040 // INSTRUCTIONS -- Platform independent definitions (same for 32- and 64-bit)
2041 
2042 // ============================================================================
2043 
2044 instruct ShouldNotReachHere() %{
2045   match(Halt);
2046   format %{ "ud2\t# ShouldNotReachHere" %}
2047   ins_encode %{
2048     __ ud2();
2049   %}
2050   ins_pipe(pipe_slow);
2051 %}
2052 
2053 // =================================EVEX special===============================
2054 
2055 instruct setMask(rRegI dst, rRegI src) %{
2056   predicate(Matcher::has_predicated_vectors());
2057   match(Set dst (SetVectMaskI  src));
2058   effect(TEMP dst);
2059   format %{ "setvectmask   $dst, $src" %}
2060   ins_encode %{
2061     __ setvectmask($dst$$Register, $src$$Register);
2062   %}
2063   ins_pipe(pipe_slow);
2064 %}
2065 
2066 // ============================================================================
2067 
2068 instruct addF_reg(regF dst, regF src) %{
2069   predicate((UseSSE>=1) && (UseAVX == 0));
2070   match(Set dst (AddF dst src));
2071 
2072   format %{ "addss   $dst, $src" %}
2073   ins_cost(150);
2074   ins_encode %{
2075     __ addss($dst$$XMMRegister, $src$$XMMRegister);
2076   %}
2077   ins_pipe(pipe_slow);
2078 %}
2079 
2080 instruct addF_mem(regF dst, memory src) %{
2081   predicate((UseSSE>=1) && (UseAVX == 0));
2082   match(Set dst (AddF dst (LoadF src)));
2083 
2084   format %{ "addss   $dst, $src" %}
2085   ins_cost(150);
2086   ins_encode %{
2087     __ addss($dst$$XMMRegister, $src$$Address);
2088   %}
2089   ins_pipe(pipe_slow);
2090 %}
2091 
2092 instruct addF_imm(regF dst, immF con) %{
2093   predicate((UseSSE>=1) && (UseAVX == 0));
2094   match(Set dst (AddF dst con));
2095   format %{ "addss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2096   ins_cost(150);
2097   ins_encode %{
2098     __ addss($dst$$XMMRegister, $constantaddress($con));
2099   %}
2100   ins_pipe(pipe_slow);
2101 %}
2102 
2103 instruct addF_reg_reg(regF dst, regF src1, regF src2) %{
2104   predicate(UseAVX > 0);
2105   match(Set dst (AddF src1 src2));
2106 
2107   format %{ "vaddss  $dst, $src1, $src2" %}
2108   ins_cost(150);
2109   ins_encode %{
2110     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2111   %}
2112   ins_pipe(pipe_slow);
2113 %}
2114 
2115 instruct addF_reg_mem(regF dst, regF src1, memory src2) %{
2116   predicate(UseAVX > 0);
2117   match(Set dst (AddF src1 (LoadF src2)));
2118 
2119   format %{ "vaddss  $dst, $src1, $src2" %}
2120   ins_cost(150);
2121   ins_encode %{
2122     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2123   %}
2124   ins_pipe(pipe_slow);
2125 %}
2126 
2127 instruct addF_reg_imm(regF dst, regF src, immF con) %{
2128   predicate(UseAVX > 0);
2129   match(Set dst (AddF src con));
2130 
2131   format %{ "vaddss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
2132   ins_cost(150);
2133   ins_encode %{
2134     __ vaddss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2135   %}
2136   ins_pipe(pipe_slow);
2137 %}
2138 
2139 instruct addD_reg(regD dst, regD src) %{
2140   predicate((UseSSE>=2) && (UseAVX == 0));
2141   match(Set dst (AddD dst src));
2142 
2143   format %{ "addsd   $dst, $src" %}
2144   ins_cost(150);
2145   ins_encode %{
2146     __ addsd($dst$$XMMRegister, $src$$XMMRegister);
2147   %}
2148   ins_pipe(pipe_slow);
2149 %}
2150 
2151 instruct addD_mem(regD dst, memory src) %{
2152   predicate((UseSSE>=2) && (UseAVX == 0));
2153   match(Set dst (AddD dst (LoadD src)));
2154 
2155   format %{ "addsd   $dst, $src" %}
2156   ins_cost(150);
2157   ins_encode %{
2158     __ addsd($dst$$XMMRegister, $src$$Address);
2159   %}
2160   ins_pipe(pipe_slow);
2161 %}
2162 
2163 instruct addD_imm(regD dst, immD con) %{
2164   predicate((UseSSE>=2) && (UseAVX == 0));
2165   match(Set dst (AddD dst con));
2166   format %{ "addsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2167   ins_cost(150);
2168   ins_encode %{
2169     __ addsd($dst$$XMMRegister, $constantaddress($con));
2170   %}
2171   ins_pipe(pipe_slow);
2172 %}
2173 
2174 instruct addD_reg_reg(regD dst, regD src1, regD src2) %{
2175   predicate(UseAVX > 0);
2176   match(Set dst (AddD src1 src2));
2177 
2178   format %{ "vaddsd  $dst, $src1, $src2" %}
2179   ins_cost(150);
2180   ins_encode %{
2181     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2182   %}
2183   ins_pipe(pipe_slow);
2184 %}
2185 
2186 instruct addD_reg_mem(regD dst, regD src1, memory src2) %{
2187   predicate(UseAVX > 0);
2188   match(Set dst (AddD src1 (LoadD src2)));
2189 
2190   format %{ "vaddsd  $dst, $src1, $src2" %}
2191   ins_cost(150);
2192   ins_encode %{
2193     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2194   %}
2195   ins_pipe(pipe_slow);
2196 %}
2197 
2198 instruct addD_reg_imm(regD dst, regD src, immD con) %{
2199   predicate(UseAVX > 0);
2200   match(Set dst (AddD src con));
2201 
2202   format %{ "vaddsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
2203   ins_cost(150);
2204   ins_encode %{
2205     __ vaddsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2206   %}
2207   ins_pipe(pipe_slow);
2208 %}
2209 
2210 instruct subF_reg(regF dst, regF src) %{
2211   predicate((UseSSE>=1) && (UseAVX == 0));
2212   match(Set dst (SubF dst src));
2213 
2214   format %{ "subss   $dst, $src" %}
2215   ins_cost(150);
2216   ins_encode %{
2217     __ subss($dst$$XMMRegister, $src$$XMMRegister);
2218   %}
2219   ins_pipe(pipe_slow);
2220 %}
2221 
2222 instruct subF_mem(regF dst, memory src) %{
2223   predicate((UseSSE>=1) && (UseAVX == 0));
2224   match(Set dst (SubF dst (LoadF src)));
2225 
2226   format %{ "subss   $dst, $src" %}
2227   ins_cost(150);
2228   ins_encode %{
2229     __ subss($dst$$XMMRegister, $src$$Address);
2230   %}
2231   ins_pipe(pipe_slow);
2232 %}
2233 
2234 instruct subF_imm(regF dst, immF con) %{
2235   predicate((UseSSE>=1) && (UseAVX == 0));
2236   match(Set dst (SubF dst con));
2237   format %{ "subss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2238   ins_cost(150);
2239   ins_encode %{
2240     __ subss($dst$$XMMRegister, $constantaddress($con));
2241   %}
2242   ins_pipe(pipe_slow);
2243 %}
2244 
2245 instruct subF_reg_reg(regF dst, regF src1, regF src2) %{
2246   predicate(UseAVX > 0);
2247   match(Set dst (SubF src1 src2));
2248 
2249   format %{ "vsubss  $dst, $src1, $src2" %}
2250   ins_cost(150);
2251   ins_encode %{
2252     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2253   %}
2254   ins_pipe(pipe_slow);
2255 %}
2256 
2257 instruct subF_reg_mem(regF dst, regF src1, memory src2) %{
2258   predicate(UseAVX > 0);
2259   match(Set dst (SubF src1 (LoadF src2)));
2260 
2261   format %{ "vsubss  $dst, $src1, $src2" %}
2262   ins_cost(150);
2263   ins_encode %{
2264     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2265   %}
2266   ins_pipe(pipe_slow);
2267 %}
2268 
2269 instruct subF_reg_imm(regF dst, regF src, immF con) %{
2270   predicate(UseAVX > 0);
2271   match(Set dst (SubF src con));
2272 
2273   format %{ "vsubss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
2274   ins_cost(150);
2275   ins_encode %{
2276     __ vsubss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2277   %}
2278   ins_pipe(pipe_slow);
2279 %}
2280 
2281 instruct subD_reg(regD dst, regD src) %{
2282   predicate((UseSSE>=2) && (UseAVX == 0));
2283   match(Set dst (SubD dst src));
2284 
2285   format %{ "subsd   $dst, $src" %}
2286   ins_cost(150);
2287   ins_encode %{
2288     __ subsd($dst$$XMMRegister, $src$$XMMRegister);
2289   %}
2290   ins_pipe(pipe_slow);
2291 %}
2292 
2293 instruct subD_mem(regD dst, memory src) %{
2294   predicate((UseSSE>=2) && (UseAVX == 0));
2295   match(Set dst (SubD dst (LoadD src)));
2296 
2297   format %{ "subsd   $dst, $src" %}
2298   ins_cost(150);
2299   ins_encode %{
2300     __ subsd($dst$$XMMRegister, $src$$Address);
2301   %}
2302   ins_pipe(pipe_slow);
2303 %}
2304 
2305 instruct subD_imm(regD dst, immD con) %{
2306   predicate((UseSSE>=2) && (UseAVX == 0));
2307   match(Set dst (SubD dst con));
2308   format %{ "subsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2309   ins_cost(150);
2310   ins_encode %{
2311     __ subsd($dst$$XMMRegister, $constantaddress($con));
2312   %}
2313   ins_pipe(pipe_slow);
2314 %}
2315 
2316 instruct subD_reg_reg(regD dst, regD src1, regD src2) %{
2317   predicate(UseAVX > 0);
2318   match(Set dst (SubD src1 src2));
2319 
2320   format %{ "vsubsd  $dst, $src1, $src2" %}
2321   ins_cost(150);
2322   ins_encode %{
2323     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2324   %}
2325   ins_pipe(pipe_slow);
2326 %}
2327 
2328 instruct subD_reg_mem(regD dst, regD src1, memory src2) %{
2329   predicate(UseAVX > 0);
2330   match(Set dst (SubD src1 (LoadD src2)));
2331 
2332   format %{ "vsubsd  $dst, $src1, $src2" %}
2333   ins_cost(150);
2334   ins_encode %{
2335     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2336   %}
2337   ins_pipe(pipe_slow);
2338 %}
2339 
2340 instruct subD_reg_imm(regD dst, regD src, immD con) %{
2341   predicate(UseAVX > 0);
2342   match(Set dst (SubD src con));
2343 
2344   format %{ "vsubsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
2345   ins_cost(150);
2346   ins_encode %{
2347     __ vsubsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2348   %}
2349   ins_pipe(pipe_slow);
2350 %}
2351 
2352 instruct mulF_reg(regF dst, regF src) %{
2353   predicate((UseSSE>=1) && (UseAVX == 0));
2354   match(Set dst (MulF dst src));
2355 
2356   format %{ "mulss   $dst, $src" %}
2357   ins_cost(150);
2358   ins_encode %{
2359     __ mulss($dst$$XMMRegister, $src$$XMMRegister);
2360   %}
2361   ins_pipe(pipe_slow);
2362 %}
2363 
2364 instruct mulF_mem(regF dst, memory src) %{
2365   predicate((UseSSE>=1) && (UseAVX == 0));
2366   match(Set dst (MulF dst (LoadF src)));
2367 
2368   format %{ "mulss   $dst, $src" %}
2369   ins_cost(150);
2370   ins_encode %{
2371     __ mulss($dst$$XMMRegister, $src$$Address);
2372   %}
2373   ins_pipe(pipe_slow);
2374 %}
2375 
2376 instruct mulF_imm(regF dst, immF con) %{
2377   predicate((UseSSE>=1) && (UseAVX == 0));
2378   match(Set dst (MulF dst con));
2379   format %{ "mulss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2380   ins_cost(150);
2381   ins_encode %{
2382     __ mulss($dst$$XMMRegister, $constantaddress($con));
2383   %}
2384   ins_pipe(pipe_slow);
2385 %}
2386 
2387 instruct mulF_reg_reg(regF dst, regF src1, regF src2) %{
2388   predicate(UseAVX > 0);
2389   match(Set dst (MulF src1 src2));
2390 
2391   format %{ "vmulss  $dst, $src1, $src2" %}
2392   ins_cost(150);
2393   ins_encode %{
2394     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2395   %}
2396   ins_pipe(pipe_slow);
2397 %}
2398 
2399 instruct mulF_reg_mem(regF dst, regF src1, memory src2) %{
2400   predicate(UseAVX > 0);
2401   match(Set dst (MulF src1 (LoadF src2)));
2402 
2403   format %{ "vmulss  $dst, $src1, $src2" %}
2404   ins_cost(150);
2405   ins_encode %{
2406     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2407   %}
2408   ins_pipe(pipe_slow);
2409 %}
2410 
2411 instruct mulF_reg_imm(regF dst, regF src, immF con) %{
2412   predicate(UseAVX > 0);
2413   match(Set dst (MulF src con));
2414 
2415   format %{ "vmulss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
2416   ins_cost(150);
2417   ins_encode %{
2418     __ vmulss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2419   %}
2420   ins_pipe(pipe_slow);
2421 %}
2422 
2423 instruct mulD_reg(regD dst, regD src) %{
2424   predicate((UseSSE>=2) && (UseAVX == 0));
2425   match(Set dst (MulD dst src));
2426 
2427   format %{ "mulsd   $dst, $src" %}
2428   ins_cost(150);
2429   ins_encode %{
2430     __ mulsd($dst$$XMMRegister, $src$$XMMRegister);
2431   %}
2432   ins_pipe(pipe_slow);
2433 %}
2434 
2435 instruct mulD_mem(regD dst, memory src) %{
2436   predicate((UseSSE>=2) && (UseAVX == 0));
2437   match(Set dst (MulD dst (LoadD src)));
2438 
2439   format %{ "mulsd   $dst, $src" %}
2440   ins_cost(150);
2441   ins_encode %{
2442     __ mulsd($dst$$XMMRegister, $src$$Address);
2443   %}
2444   ins_pipe(pipe_slow);
2445 %}
2446 
2447 instruct mulD_imm(regD dst, immD con) %{
2448   predicate((UseSSE>=2) && (UseAVX == 0));
2449   match(Set dst (MulD dst con));
2450   format %{ "mulsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2451   ins_cost(150);
2452   ins_encode %{
2453     __ mulsd($dst$$XMMRegister, $constantaddress($con));
2454   %}
2455   ins_pipe(pipe_slow);
2456 %}
2457 
2458 instruct mulD_reg_reg(regD dst, regD src1, regD src2) %{
2459   predicate(UseAVX > 0);
2460   match(Set dst (MulD src1 src2));
2461 
2462   format %{ "vmulsd  $dst, $src1, $src2" %}
2463   ins_cost(150);
2464   ins_encode %{
2465     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2466   %}
2467   ins_pipe(pipe_slow);
2468 %}
2469 
2470 instruct mulD_reg_mem(regD dst, regD src1, memory src2) %{
2471   predicate(UseAVX > 0);
2472   match(Set dst (MulD src1 (LoadD src2)));
2473 
2474   format %{ "vmulsd  $dst, $src1, $src2" %}
2475   ins_cost(150);
2476   ins_encode %{
2477     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2478   %}
2479   ins_pipe(pipe_slow);
2480 %}
2481 
2482 instruct mulD_reg_imm(regD dst, regD src, immD con) %{
2483   predicate(UseAVX > 0);
2484   match(Set dst (MulD src con));
2485 
2486   format %{ "vmulsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
2487   ins_cost(150);
2488   ins_encode %{
2489     __ vmulsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2490   %}
2491   ins_pipe(pipe_slow);
2492 %}
2493 
2494 instruct divF_reg(regF dst, regF src) %{
2495   predicate((UseSSE>=1) && (UseAVX == 0));
2496   match(Set dst (DivF dst src));
2497 
2498   format %{ "divss   $dst, $src" %}
2499   ins_cost(150);
2500   ins_encode %{
2501     __ divss($dst$$XMMRegister, $src$$XMMRegister);
2502   %}
2503   ins_pipe(pipe_slow);
2504 %}
2505 
2506 instruct divF_mem(regF dst, memory src) %{
2507   predicate((UseSSE>=1) && (UseAVX == 0));
2508   match(Set dst (DivF dst (LoadF src)));
2509 
2510   format %{ "divss   $dst, $src" %}
2511   ins_cost(150);
2512   ins_encode %{
2513     __ divss($dst$$XMMRegister, $src$$Address);
2514   %}
2515   ins_pipe(pipe_slow);
2516 %}
2517 
2518 instruct divF_imm(regF dst, immF con) %{
2519   predicate((UseSSE>=1) && (UseAVX == 0));
2520   match(Set dst (DivF dst con));
2521   format %{ "divss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2522   ins_cost(150);
2523   ins_encode %{
2524     __ divss($dst$$XMMRegister, $constantaddress($con));
2525   %}
2526   ins_pipe(pipe_slow);
2527 %}
2528 
2529 instruct divF_reg_reg(regF dst, regF src1, regF src2) %{
2530   predicate(UseAVX > 0);
2531   match(Set dst (DivF src1 src2));
2532 
2533   format %{ "vdivss  $dst, $src1, $src2" %}
2534   ins_cost(150);
2535   ins_encode %{
2536     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2537   %}
2538   ins_pipe(pipe_slow);
2539 %}
2540 
2541 instruct divF_reg_mem(regF dst, regF src1, memory src2) %{
2542   predicate(UseAVX > 0);
2543   match(Set dst (DivF src1 (LoadF src2)));
2544 
2545   format %{ "vdivss  $dst, $src1, $src2" %}
2546   ins_cost(150);
2547   ins_encode %{
2548     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2549   %}
2550   ins_pipe(pipe_slow);
2551 %}
2552 
2553 instruct divF_reg_imm(regF dst, regF src, immF con) %{
2554   predicate(UseAVX > 0);
2555   match(Set dst (DivF src con));
2556 
2557   format %{ "vdivss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
2558   ins_cost(150);
2559   ins_encode %{
2560     __ vdivss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2561   %}
2562   ins_pipe(pipe_slow);
2563 %}
2564 
2565 instruct divD_reg(regD dst, regD src) %{
2566   predicate((UseSSE>=2) && (UseAVX == 0));
2567   match(Set dst (DivD dst src));
2568 
2569   format %{ "divsd   $dst, $src" %}
2570   ins_cost(150);
2571   ins_encode %{
2572     __ divsd($dst$$XMMRegister, $src$$XMMRegister);
2573   %}
2574   ins_pipe(pipe_slow);
2575 %}
2576 
2577 instruct divD_mem(regD dst, memory src) %{
2578   predicate((UseSSE>=2) && (UseAVX == 0));
2579   match(Set dst (DivD dst (LoadD src)));
2580 
2581   format %{ "divsd   $dst, $src" %}
2582   ins_cost(150);
2583   ins_encode %{
2584     __ divsd($dst$$XMMRegister, $src$$Address);
2585   %}
2586   ins_pipe(pipe_slow);
2587 %}
2588 
2589 instruct divD_imm(regD dst, immD con) %{
2590   predicate((UseSSE>=2) && (UseAVX == 0));
2591   match(Set dst (DivD dst con));
2592   format %{ "divsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2593   ins_cost(150);
2594   ins_encode %{
2595     __ divsd($dst$$XMMRegister, $constantaddress($con));
2596   %}
2597   ins_pipe(pipe_slow);
2598 %}
2599 
2600 instruct divD_reg_reg(regD dst, regD src1, regD src2) %{
2601   predicate(UseAVX > 0);
2602   match(Set dst (DivD src1 src2));
2603 
2604   format %{ "vdivsd  $dst, $src1, $src2" %}
2605   ins_cost(150);
2606   ins_encode %{
2607     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2608   %}
2609   ins_pipe(pipe_slow);
2610 %}
2611 
2612 instruct divD_reg_mem(regD dst, regD src1, memory src2) %{
2613   predicate(UseAVX > 0);
2614   match(Set dst (DivD src1 (LoadD src2)));
2615 
2616   format %{ "vdivsd  $dst, $src1, $src2" %}
2617   ins_cost(150);
2618   ins_encode %{
2619     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2620   %}
2621   ins_pipe(pipe_slow);
2622 %}
2623 
2624 instruct divD_reg_imm(regD dst, regD src, immD con) %{
2625   predicate(UseAVX > 0);
2626   match(Set dst (DivD src con));
2627 
2628   format %{ "vdivsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
2629   ins_cost(150);
2630   ins_encode %{
2631     __ vdivsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2632   %}
2633   ins_pipe(pipe_slow);
2634 %}
2635 
2636 instruct absF_reg(regF dst) %{
2637   predicate((UseSSE>=1) && (UseAVX == 0));
2638   match(Set dst (AbsF dst));
2639   ins_cost(150);
2640   format %{ "andps   $dst, [0x7fffffff]\t# abs float by sign masking" %}
2641   ins_encode %{
2642     __ andps($dst$$XMMRegister, ExternalAddress(float_signmask()));
2643   %}
2644   ins_pipe(pipe_slow);
2645 %}
2646 
2647 instruct absF_reg_reg(vlRegF dst, vlRegF src) %{
2648   predicate(UseAVX > 0);
2649   match(Set dst (AbsF src));
2650   ins_cost(150);
2651   format %{ "vandps  $dst, $src, [0x7fffffff]\t# abs float by sign masking" %}
2652   ins_encode %{
2653     int vector_len = 0;
2654     __ vandps($dst$$XMMRegister, $src$$XMMRegister,
2655               ExternalAddress(float_signmask()), vector_len);
2656   %}
2657   ins_pipe(pipe_slow);
2658 %}
2659 
2660 instruct absD_reg(regD dst) %{
2661   predicate((UseSSE>=2) && (UseAVX == 0));
2662   match(Set dst (AbsD dst));
2663   ins_cost(150);
2664   format %{ "andpd   $dst, [0x7fffffffffffffff]\t"
2665             "# abs double by sign masking" %}
2666   ins_encode %{
2667     __ andpd($dst$$XMMRegister, ExternalAddress(double_signmask()));
2668   %}
2669   ins_pipe(pipe_slow);
2670 %}
2671 
2672 instruct absD_reg_reg(vlRegD dst, vlRegD src) %{
2673   predicate(UseAVX > 0);
2674   match(Set dst (AbsD src));
2675   ins_cost(150);
2676   format %{ "vandpd  $dst, $src, [0x7fffffffffffffff]\t"
2677             "# abs double by sign masking" %}
2678   ins_encode %{
2679     int vector_len = 0;
2680     __ vandpd($dst$$XMMRegister, $src$$XMMRegister,
2681               ExternalAddress(double_signmask()), vector_len);
2682   %}
2683   ins_pipe(pipe_slow);
2684 %}
2685 
2686 instruct negF_reg(regF dst) %{
2687   predicate((UseSSE>=1) && (UseAVX == 0));
2688   match(Set dst (NegF dst));
2689   ins_cost(150);
2690   format %{ "xorps   $dst, [0x80000000]\t# neg float by sign flipping" %}
2691   ins_encode %{
2692     __ xorps($dst$$XMMRegister, ExternalAddress(float_signflip()));
2693   %}
2694   ins_pipe(pipe_slow);
2695 %}
2696 
2697 instruct negF_reg_reg(vlRegF dst, vlRegF src) %{
2698   predicate(UseAVX > 0);
2699   match(Set dst (NegF src));
2700   ins_cost(150);
2701   format %{ "vnegatess  $dst, $src, [0x80000000]\t# neg float by sign flipping" %}
2702   ins_encode %{
2703     __ vnegatess($dst$$XMMRegister, $src$$XMMRegister,
2704                  ExternalAddress(float_signflip()));
2705   %}
2706   ins_pipe(pipe_slow);
2707 %}
2708 
2709 instruct negD_reg(regD dst) %{
2710   predicate((UseSSE>=2) && (UseAVX == 0));
2711   match(Set dst (NegD dst));
2712   ins_cost(150);
2713   format %{ "xorpd   $dst, [0x8000000000000000]\t"
2714             "# neg double by sign flipping" %}
2715   ins_encode %{
2716     __ xorpd($dst$$XMMRegister, ExternalAddress(double_signflip()));
2717   %}
2718   ins_pipe(pipe_slow);
2719 %}
2720 
2721 instruct negD_reg_reg(vlRegD dst, vlRegD src) %{
2722   predicate(UseAVX > 0);
2723   match(Set dst (NegD src));
2724   ins_cost(150);
2725   format %{ "vnegatesd  $dst, $src, [0x8000000000000000]\t"
2726             "# neg double by sign flipping" %}
2727   ins_encode %{
2728     __ vnegatesd($dst$$XMMRegister, $src$$XMMRegister,
2729                  ExternalAddress(double_signflip()));
2730   %}
2731   ins_pipe(pipe_slow);
2732 %}
2733 
2734 instruct sqrtF_reg(regF dst, regF src) %{
2735   predicate(UseSSE>=1);
2736   match(Set dst (SqrtF src));
2737 
2738   format %{ "sqrtss  $dst, $src" %}
2739   ins_cost(150);
2740   ins_encode %{
2741     __ sqrtss($dst$$XMMRegister, $src$$XMMRegister);
2742   %}
2743   ins_pipe(pipe_slow);
2744 %}
2745 
2746 instruct sqrtF_mem(regF dst, memory src) %{
2747   predicate(UseSSE>=1);
2748   match(Set dst (SqrtF (LoadF src)));
2749 
2750   format %{ "sqrtss  $dst, $src" %}
2751   ins_cost(150);
2752   ins_encode %{
2753     __ sqrtss($dst$$XMMRegister, $src$$Address);
2754   %}
2755   ins_pipe(pipe_slow);
2756 %}
2757 
2758 instruct sqrtF_imm(regF dst, immF con) %{
2759   predicate(UseSSE>=1);
2760   match(Set dst (SqrtF con));
2761 
2762   format %{ "sqrtss  $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2763   ins_cost(150);
2764   ins_encode %{
2765     __ sqrtss($dst$$XMMRegister, $constantaddress($con));
2766   %}
2767   ins_pipe(pipe_slow);
2768 %}
2769 
2770 instruct sqrtD_reg(regD dst, regD src) %{
2771   predicate(UseSSE>=2);
2772   match(Set dst (SqrtD src));
2773 
2774   format %{ "sqrtsd  $dst, $src" %}
2775   ins_cost(150);
2776   ins_encode %{
2777     __ sqrtsd($dst$$XMMRegister, $src$$XMMRegister);
2778   %}
2779   ins_pipe(pipe_slow);
2780 %}
2781 
2782 instruct sqrtD_mem(regD dst, memory src) %{
2783   predicate(UseSSE>=2);
2784   match(Set dst (SqrtD (LoadD src)));
2785 
2786   format %{ "sqrtsd  $dst, $src" %}
2787   ins_cost(150);
2788   ins_encode %{
2789     __ sqrtsd($dst$$XMMRegister, $src$$Address);
2790   %}
2791   ins_pipe(pipe_slow);
2792 %}
2793 
2794 instruct sqrtD_imm(regD dst, immD con) %{
2795   predicate(UseSSE>=2);
2796   match(Set dst (SqrtD con));
2797   format %{ "sqrtsd  $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2798   ins_cost(150);
2799   ins_encode %{
2800     __ sqrtsd($dst$$XMMRegister, $constantaddress($con));
2801   %}
2802   ins_pipe(pipe_slow);
2803 %}
2804 
2805 instruct onspinwait() %{
2806   match(OnSpinWait);
2807   ins_cost(200);
2808 
2809   format %{
2810     $$template
2811     $$emit$$"pause\t! membar_onspinwait"
2812   %}
2813   ins_encode %{
2814     __ pause();
2815   %}
2816   ins_pipe(pipe_slow);
2817 %}
2818 
2819 // a * b + c
2820 instruct fmaD_reg(regD a, regD b, regD c) %{
2821   predicate(UseFMA);
2822   match(Set c (FmaD  c (Binary a b)));
2823   format %{ "fmasd $a,$b,$c\t# $c = $a * $b + $c" %}
2824   ins_cost(150);
2825   ins_encode %{
2826     __ fmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
2827   %}
2828   ins_pipe( pipe_slow );
2829 %}
2830 
2831 // a * b + c
2832 instruct fmaF_reg(regF a, regF b, regF c) %{
2833   predicate(UseFMA);
2834   match(Set c (FmaF  c (Binary a b)));
2835   format %{ "fmass $a,$b,$c\t# $c = $a * $b + $c" %}
2836   ins_cost(150);
2837   ins_encode %{
2838     __ fmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
2839   %}
2840   ins_pipe( pipe_slow );
2841 %}
2842 
2843 // ====================VECTOR INSTRUCTIONS=====================================
2844 
2845 
2846 // Load vectors (4 bytes long)
2847 instruct loadV4(vecS dst, memory mem) %{
2848   predicate(n->as_LoadVector()->memory_size() == 4);
2849   match(Set dst (LoadVector mem));
2850   ins_cost(125);
2851   format %{ "movd    $dst,$mem\t! load vector (4 bytes)" %}
2852   ins_encode %{
2853     __ movdl($dst$$XMMRegister, $mem$$Address);
2854   %}
2855   ins_pipe( pipe_slow );
2856 %}
2857 
2858 // Load vectors (4 bytes long)
2859 instruct MoveVecS2Leg(legVecS dst, vecS src) %{
2860   match(Set dst src);
2861   format %{ "movss $dst,$src\t! load vector (4 bytes)" %}
2862   ins_encode %{
2863     __ movflt($dst$$XMMRegister, $src$$XMMRegister);
2864   %}
2865   ins_pipe( fpu_reg_reg );
2866 %}
2867 
2868 // Load vectors (4 bytes long)
2869 instruct MoveLeg2VecS(vecS dst, legVecS src) %{
2870   match(Set dst src);
2871   format %{ "movss $dst,$src\t! load vector (4 bytes)" %}
2872   ins_encode %{
2873     __ movflt($dst$$XMMRegister, $src$$XMMRegister);
2874   %}
2875   ins_pipe( fpu_reg_reg );
2876 %}
2877 
2878 // Load vectors (8 bytes long)
2879 instruct loadV8(vecD dst, memory mem) %{
2880   predicate(n->as_LoadVector()->memory_size() == 8);
2881   match(Set dst (LoadVector mem));
2882   ins_cost(125);
2883   format %{ "movq    $dst,$mem\t! load vector (8 bytes)" %}
2884   ins_encode %{
2885     __ movq($dst$$XMMRegister, $mem$$Address);
2886   %}
2887   ins_pipe( pipe_slow );
2888 %}
2889 
2890 // Load vectors (8 bytes long)
2891 instruct MoveVecD2Leg(legVecD dst, vecD src) %{
2892   match(Set dst src);
2893   format %{ "movsd $dst,$src\t! load vector (8 bytes)" %}
2894   ins_encode %{
2895     __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
2896   %}
2897   ins_pipe( fpu_reg_reg );
2898 %}
2899 
2900 // Load vectors (8 bytes long)
2901 instruct MoveLeg2VecD(vecD dst, legVecD src) %{
2902   match(Set dst src);
2903   format %{ "movsd $dst,$src\t! load vector (8 bytes)" %}
2904   ins_encode %{
2905     __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
2906   %}
2907   ins_pipe( fpu_reg_reg );
2908 %}
2909 
2910 // Load vectors (16 bytes long)
2911 instruct loadV16(vecX dst, memory mem) %{
2912   predicate(n->as_LoadVector()->memory_size() == 16);
2913   match(Set dst (LoadVector mem));
2914   ins_cost(125);
2915   format %{ "movdqu  $dst,$mem\t! load vector (16 bytes)" %}
2916   ins_encode %{
2917     __ movdqu($dst$$XMMRegister, $mem$$Address);
2918   %}
2919   ins_pipe( pipe_slow );
2920 %}
2921 
2922 // Load vectors (16 bytes long)
2923 instruct MoveVecX2Leg(legVecX dst, vecX src) %{
2924   match(Set dst src);
2925   format %{ "movdqu $dst,$src\t! load vector (16 bytes)" %}
2926   ins_encode %{
2927     if (UseAVX < 2 || VM_Version::supports_avx512vl()) {
2928       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
2929     } else {
2930       int vector_len = 2;
2931       __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
2932     }
2933   %}
2934   ins_pipe( fpu_reg_reg );
2935 %}
2936 
2937 // Load vectors (16 bytes long)
2938 instruct MoveLeg2VecX(vecX dst, legVecX src) %{
2939   match(Set dst src);
2940   format %{ "movdqu $dst,$src\t! load vector (16 bytes)" %}
2941   ins_encode %{
2942     if (UseAVX < 2 || VM_Version::supports_avx512vl()) {
2943       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
2944     } else {
2945       int vector_len = 2;
2946       __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
2947     }
2948   %}
2949   ins_pipe( fpu_reg_reg );
2950 %}
2951 
2952 // Load vectors (32 bytes long)
2953 instruct loadV32(vecY dst, memory mem) %{
2954   predicate(n->as_LoadVector()->memory_size() == 32);
2955   match(Set dst (LoadVector mem));
2956   ins_cost(125);
2957   format %{ "vmovdqu $dst,$mem\t! load vector (32 bytes)" %}
2958   ins_encode %{
2959     __ vmovdqu($dst$$XMMRegister, $mem$$Address);
2960   %}
2961   ins_pipe( pipe_slow );
2962 %}
2963 
2964 // Load vectors (32 bytes long)
2965 instruct MoveVecY2Leg(legVecY dst, vecY src) %{
2966   match(Set dst src);
2967   format %{ "vmovdqu $dst,$src\t! load vector (32 bytes)" %}
2968   ins_encode %{
2969     if (UseAVX < 2 || VM_Version::supports_avx512vl()) {
2970       __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister);
2971     } else {
2972       int vector_len = 2;
2973       __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
2974     }
2975   %}
2976   ins_pipe( fpu_reg_reg );
2977 %}
2978 
2979 // Load vectors (32 bytes long)
2980 instruct MoveLeg2VecY(vecY dst, legVecY src) %{
2981   match(Set dst src);
2982   format %{ "vmovdqu $dst,$src\t! load vector (32 bytes)" %}
2983   ins_encode %{
2984     if (UseAVX < 2 || VM_Version::supports_avx512vl()) {
2985       __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister);
2986     } else {
2987       int vector_len = 2;
2988       __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
2989     }
2990   %}
2991   ins_pipe( fpu_reg_reg );
2992 %}
2993 
2994 // Load vectors (64 bytes long)
2995 instruct loadV64_dword(vecZ dst, memory mem) %{
2996   predicate(n->as_LoadVector()->memory_size() == 64 && n->as_LoadVector()->element_size() <= 4);
2997   match(Set dst (LoadVector mem));
2998   ins_cost(125);
2999   format %{ "vmovdqul $dst k0,$mem\t! load vector (64 bytes)" %}
3000   ins_encode %{
3001     int vector_len = 2;
3002     __ evmovdqul($dst$$XMMRegister, $mem$$Address, vector_len);
3003   %}
3004   ins_pipe( pipe_slow );
3005 %}
3006 
3007 // Load vectors (64 bytes long)
3008 instruct loadV64_qword(vecZ dst, memory mem) %{
3009   predicate(n->as_LoadVector()->memory_size() == 64 && n->as_LoadVector()->element_size() > 4);
3010   match(Set dst (LoadVector mem));
3011   ins_cost(125);
3012   format %{ "vmovdquq $dst k0,$mem\t! load vector (64 bytes)" %}
3013   ins_encode %{
3014     int vector_len = 2;
3015     __ evmovdquq($dst$$XMMRegister, $mem$$Address, vector_len);
3016   %}
3017   ins_pipe( pipe_slow );
3018 %}
3019 
3020 instruct MoveVecZ2Leg(legVecZ dst, vecZ  src) %{
3021   match(Set dst src);
3022   format %{ "vmovdquq $dst k0,$src\t! Move vector (64 bytes)" %}
3023   ins_encode %{
3024     int vector_len = 2;
3025     __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
3026   %}
3027   ins_pipe( fpu_reg_reg );
3028 %}
3029 
3030 instruct MoveLeg2VecZ(vecZ dst, legVecZ  src) %{
3031   match(Set dst src);
3032   format %{ "vmovdquq $dst k0,$src\t! Move vector (64 bytes)" %}
3033   ins_encode %{
3034     int vector_len = 2;
3035     __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
3036   %}
3037   ins_pipe( fpu_reg_reg );
3038 %}
3039 
3040 // Store vectors
3041 instruct storeV4(memory mem, vecS src) %{
3042   predicate(n->as_StoreVector()->memory_size() == 4);
3043   match(Set mem (StoreVector mem src));
3044   ins_cost(145);
3045   format %{ "movd    $mem,$src\t! store vector (4 bytes)" %}
3046   ins_encode %{
3047     __ movdl($mem$$Address, $src$$XMMRegister);
3048   %}
3049   ins_pipe( pipe_slow );
3050 %}
3051 
3052 instruct storeV8(memory mem, vecD src) %{
3053   predicate(n->as_StoreVector()->memory_size() == 8);
3054   match(Set mem (StoreVector mem src));
3055   ins_cost(145);
3056   format %{ "movq    $mem,$src\t! store vector (8 bytes)" %}
3057   ins_encode %{
3058     __ movq($mem$$Address, $src$$XMMRegister);
3059   %}
3060   ins_pipe( pipe_slow );
3061 %}
3062 
3063 instruct storeV16(memory mem, vecX src) %{
3064   predicate(n->as_StoreVector()->memory_size() == 16);
3065   match(Set mem (StoreVector mem src));
3066   ins_cost(145);
3067   format %{ "movdqu  $mem,$src\t! store vector (16 bytes)" %}
3068   ins_encode %{
3069     __ movdqu($mem$$Address, $src$$XMMRegister);
3070   %}
3071   ins_pipe( pipe_slow );
3072 %}
3073 
3074 instruct storeV32(memory mem, vecY src) %{
3075   predicate(n->as_StoreVector()->memory_size() == 32);
3076   match(Set mem (StoreVector mem src));
3077   ins_cost(145);
3078   format %{ "vmovdqu $mem,$src\t! store vector (32 bytes)" %}
3079   ins_encode %{
3080     __ vmovdqu($mem$$Address, $src$$XMMRegister);
3081   %}
3082   ins_pipe( pipe_slow );
3083 %}
3084 
3085 instruct storeV64_dword(memory mem, vecZ src) %{
3086   predicate(n->as_StoreVector()->memory_size() == 64 && n->as_StoreVector()->element_size() <= 4);
3087   match(Set mem (StoreVector mem src));
3088   ins_cost(145);
3089   format %{ "vmovdqul $mem k0,$src\t! store vector (64 bytes)" %}
3090   ins_encode %{
3091     int vector_len = 2;
3092     __ evmovdqul($mem$$Address, $src$$XMMRegister, vector_len);
3093   %}
3094   ins_pipe( pipe_slow );
3095 %}
3096 
3097 instruct storeV64_qword(memory mem, vecZ src) %{
3098   predicate(n->as_StoreVector()->memory_size() == 64 && n->as_StoreVector()->element_size() > 4);
3099   match(Set mem (StoreVector mem src));
3100   ins_cost(145);
3101   format %{ "vmovdquq $mem k0,$src\t! store vector (64 bytes)" %}
3102   ins_encode %{
3103     int vector_len = 2;
3104     __ evmovdquq($mem$$Address, $src$$XMMRegister, vector_len);
3105   %}
3106   ins_pipe( pipe_slow );
3107 %}
3108 
3109 // ====================LEGACY REPLICATE=======================================
3110 
3111 instruct Repl4B_mem(vecS dst, memory mem) %{
3112   predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
3113   match(Set dst (ReplicateB (LoadB mem)));
3114   format %{ "punpcklbw $dst,$mem\n\t"
3115             "pshuflw $dst,$dst,0x00\t! replicate4B" %}
3116   ins_encode %{
3117     __ punpcklbw($dst$$XMMRegister, $mem$$Address);
3118     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3119   %}
3120   ins_pipe( pipe_slow );
3121 %}
3122 
3123 instruct Repl8B_mem(vecD dst, memory mem) %{
3124   predicate(n->as_Vector()->length() == 8 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
3125   match(Set dst (ReplicateB (LoadB mem)));
3126   format %{ "punpcklbw $dst,$mem\n\t"
3127             "pshuflw $dst,$dst,0x00\t! replicate8B" %}
3128   ins_encode %{
3129     __ punpcklbw($dst$$XMMRegister, $mem$$Address);
3130     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3131   %}
3132   ins_pipe( pipe_slow );
3133 %}
3134 
3135 instruct Repl16B(vecX dst, rRegI src) %{
3136   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
3137   match(Set dst (ReplicateB src));
3138   format %{ "movd    $dst,$src\n\t"
3139             "punpcklbw $dst,$dst\n\t"
3140             "pshuflw $dst,$dst,0x00\n\t"
3141             "punpcklqdq $dst,$dst\t! replicate16B" %}
3142   ins_encode %{
3143     __ movdl($dst$$XMMRegister, $src$$Register);
3144     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
3145     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3146     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3147   %}
3148   ins_pipe( pipe_slow );
3149 %}
3150 
3151 instruct Repl16B_mem(vecX dst, memory mem) %{
3152   predicate(n->as_Vector()->length() == 16 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
3153   match(Set dst (ReplicateB (LoadB mem)));
3154   format %{ "punpcklbw $dst,$mem\n\t"
3155             "pshuflw $dst,$dst,0x00\n\t"
3156             "punpcklqdq $dst,$dst\t! replicate16B" %}
3157   ins_encode %{
3158     __ punpcklbw($dst$$XMMRegister, $mem$$Address);
3159     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3160     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3161   %}
3162   ins_pipe( pipe_slow );
3163 %}
3164 
3165 instruct Repl32B(vecY dst, rRegI src) %{
3166   predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
3167   match(Set dst (ReplicateB src));
3168   format %{ "movd    $dst,$src\n\t"
3169             "punpcklbw $dst,$dst\n\t"
3170             "pshuflw $dst,$dst,0x00\n\t"
3171             "punpcklqdq $dst,$dst\n\t"
3172             "vinserti128_high $dst,$dst\t! replicate32B" %}
3173   ins_encode %{
3174     __ movdl($dst$$XMMRegister, $src$$Register);
3175     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
3176     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3177     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3178     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3179   %}
3180   ins_pipe( pipe_slow );
3181 %}
3182 
3183 instruct Repl32B_mem(vecY dst, memory mem) %{
3184   predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
3185   match(Set dst (ReplicateB (LoadB mem)));
3186   format %{ "punpcklbw $dst,$mem\n\t"
3187             "pshuflw $dst,$dst,0x00\n\t"
3188             "punpcklqdq $dst,$dst\n\t"
3189             "vinserti128_high $dst,$dst\t! replicate32B" %}
3190   ins_encode %{
3191     __ punpcklbw($dst$$XMMRegister, $mem$$Address);
3192     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3193     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3194     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3195   %}
3196   ins_pipe( pipe_slow );
3197 %}
3198 
3199 instruct Repl64B(legVecZ dst, rRegI src) %{
3200   predicate(n->as_Vector()->length() == 64 && !VM_Version::supports_avx512vlbw());
3201   match(Set dst (ReplicateB src));
3202   format %{ "movd    $dst,$src\n\t"
3203             "punpcklbw $dst,$dst\n\t"
3204             "pshuflw $dst,$dst,0x00\n\t"
3205             "punpcklqdq $dst,$dst\n\t"
3206             "vinserti128_high $dst,$dst\t"
3207             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate64B" %}
3208   ins_encode %{
3209     __ movdl($dst$$XMMRegister, $src$$Register);
3210     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
3211     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3212     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3213     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3214     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3215   %}
3216   ins_pipe( pipe_slow );
3217 %}
3218 
3219 instruct Repl64B_mem(legVecZ dst, memory mem) %{
3220   predicate(n->as_Vector()->length() == 64 && !VM_Version::supports_avx512vlbw());
3221   match(Set dst (ReplicateB (LoadB mem)));
3222   format %{ "punpcklbw $dst,$mem\n\t"
3223             "pshuflw $dst,$dst,0x00\n\t"
3224             "punpcklqdq $dst,$dst\n\t"
3225             "vinserti128_high $dst,$dst\t"
3226             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate64B" %}
3227   ins_encode %{
3228     __ punpcklbw($dst$$XMMRegister, $mem$$Address);
3229     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3230     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3231     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3232     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3233   %}
3234   ins_pipe( pipe_slow );
3235 %}
3236 
3237 instruct Repl16B_imm(vecX dst, immI con) %{
3238   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
3239   match(Set dst (ReplicateB con));
3240   format %{ "movq    $dst,[$constantaddress]\n\t"
3241             "punpcklqdq $dst,$dst\t! replicate16B($con)" %}
3242   ins_encode %{
3243     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
3244     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3245   %}
3246   ins_pipe( pipe_slow );
3247 %}
3248 
3249 instruct Repl32B_imm(vecY dst, immI con) %{
3250   predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
3251   match(Set dst (ReplicateB con));
3252   format %{ "movq    $dst,[$constantaddress]\n\t"
3253             "punpcklqdq $dst,$dst\n\t"
3254             "vinserti128_high $dst,$dst\t! lreplicate32B($con)" %}
3255   ins_encode %{
3256     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
3257     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3258     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3259   %}
3260   ins_pipe( pipe_slow );
3261 %}
3262 
3263 instruct Repl64B_imm(legVecZ dst, immI con) %{
3264   predicate(n->as_Vector()->length() == 64 && !VM_Version::supports_avx512vlbw());
3265   match(Set dst (ReplicateB con));
3266   format %{ "movq    $dst,[$constantaddress]\n\t"
3267             "punpcklqdq $dst,$dst\n\t"
3268             "vinserti128_high $dst,$dst\t"
3269             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate64B($con)" %}
3270   ins_encode %{
3271     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
3272     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3273     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3274     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3275   %}
3276   ins_pipe( pipe_slow );
3277 %}
3278 
3279 instruct Repl4S(vecD dst, rRegI src) %{
3280   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vlbw());
3281   match(Set dst (ReplicateS src));
3282   format %{ "movd    $dst,$src\n\t"
3283             "pshuflw $dst,$dst,0x00\t! replicate4S" %}
3284   ins_encode %{
3285     __ movdl($dst$$XMMRegister, $src$$Register);
3286     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3287   %}
3288   ins_pipe( pipe_slow );
3289 %}
3290 
3291 instruct Repl4S_mem(vecD dst, memory mem) %{
3292   predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
3293   match(Set dst (ReplicateS (LoadS mem)));
3294   format %{ "pshuflw $dst,$mem,0x00\t! replicate4S" %}
3295   ins_encode %{
3296     __ pshuflw($dst$$XMMRegister, $mem$$Address, 0x00);
3297   %}
3298   ins_pipe( pipe_slow );
3299 %}
3300 
3301 instruct Repl8S(vecX dst, rRegI src) %{
3302   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vlbw());
3303   match(Set dst (ReplicateS src));
3304   format %{ "movd    $dst,$src\n\t"
3305             "pshuflw $dst,$dst,0x00\n\t"
3306             "punpcklqdq $dst,$dst\t! replicate8S" %}
3307   ins_encode %{
3308     __ movdl($dst$$XMMRegister, $src$$Register);
3309     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3310     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3311   %}
3312   ins_pipe( pipe_slow );
3313 %}
3314 
3315 instruct Repl8S_mem(vecX dst, memory mem) %{
3316   predicate(n->as_Vector()->length() == 8 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
3317   match(Set dst (ReplicateS (LoadS mem)));
3318   format %{ "pshuflw $dst,$mem,0x00\n\t"
3319             "punpcklqdq $dst,$dst\t! replicate8S" %}
3320   ins_encode %{
3321     __ pshuflw($dst$$XMMRegister, $mem$$Address, 0x00);
3322     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3323   %}
3324   ins_pipe( pipe_slow );
3325 %}
3326 
3327 instruct Repl8S_imm(vecX dst, immI con) %{
3328   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vlbw());
3329   match(Set dst (ReplicateS con));
3330   format %{ "movq    $dst,[$constantaddress]\n\t"
3331             "punpcklqdq $dst,$dst\t! replicate8S($con)" %}
3332   ins_encode %{
3333     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
3334     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3335   %}
3336   ins_pipe( pipe_slow );
3337 %}
3338 
3339 instruct Repl16S(vecY dst, rRegI src) %{
3340   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
3341   match(Set dst (ReplicateS src));
3342   format %{ "movd    $dst,$src\n\t"
3343             "pshuflw $dst,$dst,0x00\n\t"
3344             "punpcklqdq $dst,$dst\n\t"
3345             "vinserti128_high $dst,$dst\t! replicate16S" %}
3346   ins_encode %{
3347     __ movdl($dst$$XMMRegister, $src$$Register);
3348     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3349     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3350     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3351   %}
3352   ins_pipe( pipe_slow );
3353 %}
3354 
3355 instruct Repl16S_mem(vecY dst, memory mem) %{
3356   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
3357   match(Set dst (ReplicateS (LoadS mem)));
3358   format %{ "pshuflw $dst,$mem,0x00\n\t"
3359             "punpcklqdq $dst,$dst\n\t"
3360             "vinserti128_high $dst,$dst\t! replicate16S" %}
3361   ins_encode %{
3362     __ pshuflw($dst$$XMMRegister, $mem$$Address, 0x00);
3363     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3364     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3365   %}
3366   ins_pipe( pipe_slow );
3367 %}
3368 
3369 instruct Repl16S_imm(vecY dst, immI con) %{
3370   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
3371   match(Set dst (ReplicateS con));
3372   format %{ "movq    $dst,[$constantaddress]\n\t"
3373             "punpcklqdq $dst,$dst\n\t"
3374             "vinserti128_high $dst,$dst\t! replicate16S($con)" %}
3375   ins_encode %{
3376     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
3377     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3378     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3379   %}
3380   ins_pipe( pipe_slow );
3381 %}
3382 
3383 instruct Repl32S(legVecZ dst, rRegI src) %{
3384   predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
3385   match(Set dst (ReplicateS src));
3386   format %{ "movd    $dst,$src\n\t"
3387             "pshuflw $dst,$dst,0x00\n\t"
3388             "punpcklqdq $dst,$dst\n\t"
3389             "vinserti128_high $dst,$dst\t"
3390             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate32S" %}
3391   ins_encode %{
3392     __ movdl($dst$$XMMRegister, $src$$Register);
3393     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3394     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3395     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3396     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3397   %}
3398   ins_pipe( pipe_slow );
3399 %}
3400 
3401 instruct Repl32S_mem(legVecZ dst, memory mem) %{
3402   predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
3403   match(Set dst (ReplicateS (LoadS mem)));
3404   format %{ "pshuflw $dst,$mem,0x00\n\t"
3405             "punpcklqdq $dst,$dst\n\t"
3406             "vinserti128_high $dst,$dst\t"
3407             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate32S" %}
3408   ins_encode %{
3409     __ pshuflw($dst$$XMMRegister, $mem$$Address, 0x00);
3410     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3411     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3412     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3413   %}
3414   ins_pipe( pipe_slow );
3415 %}
3416 
3417 instruct Repl32S_imm(legVecZ dst, immI con) %{
3418   predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
3419   match(Set dst (ReplicateS con));
3420   format %{ "movq    $dst,[$constantaddress]\n\t"
3421             "punpcklqdq $dst,$dst\n\t"
3422             "vinserti128_high $dst,$dst\t"
3423             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate32S($con)" %}
3424   ins_encode %{
3425     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
3426     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3427     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3428     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3429   %}
3430   ins_pipe( pipe_slow );
3431 %}
3432 
3433 instruct Repl4I(vecX dst, rRegI src) %{
3434   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3435   match(Set dst (ReplicateI src));
3436   format %{ "movd    $dst,$src\n\t"
3437             "pshufd  $dst,$dst,0x00\t! replicate4I" %}
3438   ins_encode %{
3439     __ movdl($dst$$XMMRegister, $src$$Register);
3440     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3441   %}
3442   ins_pipe( pipe_slow );
3443 %}
3444 
3445 instruct Repl4I_mem(vecX dst, memory mem) %{
3446   predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3447   match(Set dst (ReplicateI (LoadI mem)));
3448   format %{ "pshufd  $dst,$mem,0x00\t! replicate4I" %}
3449   ins_encode %{
3450     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3451   %}
3452   ins_pipe( pipe_slow );
3453 %}
3454 
3455 instruct Repl8I(vecY dst, rRegI src) %{
3456   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3457   match(Set dst (ReplicateI src));
3458   format %{ "movd    $dst,$src\n\t"
3459             "pshufd  $dst,$dst,0x00\n\t"
3460             "vinserti128_high $dst,$dst\t! replicate8I" %}
3461   ins_encode %{
3462     __ movdl($dst$$XMMRegister, $src$$Register);
3463     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3464     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3465   %}
3466   ins_pipe( pipe_slow );
3467 %}
3468 
3469 instruct Repl8I_mem(vecY dst, memory mem) %{
3470   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3471   match(Set dst (ReplicateI (LoadI mem)));
3472   format %{ "pshufd  $dst,$mem,0x00\n\t"
3473             "vinserti128_high $dst,$dst\t! replicate8I" %}
3474   ins_encode %{
3475     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3476     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3477   %}
3478   ins_pipe( pipe_slow );
3479 %}
3480 
3481 instruct Repl16I(legVecZ dst, rRegI src) %{
3482   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vl());
3483   match(Set dst (ReplicateI src));
3484   format %{ "movd    $dst,$src\n\t"
3485             "pshufd  $dst,$dst,0x00\n\t"
3486             "vinserti128_high $dst,$dst\t"
3487             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate16I" %}
3488   ins_encode %{
3489     __ movdl($dst$$XMMRegister, $src$$Register);
3490     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3491     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3492     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3493   %}
3494   ins_pipe( pipe_slow );
3495 %}
3496 
3497 instruct Repl16I_mem(legVecZ dst, memory mem) %{
3498   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vl());
3499   match(Set dst (ReplicateI (LoadI mem)));
3500   format %{ "pshufd  $dst,$mem,0x00\n\t"
3501             "vinserti128_high $dst,$dst\t"
3502             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate16I" %}
3503   ins_encode %{
3504     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3505     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3506     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3507   %}
3508   ins_pipe( pipe_slow );
3509 %}
3510 
3511 instruct Repl4I_imm(vecX dst, immI con) %{
3512   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3513   match(Set dst (ReplicateI con));
3514   format %{ "movq    $dst,[$constantaddress]\t! replicate4I($con)\n\t"
3515             "punpcklqdq $dst,$dst" %}
3516   ins_encode %{
3517     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
3518     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3519   %}
3520   ins_pipe( pipe_slow );
3521 %}
3522 
3523 instruct Repl8I_imm(vecY dst, immI con) %{
3524   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3525   match(Set dst (ReplicateI con));
3526   format %{ "movq    $dst,[$constantaddress]\t! replicate8I($con)\n\t"
3527             "punpcklqdq $dst,$dst\n\t"
3528             "vinserti128_high $dst,$dst" %}
3529   ins_encode %{
3530     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
3531     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3532     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3533   %}
3534   ins_pipe( pipe_slow );
3535 %}
3536 
3537 instruct Repl16I_imm(legVecZ dst, immI con) %{
3538   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vl());
3539   match(Set dst (ReplicateI con));
3540   format %{ "movq    $dst,[$constantaddress]\t"
3541             "punpcklqdq $dst,$dst\n\t"
3542             "vinserti128_high $dst,$dst"
3543             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate16I($con)" %}
3544   ins_encode %{
3545     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
3546     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3547     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3548     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3549   %}
3550   ins_pipe( pipe_slow );
3551 %}
3552 
3553 // Long could be loaded into xmm register directly from memory.
3554 instruct Repl2L_mem(vecX dst, memory mem) %{
3555   predicate(n->as_Vector()->length() == 2 && !VM_Version::supports_avx512vlbw());
3556   match(Set dst (ReplicateL (LoadL mem)));
3557   format %{ "movq    $dst,$mem\n\t"
3558             "punpcklqdq $dst,$dst\t! replicate2L" %}
3559   ins_encode %{
3560     __ movq($dst$$XMMRegister, $mem$$Address);
3561     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3562   %}
3563   ins_pipe( pipe_slow );
3564 %}
3565 
3566 // Replicate long (8 byte) scalar to be vector
3567 #ifdef _LP64
3568 instruct Repl4L(vecY dst, rRegL src) %{
3569   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3570   match(Set dst (ReplicateL src));
3571   format %{ "movdq   $dst,$src\n\t"
3572             "punpcklqdq $dst,$dst\n\t"
3573             "vinserti128_high $dst,$dst\t! replicate4L" %}
3574   ins_encode %{
3575     __ movdq($dst$$XMMRegister, $src$$Register);
3576     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3577     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3578   %}
3579   ins_pipe( pipe_slow );
3580 %}
3581 
3582 instruct Repl8L(legVecZ dst, rRegL src) %{
3583   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3584   match(Set dst (ReplicateL src));
3585   format %{ "movdq   $dst,$src\n\t"
3586             "punpcklqdq $dst,$dst\n\t"
3587             "vinserti128_high $dst,$dst\t"
3588             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate8L" %}
3589   ins_encode %{
3590     __ movdq($dst$$XMMRegister, $src$$Register);
3591     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3592     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3593     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3594   %}
3595   ins_pipe( pipe_slow );
3596 %}
3597 #else // _LP64
3598 instruct Repl4L(vecY dst, eRegL src, vecY tmp) %{
3599   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3600   match(Set dst (ReplicateL src));
3601   effect(TEMP dst, USE src, TEMP tmp);
3602   format %{ "movdl   $dst,$src.lo\n\t"
3603             "movdl   $tmp,$src.hi\n\t"
3604             "punpckldq $dst,$tmp\n\t"
3605             "punpcklqdq $dst,$dst\n\t"
3606             "vinserti128_high $dst,$dst\t! replicate4L" %}
3607   ins_encode %{
3608     __ movdl($dst$$XMMRegister, $src$$Register);
3609     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
3610     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
3611     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3612     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3613   %}
3614   ins_pipe( pipe_slow );
3615 %}
3616 
3617 instruct Repl8L(legVecZ dst, eRegL src, legVecZ tmp) %{
3618   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3619   match(Set dst (ReplicateL src));
3620   effect(TEMP dst, USE src, TEMP tmp);
3621   format %{ "movdl   $dst,$src.lo\n\t"
3622             "movdl   $tmp,$src.hi\n\t"
3623             "punpckldq $dst,$tmp\n\t"
3624             "punpcklqdq $dst,$dst\n\t"
3625             "vinserti128_high $dst,$dst\t"
3626             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate8L" %}
3627   ins_encode %{
3628     __ movdl($dst$$XMMRegister, $src$$Register);
3629     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
3630     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
3631     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3632     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3633     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3634   %}
3635   ins_pipe( pipe_slow );
3636 %}
3637 #endif // _LP64
3638 
3639 instruct Repl4L_imm(vecY dst, immL con) %{
3640   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3641   match(Set dst (ReplicateL con));
3642   format %{ "movq    $dst,[$constantaddress]\n\t"
3643             "punpcklqdq $dst,$dst\n\t"
3644             "vinserti128_high $dst,$dst\t! replicate4L($con)" %}
3645   ins_encode %{
3646     __ movq($dst$$XMMRegister, $constantaddress($con));
3647     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3648     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3649   %}
3650   ins_pipe( pipe_slow );
3651 %}
3652 
3653 instruct Repl8L_imm(legVecZ dst, immL con) %{
3654   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3655   match(Set dst (ReplicateL con));
3656   format %{ "movq    $dst,[$constantaddress]\n\t"
3657             "punpcklqdq $dst,$dst\n\t"
3658             "vinserti128_high $dst,$dst\t"
3659             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate8L($con)" %}
3660   ins_encode %{
3661     __ movq($dst$$XMMRegister, $constantaddress($con));
3662     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3663     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3664     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3665   %}
3666   ins_pipe( pipe_slow );
3667 %}
3668 
3669 instruct Repl4L_mem(vecY dst, memory mem) %{
3670   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3671   match(Set dst (ReplicateL (LoadL mem)));
3672   format %{ "movq    $dst,$mem\n\t"
3673             "punpcklqdq $dst,$dst\n\t"
3674             "vinserti128_high $dst,$dst\t! replicate4L" %}
3675   ins_encode %{
3676     __ movq($dst$$XMMRegister, $mem$$Address);
3677     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3678     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3679   %}
3680   ins_pipe( pipe_slow );
3681 %}
3682 
3683 instruct Repl8L_mem(legVecZ dst, memory mem) %{
3684   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3685   match(Set dst (ReplicateL (LoadL mem)));
3686   format %{ "movq    $dst,$mem\n\t"
3687             "punpcklqdq $dst,$dst\n\t"
3688             "vinserti128_high $dst,$dst\t"
3689             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate8L" %}
3690   ins_encode %{
3691     __ movq($dst$$XMMRegister, $mem$$Address);
3692     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3693     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3694     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3695   %}
3696   ins_pipe( pipe_slow );
3697 %}
3698 
3699 instruct Repl2F_mem(vecD dst, memory mem) %{
3700   predicate(n->as_Vector()->length() == 2 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3701   match(Set dst (ReplicateF (LoadF mem)));
3702   format %{ "pshufd  $dst,$mem,0x00\t! replicate2F" %}
3703   ins_encode %{
3704     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3705   %}
3706   ins_pipe( pipe_slow );
3707 %}
3708 
3709 instruct Repl4F_mem(vecX dst, memory mem) %{
3710   predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3711   match(Set dst (ReplicateF (LoadF mem)));
3712   format %{ "pshufd  $dst,$mem,0x00\t! replicate4F" %}
3713   ins_encode %{
3714     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3715   %}
3716   ins_pipe( pipe_slow );
3717 %}
3718 
3719 instruct Repl8F(vecY dst, vlRegF src) %{
3720   predicate(n->as_Vector()->length() == 8 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3721   match(Set dst (ReplicateF src));
3722   format %{ "pshufd  $dst,$src,0x00\n\t"
3723             "vinsertf128_high $dst,$dst\t! replicate8F" %}
3724   ins_encode %{
3725     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
3726     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3727   %}
3728   ins_pipe( pipe_slow );
3729 %}
3730 
3731 instruct Repl8F_mem(vecY dst, memory mem) %{
3732   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3733   match(Set dst (ReplicateF (LoadF mem)));
3734   format %{ "pshufd  $dst,$mem,0x00\n\t"
3735             "vinsertf128_high $dst,$dst\t! replicate8F" %}
3736   ins_encode %{
3737     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3738     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3739   %}
3740   ins_pipe( pipe_slow );
3741 %}
3742 
3743 instruct Repl16F(legVecZ dst, vlRegF src) %{
3744   predicate(n->as_Vector()->length() == 16 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3745   match(Set dst (ReplicateF src));
3746   format %{ "pshufd  $dst,$src,0x00\n\t"
3747             "vinsertf128_high $dst,$dst\t"
3748             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate16F" %}
3749   ins_encode %{
3750     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
3751     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3752     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3753   %}
3754   ins_pipe( pipe_slow );
3755 %}
3756 
3757 instruct Repl16F_mem(legVecZ dst, memory mem) %{
3758   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vl());
3759   match(Set dst (ReplicateF (LoadF mem)));
3760   format %{ "pshufd  $dst,$mem,0x00\n\t"
3761             "vinsertf128_high $dst,$dst\t"
3762             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate16F" %}
3763   ins_encode %{
3764     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3765     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3766     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3767   %}
3768   ins_pipe( pipe_slow );
3769 %}
3770 
3771 instruct Repl2F_zero(vecD dst, immF0 zero) %{
3772   predicate(n->as_Vector()->length() == 2 && UseAVX < 3);
3773   match(Set dst (ReplicateF zero));
3774   format %{ "xorps   $dst,$dst\t! replicate2F zero" %}
3775   ins_encode %{
3776     __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
3777   %}
3778   ins_pipe( fpu_reg_reg );
3779 %}
3780 
3781 instruct Repl4F_zero(vecX dst, immF0 zero) %{
3782   predicate(n->as_Vector()->length() == 4 && UseAVX < 3);
3783   match(Set dst (ReplicateF zero));
3784   format %{ "xorps   $dst,$dst\t! replicate4F zero" %}
3785   ins_encode %{
3786     __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
3787   %}
3788   ins_pipe( fpu_reg_reg );
3789 %}
3790 
3791 instruct Repl8F_zero(vecY dst, immF0 zero) %{
3792   predicate(n->as_Vector()->length() == 8 && UseAVX < 3);
3793   match(Set dst (ReplicateF zero));
3794   format %{ "vxorps  $dst,$dst,$dst\t! replicate8F zero" %}
3795   ins_encode %{
3796     int vector_len = 1;
3797     __ vxorps($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3798   %}
3799   ins_pipe( fpu_reg_reg );
3800 %}
3801 
3802 instruct Repl2D_mem(vecX dst, memory mem) %{
3803   predicate(n->as_Vector()->length() == 2 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3804   match(Set dst (ReplicateD (LoadD mem)));
3805   format %{ "pshufd  $dst,$mem,0x44\t! replicate2D" %}
3806   ins_encode %{
3807     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x44);
3808   %}
3809   ins_pipe( pipe_slow );
3810 %}
3811 
3812 instruct Repl4D(vecY dst, vlRegD src) %{
3813   predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3814   match(Set dst (ReplicateD src));
3815   format %{ "pshufd  $dst,$src,0x44\n\t"
3816             "vinsertf128_high $dst,$dst\t! replicate4D" %}
3817   ins_encode %{
3818     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
3819     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3820   %}
3821   ins_pipe( pipe_slow );
3822 %}
3823 
3824 instruct Repl4D_mem(vecY dst, memory mem) %{
3825   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3826   match(Set dst (ReplicateD (LoadD mem)));
3827   format %{ "pshufd  $dst,$mem,0x44\n\t"
3828             "vinsertf128_high $dst,$dst\t! replicate4D" %}
3829   ins_encode %{
3830     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x44);
3831     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3832   %}
3833   ins_pipe( pipe_slow );
3834 %}
3835 
3836 instruct Repl8D(legVecZ dst, vlRegD src) %{
3837   predicate(n->as_Vector()->length() == 8 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3838   match(Set dst (ReplicateD src));
3839   format %{ "pshufd  $dst,$src,0x44\n\t"
3840             "vinsertf128_high $dst,$dst\t"
3841             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate8D" %}
3842   ins_encode %{
3843     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
3844     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3845     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3846   %}
3847   ins_pipe( pipe_slow );
3848 %}
3849 
3850 instruct Repl8D_mem(legVecZ dst, memory mem) %{
3851   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3852   match(Set dst (ReplicateD (LoadD mem)));
3853   format %{ "pshufd  $dst,$mem,0x44\n\t"
3854             "vinsertf128_high $dst,$dst\t"
3855             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate8D" %}
3856   ins_encode %{
3857     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x44);
3858     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3859     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3860   %}
3861   ins_pipe( pipe_slow );
3862 %}
3863 
3864 // Replicate double (8 byte) scalar zero to be vector
3865 instruct Repl2D_zero(vecX dst, immD0 zero) %{
3866   predicate(n->as_Vector()->length() == 2 && UseAVX < 3);
3867   match(Set dst (ReplicateD zero));
3868   format %{ "xorpd   $dst,$dst\t! replicate2D zero" %}
3869   ins_encode %{
3870     __ xorpd($dst$$XMMRegister, $dst$$XMMRegister);
3871   %}
3872   ins_pipe( fpu_reg_reg );
3873 %}
3874 
3875 instruct Repl4D_zero(vecY dst, immD0 zero) %{
3876   predicate(n->as_Vector()->length() == 4 && UseAVX < 3);
3877   match(Set dst (ReplicateD zero));
3878   format %{ "vxorpd  $dst,$dst,$dst,vect256\t! replicate4D zero" %}
3879   ins_encode %{
3880     int vector_len = 1;
3881     __ vxorpd($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3882   %}
3883   ins_pipe( fpu_reg_reg );
3884 %}
3885 
3886 // ====================GENERIC REPLICATE==========================================
3887 
3888 // Replicate byte scalar to be vector
3889 instruct Repl4B(vecS dst, rRegI src) %{
3890   predicate(n->as_Vector()->length() == 4);
3891   match(Set dst (ReplicateB src));
3892   format %{ "movd    $dst,$src\n\t"
3893             "punpcklbw $dst,$dst\n\t"
3894             "pshuflw $dst,$dst,0x00\t! replicate4B" %}
3895   ins_encode %{
3896     __ movdl($dst$$XMMRegister, $src$$Register);
3897     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
3898     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3899   %}
3900   ins_pipe( pipe_slow );
3901 %}
3902 
3903 instruct Repl8B(vecD dst, rRegI src) %{
3904   predicate(n->as_Vector()->length() == 8);
3905   match(Set dst (ReplicateB src));
3906   format %{ "movd    $dst,$src\n\t"
3907             "punpcklbw $dst,$dst\n\t"
3908             "pshuflw $dst,$dst,0x00\t! replicate8B" %}
3909   ins_encode %{
3910     __ movdl($dst$$XMMRegister, $src$$Register);
3911     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
3912     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3913   %}
3914   ins_pipe( pipe_slow );
3915 %}
3916 
3917 // Replicate byte scalar immediate to be vector by loading from const table.
3918 instruct Repl4B_imm(vecS dst, immI con) %{
3919   predicate(n->as_Vector()->length() == 4);
3920   match(Set dst (ReplicateB con));
3921   format %{ "movdl   $dst,[$constantaddress]\t! replicate4B($con)" %}
3922   ins_encode %{
3923     __ movdl($dst$$XMMRegister, $constantaddress(replicate4_imm($con$$constant, 1)));
3924   %}
3925   ins_pipe( pipe_slow );
3926 %}
3927 
3928 instruct Repl8B_imm(vecD dst, immI con) %{
3929   predicate(n->as_Vector()->length() == 8);
3930   match(Set dst (ReplicateB con));
3931   format %{ "movq    $dst,[$constantaddress]\t! replicate8B($con)" %}
3932   ins_encode %{
3933     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
3934   %}
3935   ins_pipe( pipe_slow );
3936 %}
3937 
3938 // Replicate byte scalar zero to be vector
3939 instruct Repl4B_zero(vecS dst, immI0 zero) %{
3940   predicate(n->as_Vector()->length() == 4);
3941   match(Set dst (ReplicateB zero));
3942   format %{ "pxor    $dst,$dst\t! replicate4B zero" %}
3943   ins_encode %{
3944     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3945   %}
3946   ins_pipe( fpu_reg_reg );
3947 %}
3948 
3949 instruct Repl8B_zero(vecD dst, immI0 zero) %{
3950   predicate(n->as_Vector()->length() == 8);
3951   match(Set dst (ReplicateB zero));
3952   format %{ "pxor    $dst,$dst\t! replicate8B zero" %}
3953   ins_encode %{
3954     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3955   %}
3956   ins_pipe( fpu_reg_reg );
3957 %}
3958 
3959 instruct Repl16B_zero(vecX dst, immI0 zero) %{
3960   predicate(n->as_Vector()->length() == 16);
3961   match(Set dst (ReplicateB zero));
3962   format %{ "pxor    $dst,$dst\t! replicate16B zero" %}
3963   ins_encode %{
3964     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3965   %}
3966   ins_pipe( fpu_reg_reg );
3967 %}
3968 
3969 instruct Repl32B_zero(vecY dst, immI0 zero) %{
3970   predicate(n->as_Vector()->length() == 32);
3971   match(Set dst (ReplicateB zero));
3972   format %{ "vpxor   $dst,$dst,$dst\t! replicate32B zero" %}
3973   ins_encode %{
3974     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
3975     int vector_len = 1;
3976     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3977   %}
3978   ins_pipe( fpu_reg_reg );
3979 %}
3980 
3981 // Replicate char/short (2 byte) scalar to be vector
3982 instruct Repl2S(vecS dst, rRegI src) %{
3983   predicate(n->as_Vector()->length() == 2);
3984   match(Set dst (ReplicateS src));
3985   format %{ "movd    $dst,$src\n\t"
3986             "pshuflw $dst,$dst,0x00\t! replicate2S" %}
3987   ins_encode %{
3988     __ movdl($dst$$XMMRegister, $src$$Register);
3989     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3990   %}
3991   ins_pipe( fpu_reg_reg );
3992 %}
3993 
3994 // Replicate char/short (2 byte) scalar immediate to be vector by loading from const table.
3995 instruct Repl2S_imm(vecS dst, immI con) %{
3996   predicate(n->as_Vector()->length() == 2);
3997   match(Set dst (ReplicateS con));
3998   format %{ "movdl   $dst,[$constantaddress]\t! replicate2S($con)" %}
3999   ins_encode %{
4000     __ movdl($dst$$XMMRegister, $constantaddress(replicate4_imm($con$$constant, 2)));
4001   %}
4002   ins_pipe( fpu_reg_reg );
4003 %}
4004 
4005 instruct Repl4S_imm(vecD dst, immI con) %{
4006   predicate(n->as_Vector()->length() == 4);
4007   match(Set dst (ReplicateS con));
4008   format %{ "movq    $dst,[$constantaddress]\t! replicate4S($con)" %}
4009   ins_encode %{
4010     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
4011   %}
4012   ins_pipe( fpu_reg_reg );
4013 %}
4014 
4015 // Replicate char/short (2 byte) scalar zero to be vector
4016 instruct Repl2S_zero(vecS dst, immI0 zero) %{
4017   predicate(n->as_Vector()->length() == 2);
4018   match(Set dst (ReplicateS zero));
4019   format %{ "pxor    $dst,$dst\t! replicate2S zero" %}
4020   ins_encode %{
4021     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4022   %}
4023   ins_pipe( fpu_reg_reg );
4024 %}
4025 
4026 instruct Repl4S_zero(vecD dst, immI0 zero) %{
4027   predicate(n->as_Vector()->length() == 4);
4028   match(Set dst (ReplicateS zero));
4029   format %{ "pxor    $dst,$dst\t! replicate4S zero" %}
4030   ins_encode %{
4031     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4032   %}
4033   ins_pipe( fpu_reg_reg );
4034 %}
4035 
4036 instruct Repl8S_zero(vecX dst, immI0 zero) %{
4037   predicate(n->as_Vector()->length() == 8);
4038   match(Set dst (ReplicateS zero));
4039   format %{ "pxor    $dst,$dst\t! replicate8S zero" %}
4040   ins_encode %{
4041     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4042   %}
4043   ins_pipe( fpu_reg_reg );
4044 %}
4045 
4046 instruct Repl16S_zero(vecY dst, immI0 zero) %{
4047   predicate(n->as_Vector()->length() == 16);
4048   match(Set dst (ReplicateS zero));
4049   format %{ "vpxor   $dst,$dst,$dst\t! replicate16S zero" %}
4050   ins_encode %{
4051     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
4052     int vector_len = 1;
4053     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4054   %}
4055   ins_pipe( fpu_reg_reg );
4056 %}
4057 
4058 // Replicate integer (4 byte) scalar to be vector
4059 instruct Repl2I(vecD dst, rRegI src) %{
4060   predicate(n->as_Vector()->length() == 2);
4061   match(Set dst (ReplicateI src));
4062   format %{ "movd    $dst,$src\n\t"
4063             "pshufd  $dst,$dst,0x00\t! replicate2I" %}
4064   ins_encode %{
4065     __ movdl($dst$$XMMRegister, $src$$Register);
4066     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
4067   %}
4068   ins_pipe( fpu_reg_reg );
4069 %}
4070 
4071 // Integer could be loaded into xmm register directly from memory.
4072 instruct Repl2I_mem(vecD dst, memory mem) %{
4073   predicate(n->as_Vector()->length() == 2);
4074   match(Set dst (ReplicateI (LoadI mem)));
4075   format %{ "movd    $dst,$mem\n\t"
4076             "pshufd  $dst,$dst,0x00\t! replicate2I" %}
4077   ins_encode %{
4078     __ movdl($dst$$XMMRegister, $mem$$Address);
4079     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
4080   %}
4081   ins_pipe( fpu_reg_reg );
4082 %}
4083 
4084 // Replicate integer (4 byte) scalar immediate to be vector by loading from const table.
4085 instruct Repl2I_imm(vecD dst, immI con) %{
4086   predicate(n->as_Vector()->length() == 2);
4087   match(Set dst (ReplicateI con));
4088   format %{ "movq    $dst,[$constantaddress]\t! replicate2I($con)" %}
4089   ins_encode %{
4090     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
4091   %}
4092   ins_pipe( fpu_reg_reg );
4093 %}
4094 
4095 // Replicate integer (4 byte) scalar zero to be vector
4096 instruct Repl2I_zero(vecD dst, immI0 zero) %{
4097   predicate(n->as_Vector()->length() == 2);
4098   match(Set dst (ReplicateI zero));
4099   format %{ "pxor    $dst,$dst\t! replicate2I" %}
4100   ins_encode %{
4101     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4102   %}
4103   ins_pipe( fpu_reg_reg );
4104 %}
4105 
4106 instruct Repl4I_zero(vecX dst, immI0 zero) %{
4107   predicate(n->as_Vector()->length() == 4);
4108   match(Set dst (ReplicateI zero));
4109   format %{ "pxor    $dst,$dst\t! replicate4I zero)" %}
4110   ins_encode %{
4111     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4112   %}
4113   ins_pipe( fpu_reg_reg );
4114 %}
4115 
4116 instruct Repl8I_zero(vecY dst, immI0 zero) %{
4117   predicate(n->as_Vector()->length() == 8);
4118   match(Set dst (ReplicateI zero));
4119   format %{ "vpxor   $dst,$dst,$dst\t! replicate8I zero" %}
4120   ins_encode %{
4121     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
4122     int vector_len = 1;
4123     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4124   %}
4125   ins_pipe( fpu_reg_reg );
4126 %}
4127 
4128 // Replicate long (8 byte) scalar to be vector
4129 #ifdef _LP64
4130 instruct Repl2L(vecX dst, rRegL src) %{
4131   predicate(n->as_Vector()->length() == 2);
4132   match(Set dst (ReplicateL src));
4133   format %{ "movdq   $dst,$src\n\t"
4134             "punpcklqdq $dst,$dst\t! replicate2L" %}
4135   ins_encode %{
4136     __ movdq($dst$$XMMRegister, $src$$Register);
4137     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4138   %}
4139   ins_pipe( pipe_slow );
4140 %}
4141 #else // _LP64
4142 instruct Repl2L(vecX dst, eRegL src, vecX tmp) %{
4143   predicate(n->as_Vector()->length() == 2);
4144   match(Set dst (ReplicateL src));
4145   effect(TEMP dst, USE src, TEMP tmp);
4146   format %{ "movdl   $dst,$src.lo\n\t"
4147             "movdl   $tmp,$src.hi\n\t"
4148             "punpckldq $dst,$tmp\n\t"
4149             "punpcklqdq $dst,$dst\t! replicate2L"%}
4150   ins_encode %{
4151     __ movdl($dst$$XMMRegister, $src$$Register);
4152     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
4153     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
4154     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4155   %}
4156   ins_pipe( pipe_slow );
4157 %}
4158 #endif // _LP64
4159 
4160 // Replicate long (8 byte) scalar immediate to be vector by loading from const table.
4161 instruct Repl2L_imm(vecX dst, immL con) %{
4162   predicate(n->as_Vector()->length() == 2);
4163   match(Set dst (ReplicateL con));
4164   format %{ "movq    $dst,[$constantaddress]\n\t"
4165             "punpcklqdq $dst,$dst\t! replicate2L($con)" %}
4166   ins_encode %{
4167     __ movq($dst$$XMMRegister, $constantaddress($con));
4168     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4169   %}
4170   ins_pipe( pipe_slow );
4171 %}
4172 
4173 // Replicate long (8 byte) scalar zero to be vector
4174 instruct Repl2L_zero(vecX dst, immL0 zero) %{
4175   predicate(n->as_Vector()->length() == 2);
4176   match(Set dst (ReplicateL zero));
4177   format %{ "pxor    $dst,$dst\t! replicate2L zero" %}
4178   ins_encode %{
4179     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4180   %}
4181   ins_pipe( fpu_reg_reg );
4182 %}
4183 
4184 instruct Repl4L_zero(vecY dst, immL0 zero) %{
4185   predicate(n->as_Vector()->length() == 4);
4186   match(Set dst (ReplicateL zero));
4187   format %{ "vpxor   $dst,$dst,$dst\t! replicate4L zero" %}
4188   ins_encode %{
4189     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
4190     int vector_len = 1;
4191     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4192   %}
4193   ins_pipe( fpu_reg_reg );
4194 %}
4195 
4196 // Replicate float (4 byte) scalar to be vector
4197 instruct Repl2F(vecD dst, vlRegF src) %{
4198   predicate(n->as_Vector()->length() == 2);
4199   match(Set dst (ReplicateF src));
4200   format %{ "pshufd  $dst,$dst,0x00\t! replicate2F" %}
4201   ins_encode %{
4202     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
4203   %}
4204   ins_pipe( fpu_reg_reg );
4205 %}
4206 
4207 instruct Repl4F(vecX dst, vlRegF src) %{
4208   predicate(n->as_Vector()->length() == 4);
4209   match(Set dst (ReplicateF src));
4210   format %{ "pshufd  $dst,$dst,0x00\t! replicate4F" %}
4211   ins_encode %{
4212     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
4213   %}
4214   ins_pipe( pipe_slow );
4215 %}
4216 
4217 // Replicate double (8 bytes) scalar to be vector
4218 instruct Repl2D(vecX dst, vlRegD src) %{
4219   predicate(n->as_Vector()->length() == 2);
4220   match(Set dst (ReplicateD src));
4221   format %{ "pshufd  $dst,$src,0x44\t! replicate2D" %}
4222   ins_encode %{
4223     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
4224   %}
4225   ins_pipe( pipe_slow );
4226 %}
4227 
4228 // ====================EVEX REPLICATE=============================================
4229 
4230 instruct Repl4B_mem_evex(vecS dst, memory mem) %{
4231   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4232   match(Set dst (ReplicateB (LoadB mem)));
4233   format %{ "vpbroadcastb  $dst,$mem\t! replicate4B" %}
4234   ins_encode %{
4235     int vector_len = 0;
4236     __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
4237   %}
4238   ins_pipe( pipe_slow );
4239 %}
4240 
4241 instruct Repl8B_mem_evex(vecD dst, memory mem) %{
4242   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4243   match(Set dst (ReplicateB (LoadB mem)));
4244   format %{ "vpbroadcastb  $dst,$mem\t! replicate8B" %}
4245   ins_encode %{
4246     int vector_len = 0;
4247     __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
4248   %}
4249   ins_pipe( pipe_slow );
4250 %}
4251 
4252 instruct Repl16B_evex(vecX dst, rRegI src) %{
4253   predicate(n->as_Vector()->length() == 16 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4254   match(Set dst (ReplicateB src));
4255   format %{ "evpbroadcastb $dst,$src\t! replicate16B" %}
4256   ins_encode %{
4257    int vector_len = 0;
4258     __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vector_len);
4259   %}
4260   ins_pipe( pipe_slow );
4261 %}
4262 
4263 instruct Repl16B_mem_evex(vecX dst, memory mem) %{
4264   predicate(n->as_Vector()->length() == 16 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4265   match(Set dst (ReplicateB (LoadB mem)));
4266   format %{ "vpbroadcastb  $dst,$mem\t! replicate16B" %}
4267   ins_encode %{
4268     int vector_len = 0;
4269     __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
4270   %}
4271   ins_pipe( pipe_slow );
4272 %}
4273 
4274 instruct Repl32B_evex(vecY dst, rRegI src) %{
4275   predicate(n->as_Vector()->length() == 32 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4276   match(Set dst (ReplicateB src));
4277   format %{ "evpbroadcastb $dst,$src\t! replicate32B" %}
4278   ins_encode %{
4279    int vector_len = 1;
4280     __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vector_len);
4281   %}
4282   ins_pipe( pipe_slow );
4283 %}
4284 
4285 instruct Repl32B_mem_evex(vecY dst, memory mem) %{
4286   predicate(n->as_Vector()->length() == 32 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4287   match(Set dst (ReplicateB (LoadB mem)));
4288   format %{ "vpbroadcastb  $dst,$mem\t! replicate32B" %}
4289   ins_encode %{
4290     int vector_len = 1;
4291     __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
4292   %}
4293   ins_pipe( pipe_slow );
4294 %}
4295 
4296 instruct Repl64B_evex(vecZ dst, rRegI src) %{
4297   predicate(n->as_Vector()->length() == 64 && UseAVX > 2 && VM_Version::supports_avx512bw());
4298   match(Set dst (ReplicateB src));
4299   format %{ "evpbroadcastb $dst,$src\t! upper replicate64B" %}
4300   ins_encode %{
4301    int vector_len = 2;
4302     __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vector_len);
4303   %}
4304   ins_pipe( pipe_slow );
4305 %}
4306 
4307 instruct Repl64B_mem_evex(vecZ dst, memory mem) %{
4308   predicate(n->as_Vector()->length() == 64 && UseAVX > 2 && VM_Version::supports_avx512bw());
4309   match(Set dst (ReplicateB (LoadB mem)));
4310   format %{ "vpbroadcastb  $dst,$mem\t! replicate64B" %}
4311   ins_encode %{
4312     int vector_len = 2;
4313     __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
4314   %}
4315   ins_pipe( pipe_slow );
4316 %}
4317 
4318 instruct Repl16B_imm_evex(vecX dst, immI con) %{
4319   predicate(n->as_Vector()->length() == 16 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4320   match(Set dst (ReplicateB con));
4321   format %{ "movq    $dst,[$constantaddress]\n\t"
4322             "vpbroadcastb $dst,$dst\t! replicate16B" %}
4323   ins_encode %{
4324    int vector_len = 0;
4325     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
4326     __ vpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4327   %}
4328   ins_pipe( pipe_slow );
4329 %}
4330 
4331 instruct Repl32B_imm_evex(vecY dst, immI con) %{
4332   predicate(n->as_Vector()->length() == 32 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4333   match(Set dst (ReplicateB con));
4334   format %{ "movq    $dst,[$constantaddress]\n\t"
4335             "vpbroadcastb $dst,$dst\t! replicate32B" %}
4336   ins_encode %{
4337    int vector_len = 1;
4338     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
4339     __ vpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4340   %}
4341   ins_pipe( pipe_slow );
4342 %}
4343 
4344 instruct Repl64B_imm_evex(vecZ dst, immI con) %{
4345   predicate(n->as_Vector()->length() == 64 && UseAVX > 2 && VM_Version::supports_avx512bw());
4346   match(Set dst (ReplicateB con));
4347   format %{ "movq    $dst,[$constantaddress]\n\t"
4348             "vpbroadcastb $dst,$dst\t! upper replicate64B" %}
4349   ins_encode %{
4350    int vector_len = 2;
4351     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
4352     __ vpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4353   %}
4354   ins_pipe( pipe_slow );
4355 %}
4356 
4357 instruct Repl64B_zero_evex(vecZ dst, immI0 zero) %{
4358   predicate(n->as_Vector()->length() == 64 && UseAVX > 2);
4359   match(Set dst (ReplicateB zero));
4360   format %{ "vpxor   $dst k0,$dst,$dst\t! replicate64B zero" %}
4361   ins_encode %{
4362     // Use vxorpd since AVX does not have vpxor for 512-bit (EVEX will have it).
4363     int vector_len = 2;
4364     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4365   %}
4366   ins_pipe( fpu_reg_reg );
4367 %}
4368 
4369 instruct Repl4S_evex(vecD dst, rRegI src) %{
4370   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4371   match(Set dst (ReplicateS src));
4372   format %{ "evpbroadcastw $dst,$src\t! replicate4S" %}
4373   ins_encode %{
4374    int vector_len = 0;
4375     __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
4376   %}
4377   ins_pipe( pipe_slow );
4378 %}
4379 
4380 instruct Repl4S_mem_evex(vecD dst, memory mem) %{
4381   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4382   match(Set dst (ReplicateS (LoadS mem)));
4383   format %{ "vpbroadcastw  $dst,$mem\t! replicate4S" %}
4384   ins_encode %{
4385     int vector_len = 0;
4386     __ vpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
4387   %}
4388   ins_pipe( pipe_slow );
4389 %}
4390 
4391 instruct Repl8S_evex(vecX dst, rRegI src) %{
4392   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4393   match(Set dst (ReplicateS src));
4394   format %{ "evpbroadcastw $dst,$src\t! replicate8S" %}
4395   ins_encode %{
4396    int vector_len = 0;
4397     __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
4398   %}
4399   ins_pipe( pipe_slow );
4400 %}
4401 
4402 instruct Repl8S_mem_evex(vecX dst, memory mem) %{
4403   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4404   match(Set dst (ReplicateS (LoadS mem)));
4405   format %{ "vpbroadcastw  $dst,$mem\t! replicate8S" %}
4406   ins_encode %{
4407     int vector_len = 0;
4408     __ vpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
4409   %}
4410   ins_pipe( pipe_slow );
4411 %}
4412 
4413 instruct Repl16S_evex(vecY dst, rRegI src) %{
4414   predicate(n->as_Vector()->length() == 16 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4415   match(Set dst (ReplicateS src));
4416   format %{ "evpbroadcastw $dst,$src\t! replicate16S" %}
4417   ins_encode %{
4418    int vector_len = 1;
4419     __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
4420   %}
4421   ins_pipe( pipe_slow );
4422 %}
4423 
4424 instruct Repl16S_mem_evex(vecY dst, memory mem) %{
4425   predicate(n->as_Vector()->length() == 16 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4426   match(Set dst (ReplicateS (LoadS mem)));
4427   format %{ "vpbroadcastw  $dst,$mem\t! replicate16S" %}
4428   ins_encode %{
4429     int vector_len = 1;
4430     __ vpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
4431   %}
4432   ins_pipe( pipe_slow );
4433 %}
4434 
4435 instruct Repl32S_evex(vecZ dst, rRegI src) %{
4436   predicate(n->as_Vector()->length() == 32 && UseAVX > 2 && VM_Version::supports_avx512bw());
4437   match(Set dst (ReplicateS src));
4438   format %{ "evpbroadcastw $dst,$src\t! replicate32S" %}
4439   ins_encode %{
4440    int vector_len = 2;
4441     __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
4442   %}
4443   ins_pipe( pipe_slow );
4444 %}
4445 
4446 instruct Repl32S_mem_evex(vecZ dst, memory mem) %{
4447   predicate(n->as_Vector()->length() == 32 && UseAVX > 2 && VM_Version::supports_avx512bw());
4448   match(Set dst (ReplicateS (LoadS mem)));
4449   format %{ "vpbroadcastw  $dst,$mem\t! replicate32S" %}
4450   ins_encode %{
4451     int vector_len = 2;
4452     __ vpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
4453   %}
4454   ins_pipe( pipe_slow );
4455 %}
4456 
4457 instruct Repl8S_imm_evex(vecX dst, immI con) %{
4458   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4459   match(Set dst (ReplicateS con));
4460   format %{ "movq    $dst,[$constantaddress]\n\t"
4461             "vpbroadcastw $dst,$dst\t! replicate8S" %}
4462   ins_encode %{
4463    int vector_len = 0;
4464     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
4465     __ vpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4466   %}
4467   ins_pipe( pipe_slow );
4468 %}
4469 
4470 instruct Repl16S_imm_evex(vecY dst, immI con) %{
4471   predicate(n->as_Vector()->length() == 16 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4472   match(Set dst (ReplicateS con));
4473   format %{ "movq    $dst,[$constantaddress]\n\t"
4474             "vpbroadcastw $dst,$dst\t! replicate16S" %}
4475   ins_encode %{
4476    int vector_len = 1;
4477     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
4478     __ vpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4479   %}
4480   ins_pipe( pipe_slow );
4481 %}
4482 
4483 instruct Repl32S_imm_evex(vecZ dst, immI con) %{
4484   predicate(n->as_Vector()->length() == 32 && UseAVX > 2 && VM_Version::supports_avx512bw());
4485   match(Set dst (ReplicateS con));
4486   format %{ "movq    $dst,[$constantaddress]\n\t"
4487             "vpbroadcastw $dst,$dst\t! replicate32S" %}
4488   ins_encode %{
4489    int vector_len = 2;
4490     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
4491     __ vpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4492   %}
4493   ins_pipe( pipe_slow );
4494 %}
4495 
4496 instruct Repl32S_zero_evex(vecZ dst, immI0 zero) %{
4497   predicate(n->as_Vector()->length() == 32 && UseAVX > 2);
4498   match(Set dst (ReplicateS zero));
4499   format %{ "vpxor   $dst k0,$dst,$dst\t! replicate32S zero" %}
4500   ins_encode %{
4501     // Use vxorpd since AVX does not have vpxor for 512-bit (EVEX will have it).
4502     int vector_len = 2;
4503     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4504   %}
4505   ins_pipe( fpu_reg_reg );
4506 %}
4507 
4508 instruct Repl4I_evex(vecX dst, rRegI src) %{
4509   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
4510   match(Set dst (ReplicateI src));
4511   format %{ "evpbroadcastd  $dst,$src\t! replicate4I" %}
4512   ins_encode %{
4513     int vector_len = 0;
4514     __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vector_len);
4515   %}
4516   ins_pipe( pipe_slow );
4517 %}
4518 
4519 instruct Repl4I_mem_evex(vecX dst, memory mem) %{
4520   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
4521   match(Set dst (ReplicateI (LoadI mem)));
4522   format %{ "vpbroadcastd  $dst,$mem\t! replicate4I" %}
4523   ins_encode %{
4524     int vector_len = 0;
4525     __ vpbroadcastd($dst$$XMMRegister, $mem$$Address, vector_len);
4526   %}
4527   ins_pipe( pipe_slow );
4528 %}
4529 
4530 instruct Repl8I_evex(vecY dst, rRegI src) %{
4531   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vl());
4532   match(Set dst (ReplicateI src));
4533   format %{ "evpbroadcastd  $dst,$src\t! replicate8I" %}
4534   ins_encode %{
4535     int vector_len = 1;
4536     __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vector_len);
4537   %}
4538   ins_pipe( pipe_slow );
4539 %}
4540 
4541 instruct Repl8I_mem_evex(vecY dst, memory mem) %{
4542   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vl());
4543   match(Set dst (ReplicateI (LoadI mem)));
4544   format %{ "vpbroadcastd  $dst,$mem\t! replicate8I" %}
4545   ins_encode %{
4546     int vector_len = 1;
4547     __ vpbroadcastd($dst$$XMMRegister, $mem$$Address, vector_len);
4548   %}
4549   ins_pipe( pipe_slow );
4550 %}
4551 
4552 instruct Repl16I_evex(vecZ dst, rRegI src) %{
4553   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4554   match(Set dst (ReplicateI src));
4555   format %{ "evpbroadcastd  $dst,$src\t! replicate16I" %}
4556   ins_encode %{
4557     int vector_len = 2;
4558     __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vector_len);
4559   %}
4560   ins_pipe( pipe_slow );
4561 %}
4562 
4563 instruct Repl16I_mem_evex(vecZ dst, memory mem) %{
4564   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4565   match(Set dst (ReplicateI (LoadI mem)));
4566   format %{ "vpbroadcastd  $dst,$mem\t! replicate16I" %}
4567   ins_encode %{
4568     int vector_len = 2;
4569     __ vpbroadcastd($dst$$XMMRegister, $mem$$Address, vector_len);
4570   %}
4571   ins_pipe( pipe_slow );
4572 %}
4573 
4574 instruct Repl4I_imm_evex(vecX dst, immI con) %{
4575   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
4576   match(Set dst (ReplicateI con));
4577   format %{ "movq    $dst,[$constantaddress]\t! replicate8I($con)\n\t"
4578             "vpbroadcastd  $dst,$dst\t! replicate4I" %}
4579   ins_encode %{
4580     int vector_len = 0;
4581     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
4582     __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4583   %}
4584   ins_pipe( pipe_slow );
4585 %}
4586 
4587 instruct Repl8I_imm_evex(vecY dst, immI con) %{
4588   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vl());
4589   match(Set dst (ReplicateI con));
4590   format %{ "movq    $dst,[$constantaddress]\t! replicate8I($con)\n\t"
4591             "vpbroadcastd  $dst,$dst\t! replicate8I" %}
4592   ins_encode %{
4593     int vector_len = 1;
4594     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
4595     __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4596   %}
4597   ins_pipe( pipe_slow );
4598 %}
4599 
4600 instruct Repl16I_imm_evex(vecZ dst, immI con) %{
4601   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4602   match(Set dst (ReplicateI con));
4603   format %{ "movq    $dst,[$constantaddress]\t! replicate16I($con)\n\t"
4604             "vpbroadcastd  $dst,$dst\t! replicate16I" %}
4605   ins_encode %{
4606     int vector_len = 2;
4607     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
4608     __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4609   %}
4610   ins_pipe( pipe_slow );
4611 %}
4612 
4613 instruct Repl16I_zero_evex(vecZ dst, immI0 zero) %{
4614   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4615   match(Set dst (ReplicateI zero));
4616   format %{ "vpxor   $dst k0,$dst,$dst\t! replicate16I zero" %}
4617   ins_encode %{
4618     // Use vxorpd since AVX does not have vpxor for 512-bit (AVX2 will have it).
4619     int vector_len = 2;
4620     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4621   %}
4622   ins_pipe( fpu_reg_reg );
4623 %}
4624 
4625 // Replicate long (8 byte) scalar to be vector
4626 #ifdef _LP64
4627 instruct Repl4L_evex(vecY dst, rRegL src) %{
4628   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
4629   match(Set dst (ReplicateL src));
4630   format %{ "evpbroadcastq  $dst,$src\t! replicate4L" %}
4631   ins_encode %{
4632     int vector_len = 1;
4633     __ evpbroadcastq($dst$$XMMRegister, $src$$Register, vector_len);
4634   %}
4635   ins_pipe( pipe_slow );
4636 %}
4637 
4638 instruct Repl8L_evex(vecZ dst, rRegL src) %{
4639   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4640   match(Set dst (ReplicateL src));
4641   format %{ "evpbroadcastq  $dst,$src\t! replicate8L" %}
4642   ins_encode %{
4643     int vector_len = 2;
4644     __ evpbroadcastq($dst$$XMMRegister, $src$$Register, vector_len);
4645   %}
4646   ins_pipe( pipe_slow );
4647 %}
4648 #else // _LP64
4649 instruct Repl4L_evex(vecY dst, eRegL src, regD tmp) %{
4650   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
4651   match(Set dst (ReplicateL src));
4652   effect(TEMP dst, USE src, TEMP tmp);
4653   format %{ "movdl   $dst,$src.lo\n\t"
4654             "movdl   $tmp,$src.hi\n\t"
4655             "punpckldq $dst,$tmp\n\t"
4656             "vpbroadcastq  $dst,$dst\t! replicate4L" %}
4657   ins_encode %{
4658     int vector_len = 1;
4659     __ movdl($dst$$XMMRegister, $src$$Register);
4660     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
4661     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
4662     __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4663   %}
4664   ins_pipe( pipe_slow );
4665 %}
4666 
4667 instruct Repl8L_evex(legVecZ dst, eRegL src, legVecZ tmp) %{
4668   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4669   match(Set dst (ReplicateL src));
4670   effect(TEMP dst, USE src, TEMP tmp);
4671   format %{ "movdl   $dst,$src.lo\n\t"
4672             "movdl   $tmp,$src.hi\n\t"
4673             "punpckldq $dst,$tmp\n\t"
4674             "vpbroadcastq  $dst,$dst\t! replicate8L" %}
4675   ins_encode %{
4676     int vector_len = 2;
4677     __ movdl($dst$$XMMRegister, $src$$Register);
4678     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
4679     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
4680     __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4681   %}
4682   ins_pipe( pipe_slow );
4683 %}
4684 #endif // _LP64
4685 
4686 instruct Repl4L_imm_evex(vecY dst, immL con) %{
4687   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
4688   match(Set dst (ReplicateL con));
4689   format %{ "movq    $dst,[$constantaddress]\n\t"
4690             "vpbroadcastq  $dst,$dst\t! replicate4L" %}
4691   ins_encode %{
4692     int vector_len = 1;
4693     __ movq($dst$$XMMRegister, $constantaddress($con));
4694     __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4695   %}
4696   ins_pipe( pipe_slow );
4697 %}
4698 
4699 instruct Repl8L_imm_evex(vecZ dst, immL con) %{
4700   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4701   match(Set dst (ReplicateL con));
4702   format %{ "movq    $dst,[$constantaddress]\n\t"
4703             "vpbroadcastq  $dst,$dst\t! replicate8L" %}
4704   ins_encode %{
4705     int vector_len = 2;
4706     __ movq($dst$$XMMRegister, $constantaddress($con));
4707     __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4708   %}
4709   ins_pipe( pipe_slow );
4710 %}
4711 
4712 instruct Repl2L_mem_evex(vecX dst, memory mem) %{
4713   predicate(n->as_Vector()->length() == 2 && UseAVX > 2 && VM_Version::supports_avx512vl());
4714   match(Set dst (ReplicateL (LoadL mem)));
4715   format %{ "vpbroadcastd  $dst,$mem\t! replicate2L" %}
4716   ins_encode %{
4717     int vector_len = 0;
4718     __ vpbroadcastq($dst$$XMMRegister, $mem$$Address, vector_len);
4719   %}
4720   ins_pipe( pipe_slow );
4721 %}
4722 
4723 instruct Repl4L_mem_evex(vecY dst, memory mem) %{
4724   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
4725   match(Set dst (ReplicateL (LoadL mem)));
4726   format %{ "vpbroadcastd  $dst,$mem\t! replicate4L" %}
4727   ins_encode %{
4728     int vector_len = 1;
4729     __ vpbroadcastq($dst$$XMMRegister, $mem$$Address, vector_len);
4730   %}
4731   ins_pipe( pipe_slow );
4732 %}
4733 
4734 instruct Repl8L_mem_evex(vecZ dst, memory mem) %{
4735   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4736   match(Set dst (ReplicateL (LoadL mem)));
4737   format %{ "vpbroadcastd  $dst,$mem\t! replicate8L" %}
4738   ins_encode %{
4739     int vector_len = 2;
4740     __ vpbroadcastq($dst$$XMMRegister, $mem$$Address, vector_len);
4741   %}
4742   ins_pipe( pipe_slow );
4743 %}
4744 
4745 instruct Repl8L_zero_evex(vecZ dst, immL0 zero) %{
4746   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4747   match(Set dst (ReplicateL zero));
4748   format %{ "vpxor   $dst k0,$dst,$dst\t! replicate8L zero" %}
4749   ins_encode %{
4750     // Use vxorpd since AVX does not have vpxor for 512-bit (EVEX will have it).
4751     int vector_len = 2;
4752     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4753   %}
4754   ins_pipe( fpu_reg_reg );
4755 %}
4756 
4757 instruct Repl8F_evex(vecY dst, regF src) %{
4758   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vl());
4759   match(Set dst (ReplicateF src));
4760   format %{ "vpbroadcastss $dst,$src\t! replicate8F" %}
4761   ins_encode %{
4762     int vector_len = 1;
4763     __ vpbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vector_len);
4764   %}
4765   ins_pipe( pipe_slow );
4766 %}
4767 
4768 instruct Repl8F_mem_evex(vecY dst, memory mem) %{
4769   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vl());
4770   match(Set dst (ReplicateF (LoadF mem)));
4771   format %{ "vbroadcastss  $dst,$mem\t! replicate8F" %}
4772   ins_encode %{
4773     int vector_len = 1;
4774     __ vpbroadcastss($dst$$XMMRegister, $mem$$Address, vector_len);
4775   %}
4776   ins_pipe( pipe_slow );
4777 %}
4778 
4779 instruct Repl16F_evex(vecZ dst, regF src) %{
4780   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4781   match(Set dst (ReplicateF src));
4782   format %{ "vpbroadcastss $dst,$src\t! replicate16F" %}
4783   ins_encode %{
4784     int vector_len = 2;
4785     __ vpbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vector_len);
4786   %}
4787   ins_pipe( pipe_slow );
4788 %}
4789 
4790 instruct Repl16F_mem_evex(vecZ dst, memory mem) %{
4791   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4792   match(Set dst (ReplicateF (LoadF mem)));
4793   format %{ "vbroadcastss  $dst,$mem\t! replicate16F" %}
4794   ins_encode %{
4795     int vector_len = 2;
4796     __ vpbroadcastss($dst$$XMMRegister, $mem$$Address, vector_len);
4797   %}
4798   ins_pipe( pipe_slow );
4799 %}
4800 
4801 instruct Repl2F_zero_evex(vecD dst, immF0 zero) %{
4802   predicate(n->as_Vector()->length() == 2 && UseAVX > 2);
4803   match(Set dst (ReplicateF zero));
4804   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate2F zero" %}
4805   ins_encode %{
4806     // Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
4807     int vector_len = 2;
4808     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4809   %}
4810   ins_pipe( fpu_reg_reg );
4811 %}
4812 
4813 instruct Repl4F_zero_evex(vecX dst, immF0 zero) %{
4814   predicate(n->as_Vector()->length() == 4 && UseAVX > 2);
4815   match(Set dst (ReplicateF zero));
4816   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate4F zero" %}
4817   ins_encode %{
4818     // Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
4819     int vector_len = 2;
4820     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4821   %}
4822   ins_pipe( fpu_reg_reg );
4823 %}
4824 
4825 instruct Repl8F_zero_evex(vecY dst, immF0 zero) %{
4826   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4827   match(Set dst (ReplicateF zero));
4828   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate8F zero" %}
4829   ins_encode %{
4830     // Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
4831     int vector_len = 2;
4832     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4833   %}
4834   ins_pipe( fpu_reg_reg );
4835 %}
4836 
4837 instruct Repl16F_zero_evex(vecZ dst, immF0 zero) %{
4838   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4839   match(Set dst (ReplicateF zero));
4840   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate16F zero" %}
4841   ins_encode %{
4842     // Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
4843     int vector_len = 2;
4844     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4845   %}
4846   ins_pipe( fpu_reg_reg );
4847 %}
4848 
4849 instruct Repl4D_evex(vecY dst, regD src) %{
4850   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
4851   match(Set dst (ReplicateD src));
4852   format %{ "vpbroadcastsd $dst,$src\t! replicate4D" %}
4853   ins_encode %{
4854     int vector_len = 1;
4855     __ vpbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
4856   %}
4857   ins_pipe( pipe_slow );
4858 %}
4859 
4860 instruct Repl4D_mem_evex(vecY dst, memory mem) %{
4861   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
4862   match(Set dst (ReplicateD (LoadD mem)));
4863   format %{ "vbroadcastsd  $dst,$mem\t! replicate4D" %}
4864   ins_encode %{
4865     int vector_len = 1;
4866     __ vpbroadcastsd($dst$$XMMRegister, $mem$$Address, vector_len);
4867   %}
4868   ins_pipe( pipe_slow );
4869 %}
4870 
4871 instruct Repl8D_evex(vecZ dst, regD src) %{
4872   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4873   match(Set dst (ReplicateD src));
4874   format %{ "vpbroadcastsd $dst,$src\t! replicate8D" %}
4875   ins_encode %{
4876     int vector_len = 2;
4877     __ vpbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
4878   %}
4879   ins_pipe( pipe_slow );
4880 %}
4881 
4882 instruct Repl8D_mem_evex(vecZ dst, memory mem) %{
4883   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4884   match(Set dst (ReplicateD (LoadD mem)));
4885   format %{ "vbroadcastsd  $dst,$mem\t! replicate8D" %}
4886   ins_encode %{
4887     int vector_len = 2;
4888     __ vpbroadcastsd($dst$$XMMRegister, $mem$$Address, vector_len);
4889   %}
4890   ins_pipe( pipe_slow );
4891 %}
4892 
4893 instruct Repl2D_zero_evex(vecX dst, immD0 zero) %{
4894   predicate(n->as_Vector()->length() == 2 && UseAVX > 2);
4895   match(Set dst (ReplicateD zero));
4896   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate2D zero" %}
4897   ins_encode %{
4898     // Use vpxor in place of vxorpd since EVEX has a constriant on dq for vxorpd: this is a 512-bit operation
4899     int vector_len = 2;
4900     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4901   %}
4902   ins_pipe( fpu_reg_reg );
4903 %}
4904 
4905 instruct Repl4D_zero_evex(vecY dst, immD0 zero) %{
4906   predicate(n->as_Vector()->length() == 4 && UseAVX > 2);
4907   match(Set dst (ReplicateD zero));
4908   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate4D zero" %}
4909   ins_encode %{
4910     // Use vpxor in place of vxorpd since EVEX has a constriant on dq for vxorpd: this is a 512-bit operation
4911     int vector_len = 2;
4912     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4913   %}
4914   ins_pipe( fpu_reg_reg );
4915 %}
4916 
4917 instruct Repl8D_zero_evex(vecZ dst, immD0 zero) %{
4918   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4919   match(Set dst (ReplicateD zero));
4920   format %{ "vpxor  $dst k0,$dst,$dst,vect512\t! replicate8D zero" %}
4921   ins_encode %{
4922     // Use vpxor in place of vxorpd since EVEX has a constriant on dq for vxorpd: this is a 512-bit operation
4923     int vector_len = 2;
4924     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4925   %}
4926   ins_pipe( fpu_reg_reg );
4927 %}
4928 
4929 // ====================REDUCTION ARITHMETIC=======================================
4930 
4931 instruct rsadd2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, vecD tmp, vecD tmp2) %{
4932   predicate(UseSSE > 2 && UseAVX == 0);
4933   match(Set dst (AddReductionVI src1 src2));
4934   effect(TEMP tmp2, TEMP tmp);
4935   format %{ "movdqu  $tmp2,$src2\n\t"
4936             "phaddd  $tmp2,$tmp2\n\t"
4937             "movd    $tmp,$src1\n\t"
4938             "paddd   $tmp,$tmp2\n\t"
4939             "movd    $dst,$tmp\t! add reduction2I" %}
4940   ins_encode %{
4941     __ movdqu($tmp2$$XMMRegister, $src2$$XMMRegister);
4942     __ phaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister);
4943     __ movdl($tmp$$XMMRegister, $src1$$Register);
4944     __ paddd($tmp$$XMMRegister, $tmp2$$XMMRegister);
4945     __ movdl($dst$$Register, $tmp$$XMMRegister);
4946   %}
4947   ins_pipe( pipe_slow );
4948 %}
4949 
4950 instruct rvadd2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, vecD tmp, vecD tmp2) %{
4951   predicate(VM_Version::supports_avxonly());
4952   match(Set dst (AddReductionVI src1 src2));
4953   effect(TEMP tmp, TEMP tmp2);
4954   format %{ "vphaddd  $tmp,$src2,$src2\n\t"
4955             "movd     $tmp2,$src1\n\t"
4956             "vpaddd   $tmp2,$tmp2,$tmp\n\t"
4957             "movd     $dst,$tmp2\t! add reduction2I" %}
4958   ins_encode %{
4959     int vector_len = 0;
4960     __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len);
4961     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4962     __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
4963     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4964   %}
4965   ins_pipe( pipe_slow );
4966 %}
4967 
4968 instruct rvadd2I_reduction_reg_evex(rRegI dst, rRegI src1, vecD src2, vecD tmp, vecD tmp2) %{
4969   predicate(UseAVX > 2);
4970   match(Set dst (AddReductionVI src1 src2));
4971   effect(TEMP tmp, TEMP tmp2);
4972   format %{ "pshufd  $tmp2,$src2,0x1\n\t"
4973             "vpaddd  $tmp,$src2,$tmp2\n\t"
4974             "movd    $tmp2,$src1\n\t"
4975             "vpaddd  $tmp2,$tmp,$tmp2\n\t"
4976             "movd    $dst,$tmp2\t! add reduction2I" %}
4977   ins_encode %{
4978     int vector_len = 0;
4979     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
4980     __ vpaddd($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4981     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4982     __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4983     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4984   %}
4985   ins_pipe( pipe_slow );
4986 %}
4987 
4988 instruct rsadd4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, vecX tmp, vecX tmp2) %{
4989   predicate(UseSSE > 2 && UseAVX == 0);
4990   match(Set dst (AddReductionVI src1 src2));
4991   effect(TEMP tmp, TEMP tmp2);
4992   format %{ "movdqu  $tmp,$src2\n\t"
4993             "phaddd  $tmp,$tmp\n\t"
4994             "phaddd  $tmp,$tmp\n\t"
4995             "movd    $tmp2,$src1\n\t"
4996             "paddd   $tmp2,$tmp\n\t"
4997             "movd    $dst,$tmp2\t! add reduction4I" %}
4998   ins_encode %{
4999     __ movdqu($tmp$$XMMRegister, $src2$$XMMRegister);
5000     __ phaddd($tmp$$XMMRegister, $tmp$$XMMRegister);
5001     __ phaddd($tmp$$XMMRegister, $tmp$$XMMRegister);
5002     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5003     __ paddd($tmp2$$XMMRegister, $tmp$$XMMRegister);
5004     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5005   %}
5006   ins_pipe( pipe_slow );
5007 %}
5008 
5009 instruct rvadd4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, vecX tmp, vecX tmp2) %{
5010   predicate(VM_Version::supports_avxonly());
5011   match(Set dst (AddReductionVI src1 src2));
5012   effect(TEMP tmp, TEMP tmp2);
5013   format %{ "vphaddd  $tmp,$src2,$src2\n\t"
5014             "vphaddd  $tmp,$tmp,$tmp\n\t"
5015             "movd     $tmp2,$src1\n\t"
5016             "vpaddd   $tmp2,$tmp2,$tmp\n\t"
5017             "movd     $dst,$tmp2\t! add reduction4I" %}
5018   ins_encode %{
5019     int vector_len = 0;
5020     __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len);
5021     __ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp$$XMMRegister, vector_len);
5022     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5023     __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
5024     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5025   %}
5026   ins_pipe( pipe_slow );
5027 %}
5028 
5029 instruct rvadd4I_reduction_reg_evex(rRegI dst, rRegI src1, vecX src2, vecX tmp, vecX tmp2) %{
5030   predicate(UseAVX > 2);
5031   match(Set dst (AddReductionVI src1 src2));
5032   effect(TEMP tmp, TEMP tmp2);
5033   format %{ "pshufd  $tmp2,$src2,0xE\n\t"
5034             "vpaddd  $tmp,$src2,$tmp2\n\t"
5035             "pshufd  $tmp2,$tmp,0x1\n\t"
5036             "vpaddd  $tmp,$tmp,$tmp2\n\t"
5037             "movd    $tmp2,$src1\n\t"
5038             "vpaddd  $tmp2,$tmp,$tmp2\n\t"
5039             "movd    $dst,$tmp2\t! add reduction4I" %}
5040   ins_encode %{
5041     int vector_len = 0;
5042     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
5043     __ vpaddd($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5044     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
5045     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5046     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5047     __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5048     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5049   %}
5050   ins_pipe( pipe_slow );
5051 %}
5052 
5053 instruct rvadd8I_reduction_reg(rRegI dst, rRegI src1, vecY src2, vecY tmp, vecY tmp2) %{
5054   predicate(VM_Version::supports_avxonly());
5055   match(Set dst (AddReductionVI src1 src2));
5056   effect(TEMP tmp, TEMP tmp2);
5057   format %{ "vphaddd  $tmp,$src2,$src2\n\t"
5058             "vphaddd  $tmp,$tmp,$tmp2\n\t"
5059             "vextracti128_high  $tmp2,$tmp\n\t"
5060             "vpaddd   $tmp,$tmp,$tmp2\n\t"
5061             "movd     $tmp2,$src1\n\t"
5062             "vpaddd   $tmp2,$tmp2,$tmp\n\t"
5063             "movd     $dst,$tmp2\t! add reduction8I" %}
5064   ins_encode %{
5065     int vector_len = 1;
5066     __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len);
5067     __ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5068     __ vextracti128_high($tmp2$$XMMRegister, $tmp$$XMMRegister);
5069     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5070     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5071     __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5072     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5073   %}
5074   ins_pipe( pipe_slow );
5075 %}
5076 
5077 instruct rvadd8I_reduction_reg_evex(rRegI dst, rRegI src1, vecY src2, vecY tmp, vecY tmp2) %{
5078   predicate(UseAVX > 2);
5079   match(Set dst (AddReductionVI src1 src2));
5080   effect(TEMP tmp, TEMP tmp2);
5081   format %{ "vextracti128_high  $tmp,$src2\n\t"
5082             "vpaddd  $tmp,$tmp,$src2\n\t"
5083             "pshufd  $tmp2,$tmp,0xE\n\t"
5084             "vpaddd  $tmp,$tmp,$tmp2\n\t"
5085             "pshufd  $tmp2,$tmp,0x1\n\t"
5086             "vpaddd  $tmp,$tmp,$tmp2\n\t"
5087             "movd    $tmp2,$src1\n\t"
5088             "vpaddd  $tmp2,$tmp,$tmp2\n\t"
5089             "movd    $dst,$tmp2\t! add reduction8I" %}
5090   ins_encode %{
5091     int vector_len = 0;
5092     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
5093     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
5094     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
5095     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5096     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
5097     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5098     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5099     __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5100     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5101   %}
5102   ins_pipe( pipe_slow );
5103 %}
5104 
5105 instruct rvadd16I_reduction_reg_evex(rRegI dst, rRegI src1, legVecZ src2, legVecZ tmp, legVecZ tmp2, legVecZ tmp3) %{
5106   predicate(UseAVX > 2);
5107   match(Set dst (AddReductionVI src1 src2));
5108   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
5109   format %{ "vextracti64x4_high  $tmp3,$src2\n\t"
5110             "vpaddd  $tmp3,$tmp3,$src2\n\t"
5111             "vextracti128_high  $tmp,$tmp3\n\t"
5112             "vpaddd  $tmp,$tmp,$tmp3\n\t"
5113             "pshufd  $tmp2,$tmp,0xE\n\t"
5114             "vpaddd  $tmp,$tmp,$tmp2\n\t"
5115             "pshufd  $tmp2,$tmp,0x1\n\t"
5116             "vpaddd  $tmp,$tmp,$tmp2\n\t"
5117             "movd    $tmp2,$src1\n\t"
5118             "vpaddd  $tmp2,$tmp,$tmp2\n\t"
5119             "movd    $dst,$tmp2\t! mul reduction16I" %}
5120   ins_encode %{
5121     __ vextracti64x4_high($tmp3$$XMMRegister, $src2$$XMMRegister);
5122     __ vpaddd($tmp3$$XMMRegister, $tmp3$$XMMRegister, $src2$$XMMRegister, 1);
5123     __ vextracti128_high($tmp$$XMMRegister, $tmp3$$XMMRegister);
5124     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp3$$XMMRegister, 0);
5125     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
5126     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5127     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
5128     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5129     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5130     __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5131     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5132   %}
5133   ins_pipe( pipe_slow );
5134 %}
5135 
5136 #ifdef _LP64
5137 instruct rvadd2L_reduction_reg(rRegL dst, rRegL src1, vecX src2, vecX tmp, vecX tmp2) %{
5138   predicate(UseAVX > 2);
5139   match(Set dst (AddReductionVL src1 src2));
5140   effect(TEMP tmp, TEMP tmp2);
5141   format %{ "pshufd  $tmp2,$src2,0xE\n\t"
5142             "vpaddq  $tmp,$src2,$tmp2\n\t"
5143             "movdq   $tmp2,$src1\n\t"
5144             "vpaddq  $tmp2,$tmp,$tmp2\n\t"
5145             "movdq   $dst,$tmp2\t! add reduction2L" %}
5146   ins_encode %{
5147     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
5148     __ vpaddq($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, 0);
5149     __ movdq($tmp2$$XMMRegister, $src1$$Register);
5150     __ vpaddq($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5151     __ movdq($dst$$Register, $tmp2$$XMMRegister);
5152   %}
5153   ins_pipe( pipe_slow );
5154 %}
5155 
5156 instruct rvadd4L_reduction_reg(rRegL dst, rRegL src1, vecY src2, vecY tmp, vecY tmp2) %{
5157   predicate(UseAVX > 2);
5158   match(Set dst (AddReductionVL src1 src2));
5159   effect(TEMP tmp, TEMP tmp2);
5160   format %{ "vextracti128_high  $tmp,$src2\n\t"
5161             "vpaddq  $tmp2,$tmp,$src2\n\t"
5162             "pshufd  $tmp,$tmp2,0xE\n\t"
5163             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
5164             "movdq   $tmp,$src1\n\t"
5165             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
5166             "movdq   $dst,$tmp2\t! add reduction4L" %}
5167   ins_encode %{
5168     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
5169     __ vpaddq($tmp2$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, 0);
5170     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5171     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5172     __ movdq($tmp$$XMMRegister, $src1$$Register);
5173     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5174     __ movdq($dst$$Register, $tmp2$$XMMRegister);
5175   %}
5176   ins_pipe( pipe_slow );
5177 %}
5178 
5179 instruct rvadd8L_reduction_reg(rRegL dst, rRegL src1, legVecZ src2, legVecZ tmp, legVecZ tmp2) %{
5180   predicate(UseAVX > 2);
5181   match(Set dst (AddReductionVL src1 src2));
5182   effect(TEMP tmp, TEMP tmp2);
5183   format %{ "vextracti64x4_high  $tmp2,$src2\n\t"
5184             "vpaddq  $tmp2,$tmp2,$src2\n\t"
5185             "vextracti128_high  $tmp,$tmp2\n\t"
5186             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
5187             "pshufd  $tmp,$tmp2,0xE\n\t"
5188             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
5189             "movdq   $tmp,$src1\n\t"
5190             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
5191             "movdq   $dst,$tmp2\t! add reduction8L" %}
5192   ins_encode %{
5193     __ vextracti64x4_high($tmp2$$XMMRegister, $src2$$XMMRegister);
5194     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
5195     __ vextracti128_high($tmp$$XMMRegister, $tmp2$$XMMRegister);
5196     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5197     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5198     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5199     __ movdq($tmp$$XMMRegister, $src1$$Register);
5200     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5201     __ movdq($dst$$Register, $tmp2$$XMMRegister);
5202   %}
5203   ins_pipe( pipe_slow );
5204 %}
5205 #endif
5206 
5207 instruct rsadd2F_reduction_reg(regF dst, vecD src2, vecD tmp) %{
5208   predicate(UseSSE >= 1 && UseAVX == 0);
5209   match(Set dst (AddReductionVF dst src2));
5210   effect(TEMP dst, TEMP tmp);
5211   format %{ "addss   $dst,$src2\n\t"
5212             "pshufd  $tmp,$src2,0x01\n\t"
5213             "addss   $dst,$tmp\t! add reduction2F" %}
5214   ins_encode %{
5215     __ addss($dst$$XMMRegister, $src2$$XMMRegister);
5216     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5217     __ addss($dst$$XMMRegister, $tmp$$XMMRegister);
5218   %}
5219   ins_pipe( pipe_slow );
5220 %}
5221 
5222 instruct rvadd2F_reduction_reg(regF dst, vecD src2, vecD tmp) %{
5223   predicate(UseAVX > 0);
5224   match(Set dst (AddReductionVF dst src2));
5225   effect(TEMP dst, TEMP tmp);
5226   format %{ "vaddss  $dst,$dst,$src2\n\t"
5227             "pshufd  $tmp,$src2,0x01\n\t"
5228             "vaddss  $dst,$dst,$tmp\t! add reduction2F" %}
5229   ins_encode %{
5230     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5231     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5232     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5233   %}
5234   ins_pipe( pipe_slow );
5235 %}
5236 
5237 instruct rsadd4F_reduction_reg(regF dst, vecX src2, vecX tmp) %{
5238   predicate(UseSSE >= 1 && UseAVX == 0);
5239   match(Set dst (AddReductionVF dst src2));
5240   effect(TEMP dst, TEMP tmp);
5241   format %{ "addss   $dst,$src2\n\t"
5242             "pshufd  $tmp,$src2,0x01\n\t"
5243             "addss   $dst,$tmp\n\t"
5244             "pshufd  $tmp,$src2,0x02\n\t"
5245             "addss   $dst,$tmp\n\t"
5246             "pshufd  $tmp,$src2,0x03\n\t"
5247             "addss   $dst,$tmp\t! add reduction4F" %}
5248   ins_encode %{
5249     __ addss($dst$$XMMRegister, $src2$$XMMRegister);
5250     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5251     __ addss($dst$$XMMRegister, $tmp$$XMMRegister);
5252     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5253     __ addss($dst$$XMMRegister, $tmp$$XMMRegister);
5254     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5255     __ addss($dst$$XMMRegister, $tmp$$XMMRegister);
5256   %}
5257   ins_pipe( pipe_slow );
5258 %}
5259 
5260 instruct rvadd4F_reduction_reg(regF dst, vecX src2, vecX tmp) %{
5261   predicate(UseAVX > 0);
5262   match(Set dst (AddReductionVF dst src2));
5263   effect(TEMP tmp, TEMP dst);
5264   format %{ "vaddss  $dst,dst,$src2\n\t"
5265             "pshufd  $tmp,$src2,0x01\n\t"
5266             "vaddss  $dst,$dst,$tmp\n\t"
5267             "pshufd  $tmp,$src2,0x02\n\t"
5268             "vaddss  $dst,$dst,$tmp\n\t"
5269             "pshufd  $tmp,$src2,0x03\n\t"
5270             "vaddss  $dst,$dst,$tmp\t! add reduction4F" %}
5271   ins_encode %{
5272     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5273     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5274     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5275     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5276     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5277     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5278     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5279   %}
5280   ins_pipe( pipe_slow );
5281 %}
5282 
5283 instruct radd8F_reduction_reg(regF dst, vecY src2, vecY tmp, vecY tmp2) %{
5284   predicate(UseAVX > 0);
5285   match(Set dst (AddReductionVF dst src2));
5286   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5287   format %{ "vaddss  $dst,$dst,$src2\n\t"
5288             "pshufd  $tmp,$src2,0x01\n\t"
5289             "vaddss  $dst,$dst,$tmp\n\t"
5290             "pshufd  $tmp,$src2,0x02\n\t"
5291             "vaddss  $dst,$dst,$tmp\n\t"
5292             "pshufd  $tmp,$src2,0x03\n\t"
5293             "vaddss  $dst,$dst,$tmp\n\t"
5294             "vextractf128_high  $tmp2,$src2\n\t"
5295             "vaddss  $dst,$dst,$tmp2\n\t"
5296             "pshufd  $tmp,$tmp2,0x01\n\t"
5297             "vaddss  $dst,$dst,$tmp\n\t"
5298             "pshufd  $tmp,$tmp2,0x02\n\t"
5299             "vaddss  $dst,$dst,$tmp\n\t"
5300             "pshufd  $tmp,$tmp2,0x03\n\t"
5301             "vaddss  $dst,$dst,$tmp\t! add reduction8F" %}
5302   ins_encode %{
5303     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5304     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5305     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5306     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5307     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5308     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5309     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5310     __ vextractf128_high($tmp2$$XMMRegister, $src2$$XMMRegister);
5311     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5312     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5313     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5314     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5315     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5316     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5317     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5318   %}
5319   ins_pipe( pipe_slow );
5320 %}
5321 
5322 instruct radd16F_reduction_reg(regF dst, legVecZ src2, legVecZ tmp, legVecZ tmp2) %{
5323   predicate(UseAVX > 2);
5324   match(Set dst (AddReductionVF dst src2));
5325   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5326   format %{ "vaddss  $dst,$dst,$src2\n\t"
5327             "pshufd  $tmp,$src2,0x01\n\t"
5328             "vaddss  $dst,$dst,$tmp\n\t"
5329             "pshufd  $tmp,$src2,0x02\n\t"
5330             "vaddss  $dst,$dst,$tmp\n\t"
5331             "pshufd  $tmp,$src2,0x03\n\t"
5332             "vaddss  $dst,$dst,$tmp\n\t"
5333             "vextractf32x4  $tmp2,$src2,0x1\n\t"
5334             "vaddss  $dst,$dst,$tmp2\n\t"
5335             "pshufd  $tmp,$tmp2,0x01\n\t"
5336             "vaddss  $dst,$dst,$tmp\n\t"
5337             "pshufd  $tmp,$tmp2,0x02\n\t"
5338             "vaddss  $dst,$dst,$tmp\n\t"
5339             "pshufd  $tmp,$tmp2,0x03\n\t"
5340             "vaddss  $dst,$dst,$tmp\n\t"
5341             "vextractf32x4  $tmp2,$src2,0x2\n\t"
5342             "vaddss  $dst,$dst,$tmp2\n\t"
5343             "pshufd  $tmp,$tmp2,0x01\n\t"
5344             "vaddss  $dst,$dst,$tmp\n\t"
5345             "pshufd  $tmp,$tmp2,0x02\n\t"
5346             "vaddss  $dst,$dst,$tmp\n\t"
5347             "pshufd  $tmp,$tmp2,0x03\n\t"
5348             "vaddss  $dst,$dst,$tmp\n\t"
5349             "vextractf32x4  $tmp2,$src2,0x3\n\t"
5350             "vaddss  $dst,$dst,$tmp2\n\t"
5351             "pshufd  $tmp,$tmp2,0x01\n\t"
5352             "vaddss  $dst,$dst,$tmp\n\t"
5353             "pshufd  $tmp,$tmp2,0x02\n\t"
5354             "vaddss  $dst,$dst,$tmp\n\t"
5355             "pshufd  $tmp,$tmp2,0x03\n\t"
5356             "vaddss  $dst,$dst,$tmp\t! add reduction16F" %}
5357   ins_encode %{
5358     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5359     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5360     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5361     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5362     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5363     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5364     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5365     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5366     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5367     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5368     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5369     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5370     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5371     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5372     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5373     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
5374     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5375     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5376     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5377     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5378     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5379     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5380     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5381     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
5382     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5383     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5384     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5385     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5386     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5387     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5388     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5389   %}
5390   ins_pipe( pipe_slow );
5391 %}
5392 
5393 instruct rsadd2D_reduction_reg(regD dst, vecX src2, vecX tmp) %{
5394   predicate(UseSSE >= 1 && UseAVX == 0);
5395   match(Set dst (AddReductionVD dst src2));
5396   effect(TEMP tmp, TEMP dst);
5397   format %{ "addsd   $dst,$src2\n\t"
5398             "pshufd  $tmp,$src2,0xE\n\t"
5399             "addsd   $dst,$tmp\t! add reduction2D" %}
5400   ins_encode %{
5401     __ addsd($dst$$XMMRegister, $src2$$XMMRegister);
5402     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5403     __ addsd($dst$$XMMRegister, $tmp$$XMMRegister);
5404   %}
5405   ins_pipe( pipe_slow );
5406 %}
5407 
5408 instruct rvadd2D_reduction_reg(regD dst, vecX src2, vecX tmp) %{
5409   predicate(UseAVX > 0);
5410   match(Set dst (AddReductionVD dst src2));
5411   effect(TEMP tmp, TEMP dst);
5412   format %{ "vaddsd  $dst,$dst,$src2\n\t"
5413             "pshufd  $tmp,$src2,0xE\n\t"
5414             "vaddsd  $dst,$dst,$tmp\t! add reduction2D" %}
5415   ins_encode %{
5416     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5417     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5418     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5419   %}
5420   ins_pipe( pipe_slow );
5421 %}
5422 
5423 instruct rvadd4D_reduction_reg(regD dst, vecY src2, vecX tmp, vecX tmp2) %{
5424   predicate(UseAVX > 0);
5425   match(Set dst (AddReductionVD dst src2));
5426   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5427   format %{ "vaddsd  $dst,$dst,$src2\n\t"
5428             "pshufd  $tmp,$src2,0xE\n\t"
5429             "vaddsd  $dst,$dst,$tmp\n\t"
5430             "vextractf128  $tmp2,$src2,0x1\n\t"
5431             "vaddsd  $dst,$dst,$tmp2\n\t"
5432             "pshufd  $tmp,$tmp2,0xE\n\t"
5433             "vaddsd  $dst,$dst,$tmp\t! add reduction4D" %}
5434   ins_encode %{
5435     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5436     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5437     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5438     __ vextractf128($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5439     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5440     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5441     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5442   %}
5443   ins_pipe( pipe_slow );
5444 %}
5445 
5446 instruct rvadd8D_reduction_reg(regD dst, legVecZ src2, legVecZ tmp, legVecZ tmp2) %{
5447   predicate(UseAVX > 2);
5448   match(Set dst (AddReductionVD dst src2));
5449   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5450   format %{ "vaddsd  $dst,$dst,$src2\n\t"
5451             "pshufd  $tmp,$src2,0xE\n\t"
5452             "vaddsd  $dst,$dst,$tmp\n\t"
5453             "vextractf32x4  $tmp2,$src2,0x1\n\t"
5454             "vaddsd  $dst,$dst,$tmp2\n\t"
5455             "pshufd  $tmp,$tmp2,0xE\n\t"
5456             "vaddsd  $dst,$dst,$tmp\n\t"
5457             "vextractf32x4  $tmp2,$src2,0x2\n\t"
5458             "vaddsd  $dst,$dst,$tmp2\n\t"
5459             "pshufd  $tmp,$tmp2,0xE\n\t"
5460             "vaddsd  $dst,$dst,$tmp\n\t"
5461             "vextractf32x4  $tmp2,$src2,0x3\n\t"
5462             "vaddsd  $dst,$dst,$tmp2\n\t"
5463             "pshufd  $tmp,$tmp2,0xE\n\t"
5464             "vaddsd  $dst,$dst,$tmp\t! add reduction8D" %}
5465   ins_encode %{
5466     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5467     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5468     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5469     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5470     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5471     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5472     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5473     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
5474     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5475     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5476     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5477     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
5478     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5479     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5480     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5481   %}
5482   ins_pipe( pipe_slow );
5483 %}
5484 
5485 instruct rsmul2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, vecD tmp, vecD tmp2) %{
5486   predicate(UseSSE > 3 && UseAVX == 0);
5487   match(Set dst (MulReductionVI src1 src2));
5488   effect(TEMP tmp, TEMP tmp2);
5489   format %{ "pshufd  $tmp2,$src2,0x1\n\t"
5490             "pmulld  $tmp2,$src2\n\t"
5491             "movd    $tmp,$src1\n\t"
5492             "pmulld  $tmp2,$tmp\n\t"
5493             "movd    $dst,$tmp2\t! mul reduction2I" %}
5494   ins_encode %{
5495     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5496     __ pmulld($tmp2$$XMMRegister, $src2$$XMMRegister);
5497     __ movdl($tmp$$XMMRegister, $src1$$Register);
5498     __ pmulld($tmp2$$XMMRegister, $tmp$$XMMRegister);
5499     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5500   %}
5501   ins_pipe( pipe_slow );
5502 %}
5503 
5504 instruct rvmul2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, vecD tmp, vecD tmp2) %{
5505   predicate(UseAVX > 0);
5506   match(Set dst (MulReductionVI src1 src2));
5507   effect(TEMP tmp, TEMP tmp2);
5508   format %{ "pshufd   $tmp2,$src2,0x1\n\t"
5509             "vpmulld  $tmp,$src2,$tmp2\n\t"
5510             "movd     $tmp2,$src1\n\t"
5511             "vpmulld  $tmp2,$tmp,$tmp2\n\t"
5512             "movd     $dst,$tmp2\t! mul reduction2I" %}
5513   ins_encode %{
5514     int vector_len = 0;
5515     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5516     __ vpmulld($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5517     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5518     __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5519     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5520   %}
5521   ins_pipe( pipe_slow );
5522 %}
5523 
5524 instruct rsmul4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, vecX tmp, vecX tmp2) %{
5525   predicate(UseSSE > 3 && UseAVX == 0);
5526   match(Set dst (MulReductionVI src1 src2));
5527   effect(TEMP tmp, TEMP tmp2);
5528   format %{ "pshufd  $tmp2,$src2,0xE\n\t"
5529             "pmulld  $tmp2,$src2\n\t"
5530             "pshufd  $tmp,$tmp2,0x1\n\t"
5531             "pmulld  $tmp2,$tmp\n\t"
5532             "movd    $tmp,$src1\n\t"
5533             "pmulld  $tmp2,$tmp\n\t"
5534             "movd    $dst,$tmp2\t! mul reduction4I" %}
5535   ins_encode %{
5536     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
5537     __ pmulld($tmp2$$XMMRegister, $src2$$XMMRegister);
5538     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x1);
5539     __ pmulld($tmp2$$XMMRegister, $tmp$$XMMRegister);
5540     __ movdl($tmp$$XMMRegister, $src1$$Register);
5541     __ pmulld($tmp2$$XMMRegister, $tmp$$XMMRegister);
5542     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5543   %}
5544   ins_pipe( pipe_slow );
5545 %}
5546 
5547 instruct rvmul4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, vecX tmp, vecX tmp2) %{
5548   predicate(UseAVX > 0);
5549   match(Set dst (MulReductionVI src1 src2));
5550   effect(TEMP tmp, TEMP tmp2);
5551   format %{ "pshufd   $tmp2,$src2,0xE\n\t"
5552             "vpmulld  $tmp,$src2,$tmp2\n\t"
5553             "pshufd   $tmp2,$tmp,0x1\n\t"
5554             "vpmulld  $tmp,$tmp,$tmp2\n\t"
5555             "movd     $tmp2,$src1\n\t"
5556             "vpmulld  $tmp2,$tmp,$tmp2\n\t"
5557             "movd     $dst,$tmp2\t! mul reduction4I" %}
5558   ins_encode %{
5559     int vector_len = 0;
5560     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
5561     __ vpmulld($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5562     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
5563     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5564     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5565     __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5566     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5567   %}
5568   ins_pipe( pipe_slow );
5569 %}
5570 
5571 instruct rvmul8I_reduction_reg(rRegI dst, rRegI src1, vecY src2, vecY tmp, vecY tmp2) %{
5572   predicate(UseAVX > 1);
5573   match(Set dst (MulReductionVI src1 src2));
5574   effect(TEMP tmp, TEMP tmp2);
5575   format %{ "vextracti128_high  $tmp,$src2\n\t"
5576             "vpmulld  $tmp,$tmp,$src2\n\t"
5577             "pshufd   $tmp2,$tmp,0xE\n\t"
5578             "vpmulld  $tmp,$tmp,$tmp2\n\t"
5579             "pshufd   $tmp2,$tmp,0x1\n\t"
5580             "vpmulld  $tmp,$tmp,$tmp2\n\t"
5581             "movd     $tmp2,$src1\n\t"
5582             "vpmulld  $tmp2,$tmp,$tmp2\n\t"
5583             "movd     $dst,$tmp2\t! mul reduction8I" %}
5584   ins_encode %{
5585     int vector_len = 0;
5586     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
5587     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
5588     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
5589     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5590     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
5591     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5592     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5593     __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5594     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5595   %}
5596   ins_pipe( pipe_slow );
5597 %}
5598 
5599 instruct rvmul16I_reduction_reg(rRegI dst, rRegI src1, legVecZ src2, legVecZ tmp, legVecZ tmp2, legVecZ tmp3) %{
5600   predicate(UseAVX > 2);
5601   match(Set dst (MulReductionVI src1 src2));
5602   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
5603   format %{ "vextracti64x4_high  $tmp3,$src2\n\t"
5604             "vpmulld  $tmp3,$tmp3,$src2\n\t"
5605             "vextracti128_high  $tmp,$tmp3\n\t"
5606             "vpmulld  $tmp,$tmp,$src2\n\t"
5607             "pshufd   $tmp2,$tmp,0xE\n\t"
5608             "vpmulld  $tmp,$tmp,$tmp2\n\t"
5609             "pshufd   $tmp2,$tmp,0x1\n\t"
5610             "vpmulld  $tmp,$tmp,$tmp2\n\t"
5611             "movd     $tmp2,$src1\n\t"
5612             "vpmulld  $tmp2,$tmp,$tmp2\n\t"
5613             "movd     $dst,$tmp2\t! mul reduction16I" %}
5614   ins_encode %{
5615     __ vextracti64x4_high($tmp3$$XMMRegister, $src2$$XMMRegister);
5616     __ vpmulld($tmp3$$XMMRegister, $tmp3$$XMMRegister, $src2$$XMMRegister, 1);
5617     __ vextracti128_high($tmp$$XMMRegister, $tmp3$$XMMRegister);
5618     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp3$$XMMRegister, 0);
5619     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
5620     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5621     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
5622     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5623     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5624     __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5625     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5626   %}
5627   ins_pipe( pipe_slow );
5628 %}
5629 
5630 #ifdef _LP64
5631 instruct rvmul2L_reduction_reg(rRegL dst, rRegL src1, vecX src2, vecX tmp, vecX tmp2) %{
5632   predicate(UseAVX > 2 && VM_Version::supports_avx512dq());
5633   match(Set dst (MulReductionVL src1 src2));
5634   effect(TEMP tmp, TEMP tmp2);
5635   format %{ "pshufd   $tmp2,$src2,0xE\n\t"
5636             "vpmullq  $tmp,$src2,$tmp2\n\t"
5637             "movdq    $tmp2,$src1\n\t"
5638             "vpmullq  $tmp2,$tmp,$tmp2\n\t"
5639             "movdq    $dst,$tmp2\t! mul reduction2L" %}
5640   ins_encode %{
5641     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
5642     __ vpmullq($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, 0);
5643     __ movdq($tmp2$$XMMRegister, $src1$$Register);
5644     __ vpmullq($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5645     __ movdq($dst$$Register, $tmp2$$XMMRegister);
5646   %}
5647   ins_pipe( pipe_slow );
5648 %}
5649 
5650 instruct rvmul4L_reduction_reg(rRegL dst, rRegL src1, vecY src2, vecY tmp, vecY tmp2) %{
5651   predicate(UseAVX > 2 && VM_Version::supports_avx512dq());
5652   match(Set dst (MulReductionVL src1 src2));
5653   effect(TEMP tmp, TEMP tmp2);
5654   format %{ "vextracti128_high  $tmp,$src2\n\t"
5655             "vpmullq  $tmp2,$tmp,$src2\n\t"
5656             "pshufd   $tmp,$tmp2,0xE\n\t"
5657             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5658             "movdq    $tmp,$src1\n\t"
5659             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5660             "movdq    $dst,$tmp2\t! mul reduction4L" %}
5661   ins_encode %{
5662     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
5663     __ vpmullq($tmp2$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, 0);
5664     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5665     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5666     __ movdq($tmp$$XMMRegister, $src1$$Register);
5667     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5668     __ movdq($dst$$Register, $tmp2$$XMMRegister);
5669   %}
5670   ins_pipe( pipe_slow );
5671 %}
5672 
5673 instruct rvmul8L_reduction_reg(rRegL dst, rRegL src1, legVecZ src2, legVecZ tmp, legVecZ tmp2) %{
5674   predicate(UseAVX > 2 && VM_Version::supports_avx512dq());
5675   match(Set dst (MulReductionVL src1 src2));
5676   effect(TEMP tmp, TEMP tmp2);
5677   format %{ "vextracti64x4_high  $tmp2,$src2\n\t"
5678             "vpmullq  $tmp2,$tmp2,$src2\n\t"
5679             "vextracti128_high  $tmp,$tmp2\n\t"
5680             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5681             "pshufd   $tmp,$tmp2,0xE\n\t"
5682             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5683             "movdq    $tmp,$src1\n\t"
5684             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5685             "movdq    $dst,$tmp2\t! mul reduction8L" %}
5686   ins_encode %{
5687     __ vextracti64x4_high($tmp2$$XMMRegister, $src2$$XMMRegister);
5688     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
5689     __ vextracti128_high($tmp$$XMMRegister, $tmp2$$XMMRegister);
5690     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5691     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5692     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5693     __ movdq($tmp$$XMMRegister, $src1$$Register);
5694     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5695     __ movdq($dst$$Register, $tmp2$$XMMRegister);
5696   %}
5697   ins_pipe( pipe_slow );
5698 %}
5699 #endif
5700 
5701 instruct rsmul2F_reduction(regF dst, vecD src2, vecD tmp) %{
5702   predicate(UseSSE >= 1 && UseAVX == 0);
5703   match(Set dst (MulReductionVF dst src2));
5704   effect(TEMP dst, TEMP tmp);
5705   format %{ "mulss   $dst,$src2\n\t"
5706             "pshufd  $tmp,$src2,0x01\n\t"
5707             "mulss   $dst,$tmp\t! mul reduction2F" %}
5708   ins_encode %{
5709     __ mulss($dst$$XMMRegister, $src2$$XMMRegister);
5710     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5711     __ mulss($dst$$XMMRegister, $tmp$$XMMRegister);
5712   %}
5713   ins_pipe( pipe_slow );
5714 %}
5715 
5716 instruct rvmul2F_reduction_reg(regF dst, vecD src2, vecD tmp) %{
5717   predicate(UseAVX > 0);
5718   match(Set dst (MulReductionVF dst src2));
5719   effect(TEMP tmp, TEMP dst);
5720   format %{ "vmulss  $dst,$dst,$src2\n\t"
5721             "pshufd  $tmp,$src2,0x01\n\t"
5722             "vmulss  $dst,$dst,$tmp\t! mul reduction2F" %}
5723   ins_encode %{
5724     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5725     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5726     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5727   %}
5728   ins_pipe( pipe_slow );
5729 %}
5730 
5731 instruct rsmul4F_reduction_reg(regF dst, vecX src2, vecX tmp) %{
5732   predicate(UseSSE >= 1 && UseAVX == 0);
5733   match(Set dst (MulReductionVF dst src2));
5734   effect(TEMP dst, TEMP tmp);
5735   format %{ "mulss   $dst,$src2\n\t"
5736             "pshufd  $tmp,$src2,0x01\n\t"
5737             "mulss   $dst,$tmp\n\t"
5738             "pshufd  $tmp,$src2,0x02\n\t"
5739             "mulss   $dst,$tmp\n\t"
5740             "pshufd  $tmp,$src2,0x03\n\t"
5741             "mulss   $dst,$tmp\t! mul reduction4F" %}
5742   ins_encode %{
5743     __ mulss($dst$$XMMRegister, $src2$$XMMRegister);
5744     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5745     __ mulss($dst$$XMMRegister, $tmp$$XMMRegister);
5746     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5747     __ mulss($dst$$XMMRegister, $tmp$$XMMRegister);
5748     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5749     __ mulss($dst$$XMMRegister, $tmp$$XMMRegister);
5750   %}
5751   ins_pipe( pipe_slow );
5752 %}
5753 
5754 instruct rvmul4F_reduction_reg(regF dst, vecX src2, vecX tmp) %{
5755   predicate(UseAVX > 0);
5756   match(Set dst (MulReductionVF dst src2));
5757   effect(TEMP tmp, TEMP dst);
5758   format %{ "vmulss  $dst,$dst,$src2\n\t"
5759             "pshufd  $tmp,$src2,0x01\n\t"
5760             "vmulss  $dst,$dst,$tmp\n\t"
5761             "pshufd  $tmp,$src2,0x02\n\t"
5762             "vmulss  $dst,$dst,$tmp\n\t"
5763             "pshufd  $tmp,$src2,0x03\n\t"
5764             "vmulss  $dst,$dst,$tmp\t! mul reduction4F" %}
5765   ins_encode %{
5766     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5767     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5768     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5769     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5770     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5771     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5772     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5773   %}
5774   ins_pipe( pipe_slow );
5775 %}
5776 
5777 instruct rvmul8F_reduction_reg(regF dst, vecY src2, vecY tmp, vecY tmp2) %{
5778   predicate(UseAVX > 0);
5779   match(Set dst (MulReductionVF dst src2));
5780   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5781   format %{ "vmulss  $dst,$dst,$src2\n\t"
5782             "pshufd  $tmp,$src2,0x01\n\t"
5783             "vmulss  $dst,$dst,$tmp\n\t"
5784             "pshufd  $tmp,$src2,0x02\n\t"
5785             "vmulss  $dst,$dst,$tmp\n\t"
5786             "pshufd  $tmp,$src2,0x03\n\t"
5787             "vmulss  $dst,$dst,$tmp\n\t"
5788             "vextractf128_high  $tmp2,$src2\n\t"
5789             "vmulss  $dst,$dst,$tmp2\n\t"
5790             "pshufd  $tmp,$tmp2,0x01\n\t"
5791             "vmulss  $dst,$dst,$tmp\n\t"
5792             "pshufd  $tmp,$tmp2,0x02\n\t"
5793             "vmulss  $dst,$dst,$tmp\n\t"
5794             "pshufd  $tmp,$tmp2,0x03\n\t"
5795             "vmulss  $dst,$dst,$tmp\t! mul reduction8F" %}
5796   ins_encode %{
5797     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5798     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5799     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5800     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5801     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5802     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5803     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5804     __ vextractf128_high($tmp2$$XMMRegister, $src2$$XMMRegister);
5805     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5806     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5807     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5808     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5809     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5810     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5811     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5812   %}
5813   ins_pipe( pipe_slow );
5814 %}
5815 
5816 instruct rvmul16F_reduction_reg(regF dst, legVecZ src2, legVecZ tmp, legVecZ tmp2) %{
5817   predicate(UseAVX > 2);
5818   match(Set dst (MulReductionVF dst src2));
5819   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5820   format %{ "vmulss  $dst,$dst,$src2\n\t"
5821             "pshufd  $tmp,$src2,0x01\n\t"
5822             "vmulss  $dst,$dst,$tmp\n\t"
5823             "pshufd  $tmp,$src2,0x02\n\t"
5824             "vmulss  $dst,$dst,$tmp\n\t"
5825             "pshufd  $tmp,$src2,0x03\n\t"
5826             "vmulss  $dst,$dst,$tmp\n\t"
5827             "vextractf32x4  $tmp2,$src2,0x1\n\t"
5828             "vmulss  $dst,$dst,$tmp2\n\t"
5829             "pshufd  $tmp,$tmp2,0x01\n\t"
5830             "vmulss  $dst,$dst,$tmp\n\t"
5831             "pshufd  $tmp,$tmp2,0x02\n\t"
5832             "vmulss  $dst,$dst,$tmp\n\t"
5833             "pshufd  $tmp,$tmp2,0x03\n\t"
5834             "vmulss  $dst,$dst,$tmp\n\t"
5835             "vextractf32x4  $tmp2,$src2,0x2\n\t"
5836             "vmulss  $dst,$dst,$tmp2\n\t"
5837             "pshufd  $tmp,$tmp2,0x01\n\t"
5838             "vmulss  $dst,$dst,$tmp\n\t"
5839             "pshufd  $tmp,$tmp2,0x02\n\t"
5840             "vmulss  $dst,$dst,$tmp\n\t"
5841             "pshufd  $tmp,$tmp2,0x03\n\t"
5842             "vmulss  $dst,$dst,$tmp\n\t"
5843             "vextractf32x4  $tmp2,$src2,0x3\n\t"
5844             "vmulss  $dst,$dst,$tmp2\n\t"
5845             "pshufd  $tmp,$tmp2,0x01\n\t"
5846             "vmulss  $dst,$dst,$tmp\n\t"
5847             "pshufd  $tmp,$tmp2,0x02\n\t"
5848             "vmulss  $dst,$dst,$tmp\n\t"
5849             "pshufd  $tmp,$tmp2,0x03\n\t"
5850             "vmulss  $dst,$dst,$tmp\t! mul reduction16F" %}
5851   ins_encode %{
5852     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5853     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5854     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5855     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5856     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5857     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5858     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5859     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5860     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5861     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5862     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5863     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5864     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5865     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5866     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5867     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
5868     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5869     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5870     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5871     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5872     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5873     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5874     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5875     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
5876     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5877     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5878     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5879     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5880     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5881     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5882     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5883   %}
5884   ins_pipe( pipe_slow );
5885 %}
5886 
5887 instruct rsmul2D_reduction_reg(regD dst, vecX src2, vecX tmp) %{
5888   predicate(UseSSE >= 1 && UseAVX == 0);
5889   match(Set dst (MulReductionVD dst src2));
5890   effect(TEMP dst, TEMP tmp);
5891   format %{ "mulsd   $dst,$src2\n\t"
5892             "pshufd  $tmp,$src2,0xE\n\t"
5893             "mulsd   $dst,$tmp\t! mul reduction2D" %}
5894   ins_encode %{
5895     __ mulsd($dst$$XMMRegister, $src2$$XMMRegister);
5896     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5897     __ mulsd($dst$$XMMRegister, $tmp$$XMMRegister);
5898   %}
5899   ins_pipe( pipe_slow );
5900 %}
5901 
5902 instruct rvmul2D_reduction_reg(regD dst, vecX src2, vecX tmp) %{
5903   predicate(UseAVX > 0);
5904   match(Set dst (MulReductionVD dst src2));
5905   effect(TEMP tmp, TEMP dst);
5906   format %{ "vmulsd  $dst,$dst,$src2\n\t"
5907             "pshufd  $tmp,$src2,0xE\n\t"
5908             "vmulsd  $dst,$dst,$tmp\t! mul reduction2D" %}
5909   ins_encode %{
5910     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5911     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5912     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5913   %}
5914   ins_pipe( pipe_slow );
5915 %}
5916 
5917 instruct rvmul4D_reduction_reg(regD dst, vecY src2, vecY tmp, vecY tmp2) %{
5918   predicate(UseAVX > 0);
5919   match(Set dst (MulReductionVD dst src2));
5920   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5921   format %{ "vmulsd  $dst,$dst,$src2\n\t"
5922             "pshufd  $tmp,$src2,0xE\n\t"
5923             "vmulsd  $dst,$dst,$tmp\n\t"
5924             "vextractf128_high  $tmp2,$src2\n\t"
5925             "vmulsd  $dst,$dst,$tmp2\n\t"
5926             "pshufd  $tmp,$tmp2,0xE\n\t"
5927             "vmulsd  $dst,$dst,$tmp\t! mul reduction4D" %}
5928   ins_encode %{
5929     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5930     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5931     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5932     __ vextractf128_high($tmp2$$XMMRegister, $src2$$XMMRegister);
5933     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5934     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5935     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5936   %}
5937   ins_pipe( pipe_slow );
5938 %}
5939 
5940 instruct rvmul8D_reduction_reg(regD dst, legVecZ src2, legVecZ tmp, legVecZ tmp2) %{
5941   predicate(UseAVX > 2);
5942   match(Set dst (MulReductionVD dst src2));
5943   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5944   format %{ "vmulsd  $dst,$dst,$src2\n\t"
5945             "pshufd  $tmp,$src2,0xE\n\t"
5946             "vmulsd  $dst,$dst,$tmp\n\t"
5947             "vextractf32x4  $tmp2,$src2,0x1\n\t"
5948             "vmulsd  $dst,$dst,$tmp2\n\t"
5949             "pshufd  $tmp,$src2,0xE\n\t"
5950             "vmulsd  $dst,$dst,$tmp\n\t"
5951             "vextractf32x4  $tmp2,$src2,0x2\n\t"
5952             "vmulsd  $dst,$dst,$tmp2\n\t"
5953             "pshufd  $tmp,$tmp2,0xE\n\t"
5954             "vmulsd  $dst,$dst,$tmp\n\t"
5955             "vextractf32x4  $tmp2,$src2,0x3\n\t"
5956             "vmulsd  $dst,$dst,$tmp2\n\t"
5957             "pshufd  $tmp,$tmp2,0xE\n\t"
5958             "vmulsd  $dst,$dst,$tmp\t! mul reduction8D" %}
5959   ins_encode %{
5960     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5961     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5962     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5963     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5964     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5965     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5966     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5967     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
5968     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5969     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5970     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5971     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
5972     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5973     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5974     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5975   %}
5976   ins_pipe( pipe_slow );
5977 %}
5978 
5979 // ====================VECTOR ARITHMETIC=======================================
5980 
5981 // --------------------------------- ADD --------------------------------------
5982 
5983 // Bytes vector add
5984 instruct vadd4B(vecS dst, vecS src) %{
5985   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
5986   match(Set dst (AddVB dst src));
5987   format %{ "paddb   $dst,$src\t! add packed4B" %}
5988   ins_encode %{
5989     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
5990   %}
5991   ins_pipe( pipe_slow );
5992 %}
5993 
5994 instruct vadd4B_reg(vecS dst, vecS src1, vecS src2) %{
5995   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
5996   match(Set dst (AddVB src1 src2));
5997   format %{ "vpaddb  $dst,$src1,$src2\t! add packed4B" %}
5998   ins_encode %{
5999     int vector_len = 0;
6000     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6001   %}
6002   ins_pipe( pipe_slow );
6003 %}
6004 
6005 
6006 instruct vadd4B_mem(vecS dst, vecS src, memory mem) %{
6007   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6008   match(Set dst (AddVB src (LoadVector mem)));
6009   format %{ "vpaddb  $dst,$src,$mem\t! add packed4B" %}
6010   ins_encode %{
6011     int vector_len = 0;
6012     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6013   %}
6014   ins_pipe( pipe_slow );
6015 %}
6016 
6017 instruct vadd8B(vecD dst, vecD src) %{
6018   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
6019   match(Set dst (AddVB dst src));
6020   format %{ "paddb   $dst,$src\t! add packed8B" %}
6021   ins_encode %{
6022     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
6023   %}
6024   ins_pipe( pipe_slow );
6025 %}
6026 
6027 instruct vadd8B_reg(vecD dst, vecD src1, vecD src2) %{
6028   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6029   match(Set dst (AddVB src1 src2));
6030   format %{ "vpaddb  $dst,$src1,$src2\t! add packed8B" %}
6031   ins_encode %{
6032     int vector_len = 0;
6033     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6034   %}
6035   ins_pipe( pipe_slow );
6036 %}
6037 
6038 
6039 instruct vadd8B_mem(vecD dst, vecD src, memory mem) %{
6040   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6041   match(Set dst (AddVB src (LoadVector mem)));
6042   format %{ "vpaddb  $dst,$src,$mem\t! add packed8B" %}
6043   ins_encode %{
6044     int vector_len = 0;
6045     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6046   %}
6047   ins_pipe( pipe_slow );
6048 %}
6049 
6050 instruct vadd16B(vecX dst, vecX src) %{
6051   predicate(UseAVX == 0 && n->as_Vector()->length() == 16);
6052   match(Set dst (AddVB dst src));
6053   format %{ "paddb   $dst,$src\t! add packed16B" %}
6054   ins_encode %{
6055     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
6056   %}
6057   ins_pipe( pipe_slow );
6058 %}
6059 
6060 instruct vadd16B_reg(vecX dst, vecX src1, vecX src2) %{
6061   predicate(UseAVX > 0  && n->as_Vector()->length() == 16);
6062   match(Set dst (AddVB src1 src2));
6063   format %{ "vpaddb  $dst,$src1,$src2\t! add packed16B" %}
6064   ins_encode %{
6065     int vector_len = 0;
6066     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6067   %}
6068   ins_pipe( pipe_slow );
6069 %}
6070 
6071 instruct vadd16B_mem(vecX dst, vecX src, memory mem) %{
6072   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
6073   match(Set dst (AddVB src (LoadVector mem)));
6074   format %{ "vpaddb  $dst,$src,$mem\t! add packed16B" %}
6075   ins_encode %{
6076     int vector_len = 0;
6077     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6078   %}
6079   ins_pipe( pipe_slow );
6080 %}
6081 
6082 instruct vadd32B_reg(vecY dst, vecY src1, vecY src2) %{
6083   predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
6084   match(Set dst (AddVB src1 src2));
6085   format %{ "vpaddb  $dst,$src1,$src2\t! add packed32B" %}
6086   ins_encode %{
6087     int vector_len = 1;
6088     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6089   %}
6090   ins_pipe( pipe_slow );
6091 %}
6092 
6093 instruct vadd32B_mem(vecY dst, vecY src, memory mem) %{
6094   predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
6095   match(Set dst (AddVB src (LoadVector mem)));
6096   format %{ "vpaddb  $dst,$src,$mem\t! add packed32B" %}
6097   ins_encode %{
6098     int vector_len = 1;
6099     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6100   %}
6101   ins_pipe( pipe_slow );
6102 %}
6103 
6104 instruct vadd64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
6105   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64);
6106   match(Set dst (AddVB src1 src2));
6107   format %{ "vpaddb  $dst,$src1,$src2\t! add packed64B" %}
6108   ins_encode %{
6109     int vector_len = 2;
6110     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6111   %}
6112   ins_pipe( pipe_slow );
6113 %}
6114 
6115 instruct vadd64B_mem(vecZ dst, vecZ src, memory mem) %{
6116   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64);
6117   match(Set dst (AddVB src (LoadVector mem)));
6118   format %{ "vpaddb  $dst,$src,$mem\t! add packed64B" %}
6119   ins_encode %{
6120     int vector_len = 2;
6121     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6122   %}
6123   ins_pipe( pipe_slow );
6124 %}
6125 
6126 // Shorts/Chars vector add
6127 instruct vadd2S(vecS dst, vecS src) %{
6128   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
6129   match(Set dst (AddVS dst src));
6130   format %{ "paddw   $dst,$src\t! add packed2S" %}
6131   ins_encode %{
6132     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
6133   %}
6134   ins_pipe( pipe_slow );
6135 %}
6136 
6137 instruct vadd2S_reg(vecS dst, vecS src1, vecS src2) %{
6138   predicate(UseAVX > 0  && n->as_Vector()->length() == 2);
6139   match(Set dst (AddVS src1 src2));
6140   format %{ "vpaddw  $dst,$src1,$src2\t! add packed2S" %}
6141   ins_encode %{
6142     int vector_len = 0;
6143     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6144   %}
6145   ins_pipe( pipe_slow );
6146 %}
6147 
6148 instruct vadd2S_mem(vecS dst, vecS src, memory mem) %{
6149   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6150   match(Set dst (AddVS src (LoadVector mem)));
6151   format %{ "vpaddw  $dst,$src,$mem\t! add packed2S" %}
6152   ins_encode %{
6153     int vector_len = 0;
6154     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6155   %}
6156   ins_pipe( pipe_slow );
6157 %}
6158 
6159 instruct vadd4S(vecD dst, vecD src) %{
6160   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
6161   match(Set dst (AddVS dst src));
6162   format %{ "paddw   $dst,$src\t! add packed4S" %}
6163   ins_encode %{
6164     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
6165   %}
6166   ins_pipe( pipe_slow );
6167 %}
6168 
6169 instruct vadd4S_reg(vecD dst, vecD src1, vecD src2) %{
6170   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6171   match(Set dst (AddVS src1 src2));
6172   format %{ "vpaddw  $dst,$src1,$src2\t! add packed4S" %}
6173   ins_encode %{
6174     int vector_len = 0;
6175     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6176   %}
6177   ins_pipe( pipe_slow );
6178 %}
6179 
6180 instruct vadd4S_mem(vecD dst, vecD src, memory mem) %{
6181   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6182   match(Set dst (AddVS src (LoadVector mem)));
6183   format %{ "vpaddw  $dst,$src,$mem\t! add packed4S" %}
6184   ins_encode %{
6185     int vector_len = 0;
6186     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6187   %}
6188   ins_pipe( pipe_slow );
6189 %}
6190 
6191 instruct vadd8S(vecX dst, vecX src) %{
6192   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
6193   match(Set dst (AddVS dst src));
6194   format %{ "paddw   $dst,$src\t! add packed8S" %}
6195   ins_encode %{
6196     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
6197   %}
6198   ins_pipe( pipe_slow );
6199 %}
6200 
6201 instruct vadd8S_reg(vecX dst, vecX src1, vecX src2) %{
6202   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6203   match(Set dst (AddVS src1 src2));
6204   format %{ "vpaddw  $dst,$src1,$src2\t! add packed8S" %}
6205   ins_encode %{
6206     int vector_len = 0;
6207     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6208   %}
6209   ins_pipe( pipe_slow );
6210 %}
6211 
6212 instruct vadd8S_mem(vecX dst, vecX src, memory mem) %{
6213   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6214   match(Set dst (AddVS src (LoadVector mem)));
6215   format %{ "vpaddw  $dst,$src,$mem\t! add packed8S" %}
6216   ins_encode %{
6217     int vector_len = 0;
6218     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6219   %}
6220   ins_pipe( pipe_slow );
6221 %}
6222 
6223 instruct vadd16S_reg(vecY dst, vecY src1, vecY src2) %{
6224   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
6225   match(Set dst (AddVS src1 src2));
6226   format %{ "vpaddw  $dst,$src1,$src2\t! add packed16S" %}
6227   ins_encode %{
6228     int vector_len = 1;
6229     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6230   %}
6231   ins_pipe( pipe_slow );
6232 %}
6233 
6234 instruct vadd16S_mem(vecY dst, vecY src, memory mem) %{
6235   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
6236   match(Set dst (AddVS src (LoadVector mem)));
6237   format %{ "vpaddw  $dst,$src,$mem\t! add packed16S" %}
6238   ins_encode %{
6239     int vector_len = 1;
6240     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6241   %}
6242   ins_pipe( pipe_slow );
6243 %}
6244 
6245 instruct vadd32S_reg(vecZ dst, vecZ src1, vecZ src2) %{
6246   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
6247   match(Set dst (AddVS src1 src2));
6248   format %{ "vpaddw  $dst,$src1,$src2\t! add packed32S" %}
6249   ins_encode %{
6250     int vector_len = 2;
6251     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6252   %}
6253   ins_pipe( pipe_slow );
6254 %}
6255 
6256 instruct vadd32S_mem(vecZ dst, vecZ src, memory mem) %{
6257   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
6258   match(Set dst (AddVS src (LoadVector mem)));
6259   format %{ "vpaddw  $dst,$src,$mem\t! add packed32S" %}
6260   ins_encode %{
6261     int vector_len = 2;
6262     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6263   %}
6264   ins_pipe( pipe_slow );
6265 %}
6266 
6267 // Integers vector add
6268 instruct vadd2I(vecD dst, vecD src) %{
6269   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
6270   match(Set dst (AddVI dst src));
6271   format %{ "paddd   $dst,$src\t! add packed2I" %}
6272   ins_encode %{
6273     __ paddd($dst$$XMMRegister, $src$$XMMRegister);
6274   %}
6275   ins_pipe( pipe_slow );
6276 %}
6277 
6278 instruct vadd2I_reg(vecD dst, vecD src1, vecD src2) %{
6279   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6280   match(Set dst (AddVI src1 src2));
6281   format %{ "vpaddd  $dst,$src1,$src2\t! add packed2I" %}
6282   ins_encode %{
6283     int vector_len = 0;
6284     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6285   %}
6286   ins_pipe( pipe_slow );
6287 %}
6288 
6289 instruct vadd2I_mem(vecD dst, vecD src, memory mem) %{
6290   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6291   match(Set dst (AddVI src (LoadVector mem)));
6292   format %{ "vpaddd  $dst,$src,$mem\t! add packed2I" %}
6293   ins_encode %{
6294     int vector_len = 0;
6295     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6296   %}
6297   ins_pipe( pipe_slow );
6298 %}
6299 
6300 instruct vadd4I(vecX dst, vecX src) %{
6301   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
6302   match(Set dst (AddVI dst src));
6303   format %{ "paddd   $dst,$src\t! add packed4I" %}
6304   ins_encode %{
6305     __ paddd($dst$$XMMRegister, $src$$XMMRegister);
6306   %}
6307   ins_pipe( pipe_slow );
6308 %}
6309 
6310 instruct vadd4I_reg(vecX dst, vecX src1, vecX src2) %{
6311   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6312   match(Set dst (AddVI src1 src2));
6313   format %{ "vpaddd  $dst,$src1,$src2\t! add packed4I" %}
6314   ins_encode %{
6315     int vector_len = 0;
6316     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6317   %}
6318   ins_pipe( pipe_slow );
6319 %}
6320 
6321 instruct vadd4I_mem(vecX dst, vecX src, memory mem) %{
6322   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6323   match(Set dst (AddVI src (LoadVector mem)));
6324   format %{ "vpaddd  $dst,$src,$mem\t! add packed4I" %}
6325   ins_encode %{
6326     int vector_len = 0;
6327     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6328   %}
6329   ins_pipe( pipe_slow );
6330 %}
6331 
6332 instruct vadd8I_reg(vecY dst, vecY src1, vecY src2) %{
6333   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
6334   match(Set dst (AddVI src1 src2));
6335   format %{ "vpaddd  $dst,$src1,$src2\t! add packed8I" %}
6336   ins_encode %{
6337     int vector_len = 1;
6338     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6339   %}
6340   ins_pipe( pipe_slow );
6341 %}
6342 
6343 instruct vadd8I_mem(vecY dst, vecY src, memory mem) %{
6344   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
6345   match(Set dst (AddVI src (LoadVector mem)));
6346   format %{ "vpaddd  $dst,$src,$mem\t! add packed8I" %}
6347   ins_encode %{
6348     int vector_len = 1;
6349     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6350   %}
6351   ins_pipe( pipe_slow );
6352 %}
6353 
6354 instruct vadd16I_reg(vecZ dst, vecZ src1, vecZ src2) %{
6355   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
6356   match(Set dst (AddVI src1 src2));
6357   format %{ "vpaddd  $dst,$src1,$src2\t! add packed16I" %}
6358   ins_encode %{
6359     int vector_len = 2;
6360     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6361   %}
6362   ins_pipe( pipe_slow );
6363 %}
6364 
6365 instruct vadd16I_mem(vecZ dst, vecZ src, memory mem) %{
6366   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
6367   match(Set dst (AddVI src (LoadVector mem)));
6368   format %{ "vpaddd  $dst,$src,$mem\t! add packed16I" %}
6369   ins_encode %{
6370     int vector_len = 2;
6371     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6372   %}
6373   ins_pipe( pipe_slow );
6374 %}
6375 
6376 // Longs vector add
6377 instruct vadd2L(vecX dst, vecX src) %{
6378   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
6379   match(Set dst (AddVL dst src));
6380   format %{ "paddq   $dst,$src\t! add packed2L" %}
6381   ins_encode %{
6382     __ paddq($dst$$XMMRegister, $src$$XMMRegister);
6383   %}
6384   ins_pipe( pipe_slow );
6385 %}
6386 
6387 instruct vadd2L_reg(vecX dst, vecX src1, vecX src2) %{
6388   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6389   match(Set dst (AddVL src1 src2));
6390   format %{ "vpaddq  $dst,$src1,$src2\t! add packed2L" %}
6391   ins_encode %{
6392     int vector_len = 0;
6393     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6394   %}
6395   ins_pipe( pipe_slow );
6396 %}
6397 
6398 instruct vadd2L_mem(vecX dst, vecX src, memory mem) %{
6399   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6400   match(Set dst (AddVL src (LoadVector mem)));
6401   format %{ "vpaddq  $dst,$src,$mem\t! add packed2L" %}
6402   ins_encode %{
6403     int vector_len = 0;
6404     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6405   %}
6406   ins_pipe( pipe_slow );
6407 %}
6408 
6409 instruct vadd4L_reg(vecY dst, vecY src1, vecY src2) %{
6410   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
6411   match(Set dst (AddVL src1 src2));
6412   format %{ "vpaddq  $dst,$src1,$src2\t! add packed4L" %}
6413   ins_encode %{
6414     int vector_len = 1;
6415     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6416   %}
6417   ins_pipe( pipe_slow );
6418 %}
6419 
6420 instruct vadd4L_mem(vecY dst, vecY src, memory mem) %{
6421   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
6422   match(Set dst (AddVL src (LoadVector mem)));
6423   format %{ "vpaddq  $dst,$src,$mem\t! add packed4L" %}
6424   ins_encode %{
6425     int vector_len = 1;
6426     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6427   %}
6428   ins_pipe( pipe_slow );
6429 %}
6430 
6431 instruct vadd8L_reg(vecZ dst, vecZ src1, vecZ src2) %{
6432   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
6433   match(Set dst (AddVL src1 src2));
6434   format %{ "vpaddq  $dst,$src1,$src2\t! add packed8L" %}
6435   ins_encode %{
6436     int vector_len = 2;
6437     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6438   %}
6439   ins_pipe( pipe_slow );
6440 %}
6441 
6442 instruct vadd8L_mem(vecZ dst, vecZ src, memory mem) %{
6443   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
6444   match(Set dst (AddVL src (LoadVector mem)));
6445   format %{ "vpaddq  $dst,$src,$mem\t! add packed8L" %}
6446   ins_encode %{
6447     int vector_len = 2;
6448     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6449   %}
6450   ins_pipe( pipe_slow );
6451 %}
6452 
6453 // Floats vector add
6454 instruct vadd2F(vecD dst, vecD src) %{
6455   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
6456   match(Set dst (AddVF dst src));
6457   format %{ "addps   $dst,$src\t! add packed2F" %}
6458   ins_encode %{
6459     __ addps($dst$$XMMRegister, $src$$XMMRegister);
6460   %}
6461   ins_pipe( pipe_slow );
6462 %}
6463 
6464 instruct vadd2F_reg(vecD dst, vecD src1, vecD src2) %{
6465   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6466   match(Set dst (AddVF src1 src2));
6467   format %{ "vaddps  $dst,$src1,$src2\t! add packed2F" %}
6468   ins_encode %{
6469     int vector_len = 0;
6470     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6471   %}
6472   ins_pipe( pipe_slow );
6473 %}
6474 
6475 instruct vadd2F_mem(vecD dst, vecD src, memory mem) %{
6476   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6477   match(Set dst (AddVF src (LoadVector mem)));
6478   format %{ "vaddps  $dst,$src,$mem\t! add packed2F" %}
6479   ins_encode %{
6480     int vector_len = 0;
6481     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6482   %}
6483   ins_pipe( pipe_slow );
6484 %}
6485 
6486 instruct vadd4F(vecX dst, vecX src) %{
6487   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
6488   match(Set dst (AddVF dst src));
6489   format %{ "addps   $dst,$src\t! add packed4F" %}
6490   ins_encode %{
6491     __ addps($dst$$XMMRegister, $src$$XMMRegister);
6492   %}
6493   ins_pipe( pipe_slow );
6494 %}
6495 
6496 instruct vadd4F_reg(vecX dst, vecX src1, vecX src2) %{
6497   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6498   match(Set dst (AddVF src1 src2));
6499   format %{ "vaddps  $dst,$src1,$src2\t! add packed4F" %}
6500   ins_encode %{
6501     int vector_len = 0;
6502     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6503   %}
6504   ins_pipe( pipe_slow );
6505 %}
6506 
6507 instruct vadd4F_mem(vecX dst, vecX src, memory mem) %{
6508   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6509   match(Set dst (AddVF src (LoadVector mem)));
6510   format %{ "vaddps  $dst,$src,$mem\t! add packed4F" %}
6511   ins_encode %{
6512     int vector_len = 0;
6513     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6514   %}
6515   ins_pipe( pipe_slow );
6516 %}
6517 
6518 instruct vadd8F_reg(vecY dst, vecY src1, vecY src2) %{
6519   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6520   match(Set dst (AddVF src1 src2));
6521   format %{ "vaddps  $dst,$src1,$src2\t! add packed8F" %}
6522   ins_encode %{
6523     int vector_len = 1;
6524     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6525   %}
6526   ins_pipe( pipe_slow );
6527 %}
6528 
6529 instruct vadd8F_mem(vecY dst, vecY src, memory mem) %{
6530   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6531   match(Set dst (AddVF src (LoadVector mem)));
6532   format %{ "vaddps  $dst,$src,$mem\t! add packed8F" %}
6533   ins_encode %{
6534     int vector_len = 1;
6535     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6536   %}
6537   ins_pipe( pipe_slow );
6538 %}
6539 
6540 instruct vadd16F_reg(vecZ dst, vecZ src1, vecZ src2) %{
6541   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
6542   match(Set dst (AddVF src1 src2));
6543   format %{ "vaddps  $dst,$src1,$src2\t! add packed16F" %}
6544   ins_encode %{
6545     int vector_len = 2;
6546     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6547   %}
6548   ins_pipe( pipe_slow );
6549 %}
6550 
6551 instruct vadd16F_mem(vecZ dst, vecZ src, memory mem) %{
6552   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
6553   match(Set dst (AddVF src (LoadVector mem)));
6554   format %{ "vaddps  $dst,$src,$mem\t! add packed16F" %}
6555   ins_encode %{
6556     int vector_len = 2;
6557     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6558   %}
6559   ins_pipe( pipe_slow );
6560 %}
6561 
6562 // Doubles vector add
6563 instruct vadd2D(vecX dst, vecX src) %{
6564   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
6565   match(Set dst (AddVD dst src));
6566   format %{ "addpd   $dst,$src\t! add packed2D" %}
6567   ins_encode %{
6568     __ addpd($dst$$XMMRegister, $src$$XMMRegister);
6569   %}
6570   ins_pipe( pipe_slow );
6571 %}
6572 
6573 instruct vadd2D_reg(vecX dst, vecX src1, vecX src2) %{
6574   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6575   match(Set dst (AddVD src1 src2));
6576   format %{ "vaddpd  $dst,$src1,$src2\t! add packed2D" %}
6577   ins_encode %{
6578     int vector_len = 0;
6579     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6580   %}
6581   ins_pipe( pipe_slow );
6582 %}
6583 
6584 instruct vadd2D_mem(vecX dst, vecX src, memory mem) %{
6585   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6586   match(Set dst (AddVD src (LoadVector mem)));
6587   format %{ "vaddpd  $dst,$src,$mem\t! add packed2D" %}
6588   ins_encode %{
6589     int vector_len = 0;
6590     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6591   %}
6592   ins_pipe( pipe_slow );
6593 %}
6594 
6595 instruct vadd4D_reg(vecY dst, vecY src1, vecY src2) %{
6596   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6597   match(Set dst (AddVD src1 src2));
6598   format %{ "vaddpd  $dst,$src1,$src2\t! add packed4D" %}
6599   ins_encode %{
6600     int vector_len = 1;
6601     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6602   %}
6603   ins_pipe( pipe_slow );
6604 %}
6605 
6606 instruct vadd4D_mem(vecY dst, vecY src, memory mem) %{
6607   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6608   match(Set dst (AddVD src (LoadVector mem)));
6609   format %{ "vaddpd  $dst,$src,$mem\t! add packed4D" %}
6610   ins_encode %{
6611     int vector_len = 1;
6612     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6613   %}
6614   ins_pipe( pipe_slow );
6615 %}
6616 
6617 instruct vadd8D_reg(vecZ dst, vecZ src1, vecZ src2) %{
6618   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
6619   match(Set dst (AddVD src1 src2));
6620   format %{ "vaddpd  $dst,$src1,$src2\t! add packed8D" %}
6621   ins_encode %{
6622     int vector_len = 2;
6623     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6624   %}
6625   ins_pipe( pipe_slow );
6626 %}
6627 
6628 instruct vadd8D_mem(vecZ dst, vecZ src, memory mem) %{
6629   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
6630   match(Set dst (AddVD src (LoadVector mem)));
6631   format %{ "vaddpd  $dst,$src,$mem\t! add packed8D" %}
6632   ins_encode %{
6633     int vector_len = 2;
6634     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6635   %}
6636   ins_pipe( pipe_slow );
6637 %}
6638 
6639 // --------------------------------- SUB --------------------------------------
6640 
6641 // Bytes vector sub
6642 instruct vsub4B(vecS dst, vecS src) %{
6643   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
6644   match(Set dst (SubVB dst src));
6645   format %{ "psubb   $dst,$src\t! sub packed4B" %}
6646   ins_encode %{
6647     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
6648   %}
6649   ins_pipe( pipe_slow );
6650 %}
6651 
6652 instruct vsub4B_reg(vecS dst, vecS src1, vecS src2) %{
6653   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6654   match(Set dst (SubVB src1 src2));
6655   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed4B" %}
6656   ins_encode %{
6657     int vector_len = 0;
6658     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6659   %}
6660   ins_pipe( pipe_slow );
6661 %}
6662 
6663 instruct vsub4B_mem(vecS dst, vecS src, memory mem) %{
6664   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6665   match(Set dst (SubVB src (LoadVector mem)));
6666   format %{ "vpsubb  $dst,$src,$mem\t! sub packed4B" %}
6667   ins_encode %{
6668     int vector_len = 0;
6669     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6670   %}
6671   ins_pipe( pipe_slow );
6672 %}
6673 
6674 instruct vsub8B(vecD dst, vecD src) %{
6675   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
6676   match(Set dst (SubVB dst src));
6677   format %{ "psubb   $dst,$src\t! sub packed8B" %}
6678   ins_encode %{
6679     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
6680   %}
6681   ins_pipe( pipe_slow );
6682 %}
6683 
6684 instruct vsub8B_reg(vecD dst, vecD src1, vecD src2) %{
6685   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6686   match(Set dst (SubVB src1 src2));
6687   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed8B" %}
6688   ins_encode %{
6689     int vector_len = 0;
6690     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6691   %}
6692   ins_pipe( pipe_slow );
6693 %}
6694 
6695 instruct vsub8B_mem(vecD dst, vecD src, memory mem) %{
6696   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6697   match(Set dst (SubVB src (LoadVector mem)));
6698   format %{ "vpsubb  $dst,$src,$mem\t! sub packed8B" %}
6699   ins_encode %{
6700     int vector_len = 0;
6701     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6702   %}
6703   ins_pipe( pipe_slow );
6704 %}
6705 
6706 instruct vsub16B(vecX dst, vecX src) %{
6707   predicate(UseAVX == 0 && n->as_Vector()->length() == 16);
6708   match(Set dst (SubVB dst src));
6709   format %{ "psubb   $dst,$src\t! sub packed16B" %}
6710   ins_encode %{
6711     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
6712   %}
6713   ins_pipe( pipe_slow );
6714 %}
6715 
6716 instruct vsub16B_reg(vecX dst, vecX src1, vecX src2) %{
6717   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
6718   match(Set dst (SubVB src1 src2));
6719   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed16B" %}
6720   ins_encode %{
6721     int vector_len = 0;
6722     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6723   %}
6724   ins_pipe( pipe_slow );
6725 %}
6726 
6727 instruct vsub16B_mem(vecX dst, vecX src, memory mem) %{
6728   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
6729   match(Set dst (SubVB src (LoadVector mem)));
6730   format %{ "vpsubb  $dst,$src,$mem\t! sub packed16B" %}
6731   ins_encode %{
6732     int vector_len = 0;
6733     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6734   %}
6735   ins_pipe( pipe_slow );
6736 %}
6737 
6738 instruct vsub32B_reg(vecY dst, vecY src1, vecY src2) %{
6739   predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
6740   match(Set dst (SubVB src1 src2));
6741   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed32B" %}
6742   ins_encode %{
6743     int vector_len = 1;
6744     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6745   %}
6746   ins_pipe( pipe_slow );
6747 %}
6748 
6749 instruct vsub32B_mem(vecY dst, vecY src, memory mem) %{
6750   predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
6751   match(Set dst (SubVB src (LoadVector mem)));
6752   format %{ "vpsubb  $dst,$src,$mem\t! sub packed32B" %}
6753   ins_encode %{
6754     int vector_len = 1;
6755     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6756   %}
6757   ins_pipe( pipe_slow );
6758 %}
6759 
6760 instruct vsub64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
6761   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64);
6762   match(Set dst (SubVB src1 src2));
6763   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed64B" %}
6764   ins_encode %{
6765     int vector_len = 2;
6766     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6767   %}
6768   ins_pipe( pipe_slow );
6769 %}
6770 
6771 instruct vsub64B_mem(vecZ dst, vecZ src, memory mem) %{
6772   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64);
6773   match(Set dst (SubVB src (LoadVector mem)));
6774   format %{ "vpsubb  $dst,$src,$mem\t! sub packed64B" %}
6775   ins_encode %{
6776     int vector_len = 2;
6777     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6778   %}
6779   ins_pipe( pipe_slow );
6780 %}
6781 
6782 // Shorts/Chars vector sub
6783 instruct vsub2S(vecS dst, vecS src) %{
6784   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
6785   match(Set dst (SubVS dst src));
6786   format %{ "psubw   $dst,$src\t! sub packed2S" %}
6787   ins_encode %{
6788     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
6789   %}
6790   ins_pipe( pipe_slow );
6791 %}
6792 
6793 instruct vsub2S_reg(vecS dst, vecS src1, vecS src2) %{
6794   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6795   match(Set dst (SubVS src1 src2));
6796   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed2S" %}
6797   ins_encode %{
6798     int vector_len = 0;
6799     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6800   %}
6801   ins_pipe( pipe_slow );
6802 %}
6803 
6804 instruct vsub2S_mem(vecS dst, vecS src, memory mem) %{
6805   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6806   match(Set dst (SubVS src (LoadVector mem)));
6807   format %{ "vpsubw  $dst,$src,$mem\t! sub packed2S" %}
6808   ins_encode %{
6809     int vector_len = 0;
6810     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6811   %}
6812   ins_pipe( pipe_slow );
6813 %}
6814 
6815 instruct vsub4S(vecD dst, vecD src) %{
6816   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
6817   match(Set dst (SubVS dst src));
6818   format %{ "psubw   $dst,$src\t! sub packed4S" %}
6819   ins_encode %{
6820     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
6821   %}
6822   ins_pipe( pipe_slow );
6823 %}
6824 
6825 instruct vsub4S_reg(vecD dst, vecD src1, vecD src2) %{
6826   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6827   match(Set dst (SubVS src1 src2));
6828   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed4S" %}
6829   ins_encode %{
6830     int vector_len = 0;
6831     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6832   %}
6833   ins_pipe( pipe_slow );
6834 %}
6835 
6836 instruct vsub4S_mem(vecD dst, vecD src, memory mem) %{
6837   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6838   match(Set dst (SubVS src (LoadVector mem)));
6839   format %{ "vpsubw  $dst,$src,$mem\t! sub packed4S" %}
6840   ins_encode %{
6841     int vector_len = 0;
6842     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6843   %}
6844   ins_pipe( pipe_slow );
6845 %}
6846 
6847 instruct vsub8S(vecX dst, vecX src) %{
6848   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
6849   match(Set dst (SubVS dst src));
6850   format %{ "psubw   $dst,$src\t! sub packed8S" %}
6851   ins_encode %{
6852     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
6853   %}
6854   ins_pipe( pipe_slow );
6855 %}
6856 
6857 instruct vsub8S_reg(vecX dst, vecX src1, vecX src2) %{
6858   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6859   match(Set dst (SubVS src1 src2));
6860   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed8S" %}
6861   ins_encode %{
6862     int vector_len = 0;
6863     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6864   %}
6865   ins_pipe( pipe_slow );
6866 %}
6867 
6868 instruct vsub8S_mem(vecX dst, vecX src, memory mem) %{
6869   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6870   match(Set dst (SubVS src (LoadVector mem)));
6871   format %{ "vpsubw  $dst,$src,$mem\t! sub packed8S" %}
6872   ins_encode %{
6873     int vector_len = 0;
6874     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6875   %}
6876   ins_pipe( pipe_slow );
6877 %}
6878 
6879 instruct vsub16S_reg(vecY dst, vecY src1, vecY src2) %{
6880   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
6881   match(Set dst (SubVS src1 src2));
6882   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed16S" %}
6883   ins_encode %{
6884     int vector_len = 1;
6885     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6886   %}
6887   ins_pipe( pipe_slow );
6888 %}
6889 
6890 instruct vsub16S_mem(vecY dst, vecY src, memory mem) %{
6891   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
6892   match(Set dst (SubVS src (LoadVector mem)));
6893   format %{ "vpsubw  $dst,$src,$mem\t! sub packed16S" %}
6894   ins_encode %{
6895     int vector_len = 1;
6896     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6897   %}
6898   ins_pipe( pipe_slow );
6899 %}
6900 
6901 instruct vsub32S_reg(vecZ dst, vecZ src1, vecZ src2) %{
6902   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
6903   match(Set dst (SubVS src1 src2));
6904   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed32S" %}
6905   ins_encode %{
6906     int vector_len = 2;
6907     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6908   %}
6909   ins_pipe( pipe_slow );
6910 %}
6911 
6912 instruct vsub32S_mem(vecZ dst, vecZ src, memory mem) %{
6913   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
6914   match(Set dst (SubVS src (LoadVector mem)));
6915   format %{ "vpsubw  $dst,$src,$mem\t! sub packed32S" %}
6916   ins_encode %{
6917     int vector_len = 2;
6918     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6919   %}
6920   ins_pipe( pipe_slow );
6921 %}
6922 
6923 // Integers vector sub
6924 instruct vsub2I(vecD dst, vecD src) %{
6925   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
6926   match(Set dst (SubVI dst src));
6927   format %{ "psubd   $dst,$src\t! sub packed2I" %}
6928   ins_encode %{
6929     __ psubd($dst$$XMMRegister, $src$$XMMRegister);
6930   %}
6931   ins_pipe( pipe_slow );
6932 %}
6933 
6934 instruct vsub2I_reg(vecD dst, vecD src1, vecD src2) %{
6935   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6936   match(Set dst (SubVI src1 src2));
6937   format %{ "vpsubd  $dst,$src1,$src2\t! sub packed2I" %}
6938   ins_encode %{
6939     int vector_len = 0;
6940     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6941   %}
6942   ins_pipe( pipe_slow );
6943 %}
6944 
6945 instruct vsub2I_mem(vecD dst, vecD src, memory mem) %{
6946   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6947   match(Set dst (SubVI src (LoadVector mem)));
6948   format %{ "vpsubd  $dst,$src,$mem\t! sub packed2I" %}
6949   ins_encode %{
6950     int vector_len = 0;
6951     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6952   %}
6953   ins_pipe( pipe_slow );
6954 %}
6955 
6956 instruct vsub4I(vecX dst, vecX src) %{
6957   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
6958   match(Set dst (SubVI dst src));
6959   format %{ "psubd   $dst,$src\t! sub packed4I" %}
6960   ins_encode %{
6961     __ psubd($dst$$XMMRegister, $src$$XMMRegister);
6962   %}
6963   ins_pipe( pipe_slow );
6964 %}
6965 
6966 instruct vsub4I_reg(vecX dst, vecX src1, vecX src2) %{
6967   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6968   match(Set dst (SubVI src1 src2));
6969   format %{ "vpsubd  $dst,$src1,$src2\t! sub packed4I" %}
6970   ins_encode %{
6971     int vector_len = 0;
6972     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6973   %}
6974   ins_pipe( pipe_slow );
6975 %}
6976 
6977 instruct vsub4I_mem(vecX dst, vecX src, memory mem) %{
6978   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6979   match(Set dst (SubVI src (LoadVector mem)));
6980   format %{ "vpsubd  $dst,$src,$mem\t! sub packed4I" %}
6981   ins_encode %{
6982     int vector_len = 0;
6983     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6984   %}
6985   ins_pipe( pipe_slow );
6986 %}
6987 
6988 instruct vsub8I_reg(vecY dst, vecY src1, vecY src2) %{
6989   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
6990   match(Set dst (SubVI src1 src2));
6991   format %{ "vpsubd  $dst,$src1,$src2\t! sub packed8I" %}
6992   ins_encode %{
6993     int vector_len = 1;
6994     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6995   %}
6996   ins_pipe( pipe_slow );
6997 %}
6998 
6999 instruct vsub8I_mem(vecY dst, vecY src, memory mem) %{
7000   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
7001   match(Set dst (SubVI src (LoadVector mem)));
7002   format %{ "vpsubd  $dst,$src,$mem\t! sub packed8I" %}
7003   ins_encode %{
7004     int vector_len = 1;
7005     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7006   %}
7007   ins_pipe( pipe_slow );
7008 %}
7009 
7010 instruct vsub16I_reg(vecZ dst, vecZ src1, vecZ src2) %{
7011   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
7012   match(Set dst (SubVI src1 src2));
7013   format %{ "vpsubd  $dst,$src1,$src2\t! sub packed16I" %}
7014   ins_encode %{
7015     int vector_len = 2;
7016     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7017   %}
7018   ins_pipe( pipe_slow );
7019 %}
7020 
7021 instruct vsub16I_mem(vecZ dst, vecZ src, memory mem) %{
7022   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
7023   match(Set dst (SubVI src (LoadVector mem)));
7024   format %{ "vpsubd  $dst,$src,$mem\t! sub packed16I" %}
7025   ins_encode %{
7026     int vector_len = 2;
7027     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7028   %}
7029   ins_pipe( pipe_slow );
7030 %}
7031 
7032 // Longs vector sub
7033 instruct vsub2L(vecX dst, vecX src) %{
7034   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
7035   match(Set dst (SubVL dst src));
7036   format %{ "psubq   $dst,$src\t! sub packed2L" %}
7037   ins_encode %{
7038     __ psubq($dst$$XMMRegister, $src$$XMMRegister);
7039   %}
7040   ins_pipe( pipe_slow );
7041 %}
7042 
7043 instruct vsub2L_reg(vecX dst, vecX src1, vecX src2) %{
7044   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7045   match(Set dst (SubVL src1 src2));
7046   format %{ "vpsubq  $dst,$src1,$src2\t! sub packed2L" %}
7047   ins_encode %{
7048     int vector_len = 0;
7049     __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7050   %}
7051   ins_pipe( pipe_slow );
7052 %}
7053 
7054 instruct vsub2L_mem(vecX dst, vecX src, memory mem) %{
7055   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7056   match(Set dst (SubVL src (LoadVector mem)));
7057   format %{ "vpsubq  $dst,$src,$mem\t! sub packed2L" %}
7058   ins_encode %{
7059     int vector_len = 0;
7060     __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7061   %}
7062   ins_pipe( pipe_slow );
7063 %}
7064 
7065 instruct vsub4L_reg(vecY dst, vecY src1, vecY src2) %{
7066   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
7067   match(Set dst (SubVL src1 src2));
7068   format %{ "vpsubq  $dst,$src1,$src2\t! sub packed4L" %}
7069   ins_encode %{
7070     int vector_len = 1;
7071     __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7072   %}
7073   ins_pipe( pipe_slow );
7074 %}
7075 
7076 instruct vsub4L_mem(vecY dst, vecY src, memory mem) %{
7077   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
7078   match(Set dst (SubVL src (LoadVector mem)));
7079   format %{ "vpsubq  $dst,$src,$mem\t! sub packed4L" %}
7080   ins_encode %{
7081     int vector_len = 1;
7082     __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7083   %}
7084   ins_pipe( pipe_slow );
7085 %}
7086 
7087 instruct vsub8L_reg(vecZ dst, vecZ src1, vecZ src2) %{
7088   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
7089   match(Set dst (SubVL src1 src2));
7090   format %{ "vpsubq  $dst,$src1,$src2\t! sub packed8L" %}
7091   ins_encode %{
7092     int vector_len = 2;
7093     __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7094   %}
7095   ins_pipe( pipe_slow );
7096 %}
7097 
7098 instruct vsub8L_mem(vecZ dst, vecZ src, memory mem) %{
7099   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
7100   match(Set dst (SubVL src (LoadVector mem)));
7101   format %{ "vpsubq  $dst,$src,$mem\t! sub packed8L" %}
7102   ins_encode %{
7103     int vector_len = 2;
7104     __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7105   %}
7106   ins_pipe( pipe_slow );
7107 %}
7108 
7109 // Floats vector sub
7110 instruct vsub2F(vecD dst, vecD src) %{
7111   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
7112   match(Set dst (SubVF dst src));
7113   format %{ "subps   $dst,$src\t! sub packed2F" %}
7114   ins_encode %{
7115     __ subps($dst$$XMMRegister, $src$$XMMRegister);
7116   %}
7117   ins_pipe( pipe_slow );
7118 %}
7119 
7120 instruct vsub2F_reg(vecD dst, vecD src1, vecD src2) %{
7121   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7122   match(Set dst (SubVF src1 src2));
7123   format %{ "vsubps  $dst,$src1,$src2\t! sub packed2F" %}
7124   ins_encode %{
7125     int vector_len = 0;
7126     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7127   %}
7128   ins_pipe( pipe_slow );
7129 %}
7130 
7131 instruct vsub2F_mem(vecD dst, vecD src, memory mem) %{
7132   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7133   match(Set dst (SubVF src (LoadVector mem)));
7134   format %{ "vsubps  $dst,$src,$mem\t! sub packed2F" %}
7135   ins_encode %{
7136     int vector_len = 0;
7137     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7138   %}
7139   ins_pipe( pipe_slow );
7140 %}
7141 
7142 instruct vsub4F(vecX dst, vecX src) %{
7143   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
7144   match(Set dst (SubVF dst src));
7145   format %{ "subps   $dst,$src\t! sub packed4F" %}
7146   ins_encode %{
7147     __ subps($dst$$XMMRegister, $src$$XMMRegister);
7148   %}
7149   ins_pipe( pipe_slow );
7150 %}
7151 
7152 instruct vsub4F_reg(vecX dst, vecX src1, vecX src2) %{
7153   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7154   match(Set dst (SubVF src1 src2));
7155   format %{ "vsubps  $dst,$src1,$src2\t! sub packed4F" %}
7156   ins_encode %{
7157     int vector_len = 0;
7158     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7159   %}
7160   ins_pipe( pipe_slow );
7161 %}
7162 
7163 instruct vsub4F_mem(vecX dst, vecX src, memory mem) %{
7164   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7165   match(Set dst (SubVF src (LoadVector mem)));
7166   format %{ "vsubps  $dst,$src,$mem\t! sub packed4F" %}
7167   ins_encode %{
7168     int vector_len = 0;
7169     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7170   %}
7171   ins_pipe( pipe_slow );
7172 %}
7173 
7174 instruct vsub8F_reg(vecY dst, vecY src1, vecY src2) %{
7175   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
7176   match(Set dst (SubVF src1 src2));
7177   format %{ "vsubps  $dst,$src1,$src2\t! sub packed8F" %}
7178   ins_encode %{
7179     int vector_len = 1;
7180     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7181   %}
7182   ins_pipe( pipe_slow );
7183 %}
7184 
7185 instruct vsub8F_mem(vecY dst, vecY src, memory mem) %{
7186   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
7187   match(Set dst (SubVF src (LoadVector mem)));
7188   format %{ "vsubps  $dst,$src,$mem\t! sub packed8F" %}
7189   ins_encode %{
7190     int vector_len = 1;
7191     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7192   %}
7193   ins_pipe( pipe_slow );
7194 %}
7195 
7196 instruct vsub16F_reg(vecZ dst, vecZ src1, vecZ src2) %{
7197   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
7198   match(Set dst (SubVF src1 src2));
7199   format %{ "vsubps  $dst,$src1,$src2\t! sub packed16F" %}
7200   ins_encode %{
7201     int vector_len = 2;
7202     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7203   %}
7204   ins_pipe( pipe_slow );
7205 %}
7206 
7207 instruct vsub16F_mem(vecZ dst, vecZ src, memory mem) %{
7208   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
7209   match(Set dst (SubVF src (LoadVector mem)));
7210   format %{ "vsubps  $dst,$src,$mem\t! sub packed16F" %}
7211   ins_encode %{
7212     int vector_len = 2;
7213     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7214   %}
7215   ins_pipe( pipe_slow );
7216 %}
7217 
7218 // Doubles vector sub
7219 instruct vsub2D(vecX dst, vecX src) %{
7220   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
7221   match(Set dst (SubVD dst src));
7222   format %{ "subpd   $dst,$src\t! sub packed2D" %}
7223   ins_encode %{
7224     __ subpd($dst$$XMMRegister, $src$$XMMRegister);
7225   %}
7226   ins_pipe( pipe_slow );
7227 %}
7228 
7229 instruct vsub2D_reg(vecX dst, vecX src1, vecX src2) %{
7230   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7231   match(Set dst (SubVD src1 src2));
7232   format %{ "vsubpd  $dst,$src1,$src2\t! sub packed2D" %}
7233   ins_encode %{
7234     int vector_len = 0;
7235     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7236   %}
7237   ins_pipe( pipe_slow );
7238 %}
7239 
7240 instruct vsub2D_mem(vecX dst, vecX src, memory mem) %{
7241   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7242   match(Set dst (SubVD src (LoadVector mem)));
7243   format %{ "vsubpd  $dst,$src,$mem\t! sub packed2D" %}
7244   ins_encode %{
7245     int vector_len = 0;
7246     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7247   %}
7248   ins_pipe( pipe_slow );
7249 %}
7250 
7251 instruct vsub4D_reg(vecY dst, vecY src1, vecY src2) %{
7252   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7253   match(Set dst (SubVD src1 src2));
7254   format %{ "vsubpd  $dst,$src1,$src2\t! sub packed4D" %}
7255   ins_encode %{
7256     int vector_len = 1;
7257     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7258   %}
7259   ins_pipe( pipe_slow );
7260 %}
7261 
7262 instruct vsub4D_mem(vecY dst, vecY src, memory mem) %{
7263   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7264   match(Set dst (SubVD src (LoadVector mem)));
7265   format %{ "vsubpd  $dst,$src,$mem\t! sub packed4D" %}
7266   ins_encode %{
7267     int vector_len = 1;
7268     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7269   %}
7270   ins_pipe( pipe_slow );
7271 %}
7272 
7273 instruct vsub8D_reg(vecZ dst, vecZ src1, vecZ src2) %{
7274   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
7275   match(Set dst (SubVD src1 src2));
7276   format %{ "vsubpd  $dst,$src1,$src2\t! sub packed8D" %}
7277   ins_encode %{
7278     int vector_len = 2;
7279     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7280   %}
7281   ins_pipe( pipe_slow );
7282 %}
7283 
7284 instruct vsub8D_mem(vecZ dst, vecZ src, memory mem) %{
7285   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
7286   match(Set dst (SubVD src (LoadVector mem)));
7287   format %{ "vsubpd  $dst,$src,$mem\t! sub packed8D" %}
7288   ins_encode %{
7289     int vector_len = 2;
7290     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7291   %}
7292   ins_pipe( pipe_slow );
7293 %}
7294 
7295 // --------------------------------- MUL --------------------------------------
7296 
7297 // Shorts/Chars vector mul
7298 instruct vmul2S(vecS dst, vecS src) %{
7299   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
7300   match(Set dst (MulVS dst src));
7301   format %{ "pmullw $dst,$src\t! mul packed2S" %}
7302   ins_encode %{
7303     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
7304   %}
7305   ins_pipe( pipe_slow );
7306 %}
7307 
7308 instruct vmul2S_reg(vecS dst, vecS src1, vecS src2) %{
7309   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7310   match(Set dst (MulVS src1 src2));
7311   format %{ "vpmullw $dst,$src1,$src2\t! mul packed2S" %}
7312   ins_encode %{
7313     int vector_len = 0;
7314     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7315   %}
7316   ins_pipe( pipe_slow );
7317 %}
7318 
7319 instruct vmul2S_mem(vecS dst, vecS src, memory mem) %{
7320   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7321   match(Set dst (MulVS src (LoadVector mem)));
7322   format %{ "vpmullw $dst,$src,$mem\t! mul packed2S" %}
7323   ins_encode %{
7324     int vector_len = 0;
7325     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7326   %}
7327   ins_pipe( pipe_slow );
7328 %}
7329 
7330 instruct vmul4S(vecD dst, vecD src) %{
7331   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
7332   match(Set dst (MulVS dst src));
7333   format %{ "pmullw  $dst,$src\t! mul packed4S" %}
7334   ins_encode %{
7335     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
7336   %}
7337   ins_pipe( pipe_slow );
7338 %}
7339 
7340 instruct vmul4S_reg(vecD dst, vecD src1, vecD src2) %{
7341   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7342   match(Set dst (MulVS src1 src2));
7343   format %{ "vpmullw $dst,$src1,$src2\t! mul packed4S" %}
7344   ins_encode %{
7345     int vector_len = 0;
7346     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7347   %}
7348   ins_pipe( pipe_slow );
7349 %}
7350 
7351 instruct vmul4S_mem(vecD dst, vecD src, memory mem) %{
7352   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7353   match(Set dst (MulVS src (LoadVector mem)));
7354   format %{ "vpmullw $dst,$src,$mem\t! mul packed4S" %}
7355   ins_encode %{
7356     int vector_len = 0;
7357     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7358   %}
7359   ins_pipe( pipe_slow );
7360 %}
7361 
7362 instruct vmul8S(vecX dst, vecX src) %{
7363   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
7364   match(Set dst (MulVS dst src));
7365   format %{ "pmullw  $dst,$src\t! mul packed8S" %}
7366   ins_encode %{
7367     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
7368   %}
7369   ins_pipe( pipe_slow );
7370 %}
7371 
7372 instruct vmul8S_reg(vecX dst, vecX src1, vecX src2) %{
7373   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
7374   match(Set dst (MulVS src1 src2));
7375   format %{ "vpmullw $dst,$src1,$src2\t! mul packed8S" %}
7376   ins_encode %{
7377     int vector_len = 0;
7378     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7379   %}
7380   ins_pipe( pipe_slow );
7381 %}
7382 
7383 instruct vmul8S_mem(vecX dst, vecX src, memory mem) %{
7384   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
7385   match(Set dst (MulVS src (LoadVector mem)));
7386   format %{ "vpmullw $dst,$src,$mem\t! mul packed8S" %}
7387   ins_encode %{
7388     int vector_len = 0;
7389     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7390   %}
7391   ins_pipe( pipe_slow );
7392 %}
7393 
7394 instruct vmul16S_reg(vecY dst, vecY src1, vecY src2) %{
7395   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
7396   match(Set dst (MulVS src1 src2));
7397   format %{ "vpmullw $dst,$src1,$src2\t! mul packed16S" %}
7398   ins_encode %{
7399     int vector_len = 1;
7400     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7401   %}
7402   ins_pipe( pipe_slow );
7403 %}
7404 
7405 instruct vmul16S_mem(vecY dst, vecY src, memory mem) %{
7406   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
7407   match(Set dst (MulVS src (LoadVector mem)));
7408   format %{ "vpmullw $dst,$src,$mem\t! mul packed16S" %}
7409   ins_encode %{
7410     int vector_len = 1;
7411     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7412   %}
7413   ins_pipe( pipe_slow );
7414 %}
7415 
7416 instruct vmul32S_reg(vecZ dst, vecZ src1, vecZ src2) %{
7417   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
7418   match(Set dst (MulVS src1 src2));
7419   format %{ "vpmullw $dst,$src1,$src2\t! mul packed32S" %}
7420   ins_encode %{
7421     int vector_len = 2;
7422     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7423   %}
7424   ins_pipe( pipe_slow );
7425 %}
7426 
7427 instruct vmul32S_mem(vecZ dst, vecZ src, memory mem) %{
7428   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
7429   match(Set dst (MulVS src (LoadVector mem)));
7430   format %{ "vpmullw $dst,$src,$mem\t! mul packed32S" %}
7431   ins_encode %{
7432     int vector_len = 2;
7433     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7434   %}
7435   ins_pipe( pipe_slow );
7436 %}
7437 
7438 // Integers vector mul (sse4_1)
7439 instruct vmul2I(vecD dst, vecD src) %{
7440   predicate(UseSSE > 3 && n->as_Vector()->length() == 2);
7441   match(Set dst (MulVI dst src));
7442   format %{ "pmulld  $dst,$src\t! mul packed2I" %}
7443   ins_encode %{
7444     __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
7445   %}
7446   ins_pipe( pipe_slow );
7447 %}
7448 
7449 instruct vmul2I_reg(vecD dst, vecD src1, vecD src2) %{
7450   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7451   match(Set dst (MulVI src1 src2));
7452   format %{ "vpmulld $dst,$src1,$src2\t! mul packed2I" %}
7453   ins_encode %{
7454     int vector_len = 0;
7455     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7456   %}
7457   ins_pipe( pipe_slow );
7458 %}
7459 
7460 instruct vmul2I_mem(vecD dst, vecD src, memory mem) %{
7461   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7462   match(Set dst (MulVI src (LoadVector mem)));
7463   format %{ "vpmulld $dst,$src,$mem\t! mul packed2I" %}
7464   ins_encode %{
7465     int vector_len = 0;
7466     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7467   %}
7468   ins_pipe( pipe_slow );
7469 %}
7470 
7471 instruct vmul4I(vecX dst, vecX src) %{
7472   predicate(UseSSE > 3 && n->as_Vector()->length() == 4);
7473   match(Set dst (MulVI dst src));
7474   format %{ "pmulld  $dst,$src\t! mul packed4I" %}
7475   ins_encode %{
7476     __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
7477   %}
7478   ins_pipe( pipe_slow );
7479 %}
7480 
7481 instruct vmul4I_reg(vecX dst, vecX src1, vecX src2) %{
7482   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7483   match(Set dst (MulVI src1 src2));
7484   format %{ "vpmulld $dst,$src1,$src2\t! mul packed4I" %}
7485   ins_encode %{
7486     int vector_len = 0;
7487     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7488   %}
7489   ins_pipe( pipe_slow );
7490 %}
7491 
7492 instruct vmul4I_mem(vecX dst, vecX src, memory mem) %{
7493   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7494   match(Set dst (MulVI src (LoadVector mem)));
7495   format %{ "vpmulld $dst,$src,$mem\t! mul packed4I" %}
7496   ins_encode %{
7497     int vector_len = 0;
7498     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7499   %}
7500   ins_pipe( pipe_slow );
7501 %}
7502 
7503 instruct vmul2L_reg(vecX dst, vecX src1, vecX src2) %{
7504   predicate(UseAVX > 2 && n->as_Vector()->length() == 2 && VM_Version::supports_avx512dq());
7505   match(Set dst (MulVL src1 src2));
7506   format %{ "vpmullq $dst,$src1,$src2\t! mul packed2L" %}
7507   ins_encode %{
7508     int vector_len = 0;
7509     __ vpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7510   %}
7511   ins_pipe( pipe_slow );
7512 %}
7513 
7514 instruct vmul2L_mem(vecX dst, vecX src, memory mem) %{
7515   predicate(UseAVX > 2 && n->as_Vector()->length() == 2 && VM_Version::supports_avx512dq());
7516   match(Set dst (MulVL src (LoadVector mem)));
7517   format %{ "vpmullq $dst,$src,$mem\t! mul packed2L" %}
7518   ins_encode %{
7519     int vector_len = 0;
7520     __ vpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7521   %}
7522   ins_pipe( pipe_slow );
7523 %}
7524 
7525 instruct vmul4L_reg(vecY dst, vecY src1, vecY src2) %{
7526   predicate(UseAVX > 2 && n->as_Vector()->length() == 4 && VM_Version::supports_avx512dq());
7527   match(Set dst (MulVL src1 src2));
7528   format %{ "vpmullq $dst,$src1,$src2\t! mul packed4L" %}
7529   ins_encode %{
7530     int vector_len = 1;
7531     __ vpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7532   %}
7533   ins_pipe( pipe_slow );
7534 %}
7535 
7536 instruct vmul4L_mem(vecY dst, vecY src, memory mem) %{
7537   predicate(UseAVX > 2 && n->as_Vector()->length() == 4 && VM_Version::supports_avx512dq());
7538   match(Set dst (MulVL src (LoadVector mem)));
7539   format %{ "vpmullq $dst,$src,$mem\t! mul packed4L" %}
7540   ins_encode %{
7541     int vector_len = 1;
7542     __ vpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7543   %}
7544   ins_pipe( pipe_slow );
7545 %}
7546 
7547 instruct vmul8L_reg(vecZ dst, vecZ src1, vecZ src2) %{
7548   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && VM_Version::supports_avx512dq());
7549   match(Set dst (MulVL src1 src2));
7550   format %{ "vpmullq $dst,$src1,$src2\t! mul packed8L" %}
7551   ins_encode %{
7552     int vector_len = 2;
7553     __ vpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7554   %}
7555   ins_pipe( pipe_slow );
7556 %}
7557 
7558 instruct vmul8L_mem(vecZ dst, vecZ src, memory mem) %{
7559   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && VM_Version::supports_avx512dq());
7560   match(Set dst (MulVL src (LoadVector mem)));
7561   format %{ "vpmullq $dst,$src,$mem\t! mul packed8L" %}
7562   ins_encode %{
7563     int vector_len = 2;
7564     __ vpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7565   %}
7566   ins_pipe( pipe_slow );
7567 %}
7568 
7569 instruct vmul8I_reg(vecY dst, vecY src1, vecY src2) %{
7570   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
7571   match(Set dst (MulVI src1 src2));
7572   format %{ "vpmulld $dst,$src1,$src2\t! mul packed8I" %}
7573   ins_encode %{
7574     int vector_len = 1;
7575     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7576   %}
7577   ins_pipe( pipe_slow );
7578 %}
7579 
7580 instruct vmul8I_mem(vecY dst, vecY src, memory mem) %{
7581   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
7582   match(Set dst (MulVI src (LoadVector mem)));
7583   format %{ "vpmulld $dst,$src,$mem\t! mul packed8I" %}
7584   ins_encode %{
7585     int vector_len = 1;
7586     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7587   %}
7588   ins_pipe( pipe_slow );
7589 %}
7590 
7591 instruct vmul16I_reg(vecZ dst, vecZ src1, vecZ src2) %{
7592   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
7593   match(Set dst (MulVI src1 src2));
7594   format %{ "vpmulld $dst,$src1,$src2\t! mul packed16I" %}
7595   ins_encode %{
7596     int vector_len = 2;
7597     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7598   %}
7599   ins_pipe( pipe_slow );
7600 %}
7601 
7602 instruct vmul16I_mem(vecZ dst, vecZ src, memory mem) %{
7603   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
7604   match(Set dst (MulVI src (LoadVector mem)));
7605   format %{ "vpmulld $dst,$src,$mem\t! mul packed16I" %}
7606   ins_encode %{
7607     int vector_len = 2;
7608     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7609   %}
7610   ins_pipe( pipe_slow );
7611 %}
7612 
7613 // Floats vector mul
7614 instruct vmul2F(vecD dst, vecD src) %{
7615   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
7616   match(Set dst (MulVF dst src));
7617   format %{ "mulps   $dst,$src\t! mul packed2F" %}
7618   ins_encode %{
7619     __ mulps($dst$$XMMRegister, $src$$XMMRegister);
7620   %}
7621   ins_pipe( pipe_slow );
7622 %}
7623 
7624 instruct vmul2F_reg(vecD dst, vecD src1, vecD src2) %{
7625   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7626   match(Set dst (MulVF src1 src2));
7627   format %{ "vmulps  $dst,$src1,$src2\t! mul packed2F" %}
7628   ins_encode %{
7629     int vector_len = 0;
7630     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7631   %}
7632   ins_pipe( pipe_slow );
7633 %}
7634 
7635 instruct vmul2F_mem(vecD dst, vecD src, memory mem) %{
7636   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7637   match(Set dst (MulVF src (LoadVector mem)));
7638   format %{ "vmulps  $dst,$src,$mem\t! mul packed2F" %}
7639   ins_encode %{
7640     int vector_len = 0;
7641     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7642   %}
7643   ins_pipe( pipe_slow );
7644 %}
7645 
7646 instruct vmul4F(vecX dst, vecX src) %{
7647   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
7648   match(Set dst (MulVF dst src));
7649   format %{ "mulps   $dst,$src\t! mul packed4F" %}
7650   ins_encode %{
7651     __ mulps($dst$$XMMRegister, $src$$XMMRegister);
7652   %}
7653   ins_pipe( pipe_slow );
7654 %}
7655 
7656 instruct vmul4F_reg(vecX dst, vecX src1, vecX src2) %{
7657   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7658   match(Set dst (MulVF src1 src2));
7659   format %{ "vmulps  $dst,$src1,$src2\t! mul packed4F" %}
7660   ins_encode %{
7661     int vector_len = 0;
7662     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7663   %}
7664   ins_pipe( pipe_slow );
7665 %}
7666 
7667 instruct vmul4F_mem(vecX dst, vecX src, memory mem) %{
7668   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7669   match(Set dst (MulVF src (LoadVector mem)));
7670   format %{ "vmulps  $dst,$src,$mem\t! mul packed4F" %}
7671   ins_encode %{
7672     int vector_len = 0;
7673     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7674   %}
7675   ins_pipe( pipe_slow );
7676 %}
7677 
7678 instruct vmul8F_reg(vecY dst, vecY src1, vecY src2) %{
7679   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
7680   match(Set dst (MulVF src1 src2));
7681   format %{ "vmulps  $dst,$src1,$src2\t! mul packed8F" %}
7682   ins_encode %{
7683     int vector_len = 1;
7684     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7685   %}
7686   ins_pipe( pipe_slow );
7687 %}
7688 
7689 instruct vmul8F_mem(vecY dst, vecY src, memory mem) %{
7690   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
7691   match(Set dst (MulVF src (LoadVector mem)));
7692   format %{ "vmulps  $dst,$src,$mem\t! mul packed8F" %}
7693   ins_encode %{
7694     int vector_len = 1;
7695     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7696   %}
7697   ins_pipe( pipe_slow );
7698 %}
7699 
7700 instruct vmul16F_reg(vecZ dst, vecZ src1, vecZ src2) %{
7701   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
7702   match(Set dst (MulVF src1 src2));
7703   format %{ "vmulps  $dst,$src1,$src2\t! mul packed16F" %}
7704   ins_encode %{
7705     int vector_len = 2;
7706     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7707   %}
7708   ins_pipe( pipe_slow );
7709 %}
7710 
7711 instruct vmul16F_mem(vecZ dst, vecZ src, memory mem) %{
7712   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
7713   match(Set dst (MulVF src (LoadVector mem)));
7714   format %{ "vmulps  $dst,$src,$mem\t! mul packed16F" %}
7715   ins_encode %{
7716     int vector_len = 2;
7717     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7718   %}
7719   ins_pipe( pipe_slow );
7720 %}
7721 
7722 // Doubles vector mul
7723 instruct vmul2D(vecX dst, vecX src) %{
7724   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
7725   match(Set dst (MulVD dst src));
7726   format %{ "mulpd   $dst,$src\t! mul packed2D" %}
7727   ins_encode %{
7728     __ mulpd($dst$$XMMRegister, $src$$XMMRegister);
7729   %}
7730   ins_pipe( pipe_slow );
7731 %}
7732 
7733 instruct vmul2D_reg(vecX dst, vecX src1, vecX src2) %{
7734   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7735   match(Set dst (MulVD src1 src2));
7736   format %{ "vmulpd  $dst,$src1,$src2\t! mul packed2D" %}
7737   ins_encode %{
7738     int vector_len = 0;
7739     __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7740   %}
7741   ins_pipe( pipe_slow );
7742 %}
7743 
7744 instruct vmul2D_mem(vecX dst, vecX src, memory mem) %{
7745   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7746   match(Set dst (MulVD src (LoadVector mem)));
7747   format %{ "vmulpd  $dst,$src,$mem\t! mul packed2D" %}
7748   ins_encode %{
7749     int vector_len = 0;
7750     __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7751   %}
7752   ins_pipe( pipe_slow );
7753 %}
7754 
7755 instruct vmul4D_reg(vecY dst, vecY src1, vecY src2) %{
7756   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7757   match(Set dst (MulVD src1 src2));
7758   format %{ "vmulpd  $dst,$src1,$src2\t! mul packed4D" %}
7759   ins_encode %{
7760     int vector_len = 1;
7761     __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7762   %}
7763   ins_pipe( pipe_slow );
7764 %}
7765 
7766 instruct vmul4D_mem(vecY dst, vecY src, memory mem) %{
7767   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7768   match(Set dst (MulVD src (LoadVector mem)));
7769   format %{ "vmulpd  $dst,$src,$mem\t! mul packed4D" %}
7770   ins_encode %{
7771     int vector_len = 1;
7772     __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7773   %}
7774   ins_pipe( pipe_slow );
7775 %}
7776 
7777 instruct vmul8D_reg(vecZ dst, vecZ src1, vecZ src2) %{
7778   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
7779   match(Set dst (MulVD src1 src2));
7780   format %{ "vmulpd  $dst k0,$src1,$src2\t! mul packed8D" %}
7781   ins_encode %{
7782     int vector_len = 2;
7783     __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7784   %}
7785   ins_pipe( pipe_slow );
7786 %}
7787 
7788 instruct vmul8D_mem(vecZ dst, vecZ src, memory mem) %{
7789   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
7790   match(Set dst (MulVD src (LoadVector mem)));
7791   format %{ "vmulpd  $dst k0,$src,$mem\t! mul packed8D" %}
7792   ins_encode %{
7793     int vector_len = 2;
7794     __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7795   %}
7796   ins_pipe( pipe_slow );
7797 %}
7798 
7799 instruct vcmov8F_reg(legVecY dst, legVecY src1, legVecY src2, immI8 cop, cmpOp_vcmppd copnd) %{
7800   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
7801   match(Set dst (CMoveVF (Binary copnd cop) (Binary src1 src2)));
7802   effect(TEMP dst, USE src1, USE src2);
7803   format %{ "cmpps.$copnd  $dst, $src1, $src2  ! vcmovevf, cond=$cop\n\t"
7804             "blendvps $dst,$src1,$src2,$dst ! vcmovevf\n\t"
7805          %}
7806   ins_encode %{
7807     int vector_len = 1;
7808     int cond = (Assembler::Condition)($copnd$$cmpcode);
7809     __ cmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cond, vector_len);
7810     __ blendvps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $dst$$XMMRegister, vector_len);
7811   %}
7812   ins_pipe( pipe_slow );
7813 %}
7814 
7815 instruct vcmov4D_reg(legVecY dst, legVecY src1, legVecY src2, immI8 cop, cmpOp_vcmppd copnd) %{
7816   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7817   match(Set dst (CMoveVD (Binary copnd cop) (Binary src1 src2)));
7818   effect(TEMP dst, USE src1, USE src2);
7819   format %{ "cmppd.$copnd  $dst, $src1, $src2  ! vcmovevd, cond=$cop\n\t"
7820             "blendvpd $dst,$src1,$src2,$dst ! vcmovevd\n\t"
7821          %}
7822   ins_encode %{
7823     int vector_len = 1;
7824     int cond = (Assembler::Condition)($copnd$$cmpcode);
7825     __ cmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cond, vector_len);
7826     __ blendvpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $dst$$XMMRegister, vector_len);
7827   %}
7828   ins_pipe( pipe_slow );
7829 %}
7830 
7831 // --------------------------------- DIV --------------------------------------
7832 
7833 // Floats vector div
7834 instruct vdiv2F(vecD dst, vecD src) %{
7835   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
7836   match(Set dst (DivVF dst src));
7837   format %{ "divps   $dst,$src\t! div packed2F" %}
7838   ins_encode %{
7839     __ divps($dst$$XMMRegister, $src$$XMMRegister);
7840   %}
7841   ins_pipe( pipe_slow );
7842 %}
7843 
7844 instruct vdiv2F_reg(vecD dst, vecD src1, vecD src2) %{
7845   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7846   match(Set dst (DivVF src1 src2));
7847   format %{ "vdivps  $dst,$src1,$src2\t! div packed2F" %}
7848   ins_encode %{
7849     int vector_len = 0;
7850     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7851   %}
7852   ins_pipe( pipe_slow );
7853 %}
7854 
7855 instruct vdiv2F_mem(vecD dst, vecD src, memory mem) %{
7856   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7857   match(Set dst (DivVF src (LoadVector mem)));
7858   format %{ "vdivps  $dst,$src,$mem\t! div packed2F" %}
7859   ins_encode %{
7860     int vector_len = 0;
7861     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7862   %}
7863   ins_pipe( pipe_slow );
7864 %}
7865 
7866 instruct vdiv4F(vecX dst, vecX src) %{
7867   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
7868   match(Set dst (DivVF dst src));
7869   format %{ "divps   $dst,$src\t! div packed4F" %}
7870   ins_encode %{
7871     __ divps($dst$$XMMRegister, $src$$XMMRegister);
7872   %}
7873   ins_pipe( pipe_slow );
7874 %}
7875 
7876 instruct vdiv4F_reg(vecX dst, vecX src1, vecX src2) %{
7877   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7878   match(Set dst (DivVF src1 src2));
7879   format %{ "vdivps  $dst,$src1,$src2\t! div packed4F" %}
7880   ins_encode %{
7881     int vector_len = 0;
7882     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7883   %}
7884   ins_pipe( pipe_slow );
7885 %}
7886 
7887 instruct vdiv4F_mem(vecX dst, vecX src, memory mem) %{
7888   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7889   match(Set dst (DivVF src (LoadVector mem)));
7890   format %{ "vdivps  $dst,$src,$mem\t! div packed4F" %}
7891   ins_encode %{
7892     int vector_len = 0;
7893     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7894   %}
7895   ins_pipe( pipe_slow );
7896 %}
7897 
7898 instruct vdiv8F_reg(vecY dst, vecY src1, vecY src2) %{
7899   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
7900   match(Set dst (DivVF src1 src2));
7901   format %{ "vdivps  $dst,$src1,$src2\t! div packed8F" %}
7902   ins_encode %{
7903     int vector_len = 1;
7904     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7905   %}
7906   ins_pipe( pipe_slow );
7907 %}
7908 
7909 instruct vdiv8F_mem(vecY dst, vecY src, memory mem) %{
7910   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
7911   match(Set dst (DivVF src (LoadVector mem)));
7912   format %{ "vdivps  $dst,$src,$mem\t! div packed8F" %}
7913   ins_encode %{
7914     int vector_len = 1;
7915     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7916   %}
7917   ins_pipe( pipe_slow );
7918 %}
7919 
7920 instruct vdiv16F_reg(vecZ dst, vecZ src1, vecZ src2) %{
7921   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
7922   match(Set dst (DivVF src1 src2));
7923   format %{ "vdivps  $dst,$src1,$src2\t! div packed16F" %}
7924   ins_encode %{
7925     int vector_len = 2;
7926     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7927   %}
7928   ins_pipe( pipe_slow );
7929 %}
7930 
7931 instruct vdiv16F_mem(vecZ dst, vecZ src, memory mem) %{
7932   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
7933   match(Set dst (DivVF src (LoadVector mem)));
7934   format %{ "vdivps  $dst,$src,$mem\t! div packed16F" %}
7935   ins_encode %{
7936     int vector_len = 2;
7937     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7938   %}
7939   ins_pipe( pipe_slow );
7940 %}
7941 
7942 // Doubles vector div
7943 instruct vdiv2D(vecX dst, vecX src) %{
7944   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
7945   match(Set dst (DivVD dst src));
7946   format %{ "divpd   $dst,$src\t! div packed2D" %}
7947   ins_encode %{
7948     __ divpd($dst$$XMMRegister, $src$$XMMRegister);
7949   %}
7950   ins_pipe( pipe_slow );
7951 %}
7952 
7953 instruct vdiv2D_reg(vecX dst, vecX src1, vecX src2) %{
7954   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7955   match(Set dst (DivVD src1 src2));
7956   format %{ "vdivpd  $dst,$src1,$src2\t! div packed2D" %}
7957   ins_encode %{
7958     int vector_len = 0;
7959     __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7960   %}
7961   ins_pipe( pipe_slow );
7962 %}
7963 
7964 instruct vdiv2D_mem(vecX dst, vecX src, memory mem) %{
7965   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7966   match(Set dst (DivVD src (LoadVector mem)));
7967   format %{ "vdivpd  $dst,$src,$mem\t! div packed2D" %}
7968   ins_encode %{
7969     int vector_len = 0;
7970     __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7971   %}
7972   ins_pipe( pipe_slow );
7973 %}
7974 
7975 instruct vdiv4D_reg(vecY dst, vecY src1, vecY src2) %{
7976   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7977   match(Set dst (DivVD src1 src2));
7978   format %{ "vdivpd  $dst,$src1,$src2\t! div packed4D" %}
7979   ins_encode %{
7980     int vector_len = 1;
7981     __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7982   %}
7983   ins_pipe( pipe_slow );
7984 %}
7985 
7986 instruct vdiv4D_mem(vecY dst, vecY src, memory mem) %{
7987   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7988   match(Set dst (DivVD src (LoadVector mem)));
7989   format %{ "vdivpd  $dst,$src,$mem\t! div packed4D" %}
7990   ins_encode %{
7991     int vector_len = 1;
7992     __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7993   %}
7994   ins_pipe( pipe_slow );
7995 %}
7996 
7997 instruct vdiv8D_reg(vecZ dst, vecZ src1, vecZ src2) %{
7998   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
7999   match(Set dst (DivVD src1 src2));
8000   format %{ "vdivpd  $dst,$src1,$src2\t! div packed8D" %}
8001   ins_encode %{
8002     int vector_len = 2;
8003     __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8004   %}
8005   ins_pipe( pipe_slow );
8006 %}
8007 
8008 instruct vdiv8D_mem(vecZ dst, vecZ src, memory mem) %{
8009   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8010   match(Set dst (DivVD src (LoadVector mem)));
8011   format %{ "vdivpd  $dst,$src,$mem\t! div packed8D" %}
8012   ins_encode %{
8013     int vector_len = 2;
8014     __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8015   %}
8016   ins_pipe( pipe_slow );
8017 %}
8018 
8019 // ------------------------------ Shift ---------------------------------------
8020 
8021 // Left and right shift count vectors are the same on x86
8022 // (only lowest bits of xmm reg are used for count).
8023 instruct vshiftcnt(vecS dst, rRegI cnt) %{
8024   match(Set dst (LShiftCntV cnt));
8025   match(Set dst (RShiftCntV cnt));
8026   format %{ "movd    $dst,$cnt\t! load shift count" %}
8027   ins_encode %{
8028     __ movdl($dst$$XMMRegister, $cnt$$Register);
8029   %}
8030   ins_pipe( pipe_slow );
8031 %}
8032 
8033 // --------------------------------- Sqrt --------------------------------------
8034 
8035 // Floating point vector sqrt
8036 instruct vsqrt2D_reg(vecX dst, vecX src) %{
8037   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8038   match(Set dst (SqrtVD src));
8039   format %{ "vsqrtpd  $dst,$src\t! sqrt packed2D" %}
8040   ins_encode %{
8041     int vector_len = 0;
8042     __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
8043   %}
8044   ins_pipe( pipe_slow );
8045 %}
8046 
8047 instruct vsqrt2D_mem(vecX dst, memory mem) %{
8048   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8049   match(Set dst (SqrtVD (LoadVector mem)));
8050   format %{ "vsqrtpd  $dst,$mem\t! sqrt packed2D" %}
8051   ins_encode %{
8052     int vector_len = 0;
8053     __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vector_len);
8054   %}
8055   ins_pipe( pipe_slow );
8056 %}
8057 
8058 instruct vsqrt4D_reg(vecY dst, vecY src) %{
8059   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8060   match(Set dst (SqrtVD src));
8061   format %{ "vsqrtpd  $dst,$src\t! sqrt packed4D" %}
8062   ins_encode %{
8063     int vector_len = 1;
8064     __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
8065   %}
8066   ins_pipe( pipe_slow );
8067 %}
8068 
8069 instruct vsqrt4D_mem(vecY dst, memory mem) %{
8070   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8071   match(Set dst (SqrtVD (LoadVector mem)));
8072   format %{ "vsqrtpd  $dst,$mem\t! sqrt packed4D" %}
8073   ins_encode %{
8074     int vector_len = 1;
8075     __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vector_len);
8076   %}
8077   ins_pipe( pipe_slow );
8078 %}
8079 
8080 instruct vsqrt8D_reg(vecZ dst, vecZ src) %{
8081   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8082   match(Set dst (SqrtVD src));
8083   format %{ "vsqrtpd  $dst,$src\t! sqrt packed8D" %}
8084   ins_encode %{
8085     int vector_len = 2;
8086     __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
8087   %}
8088   ins_pipe( pipe_slow );
8089 %}
8090 
8091 instruct vsqrt8D_mem(vecZ dst, memory mem) %{
8092   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8093   match(Set dst (SqrtVD (LoadVector mem)));
8094   format %{ "vsqrtpd  $dst,$mem\t! sqrt packed8D" %}
8095   ins_encode %{
8096     int vector_len = 2;
8097     __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vector_len);
8098   %}
8099   ins_pipe( pipe_slow );
8100 %}
8101 
8102 instruct vsqrt2F_reg(vecD dst, vecD src) %{
8103   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8104   match(Set dst (SqrtVF src));
8105   format %{ "vsqrtps  $dst,$src\t! sqrt packed2F" %}
8106   ins_encode %{
8107     int vector_len = 0;
8108     __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vector_len);
8109   %}
8110   ins_pipe( pipe_slow );
8111 %}
8112 
8113 instruct vsqrt2F_mem(vecD dst, memory mem) %{
8114   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8115   match(Set dst (SqrtVF (LoadVector mem)));
8116   format %{ "vsqrtps  $dst,$mem\t! sqrt packed2F" %}
8117   ins_encode %{
8118     int vector_len = 0;
8119     __ vsqrtps($dst$$XMMRegister, $mem$$Address, vector_len);
8120   %}
8121   ins_pipe( pipe_slow );
8122 %}
8123 
8124 instruct vsqrt4F_reg(vecX dst, vecX src) %{
8125   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8126   match(Set dst (SqrtVF src));
8127   format %{ "vsqrtps  $dst,$src\t! sqrt packed4F" %}
8128   ins_encode %{
8129     int vector_len = 0;
8130     __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vector_len);
8131   %}
8132   ins_pipe( pipe_slow );
8133 %}
8134 
8135 instruct vsqrt4F_mem(vecX dst, memory mem) %{
8136   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8137   match(Set dst (SqrtVF (LoadVector mem)));
8138   format %{ "vsqrtps  $dst,$mem\t! sqrt packed4F" %}
8139   ins_encode %{
8140     int vector_len = 0;
8141     __ vsqrtps($dst$$XMMRegister, $mem$$Address, vector_len);
8142   %}
8143   ins_pipe( pipe_slow );
8144 %}
8145 
8146 instruct vsqrt8F_reg(vecY dst, vecY src) %{
8147   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
8148   match(Set dst (SqrtVF src));
8149   format %{ "vsqrtps  $dst,$src\t! sqrt packed8F" %}
8150   ins_encode %{
8151     int vector_len = 1;
8152     __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vector_len);
8153   %}
8154   ins_pipe( pipe_slow );
8155 %}
8156 
8157 instruct vsqrt8F_mem(vecY dst, memory mem) %{
8158   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
8159   match(Set dst (SqrtVF (LoadVector mem)));
8160   format %{ "vsqrtps  $dst,$mem\t! sqrt packed8F" %}
8161   ins_encode %{
8162     int vector_len = 1;
8163     __ vsqrtps($dst$$XMMRegister, $mem$$Address, vector_len);
8164   %}
8165   ins_pipe( pipe_slow );
8166 %}
8167 
8168 instruct vsqrt16F_reg(vecZ dst, vecZ src) %{
8169   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
8170   match(Set dst (SqrtVF src));
8171   format %{ "vsqrtps  $dst,$src\t! sqrt packed16F" %}
8172   ins_encode %{
8173     int vector_len = 2;
8174     __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vector_len);
8175   %}
8176   ins_pipe( pipe_slow );
8177 %}
8178 
8179 instruct vsqrt16F_mem(vecZ dst, memory mem) %{
8180   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
8181   match(Set dst (SqrtVF (LoadVector mem)));
8182   format %{ "vsqrtps  $dst,$mem\t! sqrt packed16F" %}
8183   ins_encode %{
8184     int vector_len = 2;
8185     __ vsqrtps($dst$$XMMRegister, $mem$$Address, vector_len);
8186   %}
8187   ins_pipe( pipe_slow );
8188 %}
8189 
8190 // ------------------------------ LeftShift -----------------------------------
8191 
8192 // Shorts/Chars vector left shift
8193 instruct vsll2S(vecS dst, vecS shift) %{
8194   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
8195   match(Set dst (LShiftVS dst shift));
8196   format %{ "psllw   $dst,$shift\t! left shift packed2S" %}
8197   ins_encode %{
8198     __ psllw($dst$$XMMRegister, $shift$$XMMRegister);
8199   %}
8200   ins_pipe( pipe_slow );
8201 %}
8202 
8203 instruct vsll2S_imm(vecS dst, immI8 shift) %{
8204   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
8205   match(Set dst (LShiftVS dst shift));
8206   format %{ "psllw   $dst,$shift\t! left shift packed2S" %}
8207   ins_encode %{
8208     __ psllw($dst$$XMMRegister, (int)$shift$$constant);
8209   %}
8210   ins_pipe( pipe_slow );
8211 %}
8212 
8213 instruct vsll2S_reg(vecS dst, vecS src, vecS shift) %{
8214   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8215   match(Set dst (LShiftVS src shift));
8216   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
8217   ins_encode %{
8218     int vector_len = 0;
8219     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8220   %}
8221   ins_pipe( pipe_slow );
8222 %}
8223 
8224 instruct vsll2S_reg_imm(vecS dst, vecS src, immI8 shift) %{
8225   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8226   match(Set dst (LShiftVS src shift));
8227   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
8228   ins_encode %{
8229     int vector_len = 0;
8230     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8231   %}
8232   ins_pipe( pipe_slow );
8233 %}
8234 
8235 instruct vsll4S(vecD dst, vecS shift) %{
8236   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
8237   match(Set dst (LShiftVS dst shift));
8238   format %{ "psllw   $dst,$shift\t! left shift packed4S" %}
8239   ins_encode %{
8240     __ psllw($dst$$XMMRegister, $shift$$XMMRegister);
8241   %}
8242   ins_pipe( pipe_slow );
8243 %}
8244 
8245 instruct vsll4S_imm(vecD dst, immI8 shift) %{
8246   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
8247   match(Set dst (LShiftVS dst shift));
8248   format %{ "psllw   $dst,$shift\t! left shift packed4S" %}
8249   ins_encode %{
8250     __ psllw($dst$$XMMRegister, (int)$shift$$constant);
8251   %}
8252   ins_pipe( pipe_slow );
8253 %}
8254 
8255 instruct vsll4S_reg(vecD dst, vecD src, vecS shift) %{
8256   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8257   match(Set dst (LShiftVS src shift));
8258   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
8259   ins_encode %{
8260     int vector_len = 0;
8261     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8262   %}
8263   ins_pipe( pipe_slow );
8264 %}
8265 
8266 instruct vsll4S_reg_imm(vecD dst, vecD src, immI8 shift) %{
8267   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8268   match(Set dst (LShiftVS src shift));
8269   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
8270   ins_encode %{
8271     int vector_len = 0;
8272     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8273   %}
8274   ins_pipe( pipe_slow );
8275 %}
8276 
8277 instruct vsll8S(vecX dst, vecS shift) %{
8278   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
8279   match(Set dst (LShiftVS dst shift));
8280   format %{ "psllw   $dst,$shift\t! left shift packed8S" %}
8281   ins_encode %{
8282     __ psllw($dst$$XMMRegister, $shift$$XMMRegister);
8283   %}
8284   ins_pipe( pipe_slow );
8285 %}
8286 
8287 instruct vsll8S_imm(vecX dst, immI8 shift) %{
8288   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
8289   match(Set dst (LShiftVS dst shift));
8290   format %{ "psllw   $dst,$shift\t! left shift packed8S" %}
8291   ins_encode %{
8292     __ psllw($dst$$XMMRegister, (int)$shift$$constant);
8293   %}
8294   ins_pipe( pipe_slow );
8295 %}
8296 
8297 instruct vsll8S_reg(vecX dst, vecX src, vecS shift) %{
8298   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
8299   match(Set dst (LShiftVS src shift));
8300   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
8301   ins_encode %{
8302     int vector_len = 0;
8303     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8304   %}
8305   ins_pipe( pipe_slow );
8306 %}
8307 
8308 instruct vsll8S_reg_imm(vecX dst, vecX src, immI8 shift) %{
8309   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
8310   match(Set dst (LShiftVS src shift));
8311   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
8312   ins_encode %{
8313     int vector_len = 0;
8314     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8315   %}
8316   ins_pipe( pipe_slow );
8317 %}
8318 
8319 instruct vsll16S_reg(vecY dst, vecY src, vecS shift) %{
8320   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
8321   match(Set dst (LShiftVS src shift));
8322   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
8323   ins_encode %{
8324     int vector_len = 1;
8325     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8326   %}
8327   ins_pipe( pipe_slow );
8328 %}
8329 
8330 instruct vsll16S_reg_imm(vecY dst, vecY src, immI8 shift) %{
8331   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
8332   match(Set dst (LShiftVS src shift));
8333   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
8334   ins_encode %{
8335     int vector_len = 1;
8336     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8337   %}
8338   ins_pipe( pipe_slow );
8339 %}
8340 
8341 instruct vsll32S_reg(vecZ dst, vecZ src, vecS shift) %{
8342   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
8343   match(Set dst (LShiftVS src shift));
8344   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed32S" %}
8345   ins_encode %{
8346     int vector_len = 2;
8347     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8348   %}
8349   ins_pipe( pipe_slow );
8350 %}
8351 
8352 instruct vsll32S_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
8353   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
8354   match(Set dst (LShiftVS src shift));
8355   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed32S" %}
8356   ins_encode %{
8357     int vector_len = 2;
8358     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8359   %}
8360   ins_pipe( pipe_slow );
8361 %}
8362 
8363 // Integers vector left shift
8364 instruct vsll2I(vecD dst, vecS shift) %{
8365   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
8366   match(Set dst (LShiftVI dst shift));
8367   format %{ "pslld   $dst,$shift\t! left shift packed2I" %}
8368   ins_encode %{
8369     __ pslld($dst$$XMMRegister, $shift$$XMMRegister);
8370   %}
8371   ins_pipe( pipe_slow );
8372 %}
8373 
8374 instruct vsll2I_imm(vecD dst, immI8 shift) %{
8375   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
8376   match(Set dst (LShiftVI dst shift));
8377   format %{ "pslld   $dst,$shift\t! left shift packed2I" %}
8378   ins_encode %{
8379     __ pslld($dst$$XMMRegister, (int)$shift$$constant);
8380   %}
8381   ins_pipe( pipe_slow );
8382 %}
8383 
8384 instruct vsll2I_reg(vecD dst, vecD src, vecS shift) %{
8385   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8386   match(Set dst (LShiftVI src shift));
8387   format %{ "vpslld  $dst,$src,$shift\t! left shift packed2I" %}
8388   ins_encode %{
8389     int vector_len = 0;
8390     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8391   %}
8392   ins_pipe( pipe_slow );
8393 %}
8394 
8395 instruct vsll2I_reg_imm(vecD dst, vecD src, immI8 shift) %{
8396   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8397   match(Set dst (LShiftVI src shift));
8398   format %{ "vpslld  $dst,$src,$shift\t! left shift packed2I" %}
8399   ins_encode %{
8400     int vector_len = 0;
8401     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8402   %}
8403   ins_pipe( pipe_slow );
8404 %}
8405 
8406 instruct vsll4I(vecX dst, vecS shift) %{
8407   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
8408   match(Set dst (LShiftVI dst shift));
8409   format %{ "pslld   $dst,$shift\t! left shift packed4I" %}
8410   ins_encode %{
8411     __ pslld($dst$$XMMRegister, $shift$$XMMRegister);
8412   %}
8413   ins_pipe( pipe_slow );
8414 %}
8415 
8416 instruct vsll4I_imm(vecX dst, immI8 shift) %{
8417   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
8418   match(Set dst (LShiftVI dst shift));
8419   format %{ "pslld   $dst,$shift\t! left shift packed4I" %}
8420   ins_encode %{
8421     __ pslld($dst$$XMMRegister, (int)$shift$$constant);
8422   %}
8423   ins_pipe( pipe_slow );
8424 %}
8425 
8426 instruct vsll4I_reg(vecX dst, vecX src, vecS shift) %{
8427   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8428   match(Set dst (LShiftVI src shift));
8429   format %{ "vpslld  $dst,$src,$shift\t! left shift packed4I" %}
8430   ins_encode %{
8431     int vector_len = 0;
8432     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8433   %}
8434   ins_pipe( pipe_slow );
8435 %}
8436 
8437 instruct vsll4I_reg_imm(vecX dst, vecX src, immI8 shift) %{
8438   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8439   match(Set dst (LShiftVI src shift));
8440   format %{ "vpslld  $dst,$src,$shift\t! left shift packed4I" %}
8441   ins_encode %{
8442     int vector_len = 0;
8443     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8444   %}
8445   ins_pipe( pipe_slow );
8446 %}
8447 
8448 instruct vsll8I_reg(vecY dst, vecY src, vecS shift) %{
8449   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
8450   match(Set dst (LShiftVI src shift));
8451   format %{ "vpslld  $dst,$src,$shift\t! left shift packed8I" %}
8452   ins_encode %{
8453     int vector_len = 1;
8454     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8455   %}
8456   ins_pipe( pipe_slow );
8457 %}
8458 
8459 instruct vsll8I_reg_imm(vecY dst, vecY src, immI8 shift) %{
8460   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
8461   match(Set dst (LShiftVI src shift));
8462   format %{ "vpslld  $dst,$src,$shift\t! left shift packed8I" %}
8463   ins_encode %{
8464     int vector_len = 1;
8465     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8466   %}
8467   ins_pipe( pipe_slow );
8468 %}
8469 
8470 instruct vsll16I_reg(vecZ dst, vecZ src, vecS shift) %{
8471   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
8472   match(Set dst (LShiftVI src shift));
8473   format %{ "vpslld  $dst,$src,$shift\t! left shift packed16I" %}
8474   ins_encode %{
8475     int vector_len = 2;
8476     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8477   %}
8478   ins_pipe( pipe_slow );
8479 %}
8480 
8481 instruct vsll16I_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
8482   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
8483   match(Set dst (LShiftVI src shift));
8484   format %{ "vpslld  $dst,$src,$shift\t! left shift packed16I" %}
8485   ins_encode %{
8486     int vector_len = 2;
8487     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8488   %}
8489   ins_pipe( pipe_slow );
8490 %}
8491 
8492 // Longs vector left shift
8493 instruct vsll2L(vecX dst, vecS shift) %{
8494   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
8495   match(Set dst (LShiftVL dst shift));
8496   format %{ "psllq   $dst,$shift\t! left shift packed2L" %}
8497   ins_encode %{
8498     __ psllq($dst$$XMMRegister, $shift$$XMMRegister);
8499   %}
8500   ins_pipe( pipe_slow );
8501 %}
8502 
8503 instruct vsll2L_imm(vecX dst, immI8 shift) %{
8504   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
8505   match(Set dst (LShiftVL dst shift));
8506   format %{ "psllq   $dst,$shift\t! left shift packed2L" %}
8507   ins_encode %{
8508     __ psllq($dst$$XMMRegister, (int)$shift$$constant);
8509   %}
8510   ins_pipe( pipe_slow );
8511 %}
8512 
8513 instruct vsll2L_reg(vecX dst, vecX src, vecS shift) %{
8514   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8515   match(Set dst (LShiftVL src shift));
8516   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed2L" %}
8517   ins_encode %{
8518     int vector_len = 0;
8519     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8520   %}
8521   ins_pipe( pipe_slow );
8522 %}
8523 
8524 instruct vsll2L_reg_imm(vecX dst, vecX src, immI8 shift) %{
8525   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8526   match(Set dst (LShiftVL src shift));
8527   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed2L" %}
8528   ins_encode %{
8529     int vector_len = 0;
8530     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8531   %}
8532   ins_pipe( pipe_slow );
8533 %}
8534 
8535 instruct vsll4L_reg(vecY dst, vecY src, vecS shift) %{
8536   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
8537   match(Set dst (LShiftVL src shift));
8538   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed4L" %}
8539   ins_encode %{
8540     int vector_len = 1;
8541     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8542   %}
8543   ins_pipe( pipe_slow );
8544 %}
8545 
8546 instruct vsll4L_reg_imm(vecY dst, vecY src, immI8 shift) %{
8547   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
8548   match(Set dst (LShiftVL src shift));
8549   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed4L" %}
8550   ins_encode %{
8551     int vector_len = 1;
8552     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8553   %}
8554   ins_pipe( pipe_slow );
8555 %}
8556 
8557 instruct vsll8L_reg(vecZ dst, vecZ src, vecS shift) %{
8558   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8559   match(Set dst (LShiftVL src shift));
8560   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed8L" %}
8561   ins_encode %{
8562     int vector_len = 2;
8563     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8564   %}
8565   ins_pipe( pipe_slow );
8566 %}
8567 
8568 instruct vsll8L_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
8569   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8570   match(Set dst (LShiftVL src shift));
8571   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed8L" %}
8572   ins_encode %{
8573     int vector_len = 2;
8574     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8575   %}
8576   ins_pipe( pipe_slow );
8577 %}
8578 
8579 // ----------------------- LogicalRightShift -----------------------------------
8580 
8581 // Shorts vector logical right shift produces incorrect Java result
8582 // for negative data because java code convert short value into int with
8583 // sign extension before a shift. But char vectors are fine since chars are
8584 // unsigned values.
8585 
8586 instruct vsrl2S(vecS dst, vecS shift) %{
8587   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
8588   match(Set dst (URShiftVS dst shift));
8589   format %{ "psrlw   $dst,$shift\t! logical right shift packed2S" %}
8590   ins_encode %{
8591     __ psrlw($dst$$XMMRegister, $shift$$XMMRegister);
8592   %}
8593   ins_pipe( pipe_slow );
8594 %}
8595 
8596 instruct vsrl2S_imm(vecS dst, immI8 shift) %{
8597   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
8598   match(Set dst (URShiftVS dst shift));
8599   format %{ "psrlw   $dst,$shift\t! logical right shift packed2S" %}
8600   ins_encode %{
8601     __ psrlw($dst$$XMMRegister, (int)$shift$$constant);
8602   %}
8603   ins_pipe( pipe_slow );
8604 %}
8605 
8606 instruct vsrl2S_reg(vecS dst, vecS src, vecS shift) %{
8607   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8608   match(Set dst (URShiftVS src shift));
8609   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed2S" %}
8610   ins_encode %{
8611     int vector_len = 0;
8612     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8613   %}
8614   ins_pipe( pipe_slow );
8615 %}
8616 
8617 instruct vsrl2S_reg_imm(vecS dst, vecS src, immI8 shift) %{
8618   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8619   match(Set dst (URShiftVS src shift));
8620   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed2S" %}
8621   ins_encode %{
8622     int vector_len = 0;
8623     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8624   %}
8625   ins_pipe( pipe_slow );
8626 %}
8627 
8628 instruct vsrl4S(vecD dst, vecS shift) %{
8629   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
8630   match(Set dst (URShiftVS dst shift));
8631   format %{ "psrlw   $dst,$shift\t! logical right shift packed4S" %}
8632   ins_encode %{
8633     __ psrlw($dst$$XMMRegister, $shift$$XMMRegister);
8634   %}
8635   ins_pipe( pipe_slow );
8636 %}
8637 
8638 instruct vsrl4S_imm(vecD dst, immI8 shift) %{
8639   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
8640   match(Set dst (URShiftVS dst shift));
8641   format %{ "psrlw   $dst,$shift\t! logical right shift packed4S" %}
8642   ins_encode %{
8643     __ psrlw($dst$$XMMRegister, (int)$shift$$constant);
8644   %}
8645   ins_pipe( pipe_slow );
8646 %}
8647 
8648 instruct vsrl4S_reg(vecD dst, vecD src, vecS shift) %{
8649   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8650   match(Set dst (URShiftVS src shift));
8651   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed4S" %}
8652   ins_encode %{
8653     int vector_len = 0;
8654     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8655   %}
8656   ins_pipe( pipe_slow );
8657 %}
8658 
8659 instruct vsrl4S_reg_imm(vecD dst, vecD src, immI8 shift) %{
8660   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8661   match(Set dst (URShiftVS src shift));
8662   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed4S" %}
8663   ins_encode %{
8664     int vector_len = 0;
8665     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8666   %}
8667   ins_pipe( pipe_slow );
8668 %}
8669 
8670 instruct vsrl8S(vecX dst, vecS shift) %{
8671   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
8672   match(Set dst (URShiftVS dst shift));
8673   format %{ "psrlw   $dst,$shift\t! logical right shift packed8S" %}
8674   ins_encode %{
8675     __ psrlw($dst$$XMMRegister, $shift$$XMMRegister);
8676   %}
8677   ins_pipe( pipe_slow );
8678 %}
8679 
8680 instruct vsrl8S_imm(vecX dst, immI8 shift) %{
8681   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
8682   match(Set dst (URShiftVS dst shift));
8683   format %{ "psrlw   $dst,$shift\t! logical right shift packed8S" %}
8684   ins_encode %{
8685     __ psrlw($dst$$XMMRegister, (int)$shift$$constant);
8686   %}
8687   ins_pipe( pipe_slow );
8688 %}
8689 
8690 instruct vsrl8S_reg(vecX dst, vecX src, vecS shift) %{
8691   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
8692   match(Set dst (URShiftVS src shift));
8693   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed8S" %}
8694   ins_encode %{
8695     int vector_len = 0;
8696     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8697   %}
8698   ins_pipe( pipe_slow );
8699 %}
8700 
8701 instruct vsrl8S_reg_imm(vecX dst, vecX src, immI8 shift) %{
8702   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
8703   match(Set dst (URShiftVS src shift));
8704   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed8S" %}
8705   ins_encode %{
8706     int vector_len = 0;
8707     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8708   %}
8709   ins_pipe( pipe_slow );
8710 %}
8711 
8712 instruct vsrl16S_reg(vecY dst, vecY src, vecS shift) %{
8713   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
8714   match(Set dst (URShiftVS src shift));
8715   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed16S" %}
8716   ins_encode %{
8717     int vector_len = 1;
8718     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8719   %}
8720   ins_pipe( pipe_slow );
8721 %}
8722 
8723 instruct vsrl16S_reg_imm(vecY dst, vecY src, immI8 shift) %{
8724   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
8725   match(Set dst (URShiftVS src shift));
8726   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed16S" %}
8727   ins_encode %{
8728     int vector_len = 1;
8729     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8730   %}
8731   ins_pipe( pipe_slow );
8732 %}
8733 
8734 instruct vsrl32S_reg(vecZ dst, vecZ src, vecS shift) %{
8735   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
8736   match(Set dst (URShiftVS src shift));
8737   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed32S" %}
8738   ins_encode %{
8739     int vector_len = 2;
8740     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8741   %}
8742   ins_pipe( pipe_slow );
8743 %}
8744 
8745 instruct vsrl32S_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
8746   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
8747   match(Set dst (URShiftVS src shift));
8748   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed32S" %}
8749   ins_encode %{
8750     int vector_len = 2;
8751     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8752   %}
8753   ins_pipe( pipe_slow );
8754 %}
8755 
8756 // Integers vector logical right shift
8757 instruct vsrl2I(vecD dst, vecS shift) %{
8758   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
8759   match(Set dst (URShiftVI dst shift));
8760   format %{ "psrld   $dst,$shift\t! logical right shift packed2I" %}
8761   ins_encode %{
8762     __ psrld($dst$$XMMRegister, $shift$$XMMRegister);
8763   %}
8764   ins_pipe( pipe_slow );
8765 %}
8766 
8767 instruct vsrl2I_imm(vecD dst, immI8 shift) %{
8768   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
8769   match(Set dst (URShiftVI dst shift));
8770   format %{ "psrld   $dst,$shift\t! logical right shift packed2I" %}
8771   ins_encode %{
8772     __ psrld($dst$$XMMRegister, (int)$shift$$constant);
8773   %}
8774   ins_pipe( pipe_slow );
8775 %}
8776 
8777 instruct vsrl2I_reg(vecD dst, vecD src, vecS shift) %{
8778   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8779   match(Set dst (URShiftVI src shift));
8780   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed2I" %}
8781   ins_encode %{
8782     int vector_len = 0;
8783     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8784   %}
8785   ins_pipe( pipe_slow );
8786 %}
8787 
8788 instruct vsrl2I_reg_imm(vecD dst, vecD src, immI8 shift) %{
8789   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8790   match(Set dst (URShiftVI src shift));
8791   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed2I" %}
8792   ins_encode %{
8793     int vector_len = 0;
8794     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8795   %}
8796   ins_pipe( pipe_slow );
8797 %}
8798 
8799 instruct vsrl4I(vecX dst, vecS shift) %{
8800   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
8801   match(Set dst (URShiftVI dst shift));
8802   format %{ "psrld   $dst,$shift\t! logical right shift packed4I" %}
8803   ins_encode %{
8804     __ psrld($dst$$XMMRegister, $shift$$XMMRegister);
8805   %}
8806   ins_pipe( pipe_slow );
8807 %}
8808 
8809 instruct vsrl4I_imm(vecX dst, immI8 shift) %{
8810   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
8811   match(Set dst (URShiftVI dst shift));
8812   format %{ "psrld   $dst,$shift\t! logical right shift packed4I" %}
8813   ins_encode %{
8814     __ psrld($dst$$XMMRegister, (int)$shift$$constant);
8815   %}
8816   ins_pipe( pipe_slow );
8817 %}
8818 
8819 instruct vsrl4I_reg(vecX dst, vecX src, vecS shift) %{
8820   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8821   match(Set dst (URShiftVI src shift));
8822   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed4I" %}
8823   ins_encode %{
8824     int vector_len = 0;
8825     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8826   %}
8827   ins_pipe( pipe_slow );
8828 %}
8829 
8830 instruct vsrl4I_reg_imm(vecX dst, vecX src, immI8 shift) %{
8831   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8832   match(Set dst (URShiftVI src shift));
8833   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed4I" %}
8834   ins_encode %{
8835     int vector_len = 0;
8836     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8837   %}
8838   ins_pipe( pipe_slow );
8839 %}
8840 
8841 instruct vsrl8I_reg(vecY dst, vecY src, vecS shift) %{
8842   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
8843   match(Set dst (URShiftVI src shift));
8844   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed8I" %}
8845   ins_encode %{
8846     int vector_len = 1;
8847     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8848   %}
8849   ins_pipe( pipe_slow );
8850 %}
8851 
8852 instruct vsrl8I_reg_imm(vecY dst, vecY src, immI8 shift) %{
8853   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
8854   match(Set dst (URShiftVI src shift));
8855   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed8I" %}
8856   ins_encode %{
8857     int vector_len = 1;
8858     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8859   %}
8860   ins_pipe( pipe_slow );
8861 %}
8862 
8863 instruct vsrl16I_reg(vecZ dst, vecZ src, vecS shift) %{
8864   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
8865   match(Set dst (URShiftVI src shift));
8866   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed16I" %}
8867   ins_encode %{
8868     int vector_len = 2;
8869     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8870   %}
8871   ins_pipe( pipe_slow );
8872 %}
8873 
8874 instruct vsrl16I_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
8875   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
8876   match(Set dst (URShiftVI src shift));
8877   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed16I" %}
8878   ins_encode %{
8879     int vector_len = 2;
8880     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8881   %}
8882   ins_pipe( pipe_slow );
8883 %}
8884 
8885 // Longs vector logical right shift
8886 instruct vsrl2L(vecX dst, vecS shift) %{
8887   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
8888   match(Set dst (URShiftVL dst shift));
8889   format %{ "psrlq   $dst,$shift\t! logical right shift packed2L" %}
8890   ins_encode %{
8891     __ psrlq($dst$$XMMRegister, $shift$$XMMRegister);
8892   %}
8893   ins_pipe( pipe_slow );
8894 %}
8895 
8896 instruct vsrl2L_imm(vecX dst, immI8 shift) %{
8897   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
8898   match(Set dst (URShiftVL dst shift));
8899   format %{ "psrlq   $dst,$shift\t! logical right shift packed2L" %}
8900   ins_encode %{
8901     __ psrlq($dst$$XMMRegister, (int)$shift$$constant);
8902   %}
8903   ins_pipe( pipe_slow );
8904 %}
8905 
8906 instruct vsrl2L_reg(vecX dst, vecX src, vecS shift) %{
8907   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8908   match(Set dst (URShiftVL src shift));
8909   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed2L" %}
8910   ins_encode %{
8911     int vector_len = 0;
8912     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8913   %}
8914   ins_pipe( pipe_slow );
8915 %}
8916 
8917 instruct vsrl2L_reg_imm(vecX dst, vecX src, immI8 shift) %{
8918   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8919   match(Set dst (URShiftVL src shift));
8920   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed2L" %}
8921   ins_encode %{
8922     int vector_len = 0;
8923     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8924   %}
8925   ins_pipe( pipe_slow );
8926 %}
8927 
8928 instruct vsrl4L_reg(vecY dst, vecY src, vecS shift) %{
8929   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
8930   match(Set dst (URShiftVL src shift));
8931   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed4L" %}
8932   ins_encode %{
8933     int vector_len = 1;
8934     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8935   %}
8936   ins_pipe( pipe_slow );
8937 %}
8938 
8939 instruct vsrl4L_reg_imm(vecY dst, vecY src, immI8 shift) %{
8940   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
8941   match(Set dst (URShiftVL src shift));
8942   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed4L" %}
8943   ins_encode %{
8944     int vector_len = 1;
8945     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8946   %}
8947   ins_pipe( pipe_slow );
8948 %}
8949 
8950 instruct vsrl8L_reg(vecZ dst, vecZ src, vecS shift) %{
8951   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8952   match(Set dst (URShiftVL src shift));
8953   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed8L" %}
8954   ins_encode %{
8955     int vector_len = 2;
8956     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8957   %}
8958   ins_pipe( pipe_slow );
8959 %}
8960 
8961 instruct vsrl8L_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
8962   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8963   match(Set dst (URShiftVL src shift));
8964   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed8L" %}
8965   ins_encode %{
8966     int vector_len = 2;
8967     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8968   %}
8969   ins_pipe( pipe_slow );
8970 %}
8971 
8972 // ------------------- ArithmeticRightShift -----------------------------------
8973 
8974 // Shorts/Chars vector arithmetic right shift
8975 instruct vsra2S(vecS dst, vecS shift) %{
8976   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
8977   match(Set dst (RShiftVS dst shift));
8978   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed2S" %}
8979   ins_encode %{
8980     __ psraw($dst$$XMMRegister, $shift$$XMMRegister);
8981   %}
8982   ins_pipe( pipe_slow );
8983 %}
8984 
8985 instruct vsra2S_imm(vecS dst, immI8 shift) %{
8986   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
8987   match(Set dst (RShiftVS dst shift));
8988   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed2S" %}
8989   ins_encode %{
8990     __ psraw($dst$$XMMRegister, (int)$shift$$constant);
8991   %}
8992   ins_pipe( pipe_slow );
8993 %}
8994 
8995 instruct vsra2S_reg(vecS dst, vecS src, vecS shift) %{
8996   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8997   match(Set dst (RShiftVS src shift));
8998   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
8999   ins_encode %{
9000     int vector_len = 0;
9001     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9002   %}
9003   ins_pipe( pipe_slow );
9004 %}
9005 
9006 instruct vsra2S_reg_imm(vecS dst, vecS src, immI8 shift) %{
9007   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
9008   match(Set dst (RShiftVS src shift));
9009   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
9010   ins_encode %{
9011     int vector_len = 0;
9012     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9013   %}
9014   ins_pipe( pipe_slow );
9015 %}
9016 
9017 instruct vsra4S(vecD dst, vecS shift) %{
9018   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
9019   match(Set dst (RShiftVS dst shift));
9020   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed4S" %}
9021   ins_encode %{
9022     __ psraw($dst$$XMMRegister, $shift$$XMMRegister);
9023   %}
9024   ins_pipe( pipe_slow );
9025 %}
9026 
9027 instruct vsra4S_imm(vecD dst, immI8 shift) %{
9028   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
9029   match(Set dst (RShiftVS dst shift));
9030   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed4S" %}
9031   ins_encode %{
9032     __ psraw($dst$$XMMRegister, (int)$shift$$constant);
9033   %}
9034   ins_pipe( pipe_slow );
9035 %}
9036 
9037 instruct vsra4S_reg(vecD dst, vecD src, vecS shift) %{
9038   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
9039   match(Set dst (RShiftVS src shift));
9040   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
9041   ins_encode %{
9042     int vector_len = 0;
9043     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9044   %}
9045   ins_pipe( pipe_slow );
9046 %}
9047 
9048 instruct vsra4S_reg_imm(vecD dst, vecD src, immI8 shift) %{
9049   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
9050   match(Set dst (RShiftVS src shift));
9051   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
9052   ins_encode %{
9053     int vector_len = 0;
9054     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9055   %}
9056   ins_pipe( pipe_slow );
9057 %}
9058 
9059 instruct vsra8S(vecX dst, vecS shift) %{
9060   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
9061   match(Set dst (RShiftVS dst shift));
9062   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed8S" %}
9063   ins_encode %{
9064     __ psraw($dst$$XMMRegister, $shift$$XMMRegister);
9065   %}
9066   ins_pipe( pipe_slow );
9067 %}
9068 
9069 instruct vsra8S_imm(vecX dst, immI8 shift) %{
9070   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
9071   match(Set dst (RShiftVS dst shift));
9072   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed8S" %}
9073   ins_encode %{
9074     __ psraw($dst$$XMMRegister, (int)$shift$$constant);
9075   %}
9076   ins_pipe( pipe_slow );
9077 %}
9078 
9079 instruct vsra8S_reg(vecX dst, vecX src, vecS shift) %{
9080   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
9081   match(Set dst (RShiftVS src shift));
9082   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
9083   ins_encode %{
9084     int vector_len = 0;
9085     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9086   %}
9087   ins_pipe( pipe_slow );
9088 %}
9089 
9090 instruct vsra8S_reg_imm(vecX dst, vecX src, immI8 shift) %{
9091   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
9092   match(Set dst (RShiftVS src shift));
9093   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
9094   ins_encode %{
9095     int vector_len = 0;
9096     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9097   %}
9098   ins_pipe( pipe_slow );
9099 %}
9100 
9101 instruct vsra16S_reg(vecY dst, vecY src, vecS shift) %{
9102   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
9103   match(Set dst (RShiftVS src shift));
9104   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
9105   ins_encode %{
9106     int vector_len = 1;
9107     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9108   %}
9109   ins_pipe( pipe_slow );
9110 %}
9111 
9112 instruct vsra16S_reg_imm(vecY dst, vecY src, immI8 shift) %{
9113   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
9114   match(Set dst (RShiftVS src shift));
9115   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
9116   ins_encode %{
9117     int vector_len = 1;
9118     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9119   %}
9120   ins_pipe( pipe_slow );
9121 %}
9122 
9123 instruct vsra32S_reg(vecZ dst, vecZ src, vecS shift) %{
9124   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
9125   match(Set dst (RShiftVS src shift));
9126   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed32S" %}
9127   ins_encode %{
9128     int vector_len = 2;
9129     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9130   %}
9131   ins_pipe( pipe_slow );
9132 %}
9133 
9134 instruct vsra32S_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
9135   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
9136   match(Set dst (RShiftVS src shift));
9137   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed32S" %}
9138   ins_encode %{
9139     int vector_len = 2;
9140     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9141   %}
9142   ins_pipe( pipe_slow );
9143 %}
9144 
9145 // Integers vector arithmetic right shift
9146 instruct vsra2I(vecD dst, vecS shift) %{
9147   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
9148   match(Set dst (RShiftVI dst shift));
9149   format %{ "psrad   $dst,$shift\t! arithmetic right shift packed2I" %}
9150   ins_encode %{
9151     __ psrad($dst$$XMMRegister, $shift$$XMMRegister);
9152   %}
9153   ins_pipe( pipe_slow );
9154 %}
9155 
9156 instruct vsra2I_imm(vecD dst, immI8 shift) %{
9157   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
9158   match(Set dst (RShiftVI dst shift));
9159   format %{ "psrad   $dst,$shift\t! arithmetic right shift packed2I" %}
9160   ins_encode %{
9161     __ psrad($dst$$XMMRegister, (int)$shift$$constant);
9162   %}
9163   ins_pipe( pipe_slow );
9164 %}
9165 
9166 instruct vsra2I_reg(vecD dst, vecD src, vecS shift) %{
9167   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
9168   match(Set dst (RShiftVI src shift));
9169   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed2I" %}
9170   ins_encode %{
9171     int vector_len = 0;
9172     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9173   %}
9174   ins_pipe( pipe_slow );
9175 %}
9176 
9177 instruct vsra2I_reg_imm(vecD dst, vecD src, immI8 shift) %{
9178   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
9179   match(Set dst (RShiftVI src shift));
9180   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed2I" %}
9181   ins_encode %{
9182     int vector_len = 0;
9183     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9184   %}
9185   ins_pipe( pipe_slow );
9186 %}
9187 
9188 instruct vsra4I(vecX dst, vecS shift) %{
9189   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
9190   match(Set dst (RShiftVI dst shift));
9191   format %{ "psrad   $dst,$shift\t! arithmetic right shift packed4I" %}
9192   ins_encode %{
9193     __ psrad($dst$$XMMRegister, $shift$$XMMRegister);
9194   %}
9195   ins_pipe( pipe_slow );
9196 %}
9197 
9198 instruct vsra4I_imm(vecX dst, immI8 shift) %{
9199   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
9200   match(Set dst (RShiftVI dst shift));
9201   format %{ "psrad   $dst,$shift\t! arithmetic right shift packed4I" %}
9202   ins_encode %{
9203     __ psrad($dst$$XMMRegister, (int)$shift$$constant);
9204   %}
9205   ins_pipe( pipe_slow );
9206 %}
9207 
9208 instruct vsra4I_reg(vecX dst, vecX src, vecS shift) %{
9209   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
9210   match(Set dst (RShiftVI src shift));
9211   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed4I" %}
9212   ins_encode %{
9213     int vector_len = 0;
9214     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9215   %}
9216   ins_pipe( pipe_slow );
9217 %}
9218 
9219 instruct vsra4I_reg_imm(vecX dst, vecX src, immI8 shift) %{
9220   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
9221   match(Set dst (RShiftVI src shift));
9222   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed4I" %}
9223   ins_encode %{
9224     int vector_len = 0;
9225     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9226   %}
9227   ins_pipe( pipe_slow );
9228 %}
9229 
9230 instruct vsra8I_reg(vecY dst, vecY src, vecS shift) %{
9231   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
9232   match(Set dst (RShiftVI src shift));
9233   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed8I" %}
9234   ins_encode %{
9235     int vector_len = 1;
9236     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9237   %}
9238   ins_pipe( pipe_slow );
9239 %}
9240 
9241 instruct vsra8I_reg_imm(vecY dst, vecY src, immI8 shift) %{
9242   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
9243   match(Set dst (RShiftVI src shift));
9244   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed8I" %}
9245   ins_encode %{
9246     int vector_len = 1;
9247     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9248   %}
9249   ins_pipe( pipe_slow );
9250 %}
9251 
9252 instruct vsra16I_reg(vecZ dst, vecZ src, vecS shift) %{
9253   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
9254   match(Set dst (RShiftVI src shift));
9255   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed16I" %}
9256   ins_encode %{
9257     int vector_len = 2;
9258     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9259   %}
9260   ins_pipe( pipe_slow );
9261 %}
9262 
9263 instruct vsra16I_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
9264   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
9265   match(Set dst (RShiftVI src shift));
9266   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed16I" %}
9267   ins_encode %{
9268     int vector_len = 2;
9269     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9270   %}
9271   ins_pipe( pipe_slow );
9272 %}
9273 
9274 // There are no longs vector arithmetic right shift instructions.
9275 
9276 
9277 // --------------------------------- AND --------------------------------------
9278 
9279 instruct vand4B(vecS dst, vecS src) %{
9280   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 4);
9281   match(Set dst (AndV dst src));
9282   format %{ "pand    $dst,$src\t! and vectors (4 bytes)" %}
9283   ins_encode %{
9284     __ pand($dst$$XMMRegister, $src$$XMMRegister);
9285   %}
9286   ins_pipe( pipe_slow );
9287 %}
9288 
9289 instruct vand4B_reg(vecS dst, vecS src1, vecS src2) %{
9290   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
9291   match(Set dst (AndV src1 src2));
9292   format %{ "vpand   $dst,$src1,$src2\t! and vectors (4 bytes)" %}
9293   ins_encode %{
9294     int vector_len = 0;
9295     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9296   %}
9297   ins_pipe( pipe_slow );
9298 %}
9299 
9300 instruct vand4B_mem(vecS dst, vecS src, memory mem) %{
9301   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
9302   match(Set dst (AndV src (LoadVector mem)));
9303   format %{ "vpand   $dst,$src,$mem\t! and vectors (4 bytes)" %}
9304   ins_encode %{
9305     int vector_len = 0;
9306     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9307   %}
9308   ins_pipe( pipe_slow );
9309 %}
9310 
9311 instruct vand8B(vecD dst, vecD src) %{
9312   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 8);
9313   match(Set dst (AndV dst src));
9314   format %{ "pand    $dst,$src\t! and vectors (8 bytes)" %}
9315   ins_encode %{
9316     __ pand($dst$$XMMRegister, $src$$XMMRegister);
9317   %}
9318   ins_pipe( pipe_slow );
9319 %}
9320 
9321 instruct vand8B_reg(vecD dst, vecD src1, vecD src2) %{
9322   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
9323   match(Set dst (AndV src1 src2));
9324   format %{ "vpand   $dst,$src1,$src2\t! and vectors (8 bytes)" %}
9325   ins_encode %{
9326     int vector_len = 0;
9327     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9328   %}
9329   ins_pipe( pipe_slow );
9330 %}
9331 
9332 instruct vand8B_mem(vecD dst, vecD src, memory mem) %{
9333   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
9334   match(Set dst (AndV src (LoadVector mem)));
9335   format %{ "vpand   $dst,$src,$mem\t! and vectors (8 bytes)" %}
9336   ins_encode %{
9337     int vector_len = 0;
9338     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9339   %}
9340   ins_pipe( pipe_slow );
9341 %}
9342 
9343 instruct vand16B(vecX dst, vecX src) %{
9344   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 16);
9345   match(Set dst (AndV dst src));
9346   format %{ "pand    $dst,$src\t! and vectors (16 bytes)" %}
9347   ins_encode %{
9348     __ pand($dst$$XMMRegister, $src$$XMMRegister);
9349   %}
9350   ins_pipe( pipe_slow );
9351 %}
9352 
9353 instruct vand16B_reg(vecX dst, vecX src1, vecX src2) %{
9354   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
9355   match(Set dst (AndV src1 src2));
9356   format %{ "vpand   $dst,$src1,$src2\t! and vectors (16 bytes)" %}
9357   ins_encode %{
9358     int vector_len = 0;
9359     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9360   %}
9361   ins_pipe( pipe_slow );
9362 %}
9363 
9364 instruct vand16B_mem(vecX dst, vecX src, memory mem) %{
9365   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
9366   match(Set dst (AndV src (LoadVector mem)));
9367   format %{ "vpand   $dst,$src,$mem\t! and vectors (16 bytes)" %}
9368   ins_encode %{
9369     int vector_len = 0;
9370     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9371   %}
9372   ins_pipe( pipe_slow );
9373 %}
9374 
9375 instruct vand32B_reg(vecY dst, vecY src1, vecY src2) %{
9376   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
9377   match(Set dst (AndV src1 src2));
9378   format %{ "vpand   $dst,$src1,$src2\t! and vectors (32 bytes)" %}
9379   ins_encode %{
9380     int vector_len = 1;
9381     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9382   %}
9383   ins_pipe( pipe_slow );
9384 %}
9385 
9386 instruct vand32B_mem(vecY dst, vecY src, memory mem) %{
9387   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
9388   match(Set dst (AndV src (LoadVector mem)));
9389   format %{ "vpand   $dst,$src,$mem\t! and vectors (32 bytes)" %}
9390   ins_encode %{
9391     int vector_len = 1;
9392     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9393   %}
9394   ins_pipe( pipe_slow );
9395 %}
9396 
9397 instruct vand64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
9398   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
9399   match(Set dst (AndV src1 src2));
9400   format %{ "vpand   $dst,$src1,$src2\t! and vectors (64 bytes)" %}
9401   ins_encode %{
9402     int vector_len = 2;
9403     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9404   %}
9405   ins_pipe( pipe_slow );
9406 %}
9407 
9408 instruct vand64B_mem(vecZ dst, vecZ src, memory mem) %{
9409   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
9410   match(Set dst (AndV src (LoadVector mem)));
9411   format %{ "vpand   $dst,$src,$mem\t! and vectors (64 bytes)" %}
9412   ins_encode %{
9413     int vector_len = 2;
9414     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9415   %}
9416   ins_pipe( pipe_slow );
9417 %}
9418 
9419 // --------------------------------- OR ---------------------------------------
9420 
9421 instruct vor4B(vecS dst, vecS src) %{
9422   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 4);
9423   match(Set dst (OrV dst src));
9424   format %{ "por     $dst,$src\t! or vectors (4 bytes)" %}
9425   ins_encode %{
9426     __ por($dst$$XMMRegister, $src$$XMMRegister);
9427   %}
9428   ins_pipe( pipe_slow );
9429 %}
9430 
9431 instruct vor4B_reg(vecS dst, vecS src1, vecS src2) %{
9432   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
9433   match(Set dst (OrV src1 src2));
9434   format %{ "vpor    $dst,$src1,$src2\t! or vectors (4 bytes)" %}
9435   ins_encode %{
9436     int vector_len = 0;
9437     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9438   %}
9439   ins_pipe( pipe_slow );
9440 %}
9441 
9442 instruct vor4B_mem(vecS dst, vecS src, memory mem) %{
9443   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
9444   match(Set dst (OrV src (LoadVector mem)));
9445   format %{ "vpor    $dst,$src,$mem\t! or vectors (4 bytes)" %}
9446   ins_encode %{
9447     int vector_len = 0;
9448     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9449   %}
9450   ins_pipe( pipe_slow );
9451 %}
9452 
9453 instruct vor8B(vecD dst, vecD src) %{
9454   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 8);
9455   match(Set dst (OrV dst src));
9456   format %{ "por     $dst,$src\t! or vectors (8 bytes)" %}
9457   ins_encode %{
9458     __ por($dst$$XMMRegister, $src$$XMMRegister);
9459   %}
9460   ins_pipe( pipe_slow );
9461 %}
9462 
9463 instruct vor8B_reg(vecD dst, vecD src1, vecD src2) %{
9464   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
9465   match(Set dst (OrV src1 src2));
9466   format %{ "vpor    $dst,$src1,$src2\t! or vectors (8 bytes)" %}
9467   ins_encode %{
9468     int vector_len = 0;
9469     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9470   %}
9471   ins_pipe( pipe_slow );
9472 %}
9473 
9474 instruct vor8B_mem(vecD dst, vecD src, memory mem) %{
9475   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
9476   match(Set dst (OrV src (LoadVector mem)));
9477   format %{ "vpor    $dst,$src,$mem\t! or vectors (8 bytes)" %}
9478   ins_encode %{
9479     int vector_len = 0;
9480     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9481   %}
9482   ins_pipe( pipe_slow );
9483 %}
9484 
9485 instruct vor16B(vecX dst, vecX src) %{
9486   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 16);
9487   match(Set dst (OrV dst src));
9488   format %{ "por     $dst,$src\t! or vectors (16 bytes)" %}
9489   ins_encode %{
9490     __ por($dst$$XMMRegister, $src$$XMMRegister);
9491   %}
9492   ins_pipe( pipe_slow );
9493 %}
9494 
9495 instruct vor16B_reg(vecX dst, vecX src1, vecX src2) %{
9496   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
9497   match(Set dst (OrV src1 src2));
9498   format %{ "vpor    $dst,$src1,$src2\t! or vectors (16 bytes)" %}
9499   ins_encode %{
9500     int vector_len = 0;
9501     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9502   %}
9503   ins_pipe( pipe_slow );
9504 %}
9505 
9506 instruct vor16B_mem(vecX dst, vecX src, memory mem) %{
9507   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
9508   match(Set dst (OrV src (LoadVector mem)));
9509   format %{ "vpor    $dst,$src,$mem\t! or vectors (16 bytes)" %}
9510   ins_encode %{
9511     int vector_len = 0;
9512     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9513   %}
9514   ins_pipe( pipe_slow );
9515 %}
9516 
9517 instruct vor32B_reg(vecY dst, vecY src1, vecY src2) %{
9518   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
9519   match(Set dst (OrV src1 src2));
9520   format %{ "vpor    $dst,$src1,$src2\t! or vectors (32 bytes)" %}
9521   ins_encode %{
9522     int vector_len = 1;
9523     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9524   %}
9525   ins_pipe( pipe_slow );
9526 %}
9527 
9528 instruct vor32B_mem(vecY dst, vecY src, memory mem) %{
9529   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
9530   match(Set dst (OrV src (LoadVector mem)));
9531   format %{ "vpor    $dst,$src,$mem\t! or vectors (32 bytes)" %}
9532   ins_encode %{
9533     int vector_len = 1;
9534     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9535   %}
9536   ins_pipe( pipe_slow );
9537 %}
9538 
9539 instruct vor64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
9540   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
9541   match(Set dst (OrV src1 src2));
9542   format %{ "vpor    $dst,$src1,$src2\t! or vectors (64 bytes)" %}
9543   ins_encode %{
9544     int vector_len = 2;
9545     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9546   %}
9547   ins_pipe( pipe_slow );
9548 %}
9549 
9550 instruct vor64B_mem(vecZ dst, vecZ src, memory mem) %{
9551   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
9552   match(Set dst (OrV src (LoadVector mem)));
9553   format %{ "vpor    $dst,$src,$mem\t! or vectors (64 bytes)" %}
9554   ins_encode %{
9555     int vector_len = 2;
9556     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9557   %}
9558   ins_pipe( pipe_slow );
9559 %}
9560 
9561 // --------------------------------- XOR --------------------------------------
9562 
9563 instruct vxor4B(vecS dst, vecS src) %{
9564   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 4);
9565   match(Set dst (XorV dst src));
9566   format %{ "pxor    $dst,$src\t! xor vectors (4 bytes)" %}
9567   ins_encode %{
9568     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
9569   %}
9570   ins_pipe( pipe_slow );
9571 %}
9572 
9573 instruct vxor4B_reg(vecS dst, vecS src1, vecS src2) %{
9574   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
9575   match(Set dst (XorV src1 src2));
9576   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (4 bytes)" %}
9577   ins_encode %{
9578     int vector_len = 0;
9579     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9580   %}
9581   ins_pipe( pipe_slow );
9582 %}
9583 
9584 instruct vxor4B_mem(vecS dst, vecS src, memory mem) %{
9585   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
9586   match(Set dst (XorV src (LoadVector mem)));
9587   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (4 bytes)" %}
9588   ins_encode %{
9589     int vector_len = 0;
9590     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9591   %}
9592   ins_pipe( pipe_slow );
9593 %}
9594 
9595 instruct vxor8B(vecD dst, vecD src) %{
9596   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 8);
9597   match(Set dst (XorV dst src));
9598   format %{ "pxor    $dst,$src\t! xor vectors (8 bytes)" %}
9599   ins_encode %{
9600     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
9601   %}
9602   ins_pipe( pipe_slow );
9603 %}
9604 
9605 instruct vxor8B_reg(vecD dst, vecD src1, vecD src2) %{
9606   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
9607   match(Set dst (XorV src1 src2));
9608   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (8 bytes)" %}
9609   ins_encode %{
9610     int vector_len = 0;
9611     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9612   %}
9613   ins_pipe( pipe_slow );
9614 %}
9615 
9616 instruct vxor8B_mem(vecD dst, vecD src, memory mem) %{
9617   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
9618   match(Set dst (XorV src (LoadVector mem)));
9619   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (8 bytes)" %}
9620   ins_encode %{
9621     int vector_len = 0;
9622     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9623   %}
9624   ins_pipe( pipe_slow );
9625 %}
9626 
9627 instruct vxor16B(vecX dst, vecX src) %{
9628   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 16);
9629   match(Set dst (XorV dst src));
9630   format %{ "pxor    $dst,$src\t! xor vectors (16 bytes)" %}
9631   ins_encode %{
9632     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
9633   %}
9634   ins_pipe( pipe_slow );
9635 %}
9636 
9637 instruct vxor16B_reg(vecX dst, vecX src1, vecX src2) %{
9638   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
9639   match(Set dst (XorV src1 src2));
9640   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (16 bytes)" %}
9641   ins_encode %{
9642     int vector_len = 0;
9643     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9644   %}
9645   ins_pipe( pipe_slow );
9646 %}
9647 
9648 instruct vxor16B_mem(vecX dst, vecX src, memory mem) %{
9649   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
9650   match(Set dst (XorV src (LoadVector mem)));
9651   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (16 bytes)" %}
9652   ins_encode %{
9653     int vector_len = 0;
9654     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9655   %}
9656   ins_pipe( pipe_slow );
9657 %}
9658 
9659 instruct vxor32B_reg(vecY dst, vecY src1, vecY src2) %{
9660   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
9661   match(Set dst (XorV src1 src2));
9662   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (32 bytes)" %}
9663   ins_encode %{
9664     int vector_len = 1;
9665     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9666   %}
9667   ins_pipe( pipe_slow );
9668 %}
9669 
9670 instruct vxor32B_mem(vecY dst, vecY src, memory mem) %{
9671   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
9672   match(Set dst (XorV src (LoadVector mem)));
9673   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (32 bytes)" %}
9674   ins_encode %{
9675     int vector_len = 1;
9676     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9677   %}
9678   ins_pipe( pipe_slow );
9679 %}
9680 
9681 instruct vxor64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
9682   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
9683   match(Set dst (XorV src1 src2));
9684   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (64 bytes)" %}
9685   ins_encode %{
9686     int vector_len = 2;
9687     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9688   %}
9689   ins_pipe( pipe_slow );
9690 %}
9691 
9692 instruct vxor64B_mem(vecZ dst, vecZ src, memory mem) %{
9693   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
9694   match(Set dst (XorV src (LoadVector mem)));
9695   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (64 bytes)" %}
9696   ins_encode %{
9697     int vector_len = 2;
9698     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9699   %}
9700   ins_pipe( pipe_slow );
9701 %}
9702 
9703 // --------------------------------- FMA --------------------------------------
9704 
9705 // a * b + c
9706 instruct vfma2D_reg(vecX a, vecX b, vecX c) %{
9707   predicate(UseFMA && n->as_Vector()->length() == 2);
9708   match(Set c (FmaVD  c (Binary a b)));
9709   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed2D" %}
9710   ins_cost(150);
9711   ins_encode %{
9712     int vector_len = 0;
9713     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
9714   %}
9715   ins_pipe( pipe_slow );
9716 %}
9717 
9718 // a * b + c
9719 instruct vfma2D_mem(vecX a, memory b, vecX c) %{
9720   predicate(UseFMA && n->as_Vector()->length() == 2);
9721   match(Set c (FmaVD  c (Binary a (LoadVector b))));
9722   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed2D" %}
9723   ins_cost(150);
9724   ins_encode %{
9725     int vector_len = 0;
9726     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
9727   %}
9728   ins_pipe( pipe_slow );
9729 %}
9730 
9731 
9732 // a * b + c
9733 instruct vfma4D_reg(vecY a, vecY b, vecY c) %{
9734   predicate(UseFMA && n->as_Vector()->length() == 4);
9735   match(Set c (FmaVD  c (Binary a b)));
9736   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed4D" %}
9737   ins_cost(150);
9738   ins_encode %{
9739     int vector_len = 1;
9740     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
9741   %}
9742   ins_pipe( pipe_slow );
9743 %}
9744 
9745 // a * b + c
9746 instruct vfma4D_mem(vecY a, memory b, vecY c) %{
9747   predicate(UseFMA && n->as_Vector()->length() == 4);
9748   match(Set c (FmaVD  c (Binary a (LoadVector b))));
9749   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed4D" %}
9750   ins_cost(150);
9751   ins_encode %{
9752     int vector_len = 1;
9753     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
9754   %}
9755   ins_pipe( pipe_slow );
9756 %}
9757 
9758 // a * b + c
9759 instruct vfma8D_reg(vecZ a, vecZ b, vecZ c) %{
9760   predicate(UseFMA && n->as_Vector()->length() == 8);
9761   match(Set c (FmaVD  c (Binary a b)));
9762   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed8D" %}
9763   ins_cost(150);
9764   ins_encode %{
9765     int vector_len = 2;
9766     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
9767   %}
9768   ins_pipe( pipe_slow );
9769 %}
9770 
9771 // a * b + c
9772 instruct vfma8D_mem(vecZ a, memory b, vecZ c) %{
9773   predicate(UseFMA && n->as_Vector()->length() == 8);
9774   match(Set c (FmaVD  c (Binary a (LoadVector b))));
9775   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed8D" %}
9776   ins_cost(150);
9777   ins_encode %{
9778     int vector_len = 2;
9779     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
9780   %}
9781   ins_pipe( pipe_slow );
9782 %}
9783 
9784 // a * b + c
9785 instruct vfma4F_reg(vecX a, vecX b, vecX c) %{
9786   predicate(UseFMA && n->as_Vector()->length() == 4);
9787   match(Set c (FmaVF  c (Binary a b)));
9788   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed4F" %}
9789   ins_cost(150);
9790   ins_encode %{
9791     int vector_len = 0;
9792     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
9793   %}
9794   ins_pipe( pipe_slow );
9795 %}
9796 
9797 // a * b + c
9798 instruct vfma4F_mem(vecX a, memory b, vecX c) %{
9799   predicate(UseFMA && n->as_Vector()->length() == 4);
9800   match(Set c (FmaVF  c (Binary a (LoadVector b))));
9801   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed4F" %}
9802   ins_cost(150);
9803   ins_encode %{
9804     int vector_len = 0;
9805     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
9806   %}
9807   ins_pipe( pipe_slow );
9808 %}
9809 
9810 // a * b + c
9811 instruct vfma8F_reg(vecY a, vecY b, vecY c) %{
9812   predicate(UseFMA && n->as_Vector()->length() == 8);
9813   match(Set c (FmaVF  c (Binary a b)));
9814   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed8F" %}
9815   ins_cost(150);
9816   ins_encode %{
9817     int vector_len = 1;
9818     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
9819   %}
9820   ins_pipe( pipe_slow );
9821 %}
9822 
9823 // a * b + c
9824 instruct vfma8F_mem(vecY a, memory b, vecY c) %{
9825   predicate(UseFMA && n->as_Vector()->length() == 8);
9826   match(Set c (FmaVF  c (Binary a (LoadVector b))));
9827   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed8F" %}
9828   ins_cost(150);
9829   ins_encode %{
9830     int vector_len = 1;
9831     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
9832   %}
9833   ins_pipe( pipe_slow );
9834 %}
9835 
9836 // a * b + c
9837 instruct vfma16F_reg(vecZ a, vecZ b, vecZ c) %{
9838   predicate(UseFMA && n->as_Vector()->length() == 16);
9839   match(Set c (FmaVF  c (Binary a b)));
9840   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed16F" %}
9841   ins_cost(150);
9842   ins_encode %{
9843     int vector_len = 2;
9844     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
9845   %}
9846   ins_pipe( pipe_slow );
9847 %}
9848 
9849 // a * b + c
9850 instruct vfma16F_mem(vecZ a, memory b, vecZ c) %{
9851   predicate(UseFMA && n->as_Vector()->length() == 16);
9852   match(Set c (FmaVF  c (Binary a (LoadVector b))));
9853   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed16F" %}
9854   ins_cost(150);
9855   ins_encode %{
9856     int vector_len = 2;
9857     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
9858   %}
9859   ins_pipe( pipe_slow );
9860 %}
9861 
9862 // --------------------------------- Vector Multiply Add --------------------------------------
9863 
9864 instruct smuladd4S2I_reg(vecD dst, vecD src1) %{
9865   predicate(UseSSE >= 2 && UseAVX == 0 && n->as_Vector()->length() == 2);
9866   match(Set dst (MulAddVS2VI dst src1));
9867   format %{ "pmaddwd $dst,$dst,$src1\t! muladd packed4Sto2I" %}
9868   ins_encode %{
9869     __ pmaddwd($dst$$XMMRegister, $src1$$XMMRegister);
9870   %}
9871   ins_pipe( pipe_slow );
9872 %}
9873 
9874 instruct vmuladd4S2I_reg(vecD dst, vecD src1, vecD src2) %{
9875   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
9876   match(Set dst (MulAddVS2VI src1 src2));
9877   format %{ "vpmaddwd $dst,$src1,$src2\t! muladd packed4Sto2I" %}
9878   ins_encode %{
9879     int vector_len = 0;
9880     __ vpmaddwd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9881   %}
9882   ins_pipe( pipe_slow );
9883 %}
9884 
9885 instruct smuladd8S4I_reg(vecX dst, vecX src1) %{
9886   predicate(UseSSE >= 2 && UseAVX == 0 && n->as_Vector()->length() == 4);
9887   match(Set dst (MulAddVS2VI dst src1));
9888   format %{ "pmaddwd $dst,$dst,$src1\t! muladd packed8Sto4I" %}
9889   ins_encode %{
9890     __ pmaddwd($dst$$XMMRegister, $src1$$XMMRegister);
9891   %}
9892   ins_pipe( pipe_slow );
9893 %}
9894 
9895 instruct vmuladd8S4I_reg(vecX dst, vecX src1, vecX src2) %{
9896   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
9897   match(Set dst (MulAddVS2VI src1 src2));
9898   format %{ "vpmaddwd $dst,$src1,$src2\t! muladd packed8Sto4I" %}
9899   ins_encode %{
9900     int vector_len = 0;
9901     __ vpmaddwd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9902   %}
9903   ins_pipe( pipe_slow );
9904 %}
9905 
9906 instruct vmuladd16S8I_reg(vecY dst, vecY src1, vecY src2) %{
9907   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
9908   match(Set dst (MulAddVS2VI src1 src2));
9909   format %{ "vpmaddwd $dst,$src1,$src2\t! muladd packed16Sto8I" %}
9910   ins_encode %{
9911     int vector_len = 1;
9912     __ vpmaddwd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9913   %}
9914   ins_pipe( pipe_slow );
9915 %}
9916 
9917 instruct vmuladd32S16I_reg(vecZ dst, vecZ src1, vecZ src2) %{
9918   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
9919   match(Set dst (MulAddVS2VI src1 src2));
9920   format %{ "vpmaddwd $dst,$src1,$src2\t! muladd packed32Sto16I" %}
9921   ins_encode %{
9922     int vector_len = 2;
9923     __ vpmaddwd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9924   %}
9925   ins_pipe( pipe_slow );
9926 %}
9927 
9928 // --------------------------------- Vector Multiply Add Add ----------------------------------
9929 
9930 instruct vmuladdadd4S2I_reg(vecD dst, vecD src1, vecD src2) %{
9931   predicate(VM_Version::supports_vnni() && UseAVX > 2 && n->as_Vector()->length() == 2);
9932   match(Set dst (AddVI (MulAddVS2VI src1 src2) dst));
9933   format %{ "evpdpwssd $dst,$src1,$src2\t! muladdadd packed4Sto2I" %}
9934   ins_encode %{
9935     int vector_len = 0;
9936     __ evpdpwssd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9937   %}
9938   ins_pipe( pipe_slow );
9939 %}
9940 
9941 instruct vmuladdadd8S4I_reg(vecX dst, vecX src1, vecX src2) %{
9942   predicate(VM_Version::supports_vnni() && UseAVX > 2 && n->as_Vector()->length() == 4);
9943   match(Set dst (AddVI (MulAddVS2VI src1 src2) dst));
9944   format %{ "evpdpwssd $dst,$src1,$src2\t! muladdadd packed8Sto4I" %}
9945   ins_encode %{
9946     int vector_len = 0;
9947     __ evpdpwssd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9948   %}
9949   ins_pipe( pipe_slow );
9950 %}
9951 
9952 instruct vmuladdadd16S8I_reg(vecY dst, vecY src1, vecY src2) %{
9953   predicate(VM_Version::supports_vnni() && UseAVX > 2 && n->as_Vector()->length() == 8);
9954   match(Set dst (AddVI (MulAddVS2VI src1 src2) dst));
9955   format %{ "evpdpwssd $dst,$src1,$src2\t! muladdadd packed16Sto8I" %}
9956   ins_encode %{
9957     int vector_len = 1;
9958     __ evpdpwssd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9959   %}
9960   ins_pipe( pipe_slow );
9961 %}
9962 
9963 instruct vmuladdadd32S16I_reg(vecZ dst, vecZ src1, vecZ src2) %{
9964   predicate(VM_Version::supports_vnni() && UseAVX > 2 && n->as_Vector()->length() == 16);
9965   match(Set dst (AddVI (MulAddVS2VI src1 src2) dst));
9966   format %{ "evpdpwssd $dst,$src1,$src2\t! muladdadd packed32Sto16I" %}
9967   ins_encode %{
9968     int vector_len = 2;
9969     __ evpdpwssd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9970   %}
9971   ins_pipe( pipe_slow );
9972 %}
9973 
9974 // --------------------------------- PopCount --------------------------------------
9975 
9976 instruct vpopcount2I(vecD dst, vecD src) %{
9977   predicate(VM_Version::supports_vpopcntdq() && UsePopCountInstruction && n->as_Vector()->length() == 2);
9978   match(Set dst (PopCountVI src));
9979   format %{ "vpopcntd  $dst,$src\t! vector popcount packed2I" %}
9980   ins_encode %{
9981     int vector_len = 0;
9982     __ vpopcntd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
9983   %}
9984   ins_pipe( pipe_slow );
9985 %}
9986 
9987 instruct vpopcount4I(vecX dst, vecX src) %{
9988   predicate(VM_Version::supports_vpopcntdq() && UsePopCountInstruction && n->as_Vector()->length() == 4);
9989   match(Set dst (PopCountVI src));
9990   format %{ "vpopcntd  $dst,$src\t! vector popcount packed4I" %}
9991   ins_encode %{
9992     int vector_len = 0;
9993     __ vpopcntd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
9994   %}
9995   ins_pipe( pipe_slow );
9996 %}
9997 
9998 instruct vpopcount8I(vecY dst, vecY src) %{
9999   predicate(VM_Version::supports_vpopcntdq() && UsePopCountInstruction && n->as_Vector()->length() == 8);
10000   match(Set dst (PopCountVI src));
10001   format %{ "vpopcntd  $dst,$src\t! vector popcount packed8I" %}
10002   ins_encode %{
10003     int vector_len = 1;
10004     __ vpopcntd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
10005   %}
10006   ins_pipe( pipe_slow );
10007 %}
10008 
10009 instruct vpopcount16I(vecZ dst, vecZ src) %{
10010   predicate(VM_Version::supports_vpopcntdq() && UsePopCountInstruction && n->as_Vector()->length() == 16);
10011   match(Set dst (PopCountVI src));
10012   format %{ "vpopcntd  $dst,$src\t! vector popcount packed16I" %}
10013   ins_encode %{
10014     int vector_len = 2;
10015     __ vpopcntd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
10016   %}
10017   ins_pipe( pipe_slow );
10018 %}