1 //
   2 // Copyright (c) 2011, 2018, Oracle and/or its affiliates. All rights reserved.
   3 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4 //
   5 // This code is free software; you can redistribute it and/or modify it
   6 // under the terms of the GNU General Public License version 2 only, as
   7 // published by the Free Software Foundation.
   8 //
   9 // This code is distributed in the hope that it will be useful, but WITHOUT
  10 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11 // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12 // version 2 for more details (a copy is included in the LICENSE file that
  13 // accompanied this code).
  14 //
  15 // You should have received a copy of the GNU General Public License version
  16 // 2 along with this work; if not, write to the Free Software Foundation,
  17 // Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18 //
  19 // Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20 // or visit www.oracle.com if you need additional information or have any
  21 // questions.
  22 //
  23 //
  24 
  25 // X86 Common Architecture Description File
  26 
  27 //----------REGISTER DEFINITION BLOCK------------------------------------------
  28 // This information is used by the matcher and the register allocator to
  29 // describe individual registers and classes of registers within the target
  30 // archtecture.
  31 
  32 register %{
  33 //----------Architecture Description Register Definitions----------------------
  34 // General Registers
  35 // "reg_def"  name ( register save type, C convention save type,
  36 //                   ideal register type, encoding );
  37 // Register Save Types:
  38 //
  39 // NS  = No-Save:       The register allocator assumes that these registers
  40 //                      can be used without saving upon entry to the method, &
  41 //                      that they do not need to be saved at call sites.
  42 //
  43 // SOC = Save-On-Call:  The register allocator assumes that these registers
  44 //                      can be used without saving upon entry to the method,
  45 //                      but that they must be saved at call sites.
  46 //
  47 // SOE = Save-On-Entry: The register allocator assumes that these registers
  48 //                      must be saved before using them upon entry to the
  49 //                      method, but they do not need to be saved at call
  50 //                      sites.
  51 //
  52 // AS  = Always-Save:   The register allocator assumes that these registers
  53 //                      must be saved before using them upon entry to the
  54 //                      method, & that they must be saved at call sites.
  55 //
  56 // Ideal Register Type is used to determine how to save & restore a
  57 // register.  Op_RegI will get spilled with LoadI/StoreI, Op_RegP will get
  58 // spilled with LoadP/StoreP.  If the register supports both, use Op_RegI.
  59 //
  60 // The encoding number is the actual bit-pattern placed into the opcodes.
  61 
  62 // XMM registers.  512-bit registers or 8 words each, labeled (a)-p.
  63 // Word a in each register holds a Float, words ab hold a Double.
  64 // The whole registers are used in SSE4.2 version intrinsics,
  65 // array copy stubs and superword operations (see UseSSE42Intrinsics,
  66 // UseXMMForArrayCopy and UseSuperword flags).
  67 // For pre EVEX enabled architectures:
  68 //      XMM8-XMM15 must be encoded with REX (VEX for UseAVX)
  69 // For EVEX enabled architectures:
  70 //      XMM8-XMM31 must be encoded with REX (EVEX for UseAVX).
  71 //
  72 // Linux ABI:   No register preserved across function calls
  73 //              XMM0-XMM7 might hold parameters
  74 // Windows ABI: XMM6-XMM31 preserved across function calls
  75 //              XMM0-XMM3 might hold parameters
  76 
  77 reg_def XMM0 ( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg());
  78 reg_def XMM0b( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(1));
  79 reg_def XMM0c( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(2));
  80 reg_def XMM0d( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(3));
  81 reg_def XMM0e( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(4));
  82 reg_def XMM0f( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(5));
  83 reg_def XMM0g( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(6));
  84 reg_def XMM0h( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(7));
  85 reg_def XMM0i( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(8));
  86 reg_def XMM0j( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(9));
  87 reg_def XMM0k( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(10));
  88 reg_def XMM0l( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(11));
  89 reg_def XMM0m( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(12));
  90 reg_def XMM0n( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(13));
  91 reg_def XMM0o( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(14));
  92 reg_def XMM0p( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(15));
  93 
  94 reg_def XMM1 ( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg());
  95 reg_def XMM1b( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(1));
  96 reg_def XMM1c( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(2));
  97 reg_def XMM1d( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(3));
  98 reg_def XMM1e( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(4));
  99 reg_def XMM1f( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(5));
 100 reg_def XMM1g( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(6));
 101 reg_def XMM1h( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(7));
 102 reg_def XMM1i( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(8));
 103 reg_def XMM1j( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(9));
 104 reg_def XMM1k( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(10));
 105 reg_def XMM1l( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(11));
 106 reg_def XMM1m( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(12));
 107 reg_def XMM1n( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(13));
 108 reg_def XMM1o( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(14));
 109 reg_def XMM1p( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(15));
 110 
 111 reg_def XMM2 ( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg());
 112 reg_def XMM2b( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(1));
 113 reg_def XMM2c( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(2));
 114 reg_def XMM2d( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(3));
 115 reg_def XMM2e( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(4));
 116 reg_def XMM2f( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(5));
 117 reg_def XMM2g( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(6));
 118 reg_def XMM2h( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(7));
 119 reg_def XMM2i( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(8));
 120 reg_def XMM2j( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(9));
 121 reg_def XMM2k( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(10));
 122 reg_def XMM2l( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(11));
 123 reg_def XMM2m( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(12));
 124 reg_def XMM2n( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(13));
 125 reg_def XMM2o( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(14));
 126 reg_def XMM2p( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(15));
 127 
 128 reg_def XMM3 ( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg());
 129 reg_def XMM3b( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(1));
 130 reg_def XMM3c( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(2));
 131 reg_def XMM3d( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(3));
 132 reg_def XMM3e( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(4));
 133 reg_def XMM3f( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(5));
 134 reg_def XMM3g( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(6));
 135 reg_def XMM3h( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(7));
 136 reg_def XMM3i( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(8));
 137 reg_def XMM3j( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(9));
 138 reg_def XMM3k( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(10));
 139 reg_def XMM3l( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(11));
 140 reg_def XMM3m( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(12));
 141 reg_def XMM3n( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(13));
 142 reg_def XMM3o( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(14));
 143 reg_def XMM3p( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(15));
 144 
 145 reg_def XMM4 ( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg());
 146 reg_def XMM4b( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(1));
 147 reg_def XMM4c( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(2));
 148 reg_def XMM4d( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(3));
 149 reg_def XMM4e( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(4));
 150 reg_def XMM4f( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(5));
 151 reg_def XMM4g( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(6));
 152 reg_def XMM4h( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(7));
 153 reg_def XMM4i( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(8));
 154 reg_def XMM4j( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(9));
 155 reg_def XMM4k( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(10));
 156 reg_def XMM4l( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(11));
 157 reg_def XMM4m( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(12));
 158 reg_def XMM4n( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(13));
 159 reg_def XMM4o( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(14));
 160 reg_def XMM4p( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(15));
 161 
 162 reg_def XMM5 ( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg());
 163 reg_def XMM5b( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(1));
 164 reg_def XMM5c( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(2));
 165 reg_def XMM5d( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(3));
 166 reg_def XMM5e( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(4));
 167 reg_def XMM5f( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(5));
 168 reg_def XMM5g( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(6));
 169 reg_def XMM5h( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(7));
 170 reg_def XMM5i( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(8));
 171 reg_def XMM5j( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(9));
 172 reg_def XMM5k( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(10));
 173 reg_def XMM5l( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(11));
 174 reg_def XMM5m( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(12));
 175 reg_def XMM5n( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(13));
 176 reg_def XMM5o( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(14));
 177 reg_def XMM5p( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(15));
 178 
 179 reg_def XMM6 ( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg());
 180 reg_def XMM6b( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(1));
 181 reg_def XMM6c( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(2));
 182 reg_def XMM6d( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(3));
 183 reg_def XMM6e( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(4));
 184 reg_def XMM6f( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(5));
 185 reg_def XMM6g( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(6));
 186 reg_def XMM6h( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(7));
 187 reg_def XMM6i( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(8));
 188 reg_def XMM6j( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(9));
 189 reg_def XMM6k( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(10));
 190 reg_def XMM6l( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(11));
 191 reg_def XMM6m( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(12));
 192 reg_def XMM6n( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(13));
 193 reg_def XMM6o( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(14));
 194 reg_def XMM6p( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(15));
 195 
 196 reg_def XMM7 ( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg());
 197 reg_def XMM7b( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(1));
 198 reg_def XMM7c( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(2));
 199 reg_def XMM7d( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(3));
 200 reg_def XMM7e( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(4));
 201 reg_def XMM7f( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(5));
 202 reg_def XMM7g( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(6));
 203 reg_def XMM7h( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(7));
 204 reg_def XMM7i( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(8));
 205 reg_def XMM7j( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(9));
 206 reg_def XMM7k( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(10));
 207 reg_def XMM7l( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(11));
 208 reg_def XMM7m( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(12));
 209 reg_def XMM7n( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(13));
 210 reg_def XMM7o( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(14));
 211 reg_def XMM7p( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(15));
 212 
 213 #ifdef _LP64
 214 
 215 reg_def XMM8 ( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg());
 216 reg_def XMM8b( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(1));
 217 reg_def XMM8c( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(2));
 218 reg_def XMM8d( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(3));
 219 reg_def XMM8e( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(4));
 220 reg_def XMM8f( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(5));
 221 reg_def XMM8g( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(6));
 222 reg_def XMM8h( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(7));
 223 reg_def XMM8i( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(8));
 224 reg_def XMM8j( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(9));
 225 reg_def XMM8k( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(10));
 226 reg_def XMM8l( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(11));
 227 reg_def XMM8m( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(12));
 228 reg_def XMM8n( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(13));
 229 reg_def XMM8o( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(14));
 230 reg_def XMM8p( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(15));
 231 
 232 reg_def XMM9 ( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg());
 233 reg_def XMM9b( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(1));
 234 reg_def XMM9c( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(2));
 235 reg_def XMM9d( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(3));
 236 reg_def XMM9e( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(4));
 237 reg_def XMM9f( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(5));
 238 reg_def XMM9g( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(6));
 239 reg_def XMM9h( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(7));
 240 reg_def XMM9i( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(8));
 241 reg_def XMM9j( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(9));
 242 reg_def XMM9k( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(10));
 243 reg_def XMM9l( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(11));
 244 reg_def XMM9m( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(12));
 245 reg_def XMM9n( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(13));
 246 reg_def XMM9o( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(14));
 247 reg_def XMM9p( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(15));
 248 
 249 reg_def XMM10 ( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg());
 250 reg_def XMM10b( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(1));
 251 reg_def XMM10c( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(2));
 252 reg_def XMM10d( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(3));
 253 reg_def XMM10e( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(4));
 254 reg_def XMM10f( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(5));
 255 reg_def XMM10g( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(6));
 256 reg_def XMM10h( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(7));
 257 reg_def XMM10i( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(8));
 258 reg_def XMM10j( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(9));
 259 reg_def XMM10k( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(10));
 260 reg_def XMM10l( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(11));
 261 reg_def XMM10m( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(12));
 262 reg_def XMM10n( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(13));
 263 reg_def XMM10o( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(14));
 264 reg_def XMM10p( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(15));
 265 
 266 reg_def XMM11 ( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg());
 267 reg_def XMM11b( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(1));
 268 reg_def XMM11c( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(2));
 269 reg_def XMM11d( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(3));
 270 reg_def XMM11e( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(4));
 271 reg_def XMM11f( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(5));
 272 reg_def XMM11g( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(6));
 273 reg_def XMM11h( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(7));
 274 reg_def XMM11i( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(8));
 275 reg_def XMM11j( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(9));
 276 reg_def XMM11k( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(10));
 277 reg_def XMM11l( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(11));
 278 reg_def XMM11m( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(12));
 279 reg_def XMM11n( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(13));
 280 reg_def XMM11o( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(14));
 281 reg_def XMM11p( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(15));
 282 
 283 reg_def XMM12 ( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg());
 284 reg_def XMM12b( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(1));
 285 reg_def XMM12c( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(2));
 286 reg_def XMM12d( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(3));
 287 reg_def XMM12e( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(4));
 288 reg_def XMM12f( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(5));
 289 reg_def XMM12g( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(6));
 290 reg_def XMM12h( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(7));
 291 reg_def XMM12i( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(8));
 292 reg_def XMM12j( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(9));
 293 reg_def XMM12k( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(10));
 294 reg_def XMM12l( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(11));
 295 reg_def XMM12m( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(12));
 296 reg_def XMM12n( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(13));
 297 reg_def XMM12o( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(14));
 298 reg_def XMM12p( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(15));
 299 
 300 reg_def XMM13 ( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg());
 301 reg_def XMM13b( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(1));
 302 reg_def XMM13c( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(2));
 303 reg_def XMM13d( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(3));
 304 reg_def XMM13e( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(4));
 305 reg_def XMM13f( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(5));
 306 reg_def XMM13g( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(6));
 307 reg_def XMM13h( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(7));
 308 reg_def XMM13i( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(8));
 309 reg_def XMM13j( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(9));
 310 reg_def XMM13k( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(10));
 311 reg_def XMM13l( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(11));
 312 reg_def XMM13m( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(12));
 313 reg_def XMM13n( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(13));
 314 reg_def XMM13o( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(14));
 315 reg_def XMM13p( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(15));
 316 
 317 reg_def XMM14 ( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg());
 318 reg_def XMM14b( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(1));
 319 reg_def XMM14c( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(2));
 320 reg_def XMM14d( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(3));
 321 reg_def XMM14e( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(4));
 322 reg_def XMM14f( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(5));
 323 reg_def XMM14g( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(6));
 324 reg_def XMM14h( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(7));
 325 reg_def XMM14i( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(8));
 326 reg_def XMM14j( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(9));
 327 reg_def XMM14k( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(10));
 328 reg_def XMM14l( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(11));
 329 reg_def XMM14m( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(12));
 330 reg_def XMM14n( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(13));
 331 reg_def XMM14o( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(14));
 332 reg_def XMM14p( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(15));
 333 
 334 reg_def XMM15 ( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg());
 335 reg_def XMM15b( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(1));
 336 reg_def XMM15c( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(2));
 337 reg_def XMM15d( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(3));
 338 reg_def XMM15e( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(4));
 339 reg_def XMM15f( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(5));
 340 reg_def XMM15g( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(6));
 341 reg_def XMM15h( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(7));
 342 reg_def XMM15i( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(8));
 343 reg_def XMM15j( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(9));
 344 reg_def XMM15k( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(10));
 345 reg_def XMM15l( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(11));
 346 reg_def XMM15m( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(12));
 347 reg_def XMM15n( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(13));
 348 reg_def XMM15o( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(14));
 349 reg_def XMM15p( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(15));
 350 
 351 reg_def XMM16 ( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg());
 352 reg_def XMM16b( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(1));
 353 reg_def XMM16c( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(2));
 354 reg_def XMM16d( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(3));
 355 reg_def XMM16e( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(4));
 356 reg_def XMM16f( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(5));
 357 reg_def XMM16g( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(6));
 358 reg_def XMM16h( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(7));
 359 reg_def XMM16i( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(8));
 360 reg_def XMM16j( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(9));
 361 reg_def XMM16k( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(10));
 362 reg_def XMM16l( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(11));
 363 reg_def XMM16m( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(12));
 364 reg_def XMM16n( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(13));
 365 reg_def XMM16o( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(14));
 366 reg_def XMM16p( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(15));
 367 
 368 reg_def XMM17 ( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg());
 369 reg_def XMM17b( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(1));
 370 reg_def XMM17c( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(2));
 371 reg_def XMM17d( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(3));
 372 reg_def XMM17e( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(4));
 373 reg_def XMM17f( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(5));
 374 reg_def XMM17g( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(6));
 375 reg_def XMM17h( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(7));
 376 reg_def XMM17i( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(8));
 377 reg_def XMM17j( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(9));
 378 reg_def XMM17k( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(10));
 379 reg_def XMM17l( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(11));
 380 reg_def XMM17m( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(12));
 381 reg_def XMM17n( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(13));
 382 reg_def XMM17o( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(14));
 383 reg_def XMM17p( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(15));
 384 
 385 reg_def XMM18 ( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg());
 386 reg_def XMM18b( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(1));
 387 reg_def XMM18c( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(2));
 388 reg_def XMM18d( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(3));
 389 reg_def XMM18e( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(4));
 390 reg_def XMM18f( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(5));
 391 reg_def XMM18g( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(6));
 392 reg_def XMM18h( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(7));
 393 reg_def XMM18i( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(8));
 394 reg_def XMM18j( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(9));
 395 reg_def XMM18k( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(10));
 396 reg_def XMM18l( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(11));
 397 reg_def XMM18m( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(12));
 398 reg_def XMM18n( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(13));
 399 reg_def XMM18o( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(14));
 400 reg_def XMM18p( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(15));
 401 
 402 reg_def XMM19 ( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg());
 403 reg_def XMM19b( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(1));
 404 reg_def XMM19c( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(2));
 405 reg_def XMM19d( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(3));
 406 reg_def XMM19e( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(4));
 407 reg_def XMM19f( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(5));
 408 reg_def XMM19g( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(6));
 409 reg_def XMM19h( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(7));
 410 reg_def XMM19i( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(8));
 411 reg_def XMM19j( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(9));
 412 reg_def XMM19k( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(10));
 413 reg_def XMM19l( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(11));
 414 reg_def XMM19m( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(12));
 415 reg_def XMM19n( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(13));
 416 reg_def XMM19o( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(14));
 417 reg_def XMM19p( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(15));
 418 
 419 reg_def XMM20 ( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg());
 420 reg_def XMM20b( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(1));
 421 reg_def XMM20c( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(2));
 422 reg_def XMM20d( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(3));
 423 reg_def XMM20e( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(4));
 424 reg_def XMM20f( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(5));
 425 reg_def XMM20g( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(6));
 426 reg_def XMM20h( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(7));
 427 reg_def XMM20i( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(8));
 428 reg_def XMM20j( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(9));
 429 reg_def XMM20k( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(10));
 430 reg_def XMM20l( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(11));
 431 reg_def XMM20m( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(12));
 432 reg_def XMM20n( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(13));
 433 reg_def XMM20o( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(14));
 434 reg_def XMM20p( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(15));
 435 
 436 reg_def XMM21 ( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg());
 437 reg_def XMM21b( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(1));
 438 reg_def XMM21c( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(2));
 439 reg_def XMM21d( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(3));
 440 reg_def XMM21e( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(4));
 441 reg_def XMM21f( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(5));
 442 reg_def XMM21g( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(6));
 443 reg_def XMM21h( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(7));
 444 reg_def XMM21i( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(8));
 445 reg_def XMM21j( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(9));
 446 reg_def XMM21k( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(10));
 447 reg_def XMM21l( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(11));
 448 reg_def XMM21m( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(12));
 449 reg_def XMM21n( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(13));
 450 reg_def XMM21o( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(14));
 451 reg_def XMM21p( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(15));
 452 
 453 reg_def XMM22 ( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg());
 454 reg_def XMM22b( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(1));
 455 reg_def XMM22c( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(2));
 456 reg_def XMM22d( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(3));
 457 reg_def XMM22e( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(4));
 458 reg_def XMM22f( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(5));
 459 reg_def XMM22g( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(6));
 460 reg_def XMM22h( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(7));
 461 reg_def XMM22i( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(8));
 462 reg_def XMM22j( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(9));
 463 reg_def XMM22k( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(10));
 464 reg_def XMM22l( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(11));
 465 reg_def XMM22m( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(12));
 466 reg_def XMM22n( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(13));
 467 reg_def XMM22o( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(14));
 468 reg_def XMM22p( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(15));
 469 
 470 reg_def XMM23 ( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg());
 471 reg_def XMM23b( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(1));
 472 reg_def XMM23c( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(2));
 473 reg_def XMM23d( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(3));
 474 reg_def XMM23e( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(4));
 475 reg_def XMM23f( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(5));
 476 reg_def XMM23g( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(6));
 477 reg_def XMM23h( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(7));
 478 reg_def XMM23i( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(8));
 479 reg_def XMM23j( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(9));
 480 reg_def XMM23k( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(10));
 481 reg_def XMM23l( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(11));
 482 reg_def XMM23m( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(12));
 483 reg_def XMM23n( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(13));
 484 reg_def XMM23o( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(14));
 485 reg_def XMM23p( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(15));
 486 
 487 reg_def XMM24 ( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg());
 488 reg_def XMM24b( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(1));
 489 reg_def XMM24c( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(2));
 490 reg_def XMM24d( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(3));
 491 reg_def XMM24e( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(4));
 492 reg_def XMM24f( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(5));
 493 reg_def XMM24g( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(6));
 494 reg_def XMM24h( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(7));
 495 reg_def XMM24i( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(8));
 496 reg_def XMM24j( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(9));
 497 reg_def XMM24k( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(10));
 498 reg_def XMM24l( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(11));
 499 reg_def XMM24m( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(12));
 500 reg_def XMM24n( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(13));
 501 reg_def XMM24o( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(14));
 502 reg_def XMM24p( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(15));
 503 
 504 reg_def XMM25 ( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg());
 505 reg_def XMM25b( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(1));
 506 reg_def XMM25c( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(2));
 507 reg_def XMM25d( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(3));
 508 reg_def XMM25e( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(4));
 509 reg_def XMM25f( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(5));
 510 reg_def XMM25g( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(6));
 511 reg_def XMM25h( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(7));
 512 reg_def XMM25i( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(8));
 513 reg_def XMM25j( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(9));
 514 reg_def XMM25k( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(10));
 515 reg_def XMM25l( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(11));
 516 reg_def XMM25m( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(12));
 517 reg_def XMM25n( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(13));
 518 reg_def XMM25o( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(14));
 519 reg_def XMM25p( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(15));
 520 
 521 reg_def XMM26 ( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg());
 522 reg_def XMM26b( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(1));
 523 reg_def XMM26c( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(2));
 524 reg_def XMM26d( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(3));
 525 reg_def XMM26e( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(4));
 526 reg_def XMM26f( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(5));
 527 reg_def XMM26g( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(6));
 528 reg_def XMM26h( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(7));
 529 reg_def XMM26i( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(8));
 530 reg_def XMM26j( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(9));
 531 reg_def XMM26k( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(10));
 532 reg_def XMM26l( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(11));
 533 reg_def XMM26m( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(12));
 534 reg_def XMM26n( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(13));
 535 reg_def XMM26o( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(14));
 536 reg_def XMM26p( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(15));
 537 
 538 reg_def XMM27 ( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg());
 539 reg_def XMM27b( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(1));
 540 reg_def XMM27c( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(2));
 541 reg_def XMM27d( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(3));
 542 reg_def XMM27e( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(4));
 543 reg_def XMM27f( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(5));
 544 reg_def XMM27g( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(6));
 545 reg_def XMM27h( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(7));
 546 reg_def XMM27i( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(8));
 547 reg_def XMM27j( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(9));
 548 reg_def XMM27k( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(10));
 549 reg_def XMM27l( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(11));
 550 reg_def XMM27m( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(12));
 551 reg_def XMM27n( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(13));
 552 reg_def XMM27o( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(14));
 553 reg_def XMM27p( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(15));
 554 
 555 reg_def XMM28 ( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg());
 556 reg_def XMM28b( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(1));
 557 reg_def XMM28c( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(2));
 558 reg_def XMM28d( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(3));
 559 reg_def XMM28e( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(4));
 560 reg_def XMM28f( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(5));
 561 reg_def XMM28g( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(6));
 562 reg_def XMM28h( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(7));
 563 reg_def XMM28i( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(8));
 564 reg_def XMM28j( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(9));
 565 reg_def XMM28k( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(10));
 566 reg_def XMM28l( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(11));
 567 reg_def XMM28m( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(12));
 568 reg_def XMM28n( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(13));
 569 reg_def XMM28o( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(14));
 570 reg_def XMM28p( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(15));
 571 
 572 reg_def XMM29 ( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg());
 573 reg_def XMM29b( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(1));
 574 reg_def XMM29c( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(2));
 575 reg_def XMM29d( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(3));
 576 reg_def XMM29e( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(4));
 577 reg_def XMM29f( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(5));
 578 reg_def XMM29g( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(6));
 579 reg_def XMM29h( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(7));
 580 reg_def XMM29i( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(8));
 581 reg_def XMM29j( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(9));
 582 reg_def XMM29k( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(10));
 583 reg_def XMM29l( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(11));
 584 reg_def XMM29m( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(12));
 585 reg_def XMM29n( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(13));
 586 reg_def XMM29o( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(14));
 587 reg_def XMM29p( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(15));
 588 
 589 reg_def XMM30 ( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg());
 590 reg_def XMM30b( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(1));
 591 reg_def XMM30c( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(2));
 592 reg_def XMM30d( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(3));
 593 reg_def XMM30e( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(4));
 594 reg_def XMM30f( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(5));
 595 reg_def XMM30g( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(6));
 596 reg_def XMM30h( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(7));
 597 reg_def XMM30i( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(8));
 598 reg_def XMM30j( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(9));
 599 reg_def XMM30k( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(10));
 600 reg_def XMM30l( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(11));
 601 reg_def XMM30m( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(12));
 602 reg_def XMM30n( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(13));
 603 reg_def XMM30o( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(14));
 604 reg_def XMM30p( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(15));
 605 
 606 reg_def XMM31 ( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg());
 607 reg_def XMM31b( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(1));
 608 reg_def XMM31c( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(2));
 609 reg_def XMM31d( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(3));
 610 reg_def XMM31e( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(4));
 611 reg_def XMM31f( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(5));
 612 reg_def XMM31g( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(6));
 613 reg_def XMM31h( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(7));
 614 reg_def XMM31i( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(8));
 615 reg_def XMM31j( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(9));
 616 reg_def XMM31k( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(10));
 617 reg_def XMM31l( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(11));
 618 reg_def XMM31m( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(12));
 619 reg_def XMM31n( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(13));
 620 reg_def XMM31o( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(14));
 621 reg_def XMM31p( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(15));
 622 
 623 #endif // _LP64
 624 
 625 #ifdef _LP64
 626 reg_def RFLAGS(SOC, SOC, 0, 16, VMRegImpl::Bad());
 627 #else
 628 reg_def RFLAGS(SOC, SOC, 0, 8, VMRegImpl::Bad());
 629 #endif // _LP64
 630 
 631 alloc_class chunk1(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
 632                    XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
 633                    XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
 634                    XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
 635                    XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
 636                    XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
 637                    XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
 638                    XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
 639 #ifdef _LP64
 640                   ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
 641                    XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
 642                    XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
 643                    XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
 644                    XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
 645                    XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
 646                    XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
 647                    XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
 648                   ,XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
 649                    XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
 650                    XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
 651                    XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
 652                    XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
 653                    XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
 654                    XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
 655                    XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
 656                    XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
 657                    XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
 658                    XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
 659                    XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
 660                    XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
 661                    XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
 662                    XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
 663                    XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p
 664 #endif
 665                       );
 666 
 667 // flags allocation class should be last.
 668 alloc_class chunk2(RFLAGS);
 669 
 670 // Singleton class for condition codes
 671 reg_class int_flags(RFLAGS);
 672 
 673 // Class for pre evex float registers
 674 reg_class float_reg_legacy(XMM0,
 675                     XMM1,
 676                     XMM2,
 677                     XMM3,
 678                     XMM4,
 679                     XMM5,
 680                     XMM6,
 681                     XMM7
 682 #ifdef _LP64
 683                    ,XMM8,
 684                     XMM9,
 685                     XMM10,
 686                     XMM11,
 687                     XMM12,
 688                     XMM13,
 689                     XMM14,
 690                     XMM15
 691 #endif
 692                     );
 693 
 694 // Class for evex float registers
 695 reg_class float_reg_evex(XMM0,
 696                     XMM1,
 697                     XMM2,
 698                     XMM3,
 699                     XMM4,
 700                     XMM5,
 701                     XMM6,
 702                     XMM7
 703 #ifdef _LP64
 704                    ,XMM8,
 705                     XMM9,
 706                     XMM10,
 707                     XMM11,
 708                     XMM12,
 709                     XMM13,
 710                     XMM14,
 711                     XMM15,
 712                     XMM16,
 713                     XMM17,
 714                     XMM18,
 715                     XMM19,
 716                     XMM20,
 717                     XMM21,
 718                     XMM22,
 719                     XMM23,
 720                     XMM24,
 721                     XMM25,
 722                     XMM26,
 723                     XMM27,
 724                     XMM28,
 725                     XMM29,
 726                     XMM30,
 727                     XMM31
 728 #endif
 729                     );
 730 
 731 reg_class_dynamic float_reg(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() %} );
 732 reg_class_dynamic float_reg_vl(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
 733 
 734 // Class for pre evex double registers
 735 reg_class double_reg_legacy(XMM0,  XMM0b,
 736                      XMM1,  XMM1b,
 737                      XMM2,  XMM2b,
 738                      XMM3,  XMM3b,
 739                      XMM4,  XMM4b,
 740                      XMM5,  XMM5b,
 741                      XMM6,  XMM6b,
 742                      XMM7,  XMM7b
 743 #ifdef _LP64
 744                     ,XMM8,  XMM8b,
 745                      XMM9,  XMM9b,
 746                      XMM10, XMM10b,
 747                      XMM11, XMM11b,
 748                      XMM12, XMM12b,
 749                      XMM13, XMM13b,
 750                      XMM14, XMM14b,
 751                      XMM15, XMM15b
 752 #endif
 753                      );
 754 
 755 // Class for evex double registers
 756 reg_class double_reg_evex(XMM0,  XMM0b,
 757                      XMM1,  XMM1b,
 758                      XMM2,  XMM2b,
 759                      XMM3,  XMM3b,
 760                      XMM4,  XMM4b,
 761                      XMM5,  XMM5b,
 762                      XMM6,  XMM6b,
 763                      XMM7,  XMM7b
 764 #ifdef _LP64
 765                     ,XMM8,  XMM8b,
 766                      XMM9,  XMM9b,
 767                      XMM10, XMM10b,
 768                      XMM11, XMM11b,
 769                      XMM12, XMM12b,
 770                      XMM13, XMM13b,
 771                      XMM14, XMM14b,
 772                      XMM15, XMM15b,
 773                      XMM16, XMM16b,
 774                      XMM17, XMM17b,
 775                      XMM18, XMM18b,
 776                      XMM19, XMM19b,
 777                      XMM20, XMM20b,
 778                      XMM21, XMM21b,
 779                      XMM22, XMM22b,
 780                      XMM23, XMM23b,
 781                      XMM24, XMM24b,
 782                      XMM25, XMM25b,
 783                      XMM26, XMM26b,
 784                      XMM27, XMM27b,
 785                      XMM28, XMM28b,
 786                      XMM29, XMM29b,
 787                      XMM30, XMM30b,
 788                      XMM31, XMM31b
 789 #endif
 790                      );
 791 
 792 reg_class_dynamic double_reg(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() %} );
 793 reg_class_dynamic double_reg_vl(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
 794 
 795 // Class for pre evex 32bit vector registers
 796 reg_class vectors_reg_legacy(XMM0,
 797                       XMM1,
 798                       XMM2,
 799                       XMM3,
 800                       XMM4,
 801                       XMM5,
 802                       XMM6,
 803                       XMM7
 804 #ifdef _LP64
 805                      ,XMM8,
 806                       XMM9,
 807                       XMM10,
 808                       XMM11,
 809                       XMM12,
 810                       XMM13,
 811                       XMM14,
 812                       XMM15
 813 #endif
 814                       );
 815 
 816 // Class for evex 32bit vector registers
 817 reg_class vectors_reg_evex(XMM0,
 818                       XMM1,
 819                       XMM2,
 820                       XMM3,
 821                       XMM4,
 822                       XMM5,
 823                       XMM6,
 824                       XMM7
 825 #ifdef _LP64
 826                      ,XMM8,
 827                       XMM9,
 828                       XMM10,
 829                       XMM11,
 830                       XMM12,
 831                       XMM13,
 832                       XMM14,
 833                       XMM15,
 834                       XMM16,
 835                       XMM17,
 836                       XMM18,
 837                       XMM19,
 838                       XMM20,
 839                       XMM21,
 840                       XMM22,
 841                       XMM23,
 842                       XMM24,
 843                       XMM25,
 844                       XMM26,
 845                       XMM27,
 846                       XMM28,
 847                       XMM29,
 848                       XMM30,
 849                       XMM31
 850 #endif
 851                       );
 852 
 853 reg_class_dynamic vectors_reg(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_evex() %} );
 854 reg_class_dynamic vectors_reg_vlbwdq(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
 855 
 856 // Class for all 64bit vector registers
 857 reg_class vectord_reg_legacy(XMM0,  XMM0b,
 858                       XMM1,  XMM1b,
 859                       XMM2,  XMM2b,
 860                       XMM3,  XMM3b,
 861                       XMM4,  XMM4b,
 862                       XMM5,  XMM5b,
 863                       XMM6,  XMM6b,
 864                       XMM7,  XMM7b
 865 #ifdef _LP64
 866                      ,XMM8,  XMM8b,
 867                       XMM9,  XMM9b,
 868                       XMM10, XMM10b,
 869                       XMM11, XMM11b,
 870                       XMM12, XMM12b,
 871                       XMM13, XMM13b,
 872                       XMM14, XMM14b,
 873                       XMM15, XMM15b
 874 #endif
 875                       );
 876 
 877 // Class for all 64bit vector registers
 878 reg_class vectord_reg_evex(XMM0,  XMM0b,
 879                       XMM1,  XMM1b,
 880                       XMM2,  XMM2b,
 881                       XMM3,  XMM3b,
 882                       XMM4,  XMM4b,
 883                       XMM5,  XMM5b,
 884                       XMM6,  XMM6b,
 885                       XMM7,  XMM7b
 886 #ifdef _LP64
 887                      ,XMM8,  XMM8b,
 888                       XMM9,  XMM9b,
 889                       XMM10, XMM10b,
 890                       XMM11, XMM11b,
 891                       XMM12, XMM12b,
 892                       XMM13, XMM13b,
 893                       XMM14, XMM14b,
 894                       XMM15, XMM15b,
 895                       XMM16, XMM16b,
 896                       XMM17, XMM17b,
 897                       XMM18, XMM18b,
 898                       XMM19, XMM19b,
 899                       XMM20, XMM20b,
 900                       XMM21, XMM21b,
 901                       XMM22, XMM22b,
 902                       XMM23, XMM23b,
 903                       XMM24, XMM24b,
 904                       XMM25, XMM25b,
 905                       XMM26, XMM26b,
 906                       XMM27, XMM27b,
 907                       XMM28, XMM28b,
 908                       XMM29, XMM29b,
 909                       XMM30, XMM30b,
 910                       XMM31, XMM31b
 911 #endif
 912                       );
 913 
 914 reg_class_dynamic vectord_reg(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_evex() %} );
 915 reg_class_dynamic vectord_reg_vlbwdq(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
 916 
 917 // Class for all 128bit vector registers
 918 reg_class vectorx_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,
 919                       XMM1,  XMM1b,  XMM1c,  XMM1d,
 920                       XMM2,  XMM2b,  XMM2c,  XMM2d,
 921                       XMM3,  XMM3b,  XMM3c,  XMM3d,
 922                       XMM4,  XMM4b,  XMM4c,  XMM4d,
 923                       XMM5,  XMM5b,  XMM5c,  XMM5d,
 924                       XMM6,  XMM6b,  XMM6c,  XMM6d,
 925                       XMM7,  XMM7b,  XMM7c,  XMM7d
 926 #ifdef _LP64
 927                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,
 928                       XMM9,  XMM9b,  XMM9c,  XMM9d,
 929                       XMM10, XMM10b, XMM10c, XMM10d,
 930                       XMM11, XMM11b, XMM11c, XMM11d,
 931                       XMM12, XMM12b, XMM12c, XMM12d,
 932                       XMM13, XMM13b, XMM13c, XMM13d,
 933                       XMM14, XMM14b, XMM14c, XMM14d,
 934                       XMM15, XMM15b, XMM15c, XMM15d
 935 #endif
 936                       );
 937 
 938 // Class for all 128bit vector registers
 939 reg_class vectorx_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,
 940                       XMM1,  XMM1b,  XMM1c,  XMM1d,
 941                       XMM2,  XMM2b,  XMM2c,  XMM2d,
 942                       XMM3,  XMM3b,  XMM3c,  XMM3d,
 943                       XMM4,  XMM4b,  XMM4c,  XMM4d,
 944                       XMM5,  XMM5b,  XMM5c,  XMM5d,
 945                       XMM6,  XMM6b,  XMM6c,  XMM6d,
 946                       XMM7,  XMM7b,  XMM7c,  XMM7d
 947 #ifdef _LP64
 948                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,
 949                       XMM9,  XMM9b,  XMM9c,  XMM9d,
 950                       XMM10, XMM10b, XMM10c, XMM10d,
 951                       XMM11, XMM11b, XMM11c, XMM11d,
 952                       XMM12, XMM12b, XMM12c, XMM12d,
 953                       XMM13, XMM13b, XMM13c, XMM13d,
 954                       XMM14, XMM14b, XMM14c, XMM14d,
 955                       XMM15, XMM15b, XMM15c, XMM15d,
 956                       XMM16, XMM16b, XMM16c, XMM16d,
 957                       XMM17, XMM17b, XMM17c, XMM17d,
 958                       XMM18, XMM18b, XMM18c, XMM18d,
 959                       XMM19, XMM19b, XMM19c, XMM19d,
 960                       XMM20, XMM20b, XMM20c, XMM20d,
 961                       XMM21, XMM21b, XMM21c, XMM21d,
 962                       XMM22, XMM22b, XMM22c, XMM22d,
 963                       XMM23, XMM23b, XMM23c, XMM23d,
 964                       XMM24, XMM24b, XMM24c, XMM24d,
 965                       XMM25, XMM25b, XMM25c, XMM25d,
 966                       XMM26, XMM26b, XMM26c, XMM26d,
 967                       XMM27, XMM27b, XMM27c, XMM27d,
 968                       XMM28, XMM28b, XMM28c, XMM28d,
 969                       XMM29, XMM29b, XMM29c, XMM29d,
 970                       XMM30, XMM30b, XMM30c, XMM30d,
 971                       XMM31, XMM31b, XMM31c, XMM31d
 972 #endif
 973                       );
 974 
 975 reg_class_dynamic vectorx_reg(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_evex() %} );
 976 reg_class_dynamic vectorx_reg_vlbwdq(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
 977 
 978 // Class for all 256bit vector registers
 979 reg_class vectory_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
 980                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
 981                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
 982                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
 983                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
 984                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
 985                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
 986                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
 987 #ifdef _LP64
 988                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
 989                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
 990                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
 991                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
 992                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
 993                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
 994                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
 995                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h
 996 #endif
 997                       );
 998 
 999 // Class for all 256bit vector registers
1000 reg_class vectory_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
1001                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
1002                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
1003                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
1004                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
1005                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
1006                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
1007                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
1008 #ifdef _LP64
1009                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
1010                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
1011                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
1012                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
1013                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
1014                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
1015                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
1016                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h,
1017                       XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h,
1018                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h,
1019                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h,
1020                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h,
1021                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h,
1022                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h,
1023                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h,
1024                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h,
1025                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h,
1026                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h,
1027                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h,
1028                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h,
1029                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h,
1030                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h,
1031                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h,
1032                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h
1033 #endif
1034                       );
1035 
1036 reg_class_dynamic vectory_reg(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_evex() %} );
1037 reg_class_dynamic vectory_reg_vlbwdq(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
1038 
1039 // Class for all 512bit vector registers
1040 reg_class vectorz_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
1041                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
1042                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
1043                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
1044                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
1045                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
1046                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
1047                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
1048 #ifdef _LP64
1049                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
1050                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
1051                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
1052                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
1053                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
1054                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
1055                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
1056                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
1057                      ,XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
1058                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
1059                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
1060                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
1061                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
1062                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
1063                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
1064                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
1065                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
1066                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
1067                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
1068                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
1069                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
1070                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
1071                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
1072                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p
1073 #endif
1074                       );
1075 
1076 // Class for restricted 512bit vector registers
1077 reg_class vectorz_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
1078                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
1079                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
1080                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
1081                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
1082                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
1083                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
1084                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
1085 #ifdef _LP64
1086                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
1087                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
1088                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
1089                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
1090                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
1091                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
1092                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
1093                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
1094 #endif
1095                       );
1096 
1097 reg_class_dynamic vectorz_reg(vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() %} );
1098 reg_class_dynamic vectorz_reg_vl(vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
1099 
1100 reg_class xmm0_reg(XMM0, XMM0b, XMM0c, XMM0d);
1101 reg_class ymm0_reg(XMM0, XMM0b, XMM0c, XMM0d, XMM0e, XMM0f, XMM0g, XMM0h);
1102 reg_class zmm0_reg(XMM0, XMM0b, XMM0c, XMM0d, XMM0e, XMM0f, XMM0g, XMM0h, XMM0i, XMM0j, XMM0k, XMM0l, XMM0m, XMM0n, XMM0o, XMM0p);
1103 
1104 reg_class xmm1_reg(XMM1, XMM1b, XMM1c, XMM1d);
1105 reg_class ymm1_reg(XMM1, XMM1b, XMM1c, XMM1d, XMM1e, XMM1f, XMM1g, XMM1h);
1106 reg_class zmm1_reg(XMM1, XMM1b, XMM1c, XMM1d, XMM1e, XMM1f, XMM1g, XMM1h, XMM1i, XMM1j, XMM1k, XMM1l, XMM1m, XMM1n, XMM1o, XMM1p);
1107 
1108 reg_class xmm2_reg(XMM2, XMM2b, XMM2c, XMM2d);
1109 reg_class ymm2_reg(XMM2, XMM2b, XMM2c, XMM2d, XMM2e, XMM2f, XMM2g, XMM2h);
1110 reg_class zmm2_reg(XMM2, XMM2b, XMM2c, XMM2d, XMM2e, XMM2f, XMM2g, XMM2h, XMM2i, XMM2j, XMM2k, XMM2l, XMM2m, XMM2n, XMM2o, XMM2p);
1111 
1112 reg_class xmm3_reg(XMM3, XMM3b, XMM3c, XMM3d);
1113 reg_class ymm3_reg(XMM3, XMM3b, XMM3c, XMM3d, XMM3e, XMM3f, XMM3g, XMM3h);
1114 reg_class zmm3_reg(XMM3, XMM3b, XMM3c, XMM3d, XMM3e, XMM3f, XMM3g, XMM3h, XMM3i, XMM3j, XMM3k, XMM3l, XMM3m, XMM3n, XMM3o, XMM3p);
1115 
1116 reg_class xmm4_reg(XMM4, XMM4b, XMM4c, XMM4d);
1117 reg_class ymm4_reg(XMM4, XMM4b, XMM4c, XMM4d, XMM4e, XMM4f, XMM4g, XMM4h);
1118 reg_class zmm4_reg(XMM4, XMM4b, XMM4c, XMM4d, XMM4e, XMM4f, XMM4g, XMM4h, XMM4i, XMM4j, XMM4k, XMM4l, XMM4m, XMM4n, XMM4o, XMM4p);
1119 
1120 reg_class xmm5_reg(XMM5, XMM5b, XMM5c, XMM5d);
1121 reg_class ymm5_reg(XMM5, XMM5b, XMM5c, XMM5d, XMM5e, XMM5f, XMM5g, XMM5h);
1122 reg_class zmm5_reg(XMM5, XMM5b, XMM5c, XMM5d, XMM5e, XMM5f, XMM5g, XMM5h, XMM5i, XMM5j, XMM5k, XMM5l, XMM5m, XMM5n, XMM5o, XMM5p);
1123 
1124 reg_class xmm6_reg(XMM6, XMM6b, XMM6c, XMM6d);
1125 reg_class ymm6_reg(XMM6, XMM6b, XMM6c, XMM6d, XMM6e, XMM6f, XMM6g, XMM6h);
1126 reg_class zmm6_reg(XMM6, XMM6b, XMM6c, XMM6d, XMM6e, XMM6f, XMM6g, XMM6h, XMM6i, XMM6j, XMM6k, XMM6l, XMM6m, XMM6n, XMM6o, XMM6p);
1127 
1128 reg_class xmm7_reg(XMM7, XMM7b, XMM7c, XMM7d);
1129 reg_class ymm7_reg(XMM7, XMM7b, XMM7c, XMM7d, XMM7e, XMM7f, XMM7g, XMM7h);
1130 reg_class zmm7_reg(XMM7, XMM7b, XMM7c, XMM7d, XMM7e, XMM7f, XMM7g, XMM7h, XMM7i, XMM7j, XMM7k, XMM7l, XMM7m, XMM7n, XMM7o, XMM7p);
1131 
1132 #ifdef _LP64
1133 
1134 reg_class xmm8_reg(XMM8, XMM8b, XMM8c, XMM8d);
1135 reg_class ymm8_reg(XMM8, XMM8b, XMM8c, XMM8d, XMM8e, XMM8f, XMM8g, XMM8h);
1136 reg_class zmm8_reg(XMM8, XMM8b, XMM8c, XMM8d, XMM8e, XMM8f, XMM8g, XMM8h, XMM8i, XMM8j, XMM8k, XMM8l, XMM8m, XMM8n, XMM8o, XMM8p);
1137 
1138 reg_class xmm9_reg(XMM9, XMM9b, XMM9c, XMM9d);
1139 reg_class ymm9_reg(XMM9, XMM9b, XMM9c, XMM9d, XMM9e, XMM9f, XMM9g, XMM9h);
1140 reg_class zmm9_reg(XMM9, XMM9b, XMM9c, XMM9d, XMM9e, XMM9f, XMM9g, XMM9h, XMM9i, XMM9j, XMM9k, XMM9l, XMM9m, XMM9n, XMM9o, XMM9p);
1141 
1142 reg_class xmm10_reg(XMM10, XMM10b, XMM10c, XMM10d);
1143 reg_class ymm10_reg(XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h);
1144 reg_class zmm10_reg(XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p);
1145 
1146 reg_class xmm11_reg(XMM11, XMM11b, XMM11c, XMM11d);
1147 reg_class ymm11_reg(XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h);
1148 reg_class zmm11_reg(XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p);
1149 
1150 reg_class xmm12_reg(XMM12, XMM12b, XMM12c, XMM12d);
1151 reg_class ymm12_reg(XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h);
1152 reg_class zmm12_reg(XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p);
1153 
1154 reg_class xmm13_reg(XMM13, XMM13b, XMM13c, XMM13d);
1155 reg_class ymm13_reg(XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h);
1156 reg_class zmm13_reg(XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p);
1157 
1158 reg_class xmm14_reg(XMM14, XMM14b, XMM14c, XMM14d);
1159 reg_class ymm14_reg(XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h);
1160 reg_class zmm14_reg(XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p);
1161 
1162 reg_class xmm15_reg(XMM15, XMM15b, XMM15c, XMM15d);
1163 reg_class ymm15_reg(XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h);
1164 reg_class zmm15_reg(XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p);
1165 
1166 reg_class xmm16_reg(XMM16, XMM16b, XMM16c, XMM16d);
1167 reg_class ymm16_reg(XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h);
1168 reg_class zmm16_reg(XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p);
1169 
1170 reg_class xmm17_reg(XMM17, XMM17b, XMM17c, XMM17d);
1171 reg_class ymm17_reg(XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h);
1172 reg_class zmm17_reg(XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p);
1173 
1174 reg_class xmm18_reg(XMM18, XMM18b, XMM18c, XMM18d);
1175 reg_class ymm18_reg(XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h);
1176 reg_class zmm18_reg(XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p);
1177 
1178 reg_class xmm19_reg(XMM19, XMM19b, XMM19c, XMM19d);
1179 reg_class ymm19_reg(XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h);
1180 reg_class zmm19_reg(XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p);
1181 
1182 reg_class xmm20_reg(XMM20, XMM20b, XMM20c, XMM20d);
1183 reg_class ymm20_reg(XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h);
1184 reg_class zmm20_reg(XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p);
1185 
1186 reg_class xmm21_reg(XMM21, XMM21b, XMM21c, XMM21d);
1187 reg_class ymm21_reg(XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h);
1188 reg_class zmm21_reg(XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p);
1189 
1190 reg_class xmm22_reg(XMM22, XMM22b, XMM22c, XMM22d);
1191 reg_class ymm22_reg(XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h);
1192 reg_class zmm22_reg(XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p);
1193 
1194 reg_class xmm23_reg(XMM23, XMM23b, XMM23c, XMM23d);
1195 reg_class ymm23_reg(XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h);
1196 reg_class zmm23_reg(XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p);
1197 
1198 reg_class xmm24_reg(XMM24, XMM24b, XMM24c, XMM24d);
1199 reg_class ymm24_reg(XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h);
1200 reg_class zmm24_reg(XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p);
1201 
1202 reg_class xmm25_reg(XMM25, XMM25b, XMM25c, XMM25d);
1203 reg_class ymm25_reg(XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h);
1204 reg_class zmm25_reg(XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p);
1205 
1206 reg_class xmm26_reg(XMM26, XMM26b, XMM26c, XMM26d);
1207 reg_class ymm26_reg(XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h);
1208 reg_class zmm26_reg(XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p);
1209 
1210 reg_class xmm27_reg(XMM27, XMM27b, XMM27c, XMM27d);
1211 reg_class ymm27_reg(XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h);
1212 reg_class zmm27_reg(XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p);
1213 
1214 reg_class xmm28_reg(XMM28, XMM28b, XMM28c, XMM28d);
1215 reg_class ymm28_reg(XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h);
1216 reg_class zmm28_reg(XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p);
1217 
1218 reg_class xmm29_reg(XMM29, XMM29b, XMM29c, XMM29d);
1219 reg_class ymm29_reg(XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h);
1220 reg_class zmm29_reg(XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p);
1221 
1222 reg_class xmm30_reg(XMM30, XMM30b, XMM30c, XMM30d);
1223 reg_class ymm30_reg(XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h);
1224 reg_class zmm30_reg(XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p);
1225 
1226 reg_class xmm31_reg(XMM31, XMM31b, XMM31c, XMM31d);
1227 reg_class ymm31_reg(XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h);
1228 reg_class zmm31_reg(XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p);
1229 
1230 #endif
1231 
1232 %}
1233 
1234 
1235 //----------SOURCE BLOCK-------------------------------------------------------
1236 // This is a block of C++ code which provides values, functions, and
1237 // definitions necessary in the rest of the architecture description
1238 
1239 source_hpp %{
1240 // Header information of the source block.
1241 // Method declarations/definitions which are used outside
1242 // the ad-scope can conveniently be defined here.
1243 //
1244 // To keep related declarations/definitions/uses close together,
1245 // we switch between source %{ }% and source_hpp %{ }% freely as needed.
1246 
1247 class NativeJump;
1248 
1249 class CallStubImpl {
1250 
1251   //--------------------------------------------------------------
1252   //---<  Used for optimization in Compile::shorten_branches  >---
1253   //--------------------------------------------------------------
1254 
1255  public:
1256   // Size of call trampoline stub.
1257   static uint size_call_trampoline() {
1258     return 0; // no call trampolines on this platform
1259   }
1260 
1261   // number of relocations needed by a call trampoline stub
1262   static uint reloc_call_trampoline() {
1263     return 0; // no call trampolines on this platform
1264   }
1265 };
1266 
1267 class HandlerImpl {
1268 
1269  public:
1270 
1271   static int emit_exception_handler(CodeBuffer &cbuf);
1272   static int emit_deopt_handler(CodeBuffer& cbuf);
1273 
1274   static uint size_exception_handler() {
1275     // NativeCall instruction size is the same as NativeJump.
1276     // exception handler starts out as jump and can be patched to
1277     // a call be deoptimization.  (4932387)
1278     // Note that this value is also credited (in output.cpp) to
1279     // the size of the code section.
1280     return NativeJump::instruction_size;
1281   }
1282 
1283 #ifdef _LP64
1284   static uint size_deopt_handler() {
1285     // three 5 byte instructions plus one move for unreachable address.
1286     return 15+3;
1287   }
1288 #else
1289   static uint size_deopt_handler() {
1290     // NativeCall instruction size is the same as NativeJump.
1291     // exception handler starts out as jump and can be patched to
1292     // a call be deoptimization.  (4932387)
1293     // Note that this value is also credited (in output.cpp) to
1294     // the size of the code section.
1295     return 5 + NativeJump::instruction_size; // pushl(); jmp;
1296   }
1297 #endif
1298 };
1299 
1300 %} // end source_hpp
1301 
1302 source %{
1303 
1304 #include "opto/addnode.hpp"
1305 
1306 // Emit exception handler code.
1307 // Stuff framesize into a register and call a VM stub routine.
1308 int HandlerImpl::emit_exception_handler(CodeBuffer& cbuf) {
1309 
1310   // Note that the code buffer's insts_mark is always relative to insts.
1311   // That's why we must use the macroassembler to generate a handler.
1312   MacroAssembler _masm(&cbuf);
1313   address base = __ start_a_stub(size_exception_handler());
1314   if (base == NULL) {
1315     ciEnv::current()->record_failure("CodeCache is full");
1316     return 0;  // CodeBuffer::expand failed
1317   }
1318   int offset = __ offset();
1319   __ jump(RuntimeAddress(OptoRuntime::exception_blob()->entry_point()));
1320   assert(__ offset() - offset <= (int) size_exception_handler(), "overflow");
1321   __ end_a_stub();
1322   return offset;
1323 }
1324 
1325 // Emit deopt handler code.
1326 int HandlerImpl::emit_deopt_handler(CodeBuffer& cbuf) {
1327 
1328   // Note that the code buffer's insts_mark is always relative to insts.
1329   // That's why we must use the macroassembler to generate a handler.
1330   MacroAssembler _masm(&cbuf);
1331   address base = __ start_a_stub(size_deopt_handler());
1332   if (base == NULL) {
1333     ciEnv::current()->record_failure("CodeCache is full");
1334     return 0;  // CodeBuffer::expand failed
1335   }
1336   int offset = __ offset();
1337 
1338 #ifdef _LP64
1339   address the_pc = (address) __ pc();
1340   Label next;
1341   // push a "the_pc" on the stack without destroying any registers
1342   // as they all may be live.
1343 
1344   // push address of "next"
1345   __ call(next, relocInfo::none); // reloc none is fine since it is a disp32
1346   __ bind(next);
1347   // adjust it so it matches "the_pc"
1348   __ subptr(Address(rsp, 0), __ offset() - offset);
1349 #else
1350   InternalAddress here(__ pc());
1351   __ pushptr(here.addr());
1352 #endif
1353 
1354   __ jump(RuntimeAddress(SharedRuntime::deopt_blob()->unpack()));
1355   assert(__ offset() - offset <= (int) size_deopt_handler(), "overflow %d", (__ offset() - offset));
1356   __ end_a_stub();
1357   return offset;
1358 }
1359 
1360 
1361 //=============================================================================
1362 
1363   // Float masks come from different places depending on platform.
1364 #ifdef _LP64
1365   static address float_signmask()  { return StubRoutines::x86::float_sign_mask(); }
1366   static address float_signflip()  { return StubRoutines::x86::float_sign_flip(); }
1367   static address double_signmask() { return StubRoutines::x86::double_sign_mask(); }
1368   static address double_signflip() { return StubRoutines::x86::double_sign_flip(); }
1369 #else
1370   static address float_signmask()  { return (address)float_signmask_pool; }
1371   static address float_signflip()  { return (address)float_signflip_pool; }
1372   static address double_signmask() { return (address)double_signmask_pool; }
1373   static address double_signflip() { return (address)double_signflip_pool; }
1374 #endif
1375 
1376 
1377 const bool Matcher::match_rule_supported(int opcode) {
1378   if (!has_match_rule(opcode))
1379     return false;
1380 
1381   bool ret_value = true;
1382   switch (opcode) {
1383     case Op_PopCountI:
1384     case Op_PopCountL:
1385       if (!UsePopCountInstruction)
1386         ret_value = false;
1387       break;
1388     case Op_PopCountVI:
1389       if (!UsePopCountInstruction || !VM_Version::supports_vpopcntdq())
1390         ret_value = false;
1391       break;
1392     case Op_MulVI:
1393       if ((UseSSE < 4) && (UseAVX < 1)) // only with SSE4_1 or AVX
1394         ret_value = false;
1395       break;
1396     case Op_MulVL:
1397     case Op_MulReductionVL:
1398       if (VM_Version::supports_avx512dq() == false)
1399         ret_value = false;
1400       break;
1401     case Op_AddReductionVL:
1402       if (UseAVX < 3) // only EVEX : vector connectivity becomes an issue here
1403         ret_value = false;
1404       break;
1405     case Op_AddReductionVI:
1406       if (UseSSE < 3) // requires at least SSE3
1407         ret_value = false;
1408       break;
1409     case Op_MulReductionVI:
1410       if (UseSSE < 4) // requires at least SSE4
1411         ret_value = false;
1412       break;
1413     case Op_AddReductionVF:
1414     case Op_AddReductionVD:
1415     case Op_MulReductionVF:
1416     case Op_MulReductionVD:
1417       if (UseSSE < 1) // requires at least SSE
1418         ret_value = false;
1419       break;
1420     case Op_SqrtVD:
1421     case Op_SqrtVF:
1422       if (UseAVX < 1) // enabled for AVX only
1423         ret_value = false;
1424       break;
1425     case Op_CompareAndSwapL:
1426 #ifdef _LP64
1427     case Op_CompareAndSwapP:
1428 #endif
1429       if (!VM_Version::supports_cx8())
1430         ret_value = false;
1431       break;
1432     case Op_CMoveVF:
1433     case Op_CMoveVD:
1434       if (UseAVX < 1 || UseAVX > 2)
1435         ret_value = false;
1436       break;
1437     case Op_StrIndexOf:
1438       if (!UseSSE42Intrinsics)
1439         ret_value = false;
1440       break;
1441     case Op_StrIndexOfChar:
1442       if (!UseSSE42Intrinsics)
1443         ret_value = false;
1444       break;
1445     case Op_OnSpinWait:
1446       if (VM_Version::supports_on_spin_wait() == false)
1447         ret_value = false;
1448       break;
1449     case Op_MulAddVS2VI:
1450       if (UseSSE < 2)
1451         ret_value = false;
1452       break;
1453   }
1454 
1455   return ret_value;  // Per default match rules are supported.
1456 }
1457 
1458 const bool Matcher::match_rule_supported_vector(int opcode, int vlen) {
1459   // identify extra cases that we might want to provide match rules for
1460   // e.g. Op_ vector nodes and other intrinsics while guarding with vlen
1461   bool ret_value = match_rule_supported(opcode);
1462   if (ret_value) {
1463     switch (opcode) {
1464       case Op_AddVB:
1465       case Op_SubVB:
1466         if ((vlen == 64) && (VM_Version::supports_avx512bw() == false))
1467           ret_value = false;
1468         break;
1469       case Op_URShiftVS:
1470       case Op_RShiftVS:
1471       case Op_LShiftVS:
1472       case Op_MulVS:
1473       case Op_AddVS:
1474       case Op_SubVS:
1475         if ((vlen == 32) && (VM_Version::supports_avx512bw() == false))
1476           ret_value = false;
1477         break;
1478       case Op_CMoveVF:
1479         if (vlen != 8)
1480           ret_value  = false;
1481         break;
1482       case Op_CMoveVD:
1483         if (vlen != 4)
1484           ret_value  = false;
1485         break;
1486     }
1487   }
1488 
1489   return ret_value;  // Per default match rules are supported.
1490 }
1491 
1492 const bool Matcher::has_predicated_vectors(void) {
1493   bool ret_value = false;
1494   if (UseAVX > 2) {
1495     ret_value = VM_Version::supports_avx512vl();
1496   }
1497 
1498   return ret_value;
1499 }
1500 
1501 const int Matcher::float_pressure(int default_pressure_threshold) {
1502   int float_pressure_threshold = default_pressure_threshold;
1503 #ifdef _LP64
1504   if (UseAVX > 2) {
1505     // Increase pressure threshold on machines with AVX3 which have
1506     // 2x more XMM registers.
1507     float_pressure_threshold = default_pressure_threshold * 2;
1508   }
1509 #endif
1510   return float_pressure_threshold;
1511 }
1512 
1513 // Max vector size in bytes. 0 if not supported.
1514 const int Matcher::vector_width_in_bytes(BasicType bt) {
1515   assert(is_java_primitive(bt), "only primitive type vectors");
1516   if (UseSSE < 2) return 0;
1517   // SSE2 supports 128bit vectors for all types.
1518   // AVX2 supports 256bit vectors for all types.
1519   // AVX2/EVEX supports 512bit vectors for all types.
1520   int size = (UseAVX > 1) ? (1 << UseAVX) * 8 : 16;
1521   // AVX1 supports 256bit vectors only for FLOAT and DOUBLE.
1522   if (UseAVX > 0 && (bt == T_FLOAT || bt == T_DOUBLE))
1523     size = (UseAVX > 2) ? 64 : 32;
1524   if (UseAVX > 2 && (bt == T_BYTE || bt == T_SHORT || bt == T_CHAR))
1525     size = (VM_Version::supports_avx512bw()) ? 64 : 32;
1526   // Use flag to limit vector size.
1527   size = MIN2(size,(int)MaxVectorSize);
1528   // Minimum 2 values in vector (or 4 for bytes).
1529   switch (bt) {
1530   case T_DOUBLE:
1531   case T_LONG:
1532     if (size < 16) return 0;
1533     break;
1534   case T_FLOAT:
1535   case T_INT:
1536     if (size < 8) return 0;
1537     break;
1538   case T_BOOLEAN:
1539     if (size < 4) return 0;
1540     break;
1541   case T_CHAR:
1542     if (size < 4) return 0;
1543     break;
1544   case T_BYTE:
1545     if (size < 4) return 0;
1546     break;
1547   case T_SHORT:
1548     if (size < 4) return 0;
1549     break;
1550   default:
1551     ShouldNotReachHere();
1552   }
1553   return size;
1554 }
1555 
1556 // Limits on vector size (number of elements) loaded into vector.
1557 const int Matcher::max_vector_size(const BasicType bt) {
1558   return vector_width_in_bytes(bt)/type2aelembytes(bt);
1559 }
1560 const int Matcher::min_vector_size(const BasicType bt) {
1561   int max_size = max_vector_size(bt);
1562   // Min size which can be loaded into vector is 4 bytes.
1563   int size = (type2aelembytes(bt) == 1) ? 4 : 2;
1564   return MIN2(size,max_size);
1565 }
1566 
1567 // Vector ideal reg corresponding to specified size in bytes
1568 const uint Matcher::vector_ideal_reg(int size) {
1569   assert(MaxVectorSize >= size, "");
1570   switch(size) {
1571     case  4: return Op_VecS;
1572     case  8: return Op_VecD;
1573     case 16: return Op_VecX;
1574     case 32: return Op_VecY;
1575     case 64: return Op_VecZ;
1576   }
1577   ShouldNotReachHere();
1578   return 0;
1579 }
1580 
1581 // Only lowest bits of xmm reg are used for vector shift count.
1582 const uint Matcher::vector_shift_count_ideal_reg(int size) {
1583   return Op_VecS;
1584 }
1585 
1586 // x86 supports misaligned vectors store/load.
1587 const bool Matcher::misaligned_vectors_ok() {
1588   return true;
1589 }
1590 
1591 // x86 AES instructions are compatible with SunJCE expanded
1592 // keys, hence we do not need to pass the original key to stubs
1593 const bool Matcher::pass_original_key_for_aes() {
1594   return false;
1595 }
1596 
1597 
1598 const bool Matcher::convi2l_type_required = true;
1599 
1600 // Check for shift by small constant as well
1601 static bool clone_shift(Node* shift, Matcher* matcher, Matcher::MStack& mstack, VectorSet& address_visited) {
1602   if (shift->Opcode() == Op_LShiftX && shift->in(2)->is_Con() &&
1603       shift->in(2)->get_int() <= 3 &&
1604       // Are there other uses besides address expressions?
1605       !matcher->is_visited(shift)) {
1606     address_visited.set(shift->_idx); // Flag as address_visited
1607     mstack.push(shift->in(2), Matcher::Visit);
1608     Node *conv = shift->in(1);
1609 #ifdef _LP64
1610     // Allow Matcher to match the rule which bypass
1611     // ConvI2L operation for an array index on LP64
1612     // if the index value is positive.
1613     if (conv->Opcode() == Op_ConvI2L &&
1614         conv->as_Type()->type()->is_long()->_lo >= 0 &&
1615         // Are there other uses besides address expressions?
1616         !matcher->is_visited(conv)) {
1617       address_visited.set(conv->_idx); // Flag as address_visited
1618       mstack.push(conv->in(1), Matcher::Pre_Visit);
1619     } else
1620 #endif
1621       mstack.push(conv, Matcher::Pre_Visit);
1622     return true;
1623   }
1624   return false;
1625 }
1626 
1627 // Should the Matcher clone shifts on addressing modes, expecting them
1628 // to be subsumed into complex addressing expressions or compute them
1629 // into registers?
1630 bool Matcher::clone_address_expressions(AddPNode* m, Matcher::MStack& mstack, VectorSet& address_visited) {
1631   Node *off = m->in(AddPNode::Offset);
1632   if (off->is_Con()) {
1633     address_visited.test_set(m->_idx); // Flag as address_visited
1634     Node *adr = m->in(AddPNode::Address);
1635 
1636     // Intel can handle 2 adds in addressing mode
1637     // AtomicAdd is not an addressing expression.
1638     // Cheap to find it by looking for screwy base.
1639     if (adr->is_AddP() &&
1640         !adr->in(AddPNode::Base)->is_top() &&
1641         LP64_ONLY( off->get_long() == (int) (off->get_long()) && ) // immL32
1642         // Are there other uses besides address expressions?
1643         !is_visited(adr)) {
1644       address_visited.set(adr->_idx); // Flag as address_visited
1645       Node *shift = adr->in(AddPNode::Offset);
1646       if (!clone_shift(shift, this, mstack, address_visited)) {
1647         mstack.push(shift, Pre_Visit);
1648       }
1649       mstack.push(adr->in(AddPNode::Address), Pre_Visit);
1650       mstack.push(adr->in(AddPNode::Base), Pre_Visit);
1651     } else {
1652       mstack.push(adr, Pre_Visit);
1653     }
1654 
1655     // Clone X+offset as it also folds into most addressing expressions
1656     mstack.push(off, Visit);
1657     mstack.push(m->in(AddPNode::Base), Pre_Visit);
1658     return true;
1659   } else if (clone_shift(off, this, mstack, address_visited)) {
1660     address_visited.test_set(m->_idx); // Flag as address_visited
1661     mstack.push(m->in(AddPNode::Address), Pre_Visit);
1662     mstack.push(m->in(AddPNode::Base), Pre_Visit);
1663     return true;
1664   }
1665   return false;
1666 }
1667 
1668 void Compile::reshape_address(AddPNode* addp) {
1669 }
1670 
1671 // Helper methods for MachSpillCopyNode::implementation().
1672 static int vec_mov_helper(CodeBuffer *cbuf, bool do_size, int src_lo, int dst_lo,
1673                           int src_hi, int dst_hi, uint ireg, outputStream* st) {
1674   // In 64-bit VM size calculation is very complex. Emitting instructions
1675   // into scratch buffer is used to get size in 64-bit VM.
1676   LP64_ONLY( assert(!do_size, "this method calculates size only for 32-bit VM"); )
1677   assert(ireg == Op_VecS || // 32bit vector
1678          (src_lo & 1) == 0 && (src_lo + 1) == src_hi &&
1679          (dst_lo & 1) == 0 && (dst_lo + 1) == dst_hi,
1680          "no non-adjacent vector moves" );
1681   if (cbuf) {
1682     MacroAssembler _masm(cbuf);
1683     int offset = __ offset();
1684     switch (ireg) {
1685     case Op_VecS: // copy whole register
1686     case Op_VecD:
1687     case Op_VecX:
1688 #ifndef _LP64
1689       __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
1690 #else
1691       if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
1692         __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
1693       } else {
1694         __ vextractf32x4(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 0x0);
1695      }
1696 #endif
1697       break;
1698     case Op_VecY:
1699 #ifndef _LP64
1700       __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
1701 #else
1702       if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
1703         __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
1704       } else {
1705         __ vextractf64x4(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 0x0);
1706      }
1707 #endif
1708       break;
1709     case Op_VecZ:
1710       __ evmovdquq(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 2);
1711       break;
1712     default:
1713       ShouldNotReachHere();
1714     }
1715     int size = __ offset() - offset;
1716 #ifdef ASSERT
1717     // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
1718     assert(!do_size || size == 4, "incorrect size calculattion");
1719 #endif
1720     return size;
1721 #ifndef PRODUCT
1722   } else if (!do_size) {
1723     switch (ireg) {
1724     case Op_VecS:
1725     case Op_VecD:
1726     case Op_VecX:
1727       st->print("movdqu  %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
1728       break;
1729     case Op_VecY:
1730     case Op_VecZ:
1731       st->print("vmovdqu %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
1732       break;
1733     default:
1734       ShouldNotReachHere();
1735     }
1736 #endif
1737   }
1738   // VEX_2bytes prefix is used if UseAVX > 0, and it takes the same 2 bytes as SIMD prefix.
1739   return (UseAVX > 2) ? 6 : 4;
1740 }
1741 
1742 static int vec_spill_helper(CodeBuffer *cbuf, bool do_size, bool is_load,
1743                             int stack_offset, int reg, uint ireg, outputStream* st) {
1744   // In 64-bit VM size calculation is very complex. Emitting instructions
1745   // into scratch buffer is used to get size in 64-bit VM.
1746   LP64_ONLY( assert(!do_size, "this method calculates size only for 32-bit VM"); )
1747   if (cbuf) {
1748     MacroAssembler _masm(cbuf);
1749     int offset = __ offset();
1750     if (is_load) {
1751       switch (ireg) {
1752       case Op_VecS:
1753         __ movdl(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
1754         break;
1755       case Op_VecD:
1756         __ movq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
1757         break;
1758       case Op_VecX:
1759 #ifndef _LP64
1760         __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
1761 #else
1762         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
1763           __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
1764         } else {
1765           __ vpxor(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 2);
1766           __ vinsertf32x4(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset),0x0);
1767         }
1768 #endif
1769         break;
1770       case Op_VecY:
1771 #ifndef _LP64
1772         __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
1773 #else
1774         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
1775           __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
1776         } else {
1777           __ vpxor(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 2);
1778           __ vinsertf64x4(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset),0x0);
1779         }
1780 #endif
1781         break;
1782       case Op_VecZ:
1783         __ evmovdquq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset), 2);
1784         break;
1785       default:
1786         ShouldNotReachHere();
1787       }
1788     } else { // store
1789       switch (ireg) {
1790       case Op_VecS:
1791         __ movdl(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
1792         break;
1793       case Op_VecD:
1794         __ movq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
1795         break;
1796       case Op_VecX:
1797 #ifndef _LP64
1798         __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
1799 #else
1800         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
1801           __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
1802         }
1803         else {
1804           __ vextractf32x4(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 0x0);
1805         }
1806 #endif
1807         break;
1808       case Op_VecY:
1809 #ifndef _LP64
1810         __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
1811 #else
1812         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
1813           __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
1814         }
1815         else {
1816           __ vextractf64x4(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 0x0);
1817         }
1818 #endif
1819         break;
1820       case Op_VecZ:
1821         __ evmovdquq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 2);
1822         break;
1823       default:
1824         ShouldNotReachHere();
1825       }
1826     }
1827     int size = __ offset() - offset;
1828 #ifdef ASSERT
1829     int offset_size = (stack_offset == 0) ? 0 : ((stack_offset < 0x80) ? 1 : (UseAVX > 2) ? 6 : 4);
1830     // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
1831     assert(!do_size || size == (5+offset_size), "incorrect size calculattion");
1832 #endif
1833     return size;
1834 #ifndef PRODUCT
1835   } else if (!do_size) {
1836     if (is_load) {
1837       switch (ireg) {
1838       case Op_VecS:
1839         st->print("movd    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
1840         break;
1841       case Op_VecD:
1842         st->print("movq    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
1843         break;
1844        case Op_VecX:
1845         st->print("movdqu  %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
1846         break;
1847       case Op_VecY:
1848       case Op_VecZ:
1849         st->print("vmovdqu %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
1850         break;
1851       default:
1852         ShouldNotReachHere();
1853       }
1854     } else { // store
1855       switch (ireg) {
1856       case Op_VecS:
1857         st->print("movd    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
1858         break;
1859       case Op_VecD:
1860         st->print("movq    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
1861         break;
1862        case Op_VecX:
1863         st->print("movdqu  [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
1864         break;
1865       case Op_VecY:
1866       case Op_VecZ:
1867         st->print("vmovdqu [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
1868         break;
1869       default:
1870         ShouldNotReachHere();
1871       }
1872     }
1873 #endif
1874   }
1875   bool is_single_byte = false;
1876   int vec_len = 0;
1877   if ((UseAVX > 2) && (stack_offset != 0)) {
1878     int tuple_type = Assembler::EVEX_FVM;
1879     int input_size = Assembler::EVEX_32bit;
1880     switch (ireg) {
1881     case Op_VecS:
1882       tuple_type = Assembler::EVEX_T1S;
1883       break;
1884     case Op_VecD:
1885       tuple_type = Assembler::EVEX_T1S;
1886       input_size = Assembler::EVEX_64bit;
1887       break;
1888     case Op_VecX:
1889       break;
1890     case Op_VecY:
1891       vec_len = 1;
1892       break;
1893     case Op_VecZ:
1894       vec_len = 2;
1895       break;
1896     }
1897     is_single_byte = Assembler::query_compressed_disp_byte(stack_offset, true, vec_len, tuple_type, input_size, 0);
1898   }
1899   int offset_size = 0;
1900   int size = 5;
1901   if (UseAVX > 2 ) {
1902     if (VM_Version::supports_avx512novl() && (vec_len == 2)) {
1903       offset_size = (stack_offset == 0) ? 0 : ((is_single_byte) ? 1 : 4);
1904       size += 2; // Need an additional two bytes for EVEX encoding
1905     } else if (VM_Version::supports_avx512novl() && (vec_len < 2)) {
1906       offset_size = (stack_offset == 0) ? 0 : ((stack_offset <= 127) ? 1 : 4);
1907     } else {
1908       offset_size = (stack_offset == 0) ? 0 : ((is_single_byte) ? 1 : 4);
1909       size += 2; // Need an additional two bytes for EVEX encodding
1910     }
1911   } else {
1912     offset_size = (stack_offset == 0) ? 0 : ((stack_offset <= 127) ? 1 : 4);
1913   }
1914   // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
1915   return size+offset_size;
1916 }
1917 
1918 static inline jint replicate4_imm(int con, int width) {
1919   // Load a constant of "width" (in bytes) and replicate it to fill 32bit.
1920   assert(width == 1 || width == 2, "only byte or short types here");
1921   int bit_width = width * 8;
1922   jint val = con;
1923   val &= (1 << bit_width) - 1;  // mask off sign bits
1924   while(bit_width < 32) {
1925     val |= (val << bit_width);
1926     bit_width <<= 1;
1927   }
1928   return val;
1929 }
1930 
1931 static inline jlong replicate8_imm(int con, int width) {
1932   // Load a constant of "width" (in bytes) and replicate it to fill 64bit.
1933   assert(width == 1 || width == 2 || width == 4, "only byte, short or int types here");
1934   int bit_width = width * 8;
1935   jlong val = con;
1936   val &= (((jlong) 1) << bit_width) - 1;  // mask off sign bits
1937   while(bit_width < 64) {
1938     val |= (val << bit_width);
1939     bit_width <<= 1;
1940   }
1941   return val;
1942 }
1943 
1944 #ifndef PRODUCT
1945   void MachNopNode::format(PhaseRegAlloc*, outputStream* st) const {
1946     st->print("nop \t# %d bytes pad for loops and calls", _count);
1947   }
1948 #endif
1949 
1950   void MachNopNode::emit(CodeBuffer &cbuf, PhaseRegAlloc*) const {
1951     MacroAssembler _masm(&cbuf);
1952     __ nop(_count);
1953   }
1954 
1955   uint MachNopNode::size(PhaseRegAlloc*) const {
1956     return _count;
1957   }
1958 
1959 #ifndef PRODUCT
1960   void MachBreakpointNode::format(PhaseRegAlloc*, outputStream* st) const {
1961     st->print("# breakpoint");
1962   }
1963 #endif
1964 
1965   void MachBreakpointNode::emit(CodeBuffer &cbuf, PhaseRegAlloc* ra_) const {
1966     MacroAssembler _masm(&cbuf);
1967     __ int3();
1968   }
1969 
1970   uint MachBreakpointNode::size(PhaseRegAlloc* ra_) const {
1971     return MachNode::size(ra_);
1972   }
1973 
1974 %}
1975 
1976 encode %{
1977 
1978   enc_class call_epilog %{
1979     if (VerifyStackAtCalls) {
1980       // Check that stack depth is unchanged: find majik cookie on stack
1981       int framesize = ra_->reg2offset_unchecked(OptoReg::add(ra_->_matcher._old_SP, -3*VMRegImpl::slots_per_word));
1982       MacroAssembler _masm(&cbuf);
1983       Label L;
1984       __ cmpptr(Address(rsp, framesize), (int32_t)0xbadb100d);
1985       __ jccb(Assembler::equal, L);
1986       // Die if stack mismatch
1987       __ int3();
1988       __ bind(L);
1989     }
1990   %}
1991 
1992 %}
1993 
1994 
1995 //----------OPERANDS-----------------------------------------------------------
1996 // Operand definitions must precede instruction definitions for correct parsing
1997 // in the ADLC because operands constitute user defined types which are used in
1998 // instruction definitions.
1999 
2000 operand vecZ() %{
2001   constraint(ALLOC_IN_RC(vectorz_reg));
2002   match(VecZ);
2003 
2004   format %{ %}
2005   interface(REG_INTER);
2006 %}
2007 
2008 operand legVecZ() %{
2009   constraint(ALLOC_IN_RC(vectorz_reg_vl));
2010   match(VecZ);
2011 
2012   format %{ %}
2013   interface(REG_INTER);
2014 %}
2015 
2016 // Comparison Code for FP conditional move
2017 operand cmpOp_vcmppd() %{
2018   match(Bool);
2019 
2020   predicate(n->as_Bool()->_test._test != BoolTest::overflow &&
2021             n->as_Bool()->_test._test != BoolTest::no_overflow);
2022   format %{ "" %}
2023   interface(COND_INTER) %{
2024     equal        (0x0, "eq");
2025     less         (0x1, "lt");
2026     less_equal   (0x2, "le");
2027     not_equal    (0xC, "ne");
2028     greater_equal(0xD, "ge");
2029     greater      (0xE, "gt");
2030     //TODO cannot compile (adlc breaks) without two next lines with error:
2031     // x86_64.ad(13987) Syntax Error: :In operand cmpOp_vcmppd: Do not support this encode constant: ' %{
2032     // equal' for overflow.
2033     overflow     (0x20, "o");  // not really supported by the instruction
2034     no_overflow  (0x21, "no"); // not really supported by the instruction
2035   %}
2036 %}
2037 
2038 
2039 // INSTRUCTIONS -- Platform independent definitions (same for 32- and 64-bit)
2040 
2041 // ============================================================================
2042 
2043 instruct ShouldNotReachHere() %{
2044   match(Halt);
2045   format %{ "ud2\t# ShouldNotReachHere" %}
2046   ins_encode %{
2047     __ ud2();
2048   %}
2049   ins_pipe(pipe_slow);
2050 %}
2051 
2052 // =================================EVEX special===============================
2053 
2054 instruct setMask(rRegI dst, rRegI src) %{
2055   predicate(Matcher::has_predicated_vectors());
2056   match(Set dst (SetVectMaskI  src));
2057   effect(TEMP dst);
2058   format %{ "setvectmask   $dst, $src" %}
2059   ins_encode %{
2060     __ setvectmask($dst$$Register, $src$$Register);
2061   %}
2062   ins_pipe(pipe_slow);
2063 %}
2064 
2065 // ============================================================================
2066 
2067 instruct addF_reg(regF dst, regF src) %{
2068   predicate((UseSSE>=1) && (UseAVX == 0));
2069   match(Set dst (AddF dst src));
2070 
2071   format %{ "addss   $dst, $src" %}
2072   ins_cost(150);
2073   ins_encode %{
2074     __ addss($dst$$XMMRegister, $src$$XMMRegister);
2075   %}
2076   ins_pipe(pipe_slow);
2077 %}
2078 
2079 instruct addF_mem(regF dst, memory src) %{
2080   predicate((UseSSE>=1) && (UseAVX == 0));
2081   match(Set dst (AddF dst (LoadF src)));
2082 
2083   format %{ "addss   $dst, $src" %}
2084   ins_cost(150);
2085   ins_encode %{
2086     __ addss($dst$$XMMRegister, $src$$Address);
2087   %}
2088   ins_pipe(pipe_slow);
2089 %}
2090 
2091 instruct addF_imm(regF dst, immF con) %{
2092   predicate((UseSSE>=1) && (UseAVX == 0));
2093   match(Set dst (AddF dst con));
2094   format %{ "addss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2095   ins_cost(150);
2096   ins_encode %{
2097     __ addss($dst$$XMMRegister, $constantaddress($con));
2098   %}
2099   ins_pipe(pipe_slow);
2100 %}
2101 
2102 instruct addF_reg_reg(regF dst, regF src1, regF src2) %{
2103   predicate(UseAVX > 0);
2104   match(Set dst (AddF src1 src2));
2105 
2106   format %{ "vaddss  $dst, $src1, $src2" %}
2107   ins_cost(150);
2108   ins_encode %{
2109     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2110   %}
2111   ins_pipe(pipe_slow);
2112 %}
2113 
2114 instruct addF_reg_mem(regF dst, regF src1, memory src2) %{
2115   predicate(UseAVX > 0);
2116   match(Set dst (AddF src1 (LoadF src2)));
2117 
2118   format %{ "vaddss  $dst, $src1, $src2" %}
2119   ins_cost(150);
2120   ins_encode %{
2121     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2122   %}
2123   ins_pipe(pipe_slow);
2124 %}
2125 
2126 instruct addF_reg_imm(regF dst, regF src, immF con) %{
2127   predicate(UseAVX > 0);
2128   match(Set dst (AddF src con));
2129 
2130   format %{ "vaddss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
2131   ins_cost(150);
2132   ins_encode %{
2133     __ vaddss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2134   %}
2135   ins_pipe(pipe_slow);
2136 %}
2137 
2138 instruct addD_reg(regD dst, regD src) %{
2139   predicate((UseSSE>=2) && (UseAVX == 0));
2140   match(Set dst (AddD dst src));
2141 
2142   format %{ "addsd   $dst, $src" %}
2143   ins_cost(150);
2144   ins_encode %{
2145     __ addsd($dst$$XMMRegister, $src$$XMMRegister);
2146   %}
2147   ins_pipe(pipe_slow);
2148 %}
2149 
2150 instruct addD_mem(regD dst, memory src) %{
2151   predicate((UseSSE>=2) && (UseAVX == 0));
2152   match(Set dst (AddD dst (LoadD src)));
2153 
2154   format %{ "addsd   $dst, $src" %}
2155   ins_cost(150);
2156   ins_encode %{
2157     __ addsd($dst$$XMMRegister, $src$$Address);
2158   %}
2159   ins_pipe(pipe_slow);
2160 %}
2161 
2162 instruct addD_imm(regD dst, immD con) %{
2163   predicate((UseSSE>=2) && (UseAVX == 0));
2164   match(Set dst (AddD dst con));
2165   format %{ "addsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2166   ins_cost(150);
2167   ins_encode %{
2168     __ addsd($dst$$XMMRegister, $constantaddress($con));
2169   %}
2170   ins_pipe(pipe_slow);
2171 %}
2172 
2173 instruct addD_reg_reg(regD dst, regD src1, regD src2) %{
2174   predicate(UseAVX > 0);
2175   match(Set dst (AddD src1 src2));
2176 
2177   format %{ "vaddsd  $dst, $src1, $src2" %}
2178   ins_cost(150);
2179   ins_encode %{
2180     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2181   %}
2182   ins_pipe(pipe_slow);
2183 %}
2184 
2185 instruct addD_reg_mem(regD dst, regD src1, memory src2) %{
2186   predicate(UseAVX > 0);
2187   match(Set dst (AddD src1 (LoadD src2)));
2188 
2189   format %{ "vaddsd  $dst, $src1, $src2" %}
2190   ins_cost(150);
2191   ins_encode %{
2192     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2193   %}
2194   ins_pipe(pipe_slow);
2195 %}
2196 
2197 instruct addD_reg_imm(regD dst, regD src, immD con) %{
2198   predicate(UseAVX > 0);
2199   match(Set dst (AddD src con));
2200 
2201   format %{ "vaddsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
2202   ins_cost(150);
2203   ins_encode %{
2204     __ vaddsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2205   %}
2206   ins_pipe(pipe_slow);
2207 %}
2208 
2209 instruct subF_reg(regF dst, regF src) %{
2210   predicate((UseSSE>=1) && (UseAVX == 0));
2211   match(Set dst (SubF dst src));
2212 
2213   format %{ "subss   $dst, $src" %}
2214   ins_cost(150);
2215   ins_encode %{
2216     __ subss($dst$$XMMRegister, $src$$XMMRegister);
2217   %}
2218   ins_pipe(pipe_slow);
2219 %}
2220 
2221 instruct subF_mem(regF dst, memory src) %{
2222   predicate((UseSSE>=1) && (UseAVX == 0));
2223   match(Set dst (SubF dst (LoadF src)));
2224 
2225   format %{ "subss   $dst, $src" %}
2226   ins_cost(150);
2227   ins_encode %{
2228     __ subss($dst$$XMMRegister, $src$$Address);
2229   %}
2230   ins_pipe(pipe_slow);
2231 %}
2232 
2233 instruct subF_imm(regF dst, immF con) %{
2234   predicate((UseSSE>=1) && (UseAVX == 0));
2235   match(Set dst (SubF dst con));
2236   format %{ "subss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2237   ins_cost(150);
2238   ins_encode %{
2239     __ subss($dst$$XMMRegister, $constantaddress($con));
2240   %}
2241   ins_pipe(pipe_slow);
2242 %}
2243 
2244 instruct subF_reg_reg(regF dst, regF src1, regF src2) %{
2245   predicate(UseAVX > 0);
2246   match(Set dst (SubF src1 src2));
2247 
2248   format %{ "vsubss  $dst, $src1, $src2" %}
2249   ins_cost(150);
2250   ins_encode %{
2251     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2252   %}
2253   ins_pipe(pipe_slow);
2254 %}
2255 
2256 instruct subF_reg_mem(regF dst, regF src1, memory src2) %{
2257   predicate(UseAVX > 0);
2258   match(Set dst (SubF src1 (LoadF src2)));
2259 
2260   format %{ "vsubss  $dst, $src1, $src2" %}
2261   ins_cost(150);
2262   ins_encode %{
2263     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2264   %}
2265   ins_pipe(pipe_slow);
2266 %}
2267 
2268 instruct subF_reg_imm(regF dst, regF src, immF con) %{
2269   predicate(UseAVX > 0);
2270   match(Set dst (SubF src con));
2271 
2272   format %{ "vsubss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
2273   ins_cost(150);
2274   ins_encode %{
2275     __ vsubss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2276   %}
2277   ins_pipe(pipe_slow);
2278 %}
2279 
2280 instruct subD_reg(regD dst, regD src) %{
2281   predicate((UseSSE>=2) && (UseAVX == 0));
2282   match(Set dst (SubD dst src));
2283 
2284   format %{ "subsd   $dst, $src" %}
2285   ins_cost(150);
2286   ins_encode %{
2287     __ subsd($dst$$XMMRegister, $src$$XMMRegister);
2288   %}
2289   ins_pipe(pipe_slow);
2290 %}
2291 
2292 instruct subD_mem(regD dst, memory src) %{
2293   predicate((UseSSE>=2) && (UseAVX == 0));
2294   match(Set dst (SubD dst (LoadD src)));
2295 
2296   format %{ "subsd   $dst, $src" %}
2297   ins_cost(150);
2298   ins_encode %{
2299     __ subsd($dst$$XMMRegister, $src$$Address);
2300   %}
2301   ins_pipe(pipe_slow);
2302 %}
2303 
2304 instruct subD_imm(regD dst, immD con) %{
2305   predicate((UseSSE>=2) && (UseAVX == 0));
2306   match(Set dst (SubD dst con));
2307   format %{ "subsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2308   ins_cost(150);
2309   ins_encode %{
2310     __ subsd($dst$$XMMRegister, $constantaddress($con));
2311   %}
2312   ins_pipe(pipe_slow);
2313 %}
2314 
2315 instruct subD_reg_reg(regD dst, regD src1, regD src2) %{
2316   predicate(UseAVX > 0);
2317   match(Set dst (SubD src1 src2));
2318 
2319   format %{ "vsubsd  $dst, $src1, $src2" %}
2320   ins_cost(150);
2321   ins_encode %{
2322     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2323   %}
2324   ins_pipe(pipe_slow);
2325 %}
2326 
2327 instruct subD_reg_mem(regD dst, regD src1, memory src2) %{
2328   predicate(UseAVX > 0);
2329   match(Set dst (SubD src1 (LoadD src2)));
2330 
2331   format %{ "vsubsd  $dst, $src1, $src2" %}
2332   ins_cost(150);
2333   ins_encode %{
2334     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2335   %}
2336   ins_pipe(pipe_slow);
2337 %}
2338 
2339 instruct subD_reg_imm(regD dst, regD src, immD con) %{
2340   predicate(UseAVX > 0);
2341   match(Set dst (SubD src con));
2342 
2343   format %{ "vsubsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
2344   ins_cost(150);
2345   ins_encode %{
2346     __ vsubsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2347   %}
2348   ins_pipe(pipe_slow);
2349 %}
2350 
2351 instruct mulF_reg(regF dst, regF src) %{
2352   predicate((UseSSE>=1) && (UseAVX == 0));
2353   match(Set dst (MulF dst src));
2354 
2355   format %{ "mulss   $dst, $src" %}
2356   ins_cost(150);
2357   ins_encode %{
2358     __ mulss($dst$$XMMRegister, $src$$XMMRegister);
2359   %}
2360   ins_pipe(pipe_slow);
2361 %}
2362 
2363 instruct mulF_mem(regF dst, memory src) %{
2364   predicate((UseSSE>=1) && (UseAVX == 0));
2365   match(Set dst (MulF dst (LoadF src)));
2366 
2367   format %{ "mulss   $dst, $src" %}
2368   ins_cost(150);
2369   ins_encode %{
2370     __ mulss($dst$$XMMRegister, $src$$Address);
2371   %}
2372   ins_pipe(pipe_slow);
2373 %}
2374 
2375 instruct mulF_imm(regF dst, immF con) %{
2376   predicate((UseSSE>=1) && (UseAVX == 0));
2377   match(Set dst (MulF dst con));
2378   format %{ "mulss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2379   ins_cost(150);
2380   ins_encode %{
2381     __ mulss($dst$$XMMRegister, $constantaddress($con));
2382   %}
2383   ins_pipe(pipe_slow);
2384 %}
2385 
2386 instruct mulF_reg_reg(regF dst, regF src1, regF src2) %{
2387   predicate(UseAVX > 0);
2388   match(Set dst (MulF src1 src2));
2389 
2390   format %{ "vmulss  $dst, $src1, $src2" %}
2391   ins_cost(150);
2392   ins_encode %{
2393     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2394   %}
2395   ins_pipe(pipe_slow);
2396 %}
2397 
2398 instruct mulF_reg_mem(regF dst, regF src1, memory src2) %{
2399   predicate(UseAVX > 0);
2400   match(Set dst (MulF src1 (LoadF src2)));
2401 
2402   format %{ "vmulss  $dst, $src1, $src2" %}
2403   ins_cost(150);
2404   ins_encode %{
2405     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2406   %}
2407   ins_pipe(pipe_slow);
2408 %}
2409 
2410 instruct mulF_reg_imm(regF dst, regF src, immF con) %{
2411   predicate(UseAVX > 0);
2412   match(Set dst (MulF src con));
2413 
2414   format %{ "vmulss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
2415   ins_cost(150);
2416   ins_encode %{
2417     __ vmulss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2418   %}
2419   ins_pipe(pipe_slow);
2420 %}
2421 
2422 instruct mulD_reg(regD dst, regD src) %{
2423   predicate((UseSSE>=2) && (UseAVX == 0));
2424   match(Set dst (MulD dst src));
2425 
2426   format %{ "mulsd   $dst, $src" %}
2427   ins_cost(150);
2428   ins_encode %{
2429     __ mulsd($dst$$XMMRegister, $src$$XMMRegister);
2430   %}
2431   ins_pipe(pipe_slow);
2432 %}
2433 
2434 instruct mulD_mem(regD dst, memory src) %{
2435   predicate((UseSSE>=2) && (UseAVX == 0));
2436   match(Set dst (MulD dst (LoadD src)));
2437 
2438   format %{ "mulsd   $dst, $src" %}
2439   ins_cost(150);
2440   ins_encode %{
2441     __ mulsd($dst$$XMMRegister, $src$$Address);
2442   %}
2443   ins_pipe(pipe_slow);
2444 %}
2445 
2446 instruct mulD_imm(regD dst, immD con) %{
2447   predicate((UseSSE>=2) && (UseAVX == 0));
2448   match(Set dst (MulD dst con));
2449   format %{ "mulsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2450   ins_cost(150);
2451   ins_encode %{
2452     __ mulsd($dst$$XMMRegister, $constantaddress($con));
2453   %}
2454   ins_pipe(pipe_slow);
2455 %}
2456 
2457 instruct mulD_reg_reg(regD dst, regD src1, regD src2) %{
2458   predicate(UseAVX > 0);
2459   match(Set dst (MulD src1 src2));
2460 
2461   format %{ "vmulsd  $dst, $src1, $src2" %}
2462   ins_cost(150);
2463   ins_encode %{
2464     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2465   %}
2466   ins_pipe(pipe_slow);
2467 %}
2468 
2469 instruct mulD_reg_mem(regD dst, regD src1, memory src2) %{
2470   predicate(UseAVX > 0);
2471   match(Set dst (MulD src1 (LoadD src2)));
2472 
2473   format %{ "vmulsd  $dst, $src1, $src2" %}
2474   ins_cost(150);
2475   ins_encode %{
2476     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2477   %}
2478   ins_pipe(pipe_slow);
2479 %}
2480 
2481 instruct mulD_reg_imm(regD dst, regD src, immD con) %{
2482   predicate(UseAVX > 0);
2483   match(Set dst (MulD src con));
2484 
2485   format %{ "vmulsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
2486   ins_cost(150);
2487   ins_encode %{
2488     __ vmulsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2489   %}
2490   ins_pipe(pipe_slow);
2491 %}
2492 
2493 instruct divF_reg(regF dst, regF src) %{
2494   predicate((UseSSE>=1) && (UseAVX == 0));
2495   match(Set dst (DivF dst src));
2496 
2497   format %{ "divss   $dst, $src" %}
2498   ins_cost(150);
2499   ins_encode %{
2500     __ divss($dst$$XMMRegister, $src$$XMMRegister);
2501   %}
2502   ins_pipe(pipe_slow);
2503 %}
2504 
2505 instruct divF_mem(regF dst, memory src) %{
2506   predicate((UseSSE>=1) && (UseAVX == 0));
2507   match(Set dst (DivF dst (LoadF src)));
2508 
2509   format %{ "divss   $dst, $src" %}
2510   ins_cost(150);
2511   ins_encode %{
2512     __ divss($dst$$XMMRegister, $src$$Address);
2513   %}
2514   ins_pipe(pipe_slow);
2515 %}
2516 
2517 instruct divF_imm(regF dst, immF con) %{
2518   predicate((UseSSE>=1) && (UseAVX == 0));
2519   match(Set dst (DivF dst con));
2520   format %{ "divss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2521   ins_cost(150);
2522   ins_encode %{
2523     __ divss($dst$$XMMRegister, $constantaddress($con));
2524   %}
2525   ins_pipe(pipe_slow);
2526 %}
2527 
2528 instruct divF_reg_reg(regF dst, regF src1, regF src2) %{
2529   predicate(UseAVX > 0);
2530   match(Set dst (DivF src1 src2));
2531 
2532   format %{ "vdivss  $dst, $src1, $src2" %}
2533   ins_cost(150);
2534   ins_encode %{
2535     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2536   %}
2537   ins_pipe(pipe_slow);
2538 %}
2539 
2540 instruct divF_reg_mem(regF dst, regF src1, memory src2) %{
2541   predicate(UseAVX > 0);
2542   match(Set dst (DivF src1 (LoadF src2)));
2543 
2544   format %{ "vdivss  $dst, $src1, $src2" %}
2545   ins_cost(150);
2546   ins_encode %{
2547     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2548   %}
2549   ins_pipe(pipe_slow);
2550 %}
2551 
2552 instruct divF_reg_imm(regF dst, regF src, immF con) %{
2553   predicate(UseAVX > 0);
2554   match(Set dst (DivF src con));
2555 
2556   format %{ "vdivss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
2557   ins_cost(150);
2558   ins_encode %{
2559     __ vdivss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2560   %}
2561   ins_pipe(pipe_slow);
2562 %}
2563 
2564 instruct divD_reg(regD dst, regD src) %{
2565   predicate((UseSSE>=2) && (UseAVX == 0));
2566   match(Set dst (DivD dst src));
2567 
2568   format %{ "divsd   $dst, $src" %}
2569   ins_cost(150);
2570   ins_encode %{
2571     __ divsd($dst$$XMMRegister, $src$$XMMRegister);
2572   %}
2573   ins_pipe(pipe_slow);
2574 %}
2575 
2576 instruct divD_mem(regD dst, memory src) %{
2577   predicate((UseSSE>=2) && (UseAVX == 0));
2578   match(Set dst (DivD dst (LoadD src)));
2579 
2580   format %{ "divsd   $dst, $src" %}
2581   ins_cost(150);
2582   ins_encode %{
2583     __ divsd($dst$$XMMRegister, $src$$Address);
2584   %}
2585   ins_pipe(pipe_slow);
2586 %}
2587 
2588 instruct divD_imm(regD dst, immD con) %{
2589   predicate((UseSSE>=2) && (UseAVX == 0));
2590   match(Set dst (DivD dst con));
2591   format %{ "divsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2592   ins_cost(150);
2593   ins_encode %{
2594     __ divsd($dst$$XMMRegister, $constantaddress($con));
2595   %}
2596   ins_pipe(pipe_slow);
2597 %}
2598 
2599 instruct divD_reg_reg(regD dst, regD src1, regD src2) %{
2600   predicate(UseAVX > 0);
2601   match(Set dst (DivD src1 src2));
2602 
2603   format %{ "vdivsd  $dst, $src1, $src2" %}
2604   ins_cost(150);
2605   ins_encode %{
2606     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2607   %}
2608   ins_pipe(pipe_slow);
2609 %}
2610 
2611 instruct divD_reg_mem(regD dst, regD src1, memory src2) %{
2612   predicate(UseAVX > 0);
2613   match(Set dst (DivD src1 (LoadD src2)));
2614 
2615   format %{ "vdivsd  $dst, $src1, $src2" %}
2616   ins_cost(150);
2617   ins_encode %{
2618     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2619   %}
2620   ins_pipe(pipe_slow);
2621 %}
2622 
2623 instruct divD_reg_imm(regD dst, regD src, immD con) %{
2624   predicate(UseAVX > 0);
2625   match(Set dst (DivD src con));
2626 
2627   format %{ "vdivsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
2628   ins_cost(150);
2629   ins_encode %{
2630     __ vdivsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2631   %}
2632   ins_pipe(pipe_slow);
2633 %}
2634 
2635 instruct absF_reg(regF dst) %{
2636   predicate((UseSSE>=1) && (UseAVX == 0));
2637   match(Set dst (AbsF dst));
2638   ins_cost(150);
2639   format %{ "andps   $dst, [0x7fffffff]\t# abs float by sign masking" %}
2640   ins_encode %{
2641     __ andps($dst$$XMMRegister, ExternalAddress(float_signmask()));
2642   %}
2643   ins_pipe(pipe_slow);
2644 %}
2645 
2646 instruct absF_reg_reg(vlRegF dst, vlRegF src) %{
2647   predicate(UseAVX > 0);
2648   match(Set dst (AbsF src));
2649   ins_cost(150);
2650   format %{ "vandps  $dst, $src, [0x7fffffff]\t# abs float by sign masking" %}
2651   ins_encode %{
2652     int vector_len = 0;
2653     __ vandps($dst$$XMMRegister, $src$$XMMRegister,
2654               ExternalAddress(float_signmask()), vector_len);
2655   %}
2656   ins_pipe(pipe_slow);
2657 %}
2658 
2659 instruct absD_reg(regD dst) %{
2660   predicate((UseSSE>=2) && (UseAVX == 0));
2661   match(Set dst (AbsD dst));
2662   ins_cost(150);
2663   format %{ "andpd   $dst, [0x7fffffffffffffff]\t"
2664             "# abs double by sign masking" %}
2665   ins_encode %{
2666     __ andpd($dst$$XMMRegister, ExternalAddress(double_signmask()));
2667   %}
2668   ins_pipe(pipe_slow);
2669 %}
2670 
2671 instruct absD_reg_reg(vlRegD dst, vlRegD src) %{
2672   predicate(UseAVX > 0);
2673   match(Set dst (AbsD src));
2674   ins_cost(150);
2675   format %{ "vandpd  $dst, $src, [0x7fffffffffffffff]\t"
2676             "# abs double by sign masking" %}
2677   ins_encode %{
2678     int vector_len = 0;
2679     __ vandpd($dst$$XMMRegister, $src$$XMMRegister,
2680               ExternalAddress(double_signmask()), vector_len);
2681   %}
2682   ins_pipe(pipe_slow);
2683 %}
2684 
2685 instruct negF_reg(regF dst) %{
2686   predicate((UseSSE>=1) && (UseAVX == 0));
2687   match(Set dst (NegF dst));
2688   ins_cost(150);
2689   format %{ "xorps   $dst, [0x80000000]\t# neg float by sign flipping" %}
2690   ins_encode %{
2691     __ xorps($dst$$XMMRegister, ExternalAddress(float_signflip()));
2692   %}
2693   ins_pipe(pipe_slow);
2694 %}
2695 
2696 instruct negF_reg_reg(vlRegF dst, vlRegF src) %{
2697   predicate(UseAVX > 0);
2698   match(Set dst (NegF src));
2699   ins_cost(150);
2700   format %{ "vnegatess  $dst, $src, [0x80000000]\t# neg float by sign flipping" %}
2701   ins_encode %{
2702     __ vnegatess($dst$$XMMRegister, $src$$XMMRegister,
2703                  ExternalAddress(float_signflip()));
2704   %}
2705   ins_pipe(pipe_slow);
2706 %}
2707 
2708 instruct negD_reg(regD dst) %{
2709   predicate((UseSSE>=2) && (UseAVX == 0));
2710   match(Set dst (NegD dst));
2711   ins_cost(150);
2712   format %{ "xorpd   $dst, [0x8000000000000000]\t"
2713             "# neg double by sign flipping" %}
2714   ins_encode %{
2715     __ xorpd($dst$$XMMRegister, ExternalAddress(double_signflip()));
2716   %}
2717   ins_pipe(pipe_slow);
2718 %}
2719 
2720 instruct negD_reg_reg(vlRegD dst, vlRegD src) %{
2721   predicate(UseAVX > 0);
2722   match(Set dst (NegD src));
2723   ins_cost(150);
2724   format %{ "vnegatesd  $dst, $src, [0x8000000000000000]\t"
2725             "# neg double by sign flipping" %}
2726   ins_encode %{
2727     __ vnegatesd($dst$$XMMRegister, $src$$XMMRegister,
2728                  ExternalAddress(double_signflip()));
2729   %}
2730   ins_pipe(pipe_slow);
2731 %}
2732 
2733 instruct sqrtF_reg(regF dst, regF src) %{
2734   predicate(UseSSE>=1);
2735   match(Set dst (SqrtF src));
2736 
2737   format %{ "sqrtss  $dst, $src" %}
2738   ins_cost(150);
2739   ins_encode %{
2740     __ sqrtss($dst$$XMMRegister, $src$$XMMRegister);
2741   %}
2742   ins_pipe(pipe_slow);
2743 %}
2744 
2745 instruct sqrtF_mem(regF dst, memory src) %{
2746   predicate(UseSSE>=1);
2747   match(Set dst (SqrtF (LoadF src)));
2748 
2749   format %{ "sqrtss  $dst, $src" %}
2750   ins_cost(150);
2751   ins_encode %{
2752     __ sqrtss($dst$$XMMRegister, $src$$Address);
2753   %}
2754   ins_pipe(pipe_slow);
2755 %}
2756 
2757 instruct sqrtF_imm(regF dst, immF con) %{
2758   predicate(UseSSE>=1);
2759   match(Set dst (SqrtF con));
2760 
2761   format %{ "sqrtss  $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2762   ins_cost(150);
2763   ins_encode %{
2764     __ sqrtss($dst$$XMMRegister, $constantaddress($con));
2765   %}
2766   ins_pipe(pipe_slow);
2767 %}
2768 
2769 instruct sqrtD_reg(regD dst, regD src) %{
2770   predicate(UseSSE>=2);
2771   match(Set dst (SqrtD src));
2772 
2773   format %{ "sqrtsd  $dst, $src" %}
2774   ins_cost(150);
2775   ins_encode %{
2776     __ sqrtsd($dst$$XMMRegister, $src$$XMMRegister);
2777   %}
2778   ins_pipe(pipe_slow);
2779 %}
2780 
2781 instruct sqrtD_mem(regD dst, memory src) %{
2782   predicate(UseSSE>=2);
2783   match(Set dst (SqrtD (LoadD src)));
2784 
2785   format %{ "sqrtsd  $dst, $src" %}
2786   ins_cost(150);
2787   ins_encode %{
2788     __ sqrtsd($dst$$XMMRegister, $src$$Address);
2789   %}
2790   ins_pipe(pipe_slow);
2791 %}
2792 
2793 instruct sqrtD_imm(regD dst, immD con) %{
2794   predicate(UseSSE>=2);
2795   match(Set dst (SqrtD con));
2796   format %{ "sqrtsd  $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2797   ins_cost(150);
2798   ins_encode %{
2799     __ sqrtsd($dst$$XMMRegister, $constantaddress($con));
2800   %}
2801   ins_pipe(pipe_slow);
2802 %}
2803 
2804 instruct onspinwait() %{
2805   match(OnSpinWait);
2806   ins_cost(200);
2807 
2808   format %{
2809     $$template
2810     $$emit$$"pause\t! membar_onspinwait"
2811   %}
2812   ins_encode %{
2813     __ pause();
2814   %}
2815   ins_pipe(pipe_slow);
2816 %}
2817 
2818 // a * b + c
2819 instruct fmaD_reg(regD a, regD b, regD c) %{
2820   predicate(UseFMA);
2821   match(Set c (FmaD  c (Binary a b)));
2822   format %{ "fmasd $a,$b,$c\t# $c = $a * $b + $c" %}
2823   ins_cost(150);
2824   ins_encode %{
2825     __ fmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
2826   %}
2827   ins_pipe( pipe_slow );
2828 %}
2829 
2830 // a * b + c
2831 instruct fmaF_reg(regF a, regF b, regF c) %{
2832   predicate(UseFMA);
2833   match(Set c (FmaF  c (Binary a b)));
2834   format %{ "fmass $a,$b,$c\t# $c = $a * $b + $c" %}
2835   ins_cost(150);
2836   ins_encode %{
2837     __ fmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
2838   %}
2839   ins_pipe( pipe_slow );
2840 %}
2841 
2842 // ====================VECTOR INSTRUCTIONS=====================================
2843 
2844 
2845 // Load vectors (4 bytes long)
2846 instruct loadV4(vecS dst, memory mem) %{
2847   predicate(n->as_LoadVector()->memory_size() == 4);
2848   match(Set dst (LoadVector mem));
2849   ins_cost(125);
2850   format %{ "movd    $dst,$mem\t! load vector (4 bytes)" %}
2851   ins_encode %{
2852     __ movdl($dst$$XMMRegister, $mem$$Address);
2853   %}
2854   ins_pipe( pipe_slow );
2855 %}
2856 
2857 // Load vectors (4 bytes long)
2858 instruct MoveVecS2Leg(legVecS dst, vecS src) %{
2859   match(Set dst src);
2860   format %{ "movss $dst,$src\t! load vector (4 bytes)" %}
2861   ins_encode %{
2862     __ movflt($dst$$XMMRegister, $src$$XMMRegister);
2863   %}
2864   ins_pipe( fpu_reg_reg );
2865 %}
2866 
2867 // Load vectors (4 bytes long)
2868 instruct MoveLeg2VecS(vecS dst, legVecS src) %{
2869   match(Set dst src);
2870   format %{ "movss $dst,$src\t! load vector (4 bytes)" %}
2871   ins_encode %{
2872     __ movflt($dst$$XMMRegister, $src$$XMMRegister);
2873   %}
2874   ins_pipe( fpu_reg_reg );
2875 %}
2876 
2877 // Load vectors (8 bytes long)
2878 instruct loadV8(vecD dst, memory mem) %{
2879   predicate(n->as_LoadVector()->memory_size() == 8);
2880   match(Set dst (LoadVector mem));
2881   ins_cost(125);
2882   format %{ "movq    $dst,$mem\t! load vector (8 bytes)" %}
2883   ins_encode %{
2884     __ movq($dst$$XMMRegister, $mem$$Address);
2885   %}
2886   ins_pipe( pipe_slow );
2887 %}
2888 
2889 // Load vectors (8 bytes long)
2890 instruct MoveVecD2Leg(legVecD dst, vecD src) %{
2891   match(Set dst src);
2892   format %{ "movsd $dst,$src\t! load vector (8 bytes)" %}
2893   ins_encode %{
2894     __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
2895   %}
2896   ins_pipe( fpu_reg_reg );
2897 %}
2898 
2899 // Load vectors (8 bytes long)
2900 instruct MoveLeg2VecD(vecD dst, legVecD src) %{
2901   match(Set dst src);
2902   format %{ "movsd $dst,$src\t! load vector (8 bytes)" %}
2903   ins_encode %{
2904     __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
2905   %}
2906   ins_pipe( fpu_reg_reg );
2907 %}
2908 
2909 // Load vectors (16 bytes long)
2910 instruct loadV16(vecX dst, memory mem) %{
2911   predicate(n->as_LoadVector()->memory_size() == 16);
2912   match(Set dst (LoadVector mem));
2913   ins_cost(125);
2914   format %{ "movdqu  $dst,$mem\t! load vector (16 bytes)" %}
2915   ins_encode %{
2916     __ movdqu($dst$$XMMRegister, $mem$$Address);
2917   %}
2918   ins_pipe( pipe_slow );
2919 %}
2920 
2921 // Load vectors (16 bytes long)
2922 instruct MoveVecX2Leg(legVecX dst, vecX src) %{
2923   match(Set dst src);
2924   format %{ "movdqu $dst,$src\t! load vector (16 bytes)" %}
2925   ins_encode %{
2926     if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
2927       int vector_len = 2;
2928       __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
2929     } else {
2930       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
2931     }
2932   %}
2933   ins_pipe( fpu_reg_reg );
2934 %}
2935 
2936 // Load vectors (16 bytes long)
2937 instruct MoveLeg2VecX(vecX dst, legVecX src) %{
2938   match(Set dst src);
2939   format %{ "movdqu $dst,$src\t! load vector (16 bytes)" %}
2940   ins_encode %{
2941     if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
2942       int vector_len = 2;
2943       __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
2944     } else {
2945       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
2946     }
2947   %}
2948   ins_pipe( fpu_reg_reg );
2949 %}
2950 
2951 // Load vectors (32 bytes long)
2952 instruct loadV32(vecY dst, memory mem) %{
2953   predicate(n->as_LoadVector()->memory_size() == 32);
2954   match(Set dst (LoadVector mem));
2955   ins_cost(125);
2956   format %{ "vmovdqu $dst,$mem\t! load vector (32 bytes)" %}
2957   ins_encode %{
2958     __ vmovdqu($dst$$XMMRegister, $mem$$Address);
2959   %}
2960   ins_pipe( pipe_slow );
2961 %}
2962 
2963 // Load vectors (32 bytes long)
2964 instruct MoveVecY2Leg(legVecY dst, vecY src) %{
2965   match(Set dst src);
2966   format %{ "vmovdqu $dst,$src\t! load vector (32 bytes)" %}
2967   ins_encode %{
2968     if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
2969       int vector_len = 2;
2970       __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
2971     } else {
2972       __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister);
2973     }
2974   %}
2975   ins_pipe( fpu_reg_reg );
2976 %}
2977 
2978 // Load vectors (32 bytes long)
2979 instruct MoveLeg2VecY(vecY dst, legVecY src) %{
2980   match(Set dst src);
2981   format %{ "vmovdqu $dst,$src\t! load vector (32 bytes)" %}
2982   ins_encode %{
2983     if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
2984       int vector_len = 2;
2985       __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
2986     } else {
2987       __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister);
2988     }
2989   %}
2990   ins_pipe( fpu_reg_reg );
2991 %}
2992 
2993 // Load vectors (64 bytes long)
2994 instruct loadV64_dword(vecZ dst, memory mem) %{
2995   predicate(n->as_LoadVector()->memory_size() == 64 && n->as_LoadVector()->element_size() <= 4);
2996   match(Set dst (LoadVector mem));
2997   ins_cost(125);
2998   format %{ "vmovdqul $dst k0,$mem\t! load vector (64 bytes)" %}
2999   ins_encode %{
3000     int vector_len = 2;
3001     __ evmovdqul($dst$$XMMRegister, $mem$$Address, vector_len);
3002   %}
3003   ins_pipe( pipe_slow );
3004 %}
3005 
3006 // Load vectors (64 bytes long)
3007 instruct loadV64_qword(vecZ dst, memory mem) %{
3008   predicate(n->as_LoadVector()->memory_size() == 64 && n->as_LoadVector()->element_size() > 4);
3009   match(Set dst (LoadVector mem));
3010   ins_cost(125);
3011   format %{ "vmovdquq $dst k0,$mem\t! load vector (64 bytes)" %}
3012   ins_encode %{
3013     int vector_len = 2;
3014     __ evmovdquq($dst$$XMMRegister, $mem$$Address, vector_len);
3015   %}
3016   ins_pipe( pipe_slow );
3017 %}
3018 
3019 instruct MoveVecZ2Leg(legVecZ dst, vecZ  src) %{
3020   match(Set dst src);
3021   format %{ "vmovdquq $dst k0,$src\t! Move vector (64 bytes)" %}
3022   ins_encode %{
3023     int vector_len = 2;
3024     __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
3025   %}
3026   ins_pipe( fpu_reg_reg );
3027 %}
3028 
3029 instruct MoveLeg2VecZ(vecZ dst, legVecZ  src) %{
3030   match(Set dst src);
3031   format %{ "vmovdquq $dst k0,$src\t! Move vector (64 bytes)" %}
3032   ins_encode %{
3033     int vector_len = 2;
3034     __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
3035   %}
3036   ins_pipe( fpu_reg_reg );
3037 %}
3038 
3039 // Store vectors
3040 instruct storeV4(memory mem, vecS src) %{
3041   predicate(n->as_StoreVector()->memory_size() == 4);
3042   match(Set mem (StoreVector mem src));
3043   ins_cost(145);
3044   format %{ "movd    $mem,$src\t! store vector (4 bytes)" %}
3045   ins_encode %{
3046     __ movdl($mem$$Address, $src$$XMMRegister);
3047   %}
3048   ins_pipe( pipe_slow );
3049 %}
3050 
3051 instruct storeV8(memory mem, vecD src) %{
3052   predicate(n->as_StoreVector()->memory_size() == 8);
3053   match(Set mem (StoreVector mem src));
3054   ins_cost(145);
3055   format %{ "movq    $mem,$src\t! store vector (8 bytes)" %}
3056   ins_encode %{
3057     __ movq($mem$$Address, $src$$XMMRegister);
3058   %}
3059   ins_pipe( pipe_slow );
3060 %}
3061 
3062 instruct storeV16(memory mem, vecX src) %{
3063   predicate(n->as_StoreVector()->memory_size() == 16);
3064   match(Set mem (StoreVector mem src));
3065   ins_cost(145);
3066   format %{ "movdqu  $mem,$src\t! store vector (16 bytes)" %}
3067   ins_encode %{
3068     __ movdqu($mem$$Address, $src$$XMMRegister);
3069   %}
3070   ins_pipe( pipe_slow );
3071 %}
3072 
3073 instruct storeV32(memory mem, vecY src) %{
3074   predicate(n->as_StoreVector()->memory_size() == 32);
3075   match(Set mem (StoreVector mem src));
3076   ins_cost(145);
3077   format %{ "vmovdqu $mem,$src\t! store vector (32 bytes)" %}
3078   ins_encode %{
3079     __ vmovdqu($mem$$Address, $src$$XMMRegister);
3080   %}
3081   ins_pipe( pipe_slow );
3082 %}
3083 
3084 instruct storeV64_dword(memory mem, vecZ src) %{
3085   predicate(n->as_StoreVector()->memory_size() == 64 && n->as_StoreVector()->element_size() <= 4);
3086   match(Set mem (StoreVector mem src));
3087   ins_cost(145);
3088   format %{ "vmovdqul $mem k0,$src\t! store vector (64 bytes)" %}
3089   ins_encode %{
3090     int vector_len = 2;
3091     __ evmovdqul($mem$$Address, $src$$XMMRegister, vector_len);
3092   %}
3093   ins_pipe( pipe_slow );
3094 %}
3095 
3096 instruct storeV64_qword(memory mem, vecZ src) %{
3097   predicate(n->as_StoreVector()->memory_size() == 64 && n->as_StoreVector()->element_size() > 4);
3098   match(Set mem (StoreVector mem src));
3099   ins_cost(145);
3100   format %{ "vmovdquq $mem k0,$src\t! store vector (64 bytes)" %}
3101   ins_encode %{
3102     int vector_len = 2;
3103     __ evmovdquq($mem$$Address, $src$$XMMRegister, vector_len);
3104   %}
3105   ins_pipe( pipe_slow );
3106 %}
3107 
3108 // ====================LEGACY REPLICATE=======================================
3109 
3110 instruct Repl4B_mem(vecS dst, memory mem) %{
3111   predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
3112   match(Set dst (ReplicateB (LoadB mem)));
3113   format %{ "punpcklbw $dst,$mem\n\t"
3114             "pshuflw $dst,$dst,0x00\t! replicate4B" %}
3115   ins_encode %{
3116     __ punpcklbw($dst$$XMMRegister, $mem$$Address);
3117     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3118   %}
3119   ins_pipe( pipe_slow );
3120 %}
3121 
3122 instruct Repl8B_mem(vecD dst, memory mem) %{
3123   predicate(n->as_Vector()->length() == 8 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
3124   match(Set dst (ReplicateB (LoadB mem)));
3125   format %{ "punpcklbw $dst,$mem\n\t"
3126             "pshuflw $dst,$dst,0x00\t! replicate8B" %}
3127   ins_encode %{
3128     __ punpcklbw($dst$$XMMRegister, $mem$$Address);
3129     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3130   %}
3131   ins_pipe( pipe_slow );
3132 %}
3133 
3134 instruct Repl16B(vecX dst, rRegI src) %{
3135   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
3136   match(Set dst (ReplicateB src));
3137   format %{ "movd    $dst,$src\n\t"
3138             "punpcklbw $dst,$dst\n\t"
3139             "pshuflw $dst,$dst,0x00\n\t"
3140             "punpcklqdq $dst,$dst\t! replicate16B" %}
3141   ins_encode %{
3142     __ movdl($dst$$XMMRegister, $src$$Register);
3143     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
3144     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3145     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3146   %}
3147   ins_pipe( pipe_slow );
3148 %}
3149 
3150 instruct Repl16B_mem(vecX dst, memory mem) %{
3151   predicate(n->as_Vector()->length() == 16 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
3152   match(Set dst (ReplicateB (LoadB mem)));
3153   format %{ "punpcklbw $dst,$mem\n\t"
3154             "pshuflw $dst,$dst,0x00\n\t"
3155             "punpcklqdq $dst,$dst\t! replicate16B" %}
3156   ins_encode %{
3157     __ punpcklbw($dst$$XMMRegister, $mem$$Address);
3158     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3159     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3160   %}
3161   ins_pipe( pipe_slow );
3162 %}
3163 
3164 instruct Repl32B(vecY dst, rRegI src) %{
3165   predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
3166   match(Set dst (ReplicateB src));
3167   format %{ "movd    $dst,$src\n\t"
3168             "punpcklbw $dst,$dst\n\t"
3169             "pshuflw $dst,$dst,0x00\n\t"
3170             "punpcklqdq $dst,$dst\n\t"
3171             "vinserti128_high $dst,$dst\t! replicate32B" %}
3172   ins_encode %{
3173     __ movdl($dst$$XMMRegister, $src$$Register);
3174     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
3175     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3176     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3177     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3178   %}
3179   ins_pipe( pipe_slow );
3180 %}
3181 
3182 instruct Repl32B_mem(vecY dst, memory mem) %{
3183   predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
3184   match(Set dst (ReplicateB (LoadB mem)));
3185   format %{ "punpcklbw $dst,$mem\n\t"
3186             "pshuflw $dst,$dst,0x00\n\t"
3187             "punpcklqdq $dst,$dst\n\t"
3188             "vinserti128_high $dst,$dst\t! replicate32B" %}
3189   ins_encode %{
3190     __ punpcklbw($dst$$XMMRegister, $mem$$Address);
3191     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3192     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3193     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3194   %}
3195   ins_pipe( pipe_slow );
3196 %}
3197 
3198 instruct Repl64B(legVecZ dst, rRegI src) %{
3199   predicate(n->as_Vector()->length() == 64 && !VM_Version::supports_avx512vlbw());
3200   match(Set dst (ReplicateB src));
3201   format %{ "movd    $dst,$src\n\t"
3202             "punpcklbw $dst,$dst\n\t"
3203             "pshuflw $dst,$dst,0x00\n\t"
3204             "punpcklqdq $dst,$dst\n\t"
3205             "vinserti128_high $dst,$dst\t"
3206             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate64B" %}
3207   ins_encode %{
3208     __ movdl($dst$$XMMRegister, $src$$Register);
3209     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
3210     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3211     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3212     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3213     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3214   %}
3215   ins_pipe( pipe_slow );
3216 %}
3217 
3218 instruct Repl64B_mem(legVecZ dst, memory mem) %{
3219   predicate(n->as_Vector()->length() == 64 && !VM_Version::supports_avx512vlbw());
3220   match(Set dst (ReplicateB (LoadB mem)));
3221   format %{ "punpcklbw $dst,$mem\n\t"
3222             "pshuflw $dst,$dst,0x00\n\t"
3223             "punpcklqdq $dst,$dst\n\t"
3224             "vinserti128_high $dst,$dst\t"
3225             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate64B" %}
3226   ins_encode %{
3227     __ punpcklbw($dst$$XMMRegister, $mem$$Address);
3228     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3229     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3230     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3231     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3232   %}
3233   ins_pipe( pipe_slow );
3234 %}
3235 
3236 instruct Repl16B_imm(vecX dst, immI con) %{
3237   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
3238   match(Set dst (ReplicateB con));
3239   format %{ "movq    $dst,[$constantaddress]\n\t"
3240             "punpcklqdq $dst,$dst\t! replicate16B($con)" %}
3241   ins_encode %{
3242     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
3243     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3244   %}
3245   ins_pipe( pipe_slow );
3246 %}
3247 
3248 instruct Repl32B_imm(vecY dst, immI con) %{
3249   predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
3250   match(Set dst (ReplicateB con));
3251   format %{ "movq    $dst,[$constantaddress]\n\t"
3252             "punpcklqdq $dst,$dst\n\t"
3253             "vinserti128_high $dst,$dst\t! lreplicate32B($con)" %}
3254   ins_encode %{
3255     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
3256     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3257     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3258   %}
3259   ins_pipe( pipe_slow );
3260 %}
3261 
3262 instruct Repl64B_imm(legVecZ dst, immI con) %{
3263   predicate(n->as_Vector()->length() == 64 && !VM_Version::supports_avx512vlbw());
3264   match(Set dst (ReplicateB con));
3265   format %{ "movq    $dst,[$constantaddress]\n\t"
3266             "punpcklqdq $dst,$dst\n\t"
3267             "vinserti128_high $dst,$dst\t"
3268             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate64B($con)" %}
3269   ins_encode %{
3270     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
3271     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3272     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3273     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3274   %}
3275   ins_pipe( pipe_slow );
3276 %}
3277 
3278 instruct Repl4S(vecD dst, rRegI src) %{
3279   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vlbw());
3280   match(Set dst (ReplicateS src));
3281   format %{ "movd    $dst,$src\n\t"
3282             "pshuflw $dst,$dst,0x00\t! replicate4S" %}
3283   ins_encode %{
3284     __ movdl($dst$$XMMRegister, $src$$Register);
3285     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3286   %}
3287   ins_pipe( pipe_slow );
3288 %}
3289 
3290 instruct Repl4S_mem(vecD dst, memory mem) %{
3291   predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
3292   match(Set dst (ReplicateS (LoadS mem)));
3293   format %{ "pshuflw $dst,$mem,0x00\t! replicate4S" %}
3294   ins_encode %{
3295     __ pshuflw($dst$$XMMRegister, $mem$$Address, 0x00);
3296   %}
3297   ins_pipe( pipe_slow );
3298 %}
3299 
3300 instruct Repl8S(vecX dst, rRegI src) %{
3301   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vlbw());
3302   match(Set dst (ReplicateS src));
3303   format %{ "movd    $dst,$src\n\t"
3304             "pshuflw $dst,$dst,0x00\n\t"
3305             "punpcklqdq $dst,$dst\t! replicate8S" %}
3306   ins_encode %{
3307     __ movdl($dst$$XMMRegister, $src$$Register);
3308     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3309     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3310   %}
3311   ins_pipe( pipe_slow );
3312 %}
3313 
3314 instruct Repl8S_mem(vecX dst, memory mem) %{
3315   predicate(n->as_Vector()->length() == 8 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
3316   match(Set dst (ReplicateS (LoadS mem)));
3317   format %{ "pshuflw $dst,$mem,0x00\n\t"
3318             "punpcklqdq $dst,$dst\t! replicate8S" %}
3319   ins_encode %{
3320     __ pshuflw($dst$$XMMRegister, $mem$$Address, 0x00);
3321     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3322   %}
3323   ins_pipe( pipe_slow );
3324 %}
3325 
3326 instruct Repl8S_imm(vecX dst, immI con) %{
3327   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vlbw());
3328   match(Set dst (ReplicateS con));
3329   format %{ "movq    $dst,[$constantaddress]\n\t"
3330             "punpcklqdq $dst,$dst\t! replicate8S($con)" %}
3331   ins_encode %{
3332     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
3333     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3334   %}
3335   ins_pipe( pipe_slow );
3336 %}
3337 
3338 instruct Repl16S(vecY dst, rRegI src) %{
3339   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
3340   match(Set dst (ReplicateS src));
3341   format %{ "movd    $dst,$src\n\t"
3342             "pshuflw $dst,$dst,0x00\n\t"
3343             "punpcklqdq $dst,$dst\n\t"
3344             "vinserti128_high $dst,$dst\t! replicate16S" %}
3345   ins_encode %{
3346     __ movdl($dst$$XMMRegister, $src$$Register);
3347     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3348     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3349     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3350   %}
3351   ins_pipe( pipe_slow );
3352 %}
3353 
3354 instruct Repl16S_mem(vecY dst, memory mem) %{
3355   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
3356   match(Set dst (ReplicateS (LoadS mem)));
3357   format %{ "pshuflw $dst,$mem,0x00\n\t"
3358             "punpcklqdq $dst,$dst\n\t"
3359             "vinserti128_high $dst,$dst\t! replicate16S" %}
3360   ins_encode %{
3361     __ pshuflw($dst$$XMMRegister, $mem$$Address, 0x00);
3362     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3363     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3364   %}
3365   ins_pipe( pipe_slow );
3366 %}
3367 
3368 instruct Repl16S_imm(vecY dst, immI con) %{
3369   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
3370   match(Set dst (ReplicateS con));
3371   format %{ "movq    $dst,[$constantaddress]\n\t"
3372             "punpcklqdq $dst,$dst\n\t"
3373             "vinserti128_high $dst,$dst\t! replicate16S($con)" %}
3374   ins_encode %{
3375     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
3376     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3377     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3378   %}
3379   ins_pipe( pipe_slow );
3380 %}
3381 
3382 instruct Repl32S(legVecZ dst, rRegI src) %{
3383   predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
3384   match(Set dst (ReplicateS src));
3385   format %{ "movd    $dst,$src\n\t"
3386             "pshuflw $dst,$dst,0x00\n\t"
3387             "punpcklqdq $dst,$dst\n\t"
3388             "vinserti128_high $dst,$dst\t"
3389             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate32S" %}
3390   ins_encode %{
3391     __ movdl($dst$$XMMRegister, $src$$Register);
3392     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3393     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3394     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3395     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3396   %}
3397   ins_pipe( pipe_slow );
3398 %}
3399 
3400 instruct Repl32S_mem(legVecZ dst, memory mem) %{
3401   predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
3402   match(Set dst (ReplicateS (LoadS mem)));
3403   format %{ "pshuflw $dst,$mem,0x00\n\t"
3404             "punpcklqdq $dst,$dst\n\t"
3405             "vinserti128_high $dst,$dst\t"
3406             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate32S" %}
3407   ins_encode %{
3408     __ pshuflw($dst$$XMMRegister, $mem$$Address, 0x00);
3409     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3410     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3411     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3412   %}
3413   ins_pipe( pipe_slow );
3414 %}
3415 
3416 instruct Repl32S_imm(legVecZ dst, immI con) %{
3417   predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
3418   match(Set dst (ReplicateS con));
3419   format %{ "movq    $dst,[$constantaddress]\n\t"
3420             "punpcklqdq $dst,$dst\n\t"
3421             "vinserti128_high $dst,$dst\t"
3422             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate32S($con)" %}
3423   ins_encode %{
3424     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
3425     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3426     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3427     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3428   %}
3429   ins_pipe( pipe_slow );
3430 %}
3431 
3432 instruct Repl4I(vecX dst, rRegI src) %{
3433   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3434   match(Set dst (ReplicateI src));
3435   format %{ "movd    $dst,$src\n\t"
3436             "pshufd  $dst,$dst,0x00\t! replicate4I" %}
3437   ins_encode %{
3438     __ movdl($dst$$XMMRegister, $src$$Register);
3439     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3440   %}
3441   ins_pipe( pipe_slow );
3442 %}
3443 
3444 instruct Repl4I_mem(vecX dst, memory mem) %{
3445   predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3446   match(Set dst (ReplicateI (LoadI mem)));
3447   format %{ "pshufd  $dst,$mem,0x00\t! replicate4I" %}
3448   ins_encode %{
3449     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3450   %}
3451   ins_pipe( pipe_slow );
3452 %}
3453 
3454 instruct Repl8I(vecY dst, rRegI src) %{
3455   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3456   match(Set dst (ReplicateI src));
3457   format %{ "movd    $dst,$src\n\t"
3458             "pshufd  $dst,$dst,0x00\n\t"
3459             "vinserti128_high $dst,$dst\t! replicate8I" %}
3460   ins_encode %{
3461     __ movdl($dst$$XMMRegister, $src$$Register);
3462     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3463     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3464   %}
3465   ins_pipe( pipe_slow );
3466 %}
3467 
3468 instruct Repl8I_mem(vecY dst, memory mem) %{
3469   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3470   match(Set dst (ReplicateI (LoadI mem)));
3471   format %{ "pshufd  $dst,$mem,0x00\n\t"
3472             "vinserti128_high $dst,$dst\t! replicate8I" %}
3473   ins_encode %{
3474     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3475     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3476   %}
3477   ins_pipe( pipe_slow );
3478 %}
3479 
3480 instruct Repl16I(legVecZ dst, rRegI src) %{
3481   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vl());
3482   match(Set dst (ReplicateI src));
3483   format %{ "movd    $dst,$src\n\t"
3484             "pshufd  $dst,$dst,0x00\n\t"
3485             "vinserti128_high $dst,$dst\t"
3486             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate16I" %}
3487   ins_encode %{
3488     __ movdl($dst$$XMMRegister, $src$$Register);
3489     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3490     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3491     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3492   %}
3493   ins_pipe( pipe_slow );
3494 %}
3495 
3496 instruct Repl16I_mem(legVecZ dst, memory mem) %{
3497   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vl());
3498   match(Set dst (ReplicateI (LoadI mem)));
3499   format %{ "pshufd  $dst,$mem,0x00\n\t"
3500             "vinserti128_high $dst,$dst\t"
3501             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate16I" %}
3502   ins_encode %{
3503     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3504     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3505     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3506   %}
3507   ins_pipe( pipe_slow );
3508 %}
3509 
3510 instruct Repl4I_imm(vecX dst, immI con) %{
3511   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3512   match(Set dst (ReplicateI con));
3513   format %{ "movq    $dst,[$constantaddress]\t! replicate4I($con)\n\t"
3514             "punpcklqdq $dst,$dst" %}
3515   ins_encode %{
3516     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
3517     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3518   %}
3519   ins_pipe( pipe_slow );
3520 %}
3521 
3522 instruct Repl8I_imm(vecY dst, immI con) %{
3523   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3524   match(Set dst (ReplicateI con));
3525   format %{ "movq    $dst,[$constantaddress]\t! replicate8I($con)\n\t"
3526             "punpcklqdq $dst,$dst\n\t"
3527             "vinserti128_high $dst,$dst" %}
3528   ins_encode %{
3529     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
3530     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3531     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3532   %}
3533   ins_pipe( pipe_slow );
3534 %}
3535 
3536 instruct Repl16I_imm(legVecZ dst, immI con) %{
3537   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vl());
3538   match(Set dst (ReplicateI con));
3539   format %{ "movq    $dst,[$constantaddress]\t"
3540             "punpcklqdq $dst,$dst\n\t"
3541             "vinserti128_high $dst,$dst"
3542             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate16I($con)" %}
3543   ins_encode %{
3544     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
3545     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3546     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3547     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3548   %}
3549   ins_pipe( pipe_slow );
3550 %}
3551 
3552 // Long could be loaded into xmm register directly from memory.
3553 instruct Repl2L_mem(vecX dst, memory mem) %{
3554   predicate(n->as_Vector()->length() == 2 && !VM_Version::supports_avx512vlbw());
3555   match(Set dst (ReplicateL (LoadL mem)));
3556   format %{ "movq    $dst,$mem\n\t"
3557             "punpcklqdq $dst,$dst\t! replicate2L" %}
3558   ins_encode %{
3559     __ movq($dst$$XMMRegister, $mem$$Address);
3560     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3561   %}
3562   ins_pipe( pipe_slow );
3563 %}
3564 
3565 // Replicate long (8 byte) scalar to be vector
3566 #ifdef _LP64
3567 instruct Repl4L(vecY dst, rRegL src) %{
3568   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3569   match(Set dst (ReplicateL src));
3570   format %{ "movdq   $dst,$src\n\t"
3571             "punpcklqdq $dst,$dst\n\t"
3572             "vinserti128_high $dst,$dst\t! replicate4L" %}
3573   ins_encode %{
3574     __ movdq($dst$$XMMRegister, $src$$Register);
3575     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3576     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3577   %}
3578   ins_pipe( pipe_slow );
3579 %}
3580 
3581 instruct Repl8L(legVecZ dst, rRegL src) %{
3582   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3583   match(Set dst (ReplicateL src));
3584   format %{ "movdq   $dst,$src\n\t"
3585             "punpcklqdq $dst,$dst\n\t"
3586             "vinserti128_high $dst,$dst\t"
3587             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate8L" %}
3588   ins_encode %{
3589     __ movdq($dst$$XMMRegister, $src$$Register);
3590     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3591     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3592     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3593   %}
3594   ins_pipe( pipe_slow );
3595 %}
3596 #else // _LP64
3597 instruct Repl4L(vecY dst, eRegL src, vecY tmp) %{
3598   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3599   match(Set dst (ReplicateL src));
3600   effect(TEMP dst, USE src, TEMP tmp);
3601   format %{ "movdl   $dst,$src.lo\n\t"
3602             "movdl   $tmp,$src.hi\n\t"
3603             "punpckldq $dst,$tmp\n\t"
3604             "punpcklqdq $dst,$dst\n\t"
3605             "vinserti128_high $dst,$dst\t! replicate4L" %}
3606   ins_encode %{
3607     __ movdl($dst$$XMMRegister, $src$$Register);
3608     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
3609     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
3610     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3611     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3612   %}
3613   ins_pipe( pipe_slow );
3614 %}
3615 
3616 instruct Repl8L(legVecZ dst, eRegL src, legVecZ tmp) %{
3617   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3618   match(Set dst (ReplicateL src));
3619   effect(TEMP dst, USE src, TEMP tmp);
3620   format %{ "movdl   $dst,$src.lo\n\t"
3621             "movdl   $tmp,$src.hi\n\t"
3622             "punpckldq $dst,$tmp\n\t"
3623             "punpcklqdq $dst,$dst\n\t"
3624             "vinserti128_high $dst,$dst\t"
3625             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate8L" %}
3626   ins_encode %{
3627     __ movdl($dst$$XMMRegister, $src$$Register);
3628     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
3629     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
3630     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3631     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3632     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3633   %}
3634   ins_pipe( pipe_slow );
3635 %}
3636 #endif // _LP64
3637 
3638 instruct Repl4L_imm(vecY dst, immL con) %{
3639   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3640   match(Set dst (ReplicateL con));
3641   format %{ "movq    $dst,[$constantaddress]\n\t"
3642             "punpcklqdq $dst,$dst\n\t"
3643             "vinserti128_high $dst,$dst\t! replicate4L($con)" %}
3644   ins_encode %{
3645     __ movq($dst$$XMMRegister, $constantaddress($con));
3646     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3647     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3648   %}
3649   ins_pipe( pipe_slow );
3650 %}
3651 
3652 instruct Repl8L_imm(legVecZ dst, immL con) %{
3653   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3654   match(Set dst (ReplicateL con));
3655   format %{ "movq    $dst,[$constantaddress]\n\t"
3656             "punpcklqdq $dst,$dst\n\t"
3657             "vinserti128_high $dst,$dst\t"
3658             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate8L($con)" %}
3659   ins_encode %{
3660     __ movq($dst$$XMMRegister, $constantaddress($con));
3661     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3662     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3663     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3664   %}
3665   ins_pipe( pipe_slow );
3666 %}
3667 
3668 instruct Repl4L_mem(vecY dst, memory mem) %{
3669   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3670   match(Set dst (ReplicateL (LoadL mem)));
3671   format %{ "movq    $dst,$mem\n\t"
3672             "punpcklqdq $dst,$dst\n\t"
3673             "vinserti128_high $dst,$dst\t! replicate4L" %}
3674   ins_encode %{
3675     __ movq($dst$$XMMRegister, $mem$$Address);
3676     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3677     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3678   %}
3679   ins_pipe( pipe_slow );
3680 %}
3681 
3682 instruct Repl8L_mem(legVecZ dst, memory mem) %{
3683   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3684   match(Set dst (ReplicateL (LoadL mem)));
3685   format %{ "movq    $dst,$mem\n\t"
3686             "punpcklqdq $dst,$dst\n\t"
3687             "vinserti128_high $dst,$dst\t"
3688             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate8L" %}
3689   ins_encode %{
3690     __ movq($dst$$XMMRegister, $mem$$Address);
3691     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3692     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3693     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3694   %}
3695   ins_pipe( pipe_slow );
3696 %}
3697 
3698 instruct Repl2F_mem(vecD dst, memory mem) %{
3699   predicate(n->as_Vector()->length() == 2 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3700   match(Set dst (ReplicateF (LoadF mem)));
3701   format %{ "pshufd  $dst,$mem,0x00\t! replicate2F" %}
3702   ins_encode %{
3703     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3704   %}
3705   ins_pipe( pipe_slow );
3706 %}
3707 
3708 instruct Repl4F_mem(vecX dst, memory mem) %{
3709   predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3710   match(Set dst (ReplicateF (LoadF mem)));
3711   format %{ "pshufd  $dst,$mem,0x00\t! replicate4F" %}
3712   ins_encode %{
3713     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3714   %}
3715   ins_pipe( pipe_slow );
3716 %}
3717 
3718 instruct Repl8F(vecY dst, vlRegF src) %{
3719   predicate(n->as_Vector()->length() == 8 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3720   match(Set dst (ReplicateF src));
3721   format %{ "pshufd  $dst,$src,0x00\n\t"
3722             "vinsertf128_high $dst,$dst\t! replicate8F" %}
3723   ins_encode %{
3724     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
3725     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3726   %}
3727   ins_pipe( pipe_slow );
3728 %}
3729 
3730 instruct Repl8F_mem(vecY dst, memory mem) %{
3731   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3732   match(Set dst (ReplicateF (LoadF mem)));
3733   format %{ "pshufd  $dst,$mem,0x00\n\t"
3734             "vinsertf128_high $dst,$dst\t! replicate8F" %}
3735   ins_encode %{
3736     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3737     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3738   %}
3739   ins_pipe( pipe_slow );
3740 %}
3741 
3742 instruct Repl16F(legVecZ dst, vlRegF src) %{
3743   predicate(n->as_Vector()->length() == 16 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3744   match(Set dst (ReplicateF src));
3745   format %{ "pshufd  $dst,$src,0x00\n\t"
3746             "vinsertf128_high $dst,$dst\t"
3747             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate16F" %}
3748   ins_encode %{
3749     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
3750     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3751     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3752   %}
3753   ins_pipe( pipe_slow );
3754 %}
3755 
3756 instruct Repl16F_mem(legVecZ dst, memory mem) %{
3757   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vl());
3758   match(Set dst (ReplicateF (LoadF mem)));
3759   format %{ "pshufd  $dst,$mem,0x00\n\t"
3760             "vinsertf128_high $dst,$dst\t"
3761             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate16F" %}
3762   ins_encode %{
3763     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3764     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3765     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3766   %}
3767   ins_pipe( pipe_slow );
3768 %}
3769 
3770 instruct Repl2F_zero(vecD dst, immF0 zero) %{
3771   predicate(n->as_Vector()->length() == 2 && UseAVX < 3);
3772   match(Set dst (ReplicateF zero));
3773   format %{ "xorps   $dst,$dst\t! replicate2F zero" %}
3774   ins_encode %{
3775     __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
3776   %}
3777   ins_pipe( fpu_reg_reg );
3778 %}
3779 
3780 instruct Repl4F_zero(vecX dst, immF0 zero) %{
3781   predicate(n->as_Vector()->length() == 4 && UseAVX < 3);
3782   match(Set dst (ReplicateF zero));
3783   format %{ "xorps   $dst,$dst\t! replicate4F zero" %}
3784   ins_encode %{
3785     __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
3786   %}
3787   ins_pipe( fpu_reg_reg );
3788 %}
3789 
3790 instruct Repl8F_zero(vecY dst, immF0 zero) %{
3791   predicate(n->as_Vector()->length() == 8 && UseAVX < 3);
3792   match(Set dst (ReplicateF zero));
3793   format %{ "vxorps  $dst,$dst,$dst\t! replicate8F zero" %}
3794   ins_encode %{
3795     int vector_len = 1;
3796     __ vxorps($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3797   %}
3798   ins_pipe( fpu_reg_reg );
3799 %}
3800 
3801 instruct Repl2D_mem(vecX dst, memory mem) %{
3802   predicate(n->as_Vector()->length() == 2 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3803   match(Set dst (ReplicateD (LoadD mem)));
3804   format %{ "pshufd  $dst,$mem,0x44\t! replicate2D" %}
3805   ins_encode %{
3806     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x44);
3807   %}
3808   ins_pipe( pipe_slow );
3809 %}
3810 
3811 instruct Repl4D(vecY dst, vlRegD src) %{
3812   predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3813   match(Set dst (ReplicateD src));
3814   format %{ "pshufd  $dst,$src,0x44\n\t"
3815             "vinsertf128_high $dst,$dst\t! replicate4D" %}
3816   ins_encode %{
3817     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
3818     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3819   %}
3820   ins_pipe( pipe_slow );
3821 %}
3822 
3823 instruct Repl4D_mem(vecY dst, memory mem) %{
3824   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3825   match(Set dst (ReplicateD (LoadD mem)));
3826   format %{ "pshufd  $dst,$mem,0x44\n\t"
3827             "vinsertf128_high $dst,$dst\t! replicate4D" %}
3828   ins_encode %{
3829     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x44);
3830     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3831   %}
3832   ins_pipe( pipe_slow );
3833 %}
3834 
3835 instruct Repl8D(legVecZ dst, vlRegD src) %{
3836   predicate(n->as_Vector()->length() == 8 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3837   match(Set dst (ReplicateD src));
3838   format %{ "pshufd  $dst,$src,0x44\n\t"
3839             "vinsertf128_high $dst,$dst\t"
3840             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate8D" %}
3841   ins_encode %{
3842     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
3843     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3844     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3845   %}
3846   ins_pipe( pipe_slow );
3847 %}
3848 
3849 instruct Repl8D_mem(legVecZ dst, memory mem) %{
3850   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3851   match(Set dst (ReplicateD (LoadD mem)));
3852   format %{ "pshufd  $dst,$mem,0x44\n\t"
3853             "vinsertf128_high $dst,$dst\t"
3854             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate8D" %}
3855   ins_encode %{
3856     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x44);
3857     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3858     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3859   %}
3860   ins_pipe( pipe_slow );
3861 %}
3862 
3863 // Replicate double (8 byte) scalar zero to be vector
3864 instruct Repl2D_zero(vecX dst, immD0 zero) %{
3865   predicate(n->as_Vector()->length() == 2 && UseAVX < 3);
3866   match(Set dst (ReplicateD zero));
3867   format %{ "xorpd   $dst,$dst\t! replicate2D zero" %}
3868   ins_encode %{
3869     __ xorpd($dst$$XMMRegister, $dst$$XMMRegister);
3870   %}
3871   ins_pipe( fpu_reg_reg );
3872 %}
3873 
3874 instruct Repl4D_zero(vecY dst, immD0 zero) %{
3875   predicate(n->as_Vector()->length() == 4 && UseAVX < 3);
3876   match(Set dst (ReplicateD zero));
3877   format %{ "vxorpd  $dst,$dst,$dst,vect256\t! replicate4D zero" %}
3878   ins_encode %{
3879     int vector_len = 1;
3880     __ vxorpd($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3881   %}
3882   ins_pipe( fpu_reg_reg );
3883 %}
3884 
3885 // ====================GENERIC REPLICATE==========================================
3886 
3887 // Replicate byte scalar to be vector
3888 instruct Repl4B(vecS dst, rRegI src) %{
3889   predicate(n->as_Vector()->length() == 4);
3890   match(Set dst (ReplicateB src));
3891   format %{ "movd    $dst,$src\n\t"
3892             "punpcklbw $dst,$dst\n\t"
3893             "pshuflw $dst,$dst,0x00\t! replicate4B" %}
3894   ins_encode %{
3895     __ movdl($dst$$XMMRegister, $src$$Register);
3896     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
3897     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3898   %}
3899   ins_pipe( pipe_slow );
3900 %}
3901 
3902 instruct Repl8B(vecD dst, rRegI src) %{
3903   predicate(n->as_Vector()->length() == 8);
3904   match(Set dst (ReplicateB src));
3905   format %{ "movd    $dst,$src\n\t"
3906             "punpcklbw $dst,$dst\n\t"
3907             "pshuflw $dst,$dst,0x00\t! replicate8B" %}
3908   ins_encode %{
3909     __ movdl($dst$$XMMRegister, $src$$Register);
3910     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
3911     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3912   %}
3913   ins_pipe( pipe_slow );
3914 %}
3915 
3916 // Replicate byte scalar immediate to be vector by loading from const table.
3917 instruct Repl4B_imm(vecS dst, immI con) %{
3918   predicate(n->as_Vector()->length() == 4);
3919   match(Set dst (ReplicateB con));
3920   format %{ "movdl   $dst,[$constantaddress]\t! replicate4B($con)" %}
3921   ins_encode %{
3922     __ movdl($dst$$XMMRegister, $constantaddress(replicate4_imm($con$$constant, 1)));
3923   %}
3924   ins_pipe( pipe_slow );
3925 %}
3926 
3927 instruct Repl8B_imm(vecD dst, immI con) %{
3928   predicate(n->as_Vector()->length() == 8);
3929   match(Set dst (ReplicateB con));
3930   format %{ "movq    $dst,[$constantaddress]\t! replicate8B($con)" %}
3931   ins_encode %{
3932     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
3933   %}
3934   ins_pipe( pipe_slow );
3935 %}
3936 
3937 // Replicate byte scalar zero to be vector
3938 instruct Repl4B_zero(vecS dst, immI0 zero) %{
3939   predicate(n->as_Vector()->length() == 4);
3940   match(Set dst (ReplicateB zero));
3941   format %{ "pxor    $dst,$dst\t! replicate4B zero" %}
3942   ins_encode %{
3943     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3944   %}
3945   ins_pipe( fpu_reg_reg );
3946 %}
3947 
3948 instruct Repl8B_zero(vecD dst, immI0 zero) %{
3949   predicate(n->as_Vector()->length() == 8);
3950   match(Set dst (ReplicateB zero));
3951   format %{ "pxor    $dst,$dst\t! replicate8B zero" %}
3952   ins_encode %{
3953     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3954   %}
3955   ins_pipe( fpu_reg_reg );
3956 %}
3957 
3958 instruct Repl16B_zero(vecX dst, immI0 zero) %{
3959   predicate(n->as_Vector()->length() == 16);
3960   match(Set dst (ReplicateB zero));
3961   format %{ "pxor    $dst,$dst\t! replicate16B zero" %}
3962   ins_encode %{
3963     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3964   %}
3965   ins_pipe( fpu_reg_reg );
3966 %}
3967 
3968 instruct Repl32B_zero(vecY dst, immI0 zero) %{
3969   predicate(n->as_Vector()->length() == 32);
3970   match(Set dst (ReplicateB zero));
3971   format %{ "vpxor   $dst,$dst,$dst\t! replicate32B zero" %}
3972   ins_encode %{
3973     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
3974     int vector_len = 1;
3975     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3976   %}
3977   ins_pipe( fpu_reg_reg );
3978 %}
3979 
3980 // Replicate char/short (2 byte) scalar to be vector
3981 instruct Repl2S(vecS dst, rRegI src) %{
3982   predicate(n->as_Vector()->length() == 2);
3983   match(Set dst (ReplicateS src));
3984   format %{ "movd    $dst,$src\n\t"
3985             "pshuflw $dst,$dst,0x00\t! replicate2S" %}
3986   ins_encode %{
3987     __ movdl($dst$$XMMRegister, $src$$Register);
3988     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3989   %}
3990   ins_pipe( fpu_reg_reg );
3991 %}
3992 
3993 // Replicate char/short (2 byte) scalar immediate to be vector by loading from const table.
3994 instruct Repl2S_imm(vecS dst, immI con) %{
3995   predicate(n->as_Vector()->length() == 2);
3996   match(Set dst (ReplicateS con));
3997   format %{ "movdl   $dst,[$constantaddress]\t! replicate2S($con)" %}
3998   ins_encode %{
3999     __ movdl($dst$$XMMRegister, $constantaddress(replicate4_imm($con$$constant, 2)));
4000   %}
4001   ins_pipe( fpu_reg_reg );
4002 %}
4003 
4004 instruct Repl4S_imm(vecD dst, immI con) %{
4005   predicate(n->as_Vector()->length() == 4);
4006   match(Set dst (ReplicateS con));
4007   format %{ "movq    $dst,[$constantaddress]\t! replicate4S($con)" %}
4008   ins_encode %{
4009     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
4010   %}
4011   ins_pipe( fpu_reg_reg );
4012 %}
4013 
4014 // Replicate char/short (2 byte) scalar zero to be vector
4015 instruct Repl2S_zero(vecS dst, immI0 zero) %{
4016   predicate(n->as_Vector()->length() == 2);
4017   match(Set dst (ReplicateS zero));
4018   format %{ "pxor    $dst,$dst\t! replicate2S zero" %}
4019   ins_encode %{
4020     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4021   %}
4022   ins_pipe( fpu_reg_reg );
4023 %}
4024 
4025 instruct Repl4S_zero(vecD dst, immI0 zero) %{
4026   predicate(n->as_Vector()->length() == 4);
4027   match(Set dst (ReplicateS zero));
4028   format %{ "pxor    $dst,$dst\t! replicate4S zero" %}
4029   ins_encode %{
4030     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4031   %}
4032   ins_pipe( fpu_reg_reg );
4033 %}
4034 
4035 instruct Repl8S_zero(vecX dst, immI0 zero) %{
4036   predicate(n->as_Vector()->length() == 8);
4037   match(Set dst (ReplicateS zero));
4038   format %{ "pxor    $dst,$dst\t! replicate8S zero" %}
4039   ins_encode %{
4040     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4041   %}
4042   ins_pipe( fpu_reg_reg );
4043 %}
4044 
4045 instruct Repl16S_zero(vecY dst, immI0 zero) %{
4046   predicate(n->as_Vector()->length() == 16);
4047   match(Set dst (ReplicateS zero));
4048   format %{ "vpxor   $dst,$dst,$dst\t! replicate16S zero" %}
4049   ins_encode %{
4050     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
4051     int vector_len = 1;
4052     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4053   %}
4054   ins_pipe( fpu_reg_reg );
4055 %}
4056 
4057 // Replicate integer (4 byte) scalar to be vector
4058 instruct Repl2I(vecD dst, rRegI src) %{
4059   predicate(n->as_Vector()->length() == 2);
4060   match(Set dst (ReplicateI src));
4061   format %{ "movd    $dst,$src\n\t"
4062             "pshufd  $dst,$dst,0x00\t! replicate2I" %}
4063   ins_encode %{
4064     __ movdl($dst$$XMMRegister, $src$$Register);
4065     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
4066   %}
4067   ins_pipe( fpu_reg_reg );
4068 %}
4069 
4070 // Integer could be loaded into xmm register directly from memory.
4071 instruct Repl2I_mem(vecD dst, memory mem) %{
4072   predicate(n->as_Vector()->length() == 2);
4073   match(Set dst (ReplicateI (LoadI mem)));
4074   format %{ "movd    $dst,$mem\n\t"
4075             "pshufd  $dst,$dst,0x00\t! replicate2I" %}
4076   ins_encode %{
4077     __ movdl($dst$$XMMRegister, $mem$$Address);
4078     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
4079   %}
4080   ins_pipe( fpu_reg_reg );
4081 %}
4082 
4083 // Replicate integer (4 byte) scalar immediate to be vector by loading from const table.
4084 instruct Repl2I_imm(vecD dst, immI con) %{
4085   predicate(n->as_Vector()->length() == 2);
4086   match(Set dst (ReplicateI con));
4087   format %{ "movq    $dst,[$constantaddress]\t! replicate2I($con)" %}
4088   ins_encode %{
4089     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
4090   %}
4091   ins_pipe( fpu_reg_reg );
4092 %}
4093 
4094 // Replicate integer (4 byte) scalar zero to be vector
4095 instruct Repl2I_zero(vecD dst, immI0 zero) %{
4096   predicate(n->as_Vector()->length() == 2);
4097   match(Set dst (ReplicateI zero));
4098   format %{ "pxor    $dst,$dst\t! replicate2I" %}
4099   ins_encode %{
4100     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4101   %}
4102   ins_pipe( fpu_reg_reg );
4103 %}
4104 
4105 instruct Repl4I_zero(vecX dst, immI0 zero) %{
4106   predicate(n->as_Vector()->length() == 4);
4107   match(Set dst (ReplicateI zero));
4108   format %{ "pxor    $dst,$dst\t! replicate4I zero)" %}
4109   ins_encode %{
4110     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4111   %}
4112   ins_pipe( fpu_reg_reg );
4113 %}
4114 
4115 instruct Repl8I_zero(vecY dst, immI0 zero) %{
4116   predicate(n->as_Vector()->length() == 8);
4117   match(Set dst (ReplicateI zero));
4118   format %{ "vpxor   $dst,$dst,$dst\t! replicate8I zero" %}
4119   ins_encode %{
4120     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
4121     int vector_len = 1;
4122     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4123   %}
4124   ins_pipe( fpu_reg_reg );
4125 %}
4126 
4127 // Replicate long (8 byte) scalar to be vector
4128 #ifdef _LP64
4129 instruct Repl2L(vecX dst, rRegL src) %{
4130   predicate(n->as_Vector()->length() == 2);
4131   match(Set dst (ReplicateL src));
4132   format %{ "movdq   $dst,$src\n\t"
4133             "punpcklqdq $dst,$dst\t! replicate2L" %}
4134   ins_encode %{
4135     __ movdq($dst$$XMMRegister, $src$$Register);
4136     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4137   %}
4138   ins_pipe( pipe_slow );
4139 %}
4140 #else // _LP64
4141 instruct Repl2L(vecX dst, eRegL src, vecX tmp) %{
4142   predicate(n->as_Vector()->length() == 2);
4143   match(Set dst (ReplicateL src));
4144   effect(TEMP dst, USE src, TEMP tmp);
4145   format %{ "movdl   $dst,$src.lo\n\t"
4146             "movdl   $tmp,$src.hi\n\t"
4147             "punpckldq $dst,$tmp\n\t"
4148             "punpcklqdq $dst,$dst\t! replicate2L"%}
4149   ins_encode %{
4150     __ movdl($dst$$XMMRegister, $src$$Register);
4151     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
4152     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
4153     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4154   %}
4155   ins_pipe( pipe_slow );
4156 %}
4157 #endif // _LP64
4158 
4159 // Replicate long (8 byte) scalar immediate to be vector by loading from const table.
4160 instruct Repl2L_imm(vecX dst, immL con) %{
4161   predicate(n->as_Vector()->length() == 2);
4162   match(Set dst (ReplicateL con));
4163   format %{ "movq    $dst,[$constantaddress]\n\t"
4164             "punpcklqdq $dst,$dst\t! replicate2L($con)" %}
4165   ins_encode %{
4166     __ movq($dst$$XMMRegister, $constantaddress($con));
4167     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4168   %}
4169   ins_pipe( pipe_slow );
4170 %}
4171 
4172 // Replicate long (8 byte) scalar zero to be vector
4173 instruct Repl2L_zero(vecX dst, immL0 zero) %{
4174   predicate(n->as_Vector()->length() == 2);
4175   match(Set dst (ReplicateL zero));
4176   format %{ "pxor    $dst,$dst\t! replicate2L zero" %}
4177   ins_encode %{
4178     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4179   %}
4180   ins_pipe( fpu_reg_reg );
4181 %}
4182 
4183 instruct Repl4L_zero(vecY dst, immL0 zero) %{
4184   predicate(n->as_Vector()->length() == 4);
4185   match(Set dst (ReplicateL zero));
4186   format %{ "vpxor   $dst,$dst,$dst\t! replicate4L zero" %}
4187   ins_encode %{
4188     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
4189     int vector_len = 1;
4190     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4191   %}
4192   ins_pipe( fpu_reg_reg );
4193 %}
4194 
4195 // Replicate float (4 byte) scalar to be vector
4196 instruct Repl2F(vecD dst, vlRegF src) %{
4197   predicate(n->as_Vector()->length() == 2);
4198   match(Set dst (ReplicateF src));
4199   format %{ "pshufd  $dst,$dst,0x00\t! replicate2F" %}
4200   ins_encode %{
4201     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
4202   %}
4203   ins_pipe( fpu_reg_reg );
4204 %}
4205 
4206 instruct Repl4F(vecX dst, vlRegF src) %{
4207   predicate(n->as_Vector()->length() == 4);
4208   match(Set dst (ReplicateF src));
4209   format %{ "pshufd  $dst,$dst,0x00\t! replicate4F" %}
4210   ins_encode %{
4211     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
4212   %}
4213   ins_pipe( pipe_slow );
4214 %}
4215 
4216 // Replicate double (8 bytes) scalar to be vector
4217 instruct Repl2D(vecX dst, vlRegD src) %{
4218   predicate(n->as_Vector()->length() == 2);
4219   match(Set dst (ReplicateD src));
4220   format %{ "pshufd  $dst,$src,0x44\t! replicate2D" %}
4221   ins_encode %{
4222     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
4223   %}
4224   ins_pipe( pipe_slow );
4225 %}
4226 
4227 // ====================EVEX REPLICATE=============================================
4228 
4229 instruct Repl4B_mem_evex(vecS dst, memory mem) %{
4230   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4231   match(Set dst (ReplicateB (LoadB mem)));
4232   format %{ "vpbroadcastb  $dst,$mem\t! replicate4B" %}
4233   ins_encode %{
4234     int vector_len = 0;
4235     __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
4236   %}
4237   ins_pipe( pipe_slow );
4238 %}
4239 
4240 instruct Repl8B_mem_evex(vecD dst, memory mem) %{
4241   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4242   match(Set dst (ReplicateB (LoadB mem)));
4243   format %{ "vpbroadcastb  $dst,$mem\t! replicate8B" %}
4244   ins_encode %{
4245     int vector_len = 0;
4246     __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
4247   %}
4248   ins_pipe( pipe_slow );
4249 %}
4250 
4251 instruct Repl16B_evex(vecX dst, rRegI src) %{
4252   predicate(n->as_Vector()->length() == 16 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4253   match(Set dst (ReplicateB src));
4254   format %{ "evpbroadcastb $dst,$src\t! replicate16B" %}
4255   ins_encode %{
4256    int vector_len = 0;
4257     __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vector_len);
4258   %}
4259   ins_pipe( pipe_slow );
4260 %}
4261 
4262 instruct Repl16B_mem_evex(vecX dst, memory mem) %{
4263   predicate(n->as_Vector()->length() == 16 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4264   match(Set dst (ReplicateB (LoadB mem)));
4265   format %{ "vpbroadcastb  $dst,$mem\t! replicate16B" %}
4266   ins_encode %{
4267     int vector_len = 0;
4268     __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
4269   %}
4270   ins_pipe( pipe_slow );
4271 %}
4272 
4273 instruct Repl32B_evex(vecY dst, rRegI src) %{
4274   predicate(n->as_Vector()->length() == 32 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4275   match(Set dst (ReplicateB src));
4276   format %{ "evpbroadcastb $dst,$src\t! replicate32B" %}
4277   ins_encode %{
4278    int vector_len = 1;
4279     __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vector_len);
4280   %}
4281   ins_pipe( pipe_slow );
4282 %}
4283 
4284 instruct Repl32B_mem_evex(vecY dst, memory mem) %{
4285   predicate(n->as_Vector()->length() == 32 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4286   match(Set dst (ReplicateB (LoadB mem)));
4287   format %{ "vpbroadcastb  $dst,$mem\t! replicate32B" %}
4288   ins_encode %{
4289     int vector_len = 1;
4290     __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
4291   %}
4292   ins_pipe( pipe_slow );
4293 %}
4294 
4295 instruct Repl64B_evex(vecZ dst, rRegI src) %{
4296   predicate(n->as_Vector()->length() == 64 && UseAVX > 2 && VM_Version::supports_avx512bw());
4297   match(Set dst (ReplicateB src));
4298   format %{ "evpbroadcastb $dst,$src\t! upper replicate64B" %}
4299   ins_encode %{
4300    int vector_len = 2;
4301     __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vector_len);
4302   %}
4303   ins_pipe( pipe_slow );
4304 %}
4305 
4306 instruct Repl64B_mem_evex(vecZ dst, memory mem) %{
4307   predicate(n->as_Vector()->length() == 64 && UseAVX > 2 && VM_Version::supports_avx512bw());
4308   match(Set dst (ReplicateB (LoadB mem)));
4309   format %{ "vpbroadcastb  $dst,$mem\t! replicate64B" %}
4310   ins_encode %{
4311     int vector_len = 2;
4312     __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
4313   %}
4314   ins_pipe( pipe_slow );
4315 %}
4316 
4317 instruct Repl16B_imm_evex(vecX dst, immI con) %{
4318   predicate(n->as_Vector()->length() == 16 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4319   match(Set dst (ReplicateB con));
4320   format %{ "movq    $dst,[$constantaddress]\n\t"
4321             "vpbroadcastb $dst,$dst\t! replicate16B" %}
4322   ins_encode %{
4323    int vector_len = 0;
4324     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
4325     __ vpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4326   %}
4327   ins_pipe( pipe_slow );
4328 %}
4329 
4330 instruct Repl32B_imm_evex(vecY dst, immI con) %{
4331   predicate(n->as_Vector()->length() == 32 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4332   match(Set dst (ReplicateB con));
4333   format %{ "movq    $dst,[$constantaddress]\n\t"
4334             "vpbroadcastb $dst,$dst\t! replicate32B" %}
4335   ins_encode %{
4336    int vector_len = 1;
4337     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
4338     __ vpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4339   %}
4340   ins_pipe( pipe_slow );
4341 %}
4342 
4343 instruct Repl64B_imm_evex(vecZ dst, immI con) %{
4344   predicate(n->as_Vector()->length() == 64 && UseAVX > 2 && VM_Version::supports_avx512bw());
4345   match(Set dst (ReplicateB con));
4346   format %{ "movq    $dst,[$constantaddress]\n\t"
4347             "vpbroadcastb $dst,$dst\t! upper replicate64B" %}
4348   ins_encode %{
4349    int vector_len = 2;
4350     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
4351     __ vpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4352   %}
4353   ins_pipe( pipe_slow );
4354 %}
4355 
4356 instruct Repl64B_zero_evex(vecZ dst, immI0 zero) %{
4357   predicate(n->as_Vector()->length() == 64 && UseAVX > 2);
4358   match(Set dst (ReplicateB zero));
4359   format %{ "vpxor   $dst k0,$dst,$dst\t! replicate64B zero" %}
4360   ins_encode %{
4361     // Use vxorpd since AVX does not have vpxor for 512-bit (EVEX will have it).
4362     int vector_len = 2;
4363     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4364   %}
4365   ins_pipe( fpu_reg_reg );
4366 %}
4367 
4368 instruct Repl4S_evex(vecD dst, rRegI src) %{
4369   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4370   match(Set dst (ReplicateS src));
4371   format %{ "evpbroadcastw $dst,$src\t! replicate4S" %}
4372   ins_encode %{
4373    int vector_len = 0;
4374     __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
4375   %}
4376   ins_pipe( pipe_slow );
4377 %}
4378 
4379 instruct Repl4S_mem_evex(vecD dst, memory mem) %{
4380   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4381   match(Set dst (ReplicateS (LoadS mem)));
4382   format %{ "vpbroadcastw  $dst,$mem\t! replicate4S" %}
4383   ins_encode %{
4384     int vector_len = 0;
4385     __ vpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
4386   %}
4387   ins_pipe( pipe_slow );
4388 %}
4389 
4390 instruct Repl8S_evex(vecX dst, rRegI src) %{
4391   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4392   match(Set dst (ReplicateS src));
4393   format %{ "evpbroadcastw $dst,$src\t! replicate8S" %}
4394   ins_encode %{
4395    int vector_len = 0;
4396     __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
4397   %}
4398   ins_pipe( pipe_slow );
4399 %}
4400 
4401 instruct Repl8S_mem_evex(vecX dst, memory mem) %{
4402   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4403   match(Set dst (ReplicateS (LoadS mem)));
4404   format %{ "vpbroadcastw  $dst,$mem\t! replicate8S" %}
4405   ins_encode %{
4406     int vector_len = 0;
4407     __ vpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
4408   %}
4409   ins_pipe( pipe_slow );
4410 %}
4411 
4412 instruct Repl16S_evex(vecY dst, rRegI src) %{
4413   predicate(n->as_Vector()->length() == 16 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4414   match(Set dst (ReplicateS src));
4415   format %{ "evpbroadcastw $dst,$src\t! replicate16S" %}
4416   ins_encode %{
4417    int vector_len = 1;
4418     __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
4419   %}
4420   ins_pipe( pipe_slow );
4421 %}
4422 
4423 instruct Repl16S_mem_evex(vecY dst, memory mem) %{
4424   predicate(n->as_Vector()->length() == 16 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4425   match(Set dst (ReplicateS (LoadS mem)));
4426   format %{ "vpbroadcastw  $dst,$mem\t! replicate16S" %}
4427   ins_encode %{
4428     int vector_len = 1;
4429     __ vpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
4430   %}
4431   ins_pipe( pipe_slow );
4432 %}
4433 
4434 instruct Repl32S_evex(vecZ dst, rRegI src) %{
4435   predicate(n->as_Vector()->length() == 32 && UseAVX > 2 && VM_Version::supports_avx512bw());
4436   match(Set dst (ReplicateS src));
4437   format %{ "evpbroadcastw $dst,$src\t! replicate32S" %}
4438   ins_encode %{
4439    int vector_len = 2;
4440     __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
4441   %}
4442   ins_pipe( pipe_slow );
4443 %}
4444 
4445 instruct Repl32S_mem_evex(vecZ dst, memory mem) %{
4446   predicate(n->as_Vector()->length() == 32 && UseAVX > 2 && VM_Version::supports_avx512bw());
4447   match(Set dst (ReplicateS (LoadS mem)));
4448   format %{ "vpbroadcastw  $dst,$mem\t! replicate32S" %}
4449   ins_encode %{
4450     int vector_len = 2;
4451     __ vpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
4452   %}
4453   ins_pipe( pipe_slow );
4454 %}
4455 
4456 instruct Repl8S_imm_evex(vecX dst, immI con) %{
4457   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4458   match(Set dst (ReplicateS con));
4459   format %{ "movq    $dst,[$constantaddress]\n\t"
4460             "vpbroadcastw $dst,$dst\t! replicate8S" %}
4461   ins_encode %{
4462    int vector_len = 0;
4463     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
4464     __ vpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4465   %}
4466   ins_pipe( pipe_slow );
4467 %}
4468 
4469 instruct Repl16S_imm_evex(vecY dst, immI con) %{
4470   predicate(n->as_Vector()->length() == 16 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4471   match(Set dst (ReplicateS con));
4472   format %{ "movq    $dst,[$constantaddress]\n\t"
4473             "vpbroadcastw $dst,$dst\t! replicate16S" %}
4474   ins_encode %{
4475    int vector_len = 1;
4476     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
4477     __ vpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4478   %}
4479   ins_pipe( pipe_slow );
4480 %}
4481 
4482 instruct Repl32S_imm_evex(vecZ dst, immI con) %{
4483   predicate(n->as_Vector()->length() == 32 && UseAVX > 2 && VM_Version::supports_avx512bw());
4484   match(Set dst (ReplicateS con));
4485   format %{ "movq    $dst,[$constantaddress]\n\t"
4486             "vpbroadcastw $dst,$dst\t! replicate32S" %}
4487   ins_encode %{
4488    int vector_len = 2;
4489     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
4490     __ vpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4491   %}
4492   ins_pipe( pipe_slow );
4493 %}
4494 
4495 instruct Repl32S_zero_evex(vecZ dst, immI0 zero) %{
4496   predicate(n->as_Vector()->length() == 32 && UseAVX > 2);
4497   match(Set dst (ReplicateS zero));
4498   format %{ "vpxor   $dst k0,$dst,$dst\t! replicate32S zero" %}
4499   ins_encode %{
4500     // Use vxorpd since AVX does not have vpxor for 512-bit (EVEX will have it).
4501     int vector_len = 2;
4502     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4503   %}
4504   ins_pipe( fpu_reg_reg );
4505 %}
4506 
4507 instruct Repl4I_evex(vecX dst, rRegI src) %{
4508   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
4509   match(Set dst (ReplicateI src));
4510   format %{ "evpbroadcastd  $dst,$src\t! replicate4I" %}
4511   ins_encode %{
4512     int vector_len = 0;
4513     __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vector_len);
4514   %}
4515   ins_pipe( pipe_slow );
4516 %}
4517 
4518 instruct Repl4I_mem_evex(vecX dst, memory mem) %{
4519   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
4520   match(Set dst (ReplicateI (LoadI mem)));
4521   format %{ "vpbroadcastd  $dst,$mem\t! replicate4I" %}
4522   ins_encode %{
4523     int vector_len = 0;
4524     __ vpbroadcastd($dst$$XMMRegister, $mem$$Address, vector_len);
4525   %}
4526   ins_pipe( pipe_slow );
4527 %}
4528 
4529 instruct Repl8I_evex(vecY dst, rRegI src) %{
4530   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vl());
4531   match(Set dst (ReplicateI src));
4532   format %{ "evpbroadcastd  $dst,$src\t! replicate8I" %}
4533   ins_encode %{
4534     int vector_len = 1;
4535     __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vector_len);
4536   %}
4537   ins_pipe( pipe_slow );
4538 %}
4539 
4540 instruct Repl8I_mem_evex(vecY dst, memory mem) %{
4541   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vl());
4542   match(Set dst (ReplicateI (LoadI mem)));
4543   format %{ "vpbroadcastd  $dst,$mem\t! replicate8I" %}
4544   ins_encode %{
4545     int vector_len = 1;
4546     __ vpbroadcastd($dst$$XMMRegister, $mem$$Address, vector_len);
4547   %}
4548   ins_pipe( pipe_slow );
4549 %}
4550 
4551 instruct Repl16I_evex(vecZ dst, rRegI src) %{
4552   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4553   match(Set dst (ReplicateI src));
4554   format %{ "evpbroadcastd  $dst,$src\t! replicate16I" %}
4555   ins_encode %{
4556     int vector_len = 2;
4557     __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vector_len);
4558   %}
4559   ins_pipe( pipe_slow );
4560 %}
4561 
4562 instruct Repl16I_mem_evex(vecZ dst, memory mem) %{
4563   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4564   match(Set dst (ReplicateI (LoadI mem)));
4565   format %{ "vpbroadcastd  $dst,$mem\t! replicate16I" %}
4566   ins_encode %{
4567     int vector_len = 2;
4568     __ vpbroadcastd($dst$$XMMRegister, $mem$$Address, vector_len);
4569   %}
4570   ins_pipe( pipe_slow );
4571 %}
4572 
4573 instruct Repl4I_imm_evex(vecX dst, immI con) %{
4574   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
4575   match(Set dst (ReplicateI con));
4576   format %{ "movq    $dst,[$constantaddress]\t! replicate8I($con)\n\t"
4577             "vpbroadcastd  $dst,$dst\t! replicate4I" %}
4578   ins_encode %{
4579     int vector_len = 0;
4580     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
4581     __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4582   %}
4583   ins_pipe( pipe_slow );
4584 %}
4585 
4586 instruct Repl8I_imm_evex(vecY dst, immI con) %{
4587   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vl());
4588   match(Set dst (ReplicateI con));
4589   format %{ "movq    $dst,[$constantaddress]\t! replicate8I($con)\n\t"
4590             "vpbroadcastd  $dst,$dst\t! replicate8I" %}
4591   ins_encode %{
4592     int vector_len = 1;
4593     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
4594     __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4595   %}
4596   ins_pipe( pipe_slow );
4597 %}
4598 
4599 instruct Repl16I_imm_evex(vecZ dst, immI con) %{
4600   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4601   match(Set dst (ReplicateI con));
4602   format %{ "movq    $dst,[$constantaddress]\t! replicate16I($con)\n\t"
4603             "vpbroadcastd  $dst,$dst\t! replicate16I" %}
4604   ins_encode %{
4605     int vector_len = 2;
4606     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
4607     __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4608   %}
4609   ins_pipe( pipe_slow );
4610 %}
4611 
4612 instruct Repl16I_zero_evex(vecZ dst, immI0 zero) %{
4613   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4614   match(Set dst (ReplicateI zero));
4615   format %{ "vpxor   $dst k0,$dst,$dst\t! replicate16I zero" %}
4616   ins_encode %{
4617     // Use vxorpd since AVX does not have vpxor for 512-bit (AVX2 will have it).
4618     int vector_len = 2;
4619     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4620   %}
4621   ins_pipe( fpu_reg_reg );
4622 %}
4623 
4624 // Replicate long (8 byte) scalar to be vector
4625 #ifdef _LP64
4626 instruct Repl4L_evex(vecY dst, rRegL src) %{
4627   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
4628   match(Set dst (ReplicateL src));
4629   format %{ "evpbroadcastq  $dst,$src\t! replicate4L" %}
4630   ins_encode %{
4631     int vector_len = 1;
4632     __ evpbroadcastq($dst$$XMMRegister, $src$$Register, vector_len);
4633   %}
4634   ins_pipe( pipe_slow );
4635 %}
4636 
4637 instruct Repl8L_evex(vecZ dst, rRegL src) %{
4638   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4639   match(Set dst (ReplicateL src));
4640   format %{ "evpbroadcastq  $dst,$src\t! replicate8L" %}
4641   ins_encode %{
4642     int vector_len = 2;
4643     __ evpbroadcastq($dst$$XMMRegister, $src$$Register, vector_len);
4644   %}
4645   ins_pipe( pipe_slow );
4646 %}
4647 #else // _LP64
4648 instruct Repl4L_evex(vecY dst, eRegL src, regD tmp) %{
4649   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
4650   match(Set dst (ReplicateL src));
4651   effect(TEMP dst, USE src, TEMP tmp);
4652   format %{ "movdl   $dst,$src.lo\n\t"
4653             "movdl   $tmp,$src.hi\n\t"
4654             "punpckldq $dst,$tmp\n\t"
4655             "vpbroadcastq  $dst,$dst\t! replicate4L" %}
4656   ins_encode %{
4657     int vector_len = 1;
4658     __ movdl($dst$$XMMRegister, $src$$Register);
4659     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
4660     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
4661     __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4662   %}
4663   ins_pipe( pipe_slow );
4664 %}
4665 
4666 instruct Repl8L_evex(legVecZ dst, eRegL src, legVecZ tmp) %{
4667   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4668   match(Set dst (ReplicateL src));
4669   effect(TEMP dst, USE src, TEMP tmp);
4670   format %{ "movdl   $dst,$src.lo\n\t"
4671             "movdl   $tmp,$src.hi\n\t"
4672             "punpckldq $dst,$tmp\n\t"
4673             "vpbroadcastq  $dst,$dst\t! replicate8L" %}
4674   ins_encode %{
4675     int vector_len = 2;
4676     __ movdl($dst$$XMMRegister, $src$$Register);
4677     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
4678     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
4679     __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4680   %}
4681   ins_pipe( pipe_slow );
4682 %}
4683 #endif // _LP64
4684 
4685 instruct Repl4L_imm_evex(vecY dst, immL con) %{
4686   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
4687   match(Set dst (ReplicateL con));
4688   format %{ "movq    $dst,[$constantaddress]\n\t"
4689             "vpbroadcastq  $dst,$dst\t! replicate4L" %}
4690   ins_encode %{
4691     int vector_len = 1;
4692     __ movq($dst$$XMMRegister, $constantaddress($con));
4693     __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4694   %}
4695   ins_pipe( pipe_slow );
4696 %}
4697 
4698 instruct Repl8L_imm_evex(vecZ dst, immL con) %{
4699   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4700   match(Set dst (ReplicateL con));
4701   format %{ "movq    $dst,[$constantaddress]\n\t"
4702             "vpbroadcastq  $dst,$dst\t! replicate8L" %}
4703   ins_encode %{
4704     int vector_len = 2;
4705     __ movq($dst$$XMMRegister, $constantaddress($con));
4706     __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4707   %}
4708   ins_pipe( pipe_slow );
4709 %}
4710 
4711 instruct Repl2L_mem_evex(vecX dst, memory mem) %{
4712   predicate(n->as_Vector()->length() == 2 && UseAVX > 2 && VM_Version::supports_avx512vl());
4713   match(Set dst (ReplicateL (LoadL mem)));
4714   format %{ "vpbroadcastd  $dst,$mem\t! replicate2L" %}
4715   ins_encode %{
4716     int vector_len = 0;
4717     __ vpbroadcastq($dst$$XMMRegister, $mem$$Address, vector_len);
4718   %}
4719   ins_pipe( pipe_slow );
4720 %}
4721 
4722 instruct Repl4L_mem_evex(vecY dst, memory mem) %{
4723   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
4724   match(Set dst (ReplicateL (LoadL mem)));
4725   format %{ "vpbroadcastd  $dst,$mem\t! replicate4L" %}
4726   ins_encode %{
4727     int vector_len = 1;
4728     __ vpbroadcastq($dst$$XMMRegister, $mem$$Address, vector_len);
4729   %}
4730   ins_pipe( pipe_slow );
4731 %}
4732 
4733 instruct Repl8L_mem_evex(vecZ dst, memory mem) %{
4734   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4735   match(Set dst (ReplicateL (LoadL mem)));
4736   format %{ "vpbroadcastd  $dst,$mem\t! replicate8L" %}
4737   ins_encode %{
4738     int vector_len = 2;
4739     __ vpbroadcastq($dst$$XMMRegister, $mem$$Address, vector_len);
4740   %}
4741   ins_pipe( pipe_slow );
4742 %}
4743 
4744 instruct Repl8L_zero_evex(vecZ dst, immL0 zero) %{
4745   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4746   match(Set dst (ReplicateL zero));
4747   format %{ "vpxor   $dst k0,$dst,$dst\t! replicate8L zero" %}
4748   ins_encode %{
4749     // Use vxorpd since AVX does not have vpxor for 512-bit (EVEX will have it).
4750     int vector_len = 2;
4751     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4752   %}
4753   ins_pipe( fpu_reg_reg );
4754 %}
4755 
4756 instruct Repl8F_evex(vecY dst, regF src) %{
4757   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vl());
4758   match(Set dst (ReplicateF src));
4759   format %{ "vpbroadcastss $dst,$src\t! replicate8F" %}
4760   ins_encode %{
4761     int vector_len = 1;
4762     __ vpbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vector_len);
4763   %}
4764   ins_pipe( pipe_slow );
4765 %}
4766 
4767 instruct Repl8F_mem_evex(vecY dst, memory mem) %{
4768   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vl());
4769   match(Set dst (ReplicateF (LoadF mem)));
4770   format %{ "vbroadcastss  $dst,$mem\t! replicate8F" %}
4771   ins_encode %{
4772     int vector_len = 1;
4773     __ vpbroadcastss($dst$$XMMRegister, $mem$$Address, vector_len);
4774   %}
4775   ins_pipe( pipe_slow );
4776 %}
4777 
4778 instruct Repl16F_evex(vecZ dst, regF src) %{
4779   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4780   match(Set dst (ReplicateF src));
4781   format %{ "vpbroadcastss $dst,$src\t! replicate16F" %}
4782   ins_encode %{
4783     int vector_len = 2;
4784     __ vpbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vector_len);
4785   %}
4786   ins_pipe( pipe_slow );
4787 %}
4788 
4789 instruct Repl16F_mem_evex(vecZ dst, memory mem) %{
4790   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4791   match(Set dst (ReplicateF (LoadF mem)));
4792   format %{ "vbroadcastss  $dst,$mem\t! replicate16F" %}
4793   ins_encode %{
4794     int vector_len = 2;
4795     __ vpbroadcastss($dst$$XMMRegister, $mem$$Address, vector_len);
4796   %}
4797   ins_pipe( pipe_slow );
4798 %}
4799 
4800 instruct Repl2F_zero_evex(vecD dst, immF0 zero) %{
4801   predicate(n->as_Vector()->length() == 2 && UseAVX > 2);
4802   match(Set dst (ReplicateF zero));
4803   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate2F zero" %}
4804   ins_encode %{
4805     // Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
4806     int vector_len = 2;
4807     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4808   %}
4809   ins_pipe( fpu_reg_reg );
4810 %}
4811 
4812 instruct Repl4F_zero_evex(vecX dst, immF0 zero) %{
4813   predicate(n->as_Vector()->length() == 4 && UseAVX > 2);
4814   match(Set dst (ReplicateF zero));
4815   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate4F zero" %}
4816   ins_encode %{
4817     // Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
4818     int vector_len = 2;
4819     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4820   %}
4821   ins_pipe( fpu_reg_reg );
4822 %}
4823 
4824 instruct Repl8F_zero_evex(vecY dst, immF0 zero) %{
4825   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4826   match(Set dst (ReplicateF zero));
4827   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate8F zero" %}
4828   ins_encode %{
4829     // Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
4830     int vector_len = 2;
4831     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4832   %}
4833   ins_pipe( fpu_reg_reg );
4834 %}
4835 
4836 instruct Repl16F_zero_evex(vecZ dst, immF0 zero) %{
4837   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4838   match(Set dst (ReplicateF zero));
4839   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate16F zero" %}
4840   ins_encode %{
4841     // Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
4842     int vector_len = 2;
4843     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4844   %}
4845   ins_pipe( fpu_reg_reg );
4846 %}
4847 
4848 instruct Repl4D_evex(vecY dst, regD src) %{
4849   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
4850   match(Set dst (ReplicateD src));
4851   format %{ "vpbroadcastsd $dst,$src\t! replicate4D" %}
4852   ins_encode %{
4853     int vector_len = 1;
4854     __ vpbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
4855   %}
4856   ins_pipe( pipe_slow );
4857 %}
4858 
4859 instruct Repl4D_mem_evex(vecY dst, memory mem) %{
4860   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
4861   match(Set dst (ReplicateD (LoadD mem)));
4862   format %{ "vbroadcastsd  $dst,$mem\t! replicate4D" %}
4863   ins_encode %{
4864     int vector_len = 1;
4865     __ vpbroadcastsd($dst$$XMMRegister, $mem$$Address, vector_len);
4866   %}
4867   ins_pipe( pipe_slow );
4868 %}
4869 
4870 instruct Repl8D_evex(vecZ dst, regD src) %{
4871   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4872   match(Set dst (ReplicateD src));
4873   format %{ "vpbroadcastsd $dst,$src\t! replicate8D" %}
4874   ins_encode %{
4875     int vector_len = 2;
4876     __ vpbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
4877   %}
4878   ins_pipe( pipe_slow );
4879 %}
4880 
4881 instruct Repl8D_mem_evex(vecZ dst, memory mem) %{
4882   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4883   match(Set dst (ReplicateD (LoadD mem)));
4884   format %{ "vbroadcastsd  $dst,$mem\t! replicate8D" %}
4885   ins_encode %{
4886     int vector_len = 2;
4887     __ vpbroadcastsd($dst$$XMMRegister, $mem$$Address, vector_len);
4888   %}
4889   ins_pipe( pipe_slow );
4890 %}
4891 
4892 instruct Repl2D_zero_evex(vecX dst, immD0 zero) %{
4893   predicate(n->as_Vector()->length() == 2 && UseAVX > 2);
4894   match(Set dst (ReplicateD zero));
4895   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate2D zero" %}
4896   ins_encode %{
4897     // Use vpxor in place of vxorpd since EVEX has a constriant on dq for vxorpd: this is a 512-bit operation
4898     int vector_len = 2;
4899     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4900   %}
4901   ins_pipe( fpu_reg_reg );
4902 %}
4903 
4904 instruct Repl4D_zero_evex(vecY dst, immD0 zero) %{
4905   predicate(n->as_Vector()->length() == 4 && UseAVX > 2);
4906   match(Set dst (ReplicateD zero));
4907   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate4D zero" %}
4908   ins_encode %{
4909     // Use vpxor in place of vxorpd since EVEX has a constriant on dq for vxorpd: this is a 512-bit operation
4910     int vector_len = 2;
4911     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4912   %}
4913   ins_pipe( fpu_reg_reg );
4914 %}
4915 
4916 instruct Repl8D_zero_evex(vecZ dst, immD0 zero) %{
4917   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4918   match(Set dst (ReplicateD zero));
4919   format %{ "vpxor  $dst k0,$dst,$dst,vect512\t! replicate8D zero" %}
4920   ins_encode %{
4921     // Use vpxor in place of vxorpd since EVEX has a constriant on dq for vxorpd: this is a 512-bit operation
4922     int vector_len = 2;
4923     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4924   %}
4925   ins_pipe( fpu_reg_reg );
4926 %}
4927 
4928 // ====================REDUCTION ARITHMETIC=======================================
4929 
4930 instruct rsadd2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, vecD tmp, vecD tmp2) %{
4931   predicate(UseSSE > 2 && UseAVX == 0);
4932   match(Set dst (AddReductionVI src1 src2));
4933   effect(TEMP tmp2, TEMP tmp);
4934   format %{ "movdqu  $tmp2,$src2\n\t"
4935             "phaddd  $tmp2,$tmp2\n\t"
4936             "movd    $tmp,$src1\n\t"
4937             "paddd   $tmp,$tmp2\n\t"
4938             "movd    $dst,$tmp\t! add reduction2I" %}
4939   ins_encode %{
4940     __ movdqu($tmp2$$XMMRegister, $src2$$XMMRegister);
4941     __ phaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister);
4942     __ movdl($tmp$$XMMRegister, $src1$$Register);
4943     __ paddd($tmp$$XMMRegister, $tmp2$$XMMRegister);
4944     __ movdl($dst$$Register, $tmp$$XMMRegister);
4945   %}
4946   ins_pipe( pipe_slow );
4947 %}
4948 
4949 instruct rvadd2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, vecD tmp, vecD tmp2) %{
4950   predicate(VM_Version::supports_avxonly());
4951   match(Set dst (AddReductionVI src1 src2));
4952   effect(TEMP tmp, TEMP tmp2);
4953   format %{ "vphaddd  $tmp,$src2,$src2\n\t"
4954             "movd     $tmp2,$src1\n\t"
4955             "vpaddd   $tmp2,$tmp2,$tmp\n\t"
4956             "movd     $dst,$tmp2\t! add reduction2I" %}
4957   ins_encode %{
4958     int vector_len = 0;
4959     __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len);
4960     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4961     __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
4962     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4963   %}
4964   ins_pipe( pipe_slow );
4965 %}
4966 
4967 instruct rvadd2I_reduction_reg_evex(rRegI dst, rRegI src1, vecD src2, vecD tmp, vecD tmp2) %{
4968   predicate(UseAVX > 2);
4969   match(Set dst (AddReductionVI src1 src2));
4970   effect(TEMP tmp, TEMP tmp2);
4971   format %{ "pshufd  $tmp2,$src2,0x1\n\t"
4972             "vpaddd  $tmp,$src2,$tmp2\n\t"
4973             "movd    $tmp2,$src1\n\t"
4974             "vpaddd  $tmp2,$tmp,$tmp2\n\t"
4975             "movd    $dst,$tmp2\t! add reduction2I" %}
4976   ins_encode %{
4977     int vector_len = 0;
4978     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
4979     __ vpaddd($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4980     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4981     __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4982     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4983   %}
4984   ins_pipe( pipe_slow );
4985 %}
4986 
4987 instruct rsadd4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, vecX tmp, vecX tmp2) %{
4988   predicate(UseSSE > 2 && UseAVX == 0);
4989   match(Set dst (AddReductionVI src1 src2));
4990   effect(TEMP tmp, TEMP tmp2);
4991   format %{ "movdqu  $tmp,$src2\n\t"
4992             "phaddd  $tmp,$tmp\n\t"
4993             "phaddd  $tmp,$tmp\n\t"
4994             "movd    $tmp2,$src1\n\t"
4995             "paddd   $tmp2,$tmp\n\t"
4996             "movd    $dst,$tmp2\t! add reduction4I" %}
4997   ins_encode %{
4998     __ movdqu($tmp$$XMMRegister, $src2$$XMMRegister);
4999     __ phaddd($tmp$$XMMRegister, $tmp$$XMMRegister);
5000     __ phaddd($tmp$$XMMRegister, $tmp$$XMMRegister);
5001     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5002     __ paddd($tmp2$$XMMRegister, $tmp$$XMMRegister);
5003     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5004   %}
5005   ins_pipe( pipe_slow );
5006 %}
5007 
5008 instruct rvadd4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, vecX tmp, vecX tmp2) %{
5009   predicate(VM_Version::supports_avxonly());
5010   match(Set dst (AddReductionVI src1 src2));
5011   effect(TEMP tmp, TEMP tmp2);
5012   format %{ "vphaddd  $tmp,$src2,$src2\n\t"
5013             "vphaddd  $tmp,$tmp,$tmp\n\t"
5014             "movd     $tmp2,$src1\n\t"
5015             "vpaddd   $tmp2,$tmp2,$tmp\n\t"
5016             "movd     $dst,$tmp2\t! add reduction4I" %}
5017   ins_encode %{
5018     int vector_len = 0;
5019     __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len);
5020     __ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp$$XMMRegister, vector_len);
5021     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5022     __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
5023     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5024   %}
5025   ins_pipe( pipe_slow );
5026 %}
5027 
5028 instruct rvadd4I_reduction_reg_evex(rRegI dst, rRegI src1, vecX src2, vecX tmp, vecX tmp2) %{
5029   predicate(UseAVX > 2);
5030   match(Set dst (AddReductionVI src1 src2));
5031   effect(TEMP tmp, TEMP tmp2);
5032   format %{ "pshufd  $tmp2,$src2,0xE\n\t"
5033             "vpaddd  $tmp,$src2,$tmp2\n\t"
5034             "pshufd  $tmp2,$tmp,0x1\n\t"
5035             "vpaddd  $tmp,$tmp,$tmp2\n\t"
5036             "movd    $tmp2,$src1\n\t"
5037             "vpaddd  $tmp2,$tmp,$tmp2\n\t"
5038             "movd    $dst,$tmp2\t! add reduction4I" %}
5039   ins_encode %{
5040     int vector_len = 0;
5041     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
5042     __ vpaddd($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5043     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
5044     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5045     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5046     __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5047     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5048   %}
5049   ins_pipe( pipe_slow );
5050 %}
5051 
5052 instruct rvadd8I_reduction_reg(rRegI dst, rRegI src1, vecY src2, vecY tmp, vecY tmp2) %{
5053   predicate(VM_Version::supports_avxonly());
5054   match(Set dst (AddReductionVI src1 src2));
5055   effect(TEMP tmp, TEMP tmp2);
5056   format %{ "vphaddd  $tmp,$src2,$src2\n\t"
5057             "vphaddd  $tmp,$tmp,$tmp2\n\t"
5058             "vextracti128_high  $tmp2,$tmp\n\t"
5059             "vpaddd   $tmp,$tmp,$tmp2\n\t"
5060             "movd     $tmp2,$src1\n\t"
5061             "vpaddd   $tmp2,$tmp2,$tmp\n\t"
5062             "movd     $dst,$tmp2\t! add reduction8I" %}
5063   ins_encode %{
5064     int vector_len = 1;
5065     __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len);
5066     __ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5067     __ vextracti128_high($tmp2$$XMMRegister, $tmp$$XMMRegister);
5068     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5069     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5070     __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5071     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5072   %}
5073   ins_pipe( pipe_slow );
5074 %}
5075 
5076 instruct rvadd8I_reduction_reg_evex(rRegI dst, rRegI src1, vecY src2, vecY tmp, vecY tmp2) %{
5077   predicate(UseAVX > 2);
5078   match(Set dst (AddReductionVI src1 src2));
5079   effect(TEMP tmp, TEMP tmp2);
5080   format %{ "vextracti128_high  $tmp,$src2\n\t"
5081             "vpaddd  $tmp,$tmp,$src2\n\t"
5082             "pshufd  $tmp2,$tmp,0xE\n\t"
5083             "vpaddd  $tmp,$tmp,$tmp2\n\t"
5084             "pshufd  $tmp2,$tmp,0x1\n\t"
5085             "vpaddd  $tmp,$tmp,$tmp2\n\t"
5086             "movd    $tmp2,$src1\n\t"
5087             "vpaddd  $tmp2,$tmp,$tmp2\n\t"
5088             "movd    $dst,$tmp2\t! add reduction8I" %}
5089   ins_encode %{
5090     int vector_len = 0;
5091     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
5092     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
5093     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
5094     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5095     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
5096     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5097     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5098     __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5099     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5100   %}
5101   ins_pipe( pipe_slow );
5102 %}
5103 
5104 instruct rvadd16I_reduction_reg_evex(rRegI dst, rRegI src1, legVecZ src2, legVecZ tmp, legVecZ tmp2, legVecZ tmp3) %{
5105   predicate(UseAVX > 2);
5106   match(Set dst (AddReductionVI src1 src2));
5107   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
5108   format %{ "vextracti64x4_high  $tmp3,$src2\n\t"
5109             "vpaddd  $tmp3,$tmp3,$src2\n\t"
5110             "vextracti128_high  $tmp,$tmp3\n\t"
5111             "vpaddd  $tmp,$tmp,$tmp3\n\t"
5112             "pshufd  $tmp2,$tmp,0xE\n\t"
5113             "vpaddd  $tmp,$tmp,$tmp2\n\t"
5114             "pshufd  $tmp2,$tmp,0x1\n\t"
5115             "vpaddd  $tmp,$tmp,$tmp2\n\t"
5116             "movd    $tmp2,$src1\n\t"
5117             "vpaddd  $tmp2,$tmp,$tmp2\n\t"
5118             "movd    $dst,$tmp2\t! mul reduction16I" %}
5119   ins_encode %{
5120     __ vextracti64x4_high($tmp3$$XMMRegister, $src2$$XMMRegister);
5121     __ vpaddd($tmp3$$XMMRegister, $tmp3$$XMMRegister, $src2$$XMMRegister, 1);
5122     __ vextracti128_high($tmp$$XMMRegister, $tmp3$$XMMRegister);
5123     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp3$$XMMRegister, 0);
5124     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
5125     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5126     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
5127     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5128     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5129     __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5130     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5131   %}
5132   ins_pipe( pipe_slow );
5133 %}
5134 
5135 #ifdef _LP64
5136 instruct rvadd2L_reduction_reg(rRegL dst, rRegL src1, vecX src2, vecX tmp, vecX tmp2) %{
5137   predicate(UseAVX > 2);
5138   match(Set dst (AddReductionVL src1 src2));
5139   effect(TEMP tmp, TEMP tmp2);
5140   format %{ "pshufd  $tmp2,$src2,0xE\n\t"
5141             "vpaddq  $tmp,$src2,$tmp2\n\t"
5142             "movdq   $tmp2,$src1\n\t"
5143             "vpaddq  $tmp2,$tmp,$tmp2\n\t"
5144             "movdq   $dst,$tmp2\t! add reduction2L" %}
5145   ins_encode %{
5146     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
5147     __ vpaddq($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, 0);
5148     __ movdq($tmp2$$XMMRegister, $src1$$Register);
5149     __ vpaddq($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5150     __ movdq($dst$$Register, $tmp2$$XMMRegister);
5151   %}
5152   ins_pipe( pipe_slow );
5153 %}
5154 
5155 instruct rvadd4L_reduction_reg(rRegL dst, rRegL src1, vecY src2, vecY tmp, vecY tmp2) %{
5156   predicate(UseAVX > 2);
5157   match(Set dst (AddReductionVL src1 src2));
5158   effect(TEMP tmp, TEMP tmp2);
5159   format %{ "vextracti128_high  $tmp,$src2\n\t"
5160             "vpaddq  $tmp2,$tmp,$src2\n\t"
5161             "pshufd  $tmp,$tmp2,0xE\n\t"
5162             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
5163             "movdq   $tmp,$src1\n\t"
5164             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
5165             "movdq   $dst,$tmp2\t! add reduction4L" %}
5166   ins_encode %{
5167     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
5168     __ vpaddq($tmp2$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, 0);
5169     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5170     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5171     __ movdq($tmp$$XMMRegister, $src1$$Register);
5172     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5173     __ movdq($dst$$Register, $tmp2$$XMMRegister);
5174   %}
5175   ins_pipe( pipe_slow );
5176 %}
5177 
5178 instruct rvadd8L_reduction_reg(rRegL dst, rRegL src1, legVecZ src2, legVecZ tmp, legVecZ tmp2) %{
5179   predicate(UseAVX > 2);
5180   match(Set dst (AddReductionVL src1 src2));
5181   effect(TEMP tmp, TEMP tmp2);
5182   format %{ "vextracti64x4_high  $tmp2,$src2\n\t"
5183             "vpaddq  $tmp2,$tmp2,$src2\n\t"
5184             "vextracti128_high  $tmp,$tmp2\n\t"
5185             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
5186             "pshufd  $tmp,$tmp2,0xE\n\t"
5187             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
5188             "movdq   $tmp,$src1\n\t"
5189             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
5190             "movdq   $dst,$tmp2\t! add reduction8L" %}
5191   ins_encode %{
5192     __ vextracti64x4_high($tmp2$$XMMRegister, $src2$$XMMRegister);
5193     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
5194     __ vextracti128_high($tmp$$XMMRegister, $tmp2$$XMMRegister);
5195     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5196     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5197     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5198     __ movdq($tmp$$XMMRegister, $src1$$Register);
5199     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5200     __ movdq($dst$$Register, $tmp2$$XMMRegister);
5201   %}
5202   ins_pipe( pipe_slow );
5203 %}
5204 #endif
5205 
5206 instruct rsadd2F_reduction_reg(regF dst, vecD src2, vecD tmp) %{
5207   predicate(UseSSE >= 1 && UseAVX == 0);
5208   match(Set dst (AddReductionVF dst src2));
5209   effect(TEMP dst, TEMP tmp);
5210   format %{ "addss   $dst,$src2\n\t"
5211             "pshufd  $tmp,$src2,0x01\n\t"
5212             "addss   $dst,$tmp\t! add reduction2F" %}
5213   ins_encode %{
5214     __ addss($dst$$XMMRegister, $src2$$XMMRegister);
5215     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5216     __ addss($dst$$XMMRegister, $tmp$$XMMRegister);
5217   %}
5218   ins_pipe( pipe_slow );
5219 %}
5220 
5221 instruct rvadd2F_reduction_reg(regF dst, vecD src2, vecD tmp) %{
5222   predicate(UseAVX > 0);
5223   match(Set dst (AddReductionVF dst src2));
5224   effect(TEMP dst, TEMP tmp);
5225   format %{ "vaddss  $dst,$dst,$src2\n\t"
5226             "pshufd  $tmp,$src2,0x01\n\t"
5227             "vaddss  $dst,$dst,$tmp\t! add reduction2F" %}
5228   ins_encode %{
5229     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5230     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5231     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5232   %}
5233   ins_pipe( pipe_slow );
5234 %}
5235 
5236 instruct rsadd4F_reduction_reg(regF dst, vecX src2, vecX tmp) %{
5237   predicate(UseSSE >= 1 && UseAVX == 0);
5238   match(Set dst (AddReductionVF dst src2));
5239   effect(TEMP dst, TEMP tmp);
5240   format %{ "addss   $dst,$src2\n\t"
5241             "pshufd  $tmp,$src2,0x01\n\t"
5242             "addss   $dst,$tmp\n\t"
5243             "pshufd  $tmp,$src2,0x02\n\t"
5244             "addss   $dst,$tmp\n\t"
5245             "pshufd  $tmp,$src2,0x03\n\t"
5246             "addss   $dst,$tmp\t! add reduction4F" %}
5247   ins_encode %{
5248     __ addss($dst$$XMMRegister, $src2$$XMMRegister);
5249     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5250     __ addss($dst$$XMMRegister, $tmp$$XMMRegister);
5251     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5252     __ addss($dst$$XMMRegister, $tmp$$XMMRegister);
5253     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5254     __ addss($dst$$XMMRegister, $tmp$$XMMRegister);
5255   %}
5256   ins_pipe( pipe_slow );
5257 %}
5258 
5259 instruct rvadd4F_reduction_reg(regF dst, vecX src2, vecX tmp) %{
5260   predicate(UseAVX > 0);
5261   match(Set dst (AddReductionVF dst src2));
5262   effect(TEMP tmp, TEMP dst);
5263   format %{ "vaddss  $dst,dst,$src2\n\t"
5264             "pshufd  $tmp,$src2,0x01\n\t"
5265             "vaddss  $dst,$dst,$tmp\n\t"
5266             "pshufd  $tmp,$src2,0x02\n\t"
5267             "vaddss  $dst,$dst,$tmp\n\t"
5268             "pshufd  $tmp,$src2,0x03\n\t"
5269             "vaddss  $dst,$dst,$tmp\t! add reduction4F" %}
5270   ins_encode %{
5271     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5272     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5273     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5274     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5275     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5276     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5277     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5278   %}
5279   ins_pipe( pipe_slow );
5280 %}
5281 
5282 instruct radd8F_reduction_reg(regF dst, vecY src2, vecY tmp, vecY tmp2) %{
5283   predicate(UseAVX > 0);
5284   match(Set dst (AddReductionVF dst src2));
5285   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5286   format %{ "vaddss  $dst,$dst,$src2\n\t"
5287             "pshufd  $tmp,$src2,0x01\n\t"
5288             "vaddss  $dst,$dst,$tmp\n\t"
5289             "pshufd  $tmp,$src2,0x02\n\t"
5290             "vaddss  $dst,$dst,$tmp\n\t"
5291             "pshufd  $tmp,$src2,0x03\n\t"
5292             "vaddss  $dst,$dst,$tmp\n\t"
5293             "vextractf128_high  $tmp2,$src2\n\t"
5294             "vaddss  $dst,$dst,$tmp2\n\t"
5295             "pshufd  $tmp,$tmp2,0x01\n\t"
5296             "vaddss  $dst,$dst,$tmp\n\t"
5297             "pshufd  $tmp,$tmp2,0x02\n\t"
5298             "vaddss  $dst,$dst,$tmp\n\t"
5299             "pshufd  $tmp,$tmp2,0x03\n\t"
5300             "vaddss  $dst,$dst,$tmp\t! add reduction8F" %}
5301   ins_encode %{
5302     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5303     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5304     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5305     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5306     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5307     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5308     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5309     __ vextractf128_high($tmp2$$XMMRegister, $src2$$XMMRegister);
5310     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5311     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5312     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5313     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5314     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5315     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5316     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5317   %}
5318   ins_pipe( pipe_slow );
5319 %}
5320 
5321 instruct radd16F_reduction_reg(regF dst, legVecZ src2, legVecZ tmp, legVecZ tmp2) %{
5322   predicate(UseAVX > 2);
5323   match(Set dst (AddReductionVF dst src2));
5324   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5325   format %{ "vaddss  $dst,$dst,$src2\n\t"
5326             "pshufd  $tmp,$src2,0x01\n\t"
5327             "vaddss  $dst,$dst,$tmp\n\t"
5328             "pshufd  $tmp,$src2,0x02\n\t"
5329             "vaddss  $dst,$dst,$tmp\n\t"
5330             "pshufd  $tmp,$src2,0x03\n\t"
5331             "vaddss  $dst,$dst,$tmp\n\t"
5332             "vextractf32x4  $tmp2,$src2,0x1\n\t"
5333             "vaddss  $dst,$dst,$tmp2\n\t"
5334             "pshufd  $tmp,$tmp2,0x01\n\t"
5335             "vaddss  $dst,$dst,$tmp\n\t"
5336             "pshufd  $tmp,$tmp2,0x02\n\t"
5337             "vaddss  $dst,$dst,$tmp\n\t"
5338             "pshufd  $tmp,$tmp2,0x03\n\t"
5339             "vaddss  $dst,$dst,$tmp\n\t"
5340             "vextractf32x4  $tmp2,$src2,0x2\n\t"
5341             "vaddss  $dst,$dst,$tmp2\n\t"
5342             "pshufd  $tmp,$tmp2,0x01\n\t"
5343             "vaddss  $dst,$dst,$tmp\n\t"
5344             "pshufd  $tmp,$tmp2,0x02\n\t"
5345             "vaddss  $dst,$dst,$tmp\n\t"
5346             "pshufd  $tmp,$tmp2,0x03\n\t"
5347             "vaddss  $dst,$dst,$tmp\n\t"
5348             "vextractf32x4  $tmp2,$src2,0x3\n\t"
5349             "vaddss  $dst,$dst,$tmp2\n\t"
5350             "pshufd  $tmp,$tmp2,0x01\n\t"
5351             "vaddss  $dst,$dst,$tmp\n\t"
5352             "pshufd  $tmp,$tmp2,0x02\n\t"
5353             "vaddss  $dst,$dst,$tmp\n\t"
5354             "pshufd  $tmp,$tmp2,0x03\n\t"
5355             "vaddss  $dst,$dst,$tmp\t! add reduction16F" %}
5356   ins_encode %{
5357     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5358     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5359     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5360     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5361     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5362     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5363     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5364     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5365     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5366     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5367     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5368     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5369     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5370     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5371     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5372     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
5373     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5374     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5375     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5376     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5377     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5378     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5379     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5380     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
5381     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5382     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5383     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5384     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5385     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5386     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5387     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5388   %}
5389   ins_pipe( pipe_slow );
5390 %}
5391 
5392 instruct rsadd2D_reduction_reg(regD dst, vecX src2, vecX tmp) %{
5393   predicate(UseSSE >= 1 && UseAVX == 0);
5394   match(Set dst (AddReductionVD dst src2));
5395   effect(TEMP tmp, TEMP dst);
5396   format %{ "addsd   $dst,$src2\n\t"
5397             "pshufd  $tmp,$src2,0xE\n\t"
5398             "addsd   $dst,$tmp\t! add reduction2D" %}
5399   ins_encode %{
5400     __ addsd($dst$$XMMRegister, $src2$$XMMRegister);
5401     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5402     __ addsd($dst$$XMMRegister, $tmp$$XMMRegister);
5403   %}
5404   ins_pipe( pipe_slow );
5405 %}
5406 
5407 instruct rvadd2D_reduction_reg(regD dst, vecX src2, vecX tmp) %{
5408   predicate(UseAVX > 0);
5409   match(Set dst (AddReductionVD dst src2));
5410   effect(TEMP tmp, TEMP dst);
5411   format %{ "vaddsd  $dst,$dst,$src2\n\t"
5412             "pshufd  $tmp,$src2,0xE\n\t"
5413             "vaddsd  $dst,$dst,$tmp\t! add reduction2D" %}
5414   ins_encode %{
5415     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5416     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5417     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5418   %}
5419   ins_pipe( pipe_slow );
5420 %}
5421 
5422 instruct rvadd4D_reduction_reg(regD dst, vecY src2, vecX tmp, vecX tmp2) %{
5423   predicate(UseAVX > 0);
5424   match(Set dst (AddReductionVD dst src2));
5425   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5426   format %{ "vaddsd  $dst,$dst,$src2\n\t"
5427             "pshufd  $tmp,$src2,0xE\n\t"
5428             "vaddsd  $dst,$dst,$tmp\n\t"
5429             "vextractf128  $tmp2,$src2,0x1\n\t"
5430             "vaddsd  $dst,$dst,$tmp2\n\t"
5431             "pshufd  $tmp,$tmp2,0xE\n\t"
5432             "vaddsd  $dst,$dst,$tmp\t! add reduction4D" %}
5433   ins_encode %{
5434     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5435     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5436     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5437     __ vextractf128($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5438     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5439     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5440     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5441   %}
5442   ins_pipe( pipe_slow );
5443 %}
5444 
5445 instruct rvadd8D_reduction_reg(regD dst, legVecZ src2, legVecZ tmp, legVecZ tmp2) %{
5446   predicate(UseAVX > 2);
5447   match(Set dst (AddReductionVD dst src2));
5448   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5449   format %{ "vaddsd  $dst,$dst,$src2\n\t"
5450             "pshufd  $tmp,$src2,0xE\n\t"
5451             "vaddsd  $dst,$dst,$tmp\n\t"
5452             "vextractf32x4  $tmp2,$src2,0x1\n\t"
5453             "vaddsd  $dst,$dst,$tmp2\n\t"
5454             "pshufd  $tmp,$tmp2,0xE\n\t"
5455             "vaddsd  $dst,$dst,$tmp\n\t"
5456             "vextractf32x4  $tmp2,$src2,0x2\n\t"
5457             "vaddsd  $dst,$dst,$tmp2\n\t"
5458             "pshufd  $tmp,$tmp2,0xE\n\t"
5459             "vaddsd  $dst,$dst,$tmp\n\t"
5460             "vextractf32x4  $tmp2,$src2,0x3\n\t"
5461             "vaddsd  $dst,$dst,$tmp2\n\t"
5462             "pshufd  $tmp,$tmp2,0xE\n\t"
5463             "vaddsd  $dst,$dst,$tmp\t! add reduction8D" %}
5464   ins_encode %{
5465     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5466     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5467     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5468     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5469     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5470     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5471     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5472     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
5473     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5474     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5475     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5476     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
5477     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5478     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5479     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5480   %}
5481   ins_pipe( pipe_slow );
5482 %}
5483 
5484 instruct rsmul2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, vecD tmp, vecD tmp2) %{
5485   predicate(UseSSE > 3 && UseAVX == 0);
5486   match(Set dst (MulReductionVI src1 src2));
5487   effect(TEMP tmp, TEMP tmp2);
5488   format %{ "pshufd  $tmp2,$src2,0x1\n\t"
5489             "pmulld  $tmp2,$src2\n\t"
5490             "movd    $tmp,$src1\n\t"
5491             "pmulld  $tmp2,$tmp\n\t"
5492             "movd    $dst,$tmp2\t! mul reduction2I" %}
5493   ins_encode %{
5494     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5495     __ pmulld($tmp2$$XMMRegister, $src2$$XMMRegister);
5496     __ movdl($tmp$$XMMRegister, $src1$$Register);
5497     __ pmulld($tmp2$$XMMRegister, $tmp$$XMMRegister);
5498     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5499   %}
5500   ins_pipe( pipe_slow );
5501 %}
5502 
5503 instruct rvmul2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, vecD tmp, vecD tmp2) %{
5504   predicate(UseAVX > 0);
5505   match(Set dst (MulReductionVI src1 src2));
5506   effect(TEMP tmp, TEMP tmp2);
5507   format %{ "pshufd   $tmp2,$src2,0x1\n\t"
5508             "vpmulld  $tmp,$src2,$tmp2\n\t"
5509             "movd     $tmp2,$src1\n\t"
5510             "vpmulld  $tmp2,$tmp,$tmp2\n\t"
5511             "movd     $dst,$tmp2\t! mul reduction2I" %}
5512   ins_encode %{
5513     int vector_len = 0;
5514     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5515     __ vpmulld($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5516     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5517     __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5518     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5519   %}
5520   ins_pipe( pipe_slow );
5521 %}
5522 
5523 instruct rsmul4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, vecX tmp, vecX tmp2) %{
5524   predicate(UseSSE > 3 && UseAVX == 0);
5525   match(Set dst (MulReductionVI src1 src2));
5526   effect(TEMP tmp, TEMP tmp2);
5527   format %{ "pshufd  $tmp2,$src2,0xE\n\t"
5528             "pmulld  $tmp2,$src2\n\t"
5529             "pshufd  $tmp,$tmp2,0x1\n\t"
5530             "pmulld  $tmp2,$tmp\n\t"
5531             "movd    $tmp,$src1\n\t"
5532             "pmulld  $tmp2,$tmp\n\t"
5533             "movd    $dst,$tmp2\t! mul reduction4I" %}
5534   ins_encode %{
5535     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
5536     __ pmulld($tmp2$$XMMRegister, $src2$$XMMRegister);
5537     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x1);
5538     __ pmulld($tmp2$$XMMRegister, $tmp$$XMMRegister);
5539     __ movdl($tmp$$XMMRegister, $src1$$Register);
5540     __ pmulld($tmp2$$XMMRegister, $tmp$$XMMRegister);
5541     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5542   %}
5543   ins_pipe( pipe_slow );
5544 %}
5545 
5546 instruct rvmul4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, vecX tmp, vecX tmp2) %{
5547   predicate(UseAVX > 0);
5548   match(Set dst (MulReductionVI src1 src2));
5549   effect(TEMP tmp, TEMP tmp2);
5550   format %{ "pshufd   $tmp2,$src2,0xE\n\t"
5551             "vpmulld  $tmp,$src2,$tmp2\n\t"
5552             "pshufd   $tmp2,$tmp,0x1\n\t"
5553             "vpmulld  $tmp,$tmp,$tmp2\n\t"
5554             "movd     $tmp2,$src1\n\t"
5555             "vpmulld  $tmp2,$tmp,$tmp2\n\t"
5556             "movd     $dst,$tmp2\t! mul reduction4I" %}
5557   ins_encode %{
5558     int vector_len = 0;
5559     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
5560     __ vpmulld($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5561     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
5562     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5563     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5564     __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5565     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5566   %}
5567   ins_pipe( pipe_slow );
5568 %}
5569 
5570 instruct rvmul8I_reduction_reg(rRegI dst, rRegI src1, vecY src2, vecY tmp, vecY tmp2) %{
5571   predicate(UseAVX > 1);
5572   match(Set dst (MulReductionVI src1 src2));
5573   effect(TEMP tmp, TEMP tmp2);
5574   format %{ "vextracti128_high  $tmp,$src2\n\t"
5575             "vpmulld  $tmp,$tmp,$src2\n\t"
5576             "pshufd   $tmp2,$tmp,0xE\n\t"
5577             "vpmulld  $tmp,$tmp,$tmp2\n\t"
5578             "pshufd   $tmp2,$tmp,0x1\n\t"
5579             "vpmulld  $tmp,$tmp,$tmp2\n\t"
5580             "movd     $tmp2,$src1\n\t"
5581             "vpmulld  $tmp2,$tmp,$tmp2\n\t"
5582             "movd     $dst,$tmp2\t! mul reduction8I" %}
5583   ins_encode %{
5584     int vector_len = 0;
5585     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
5586     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
5587     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
5588     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5589     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
5590     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5591     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5592     __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5593     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5594   %}
5595   ins_pipe( pipe_slow );
5596 %}
5597 
5598 instruct rvmul16I_reduction_reg(rRegI dst, rRegI src1, legVecZ src2, legVecZ tmp, legVecZ tmp2, legVecZ tmp3) %{
5599   predicate(UseAVX > 2);
5600   match(Set dst (MulReductionVI src1 src2));
5601   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
5602   format %{ "vextracti64x4_high  $tmp3,$src2\n\t"
5603             "vpmulld  $tmp3,$tmp3,$src2\n\t"
5604             "vextracti128_high  $tmp,$tmp3\n\t"
5605             "vpmulld  $tmp,$tmp,$src2\n\t"
5606             "pshufd   $tmp2,$tmp,0xE\n\t"
5607             "vpmulld  $tmp,$tmp,$tmp2\n\t"
5608             "pshufd   $tmp2,$tmp,0x1\n\t"
5609             "vpmulld  $tmp,$tmp,$tmp2\n\t"
5610             "movd     $tmp2,$src1\n\t"
5611             "vpmulld  $tmp2,$tmp,$tmp2\n\t"
5612             "movd     $dst,$tmp2\t! mul reduction16I" %}
5613   ins_encode %{
5614     __ vextracti64x4_high($tmp3$$XMMRegister, $src2$$XMMRegister);
5615     __ vpmulld($tmp3$$XMMRegister, $tmp3$$XMMRegister, $src2$$XMMRegister, 1);
5616     __ vextracti128_high($tmp$$XMMRegister, $tmp3$$XMMRegister);
5617     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp3$$XMMRegister, 0);
5618     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
5619     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5620     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
5621     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5622     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5623     __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5624     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5625   %}
5626   ins_pipe( pipe_slow );
5627 %}
5628 
5629 #ifdef _LP64
5630 instruct rvmul2L_reduction_reg(rRegL dst, rRegL src1, vecX src2, vecX tmp, vecX tmp2) %{
5631   predicate(UseAVX > 2 && VM_Version::supports_avx512dq());
5632   match(Set dst (MulReductionVL src1 src2));
5633   effect(TEMP tmp, TEMP tmp2);
5634   format %{ "pshufd   $tmp2,$src2,0xE\n\t"
5635             "vpmullq  $tmp,$src2,$tmp2\n\t"
5636             "movdq    $tmp2,$src1\n\t"
5637             "vpmullq  $tmp2,$tmp,$tmp2\n\t"
5638             "movdq    $dst,$tmp2\t! mul reduction2L" %}
5639   ins_encode %{
5640     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
5641     __ vpmullq($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, 0);
5642     __ movdq($tmp2$$XMMRegister, $src1$$Register);
5643     __ vpmullq($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5644     __ movdq($dst$$Register, $tmp2$$XMMRegister);
5645   %}
5646   ins_pipe( pipe_slow );
5647 %}
5648 
5649 instruct rvmul4L_reduction_reg(rRegL dst, rRegL src1, vecY src2, vecY tmp, vecY tmp2) %{
5650   predicate(UseAVX > 2 && VM_Version::supports_avx512dq());
5651   match(Set dst (MulReductionVL src1 src2));
5652   effect(TEMP tmp, TEMP tmp2);
5653   format %{ "vextracti128_high  $tmp,$src2\n\t"
5654             "vpmullq  $tmp2,$tmp,$src2\n\t"
5655             "pshufd   $tmp,$tmp2,0xE\n\t"
5656             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5657             "movdq    $tmp,$src1\n\t"
5658             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5659             "movdq    $dst,$tmp2\t! mul reduction4L" %}
5660   ins_encode %{
5661     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
5662     __ vpmullq($tmp2$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, 0);
5663     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5664     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5665     __ movdq($tmp$$XMMRegister, $src1$$Register);
5666     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5667     __ movdq($dst$$Register, $tmp2$$XMMRegister);
5668   %}
5669   ins_pipe( pipe_slow );
5670 %}
5671 
5672 instruct rvmul8L_reduction_reg(rRegL dst, rRegL src1, legVecZ src2, legVecZ tmp, legVecZ tmp2) %{
5673   predicate(UseAVX > 2 && VM_Version::supports_avx512dq());
5674   match(Set dst (MulReductionVL src1 src2));
5675   effect(TEMP tmp, TEMP tmp2);
5676   format %{ "vextracti64x4_high  $tmp2,$src2\n\t"
5677             "vpmullq  $tmp2,$tmp2,$src2\n\t"
5678             "vextracti128_high  $tmp,$tmp2\n\t"
5679             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5680             "pshufd   $tmp,$tmp2,0xE\n\t"
5681             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5682             "movdq    $tmp,$src1\n\t"
5683             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5684             "movdq    $dst,$tmp2\t! mul reduction8L" %}
5685   ins_encode %{
5686     __ vextracti64x4_high($tmp2$$XMMRegister, $src2$$XMMRegister);
5687     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
5688     __ vextracti128_high($tmp$$XMMRegister, $tmp2$$XMMRegister);
5689     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5690     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5691     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5692     __ movdq($tmp$$XMMRegister, $src1$$Register);
5693     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5694     __ movdq($dst$$Register, $tmp2$$XMMRegister);
5695   %}
5696   ins_pipe( pipe_slow );
5697 %}
5698 #endif
5699 
5700 instruct rsmul2F_reduction(regF dst, vecD src2, vecD tmp) %{
5701   predicate(UseSSE >= 1 && UseAVX == 0);
5702   match(Set dst (MulReductionVF dst src2));
5703   effect(TEMP dst, TEMP tmp);
5704   format %{ "mulss   $dst,$src2\n\t"
5705             "pshufd  $tmp,$src2,0x01\n\t"
5706             "mulss   $dst,$tmp\t! mul reduction2F" %}
5707   ins_encode %{
5708     __ mulss($dst$$XMMRegister, $src2$$XMMRegister);
5709     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5710     __ mulss($dst$$XMMRegister, $tmp$$XMMRegister);
5711   %}
5712   ins_pipe( pipe_slow );
5713 %}
5714 
5715 instruct rvmul2F_reduction_reg(regF dst, vecD src2, vecD tmp) %{
5716   predicate(UseAVX > 0);
5717   match(Set dst (MulReductionVF dst src2));
5718   effect(TEMP tmp, TEMP dst);
5719   format %{ "vmulss  $dst,$dst,$src2\n\t"
5720             "pshufd  $tmp,$src2,0x01\n\t"
5721             "vmulss  $dst,$dst,$tmp\t! mul reduction2F" %}
5722   ins_encode %{
5723     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5724     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5725     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5726   %}
5727   ins_pipe( pipe_slow );
5728 %}
5729 
5730 instruct rsmul4F_reduction_reg(regF dst, vecX src2, vecX tmp) %{
5731   predicate(UseSSE >= 1 && UseAVX == 0);
5732   match(Set dst (MulReductionVF dst src2));
5733   effect(TEMP dst, TEMP tmp);
5734   format %{ "mulss   $dst,$src2\n\t"
5735             "pshufd  $tmp,$src2,0x01\n\t"
5736             "mulss   $dst,$tmp\n\t"
5737             "pshufd  $tmp,$src2,0x02\n\t"
5738             "mulss   $dst,$tmp\n\t"
5739             "pshufd  $tmp,$src2,0x03\n\t"
5740             "mulss   $dst,$tmp\t! mul reduction4F" %}
5741   ins_encode %{
5742     __ mulss($dst$$XMMRegister, $src2$$XMMRegister);
5743     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5744     __ mulss($dst$$XMMRegister, $tmp$$XMMRegister);
5745     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5746     __ mulss($dst$$XMMRegister, $tmp$$XMMRegister);
5747     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5748     __ mulss($dst$$XMMRegister, $tmp$$XMMRegister);
5749   %}
5750   ins_pipe( pipe_slow );
5751 %}
5752 
5753 instruct rvmul4F_reduction_reg(regF dst, vecX src2, vecX tmp) %{
5754   predicate(UseAVX > 0);
5755   match(Set dst (MulReductionVF dst src2));
5756   effect(TEMP tmp, TEMP dst);
5757   format %{ "vmulss  $dst,$dst,$src2\n\t"
5758             "pshufd  $tmp,$src2,0x01\n\t"
5759             "vmulss  $dst,$dst,$tmp\n\t"
5760             "pshufd  $tmp,$src2,0x02\n\t"
5761             "vmulss  $dst,$dst,$tmp\n\t"
5762             "pshufd  $tmp,$src2,0x03\n\t"
5763             "vmulss  $dst,$dst,$tmp\t! mul reduction4F" %}
5764   ins_encode %{
5765     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5766     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5767     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5768     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5769     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5770     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5771     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5772   %}
5773   ins_pipe( pipe_slow );
5774 %}
5775 
5776 instruct rvmul8F_reduction_reg(regF dst, vecY src2, vecY tmp, vecY tmp2) %{
5777   predicate(UseAVX > 0);
5778   match(Set dst (MulReductionVF dst src2));
5779   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5780   format %{ "vmulss  $dst,$dst,$src2\n\t"
5781             "pshufd  $tmp,$src2,0x01\n\t"
5782             "vmulss  $dst,$dst,$tmp\n\t"
5783             "pshufd  $tmp,$src2,0x02\n\t"
5784             "vmulss  $dst,$dst,$tmp\n\t"
5785             "pshufd  $tmp,$src2,0x03\n\t"
5786             "vmulss  $dst,$dst,$tmp\n\t"
5787             "vextractf128_high  $tmp2,$src2\n\t"
5788             "vmulss  $dst,$dst,$tmp2\n\t"
5789             "pshufd  $tmp,$tmp2,0x01\n\t"
5790             "vmulss  $dst,$dst,$tmp\n\t"
5791             "pshufd  $tmp,$tmp2,0x02\n\t"
5792             "vmulss  $dst,$dst,$tmp\n\t"
5793             "pshufd  $tmp,$tmp2,0x03\n\t"
5794             "vmulss  $dst,$dst,$tmp\t! mul reduction8F" %}
5795   ins_encode %{
5796     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5797     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5798     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5799     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5800     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5801     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5802     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5803     __ vextractf128_high($tmp2$$XMMRegister, $src2$$XMMRegister);
5804     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5805     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5806     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5807     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5808     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5809     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5810     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5811   %}
5812   ins_pipe( pipe_slow );
5813 %}
5814 
5815 instruct rvmul16F_reduction_reg(regF dst, legVecZ src2, legVecZ tmp, legVecZ tmp2) %{
5816   predicate(UseAVX > 2);
5817   match(Set dst (MulReductionVF dst src2));
5818   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5819   format %{ "vmulss  $dst,$dst,$src2\n\t"
5820             "pshufd  $tmp,$src2,0x01\n\t"
5821             "vmulss  $dst,$dst,$tmp\n\t"
5822             "pshufd  $tmp,$src2,0x02\n\t"
5823             "vmulss  $dst,$dst,$tmp\n\t"
5824             "pshufd  $tmp,$src2,0x03\n\t"
5825             "vmulss  $dst,$dst,$tmp\n\t"
5826             "vextractf32x4  $tmp2,$src2,0x1\n\t"
5827             "vmulss  $dst,$dst,$tmp2\n\t"
5828             "pshufd  $tmp,$tmp2,0x01\n\t"
5829             "vmulss  $dst,$dst,$tmp\n\t"
5830             "pshufd  $tmp,$tmp2,0x02\n\t"
5831             "vmulss  $dst,$dst,$tmp\n\t"
5832             "pshufd  $tmp,$tmp2,0x03\n\t"
5833             "vmulss  $dst,$dst,$tmp\n\t"
5834             "vextractf32x4  $tmp2,$src2,0x2\n\t"
5835             "vmulss  $dst,$dst,$tmp2\n\t"
5836             "pshufd  $tmp,$tmp2,0x01\n\t"
5837             "vmulss  $dst,$dst,$tmp\n\t"
5838             "pshufd  $tmp,$tmp2,0x02\n\t"
5839             "vmulss  $dst,$dst,$tmp\n\t"
5840             "pshufd  $tmp,$tmp2,0x03\n\t"
5841             "vmulss  $dst,$dst,$tmp\n\t"
5842             "vextractf32x4  $tmp2,$src2,0x3\n\t"
5843             "vmulss  $dst,$dst,$tmp2\n\t"
5844             "pshufd  $tmp,$tmp2,0x01\n\t"
5845             "vmulss  $dst,$dst,$tmp\n\t"
5846             "pshufd  $tmp,$tmp2,0x02\n\t"
5847             "vmulss  $dst,$dst,$tmp\n\t"
5848             "pshufd  $tmp,$tmp2,0x03\n\t"
5849             "vmulss  $dst,$dst,$tmp\t! mul reduction16F" %}
5850   ins_encode %{
5851     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5852     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5853     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5854     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5855     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5856     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5857     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5858     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5859     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5860     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5861     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5862     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5863     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5864     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5865     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5866     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
5867     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5868     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5869     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5870     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5871     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5872     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5873     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5874     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
5875     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5876     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5877     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5878     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5879     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5880     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5881     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5882   %}
5883   ins_pipe( pipe_slow );
5884 %}
5885 
5886 instruct rsmul2D_reduction_reg(regD dst, vecX src2, vecX tmp) %{
5887   predicate(UseSSE >= 1 && UseAVX == 0);
5888   match(Set dst (MulReductionVD dst src2));
5889   effect(TEMP dst, TEMP tmp);
5890   format %{ "mulsd   $dst,$src2\n\t"
5891             "pshufd  $tmp,$src2,0xE\n\t"
5892             "mulsd   $dst,$tmp\t! mul reduction2D" %}
5893   ins_encode %{
5894     __ mulsd($dst$$XMMRegister, $src2$$XMMRegister);
5895     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5896     __ mulsd($dst$$XMMRegister, $tmp$$XMMRegister);
5897   %}
5898   ins_pipe( pipe_slow );
5899 %}
5900 
5901 instruct rvmul2D_reduction_reg(regD dst, vecX src2, vecX tmp) %{
5902   predicate(UseAVX > 0);
5903   match(Set dst (MulReductionVD dst src2));
5904   effect(TEMP tmp, TEMP dst);
5905   format %{ "vmulsd  $dst,$dst,$src2\n\t"
5906             "pshufd  $tmp,$src2,0xE\n\t"
5907             "vmulsd  $dst,$dst,$tmp\t! mul reduction2D" %}
5908   ins_encode %{
5909     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5910     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5911     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5912   %}
5913   ins_pipe( pipe_slow );
5914 %}
5915 
5916 instruct rvmul4D_reduction_reg(regD dst, vecY src2, vecY tmp, vecY tmp2) %{
5917   predicate(UseAVX > 0);
5918   match(Set dst (MulReductionVD dst src2));
5919   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5920   format %{ "vmulsd  $dst,$dst,$src2\n\t"
5921             "pshufd  $tmp,$src2,0xE\n\t"
5922             "vmulsd  $dst,$dst,$tmp\n\t"
5923             "vextractf128_high  $tmp2,$src2\n\t"
5924             "vmulsd  $dst,$dst,$tmp2\n\t"
5925             "pshufd  $tmp,$tmp2,0xE\n\t"
5926             "vmulsd  $dst,$dst,$tmp\t! mul reduction4D" %}
5927   ins_encode %{
5928     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5929     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5930     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5931     __ vextractf128_high($tmp2$$XMMRegister, $src2$$XMMRegister);
5932     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5933     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5934     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5935   %}
5936   ins_pipe( pipe_slow );
5937 %}
5938 
5939 instruct rvmul8D_reduction_reg(regD dst, legVecZ src2, legVecZ tmp, legVecZ tmp2) %{
5940   predicate(UseAVX > 2);
5941   match(Set dst (MulReductionVD dst src2));
5942   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5943   format %{ "vmulsd  $dst,$dst,$src2\n\t"
5944             "pshufd  $tmp,$src2,0xE\n\t"
5945             "vmulsd  $dst,$dst,$tmp\n\t"
5946             "vextractf32x4  $tmp2,$src2,0x1\n\t"
5947             "vmulsd  $dst,$dst,$tmp2\n\t"
5948             "pshufd  $tmp,$src2,0xE\n\t"
5949             "vmulsd  $dst,$dst,$tmp\n\t"
5950             "vextractf32x4  $tmp2,$src2,0x2\n\t"
5951             "vmulsd  $dst,$dst,$tmp2\n\t"
5952             "pshufd  $tmp,$tmp2,0xE\n\t"
5953             "vmulsd  $dst,$dst,$tmp\n\t"
5954             "vextractf32x4  $tmp2,$src2,0x3\n\t"
5955             "vmulsd  $dst,$dst,$tmp2\n\t"
5956             "pshufd  $tmp,$tmp2,0xE\n\t"
5957             "vmulsd  $dst,$dst,$tmp\t! mul reduction8D" %}
5958   ins_encode %{
5959     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5960     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5961     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5962     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5963     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5964     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5965     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5966     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
5967     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5968     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5969     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5970     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
5971     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5972     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5973     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5974   %}
5975   ins_pipe( pipe_slow );
5976 %}
5977 
5978 // ====================VECTOR ARITHMETIC=======================================
5979 
5980 // --------------------------------- ADD --------------------------------------
5981 
5982 // Bytes vector add
5983 instruct vadd4B(vecS dst, vecS src) %{
5984   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
5985   match(Set dst (AddVB dst src));
5986   format %{ "paddb   $dst,$src\t! add packed4B" %}
5987   ins_encode %{
5988     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
5989   %}
5990   ins_pipe( pipe_slow );
5991 %}
5992 
5993 instruct vadd4B_reg(vecS dst, vecS src1, vecS src2) %{
5994   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
5995   match(Set dst (AddVB src1 src2));
5996   format %{ "vpaddb  $dst,$src1,$src2\t! add packed4B" %}
5997   ins_encode %{
5998     int vector_len = 0;
5999     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6000   %}
6001   ins_pipe( pipe_slow );
6002 %}
6003 
6004 
6005 instruct vadd4B_mem(vecS dst, vecS src, memory mem) %{
6006   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6007   match(Set dst (AddVB src (LoadVector mem)));
6008   format %{ "vpaddb  $dst,$src,$mem\t! add packed4B" %}
6009   ins_encode %{
6010     int vector_len = 0;
6011     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6012   %}
6013   ins_pipe( pipe_slow );
6014 %}
6015 
6016 instruct vadd8B(vecD dst, vecD src) %{
6017   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
6018   match(Set dst (AddVB dst src));
6019   format %{ "paddb   $dst,$src\t! add packed8B" %}
6020   ins_encode %{
6021     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
6022   %}
6023   ins_pipe( pipe_slow );
6024 %}
6025 
6026 instruct vadd8B_reg(vecD dst, vecD src1, vecD src2) %{
6027   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6028   match(Set dst (AddVB src1 src2));
6029   format %{ "vpaddb  $dst,$src1,$src2\t! add packed8B" %}
6030   ins_encode %{
6031     int vector_len = 0;
6032     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6033   %}
6034   ins_pipe( pipe_slow );
6035 %}
6036 
6037 
6038 instruct vadd8B_mem(vecD dst, vecD src, memory mem) %{
6039   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6040   match(Set dst (AddVB src (LoadVector mem)));
6041   format %{ "vpaddb  $dst,$src,$mem\t! add packed8B" %}
6042   ins_encode %{
6043     int vector_len = 0;
6044     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6045   %}
6046   ins_pipe( pipe_slow );
6047 %}
6048 
6049 instruct vadd16B(vecX dst, vecX src) %{
6050   predicate(UseAVX == 0 && n->as_Vector()->length() == 16);
6051   match(Set dst (AddVB dst src));
6052   format %{ "paddb   $dst,$src\t! add packed16B" %}
6053   ins_encode %{
6054     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
6055   %}
6056   ins_pipe( pipe_slow );
6057 %}
6058 
6059 instruct vadd16B_reg(vecX dst, vecX src1, vecX src2) %{
6060   predicate(UseAVX > 0  && n->as_Vector()->length() == 16);
6061   match(Set dst (AddVB src1 src2));
6062   format %{ "vpaddb  $dst,$src1,$src2\t! add packed16B" %}
6063   ins_encode %{
6064     int vector_len = 0;
6065     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6066   %}
6067   ins_pipe( pipe_slow );
6068 %}
6069 
6070 instruct vadd16B_mem(vecX dst, vecX src, memory mem) %{
6071   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
6072   match(Set dst (AddVB src (LoadVector mem)));
6073   format %{ "vpaddb  $dst,$src,$mem\t! add packed16B" %}
6074   ins_encode %{
6075     int vector_len = 0;
6076     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6077   %}
6078   ins_pipe( pipe_slow );
6079 %}
6080 
6081 instruct vadd32B_reg(vecY dst, vecY src1, vecY src2) %{
6082   predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
6083   match(Set dst (AddVB src1 src2));
6084   format %{ "vpaddb  $dst,$src1,$src2\t! add packed32B" %}
6085   ins_encode %{
6086     int vector_len = 1;
6087     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6088   %}
6089   ins_pipe( pipe_slow );
6090 %}
6091 
6092 instruct vadd32B_mem(vecY dst, vecY src, memory mem) %{
6093   predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
6094   match(Set dst (AddVB src (LoadVector mem)));
6095   format %{ "vpaddb  $dst,$src,$mem\t! add packed32B" %}
6096   ins_encode %{
6097     int vector_len = 1;
6098     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6099   %}
6100   ins_pipe( pipe_slow );
6101 %}
6102 
6103 instruct vadd64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
6104   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64);
6105   match(Set dst (AddVB src1 src2));
6106   format %{ "vpaddb  $dst,$src1,$src2\t! add packed64B" %}
6107   ins_encode %{
6108     int vector_len = 2;
6109     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6110   %}
6111   ins_pipe( pipe_slow );
6112 %}
6113 
6114 instruct vadd64B_mem(vecZ dst, vecZ src, memory mem) %{
6115   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64);
6116   match(Set dst (AddVB src (LoadVector mem)));
6117   format %{ "vpaddb  $dst,$src,$mem\t! add packed64B" %}
6118   ins_encode %{
6119     int vector_len = 2;
6120     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6121   %}
6122   ins_pipe( pipe_slow );
6123 %}
6124 
6125 // Shorts/Chars vector add
6126 instruct vadd2S(vecS dst, vecS src) %{
6127   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
6128   match(Set dst (AddVS dst src));
6129   format %{ "paddw   $dst,$src\t! add packed2S" %}
6130   ins_encode %{
6131     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
6132   %}
6133   ins_pipe( pipe_slow );
6134 %}
6135 
6136 instruct vadd2S_reg(vecS dst, vecS src1, vecS src2) %{
6137   predicate(UseAVX > 0  && n->as_Vector()->length() == 2);
6138   match(Set dst (AddVS src1 src2));
6139   format %{ "vpaddw  $dst,$src1,$src2\t! add packed2S" %}
6140   ins_encode %{
6141     int vector_len = 0;
6142     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6143   %}
6144   ins_pipe( pipe_slow );
6145 %}
6146 
6147 instruct vadd2S_mem(vecS dst, vecS src, memory mem) %{
6148   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6149   match(Set dst (AddVS src (LoadVector mem)));
6150   format %{ "vpaddw  $dst,$src,$mem\t! add packed2S" %}
6151   ins_encode %{
6152     int vector_len = 0;
6153     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6154   %}
6155   ins_pipe( pipe_slow );
6156 %}
6157 
6158 instruct vadd4S(vecD dst, vecD src) %{
6159   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
6160   match(Set dst (AddVS dst src));
6161   format %{ "paddw   $dst,$src\t! add packed4S" %}
6162   ins_encode %{
6163     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
6164   %}
6165   ins_pipe( pipe_slow );
6166 %}
6167 
6168 instruct vadd4S_reg(vecD dst, vecD src1, vecD src2) %{
6169   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6170   match(Set dst (AddVS src1 src2));
6171   format %{ "vpaddw  $dst,$src1,$src2\t! add packed4S" %}
6172   ins_encode %{
6173     int vector_len = 0;
6174     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6175   %}
6176   ins_pipe( pipe_slow );
6177 %}
6178 
6179 instruct vadd4S_mem(vecD dst, vecD src, memory mem) %{
6180   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6181   match(Set dst (AddVS src (LoadVector mem)));
6182   format %{ "vpaddw  $dst,$src,$mem\t! add packed4S" %}
6183   ins_encode %{
6184     int vector_len = 0;
6185     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6186   %}
6187   ins_pipe( pipe_slow );
6188 %}
6189 
6190 instruct vadd8S(vecX dst, vecX src) %{
6191   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
6192   match(Set dst (AddVS dst src));
6193   format %{ "paddw   $dst,$src\t! add packed8S" %}
6194   ins_encode %{
6195     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
6196   %}
6197   ins_pipe( pipe_slow );
6198 %}
6199 
6200 instruct vadd8S_reg(vecX dst, vecX src1, vecX src2) %{
6201   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6202   match(Set dst (AddVS src1 src2));
6203   format %{ "vpaddw  $dst,$src1,$src2\t! add packed8S" %}
6204   ins_encode %{
6205     int vector_len = 0;
6206     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6207   %}
6208   ins_pipe( pipe_slow );
6209 %}
6210 
6211 instruct vadd8S_mem(vecX dst, vecX src, memory mem) %{
6212   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6213   match(Set dst (AddVS src (LoadVector mem)));
6214   format %{ "vpaddw  $dst,$src,$mem\t! add packed8S" %}
6215   ins_encode %{
6216     int vector_len = 0;
6217     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6218   %}
6219   ins_pipe( pipe_slow );
6220 %}
6221 
6222 instruct vadd16S_reg(vecY dst, vecY src1, vecY src2) %{
6223   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
6224   match(Set dst (AddVS src1 src2));
6225   format %{ "vpaddw  $dst,$src1,$src2\t! add packed16S" %}
6226   ins_encode %{
6227     int vector_len = 1;
6228     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6229   %}
6230   ins_pipe( pipe_slow );
6231 %}
6232 
6233 instruct vadd16S_mem(vecY dst, vecY src, memory mem) %{
6234   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
6235   match(Set dst (AddVS src (LoadVector mem)));
6236   format %{ "vpaddw  $dst,$src,$mem\t! add packed16S" %}
6237   ins_encode %{
6238     int vector_len = 1;
6239     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6240   %}
6241   ins_pipe( pipe_slow );
6242 %}
6243 
6244 instruct vadd32S_reg(vecZ dst, vecZ src1, vecZ src2) %{
6245   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
6246   match(Set dst (AddVS src1 src2));
6247   format %{ "vpaddw  $dst,$src1,$src2\t! add packed32S" %}
6248   ins_encode %{
6249     int vector_len = 2;
6250     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6251   %}
6252   ins_pipe( pipe_slow );
6253 %}
6254 
6255 instruct vadd32S_mem(vecZ dst, vecZ src, memory mem) %{
6256   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
6257   match(Set dst (AddVS src (LoadVector mem)));
6258   format %{ "vpaddw  $dst,$src,$mem\t! add packed32S" %}
6259   ins_encode %{
6260     int vector_len = 2;
6261     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6262   %}
6263   ins_pipe( pipe_slow );
6264 %}
6265 
6266 // Integers vector add
6267 instruct vadd2I(vecD dst, vecD src) %{
6268   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
6269   match(Set dst (AddVI dst src));
6270   format %{ "paddd   $dst,$src\t! add packed2I" %}
6271   ins_encode %{
6272     __ paddd($dst$$XMMRegister, $src$$XMMRegister);
6273   %}
6274   ins_pipe( pipe_slow );
6275 %}
6276 
6277 instruct vadd2I_reg(vecD dst, vecD src1, vecD src2) %{
6278   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6279   match(Set dst (AddVI src1 src2));
6280   format %{ "vpaddd  $dst,$src1,$src2\t! add packed2I" %}
6281   ins_encode %{
6282     int vector_len = 0;
6283     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6284   %}
6285   ins_pipe( pipe_slow );
6286 %}
6287 
6288 instruct vadd2I_mem(vecD dst, vecD src, memory mem) %{
6289   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6290   match(Set dst (AddVI src (LoadVector mem)));
6291   format %{ "vpaddd  $dst,$src,$mem\t! add packed2I" %}
6292   ins_encode %{
6293     int vector_len = 0;
6294     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6295   %}
6296   ins_pipe( pipe_slow );
6297 %}
6298 
6299 instruct vadd4I(vecX dst, vecX src) %{
6300   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
6301   match(Set dst (AddVI dst src));
6302   format %{ "paddd   $dst,$src\t! add packed4I" %}
6303   ins_encode %{
6304     __ paddd($dst$$XMMRegister, $src$$XMMRegister);
6305   %}
6306   ins_pipe( pipe_slow );
6307 %}
6308 
6309 instruct vadd4I_reg(vecX dst, vecX src1, vecX src2) %{
6310   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6311   match(Set dst (AddVI src1 src2));
6312   format %{ "vpaddd  $dst,$src1,$src2\t! add packed4I" %}
6313   ins_encode %{
6314     int vector_len = 0;
6315     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6316   %}
6317   ins_pipe( pipe_slow );
6318 %}
6319 
6320 instruct vadd4I_mem(vecX dst, vecX src, memory mem) %{
6321   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6322   match(Set dst (AddVI src (LoadVector mem)));
6323   format %{ "vpaddd  $dst,$src,$mem\t! add packed4I" %}
6324   ins_encode %{
6325     int vector_len = 0;
6326     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6327   %}
6328   ins_pipe( pipe_slow );
6329 %}
6330 
6331 instruct vadd8I_reg(vecY dst, vecY src1, vecY src2) %{
6332   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
6333   match(Set dst (AddVI src1 src2));
6334   format %{ "vpaddd  $dst,$src1,$src2\t! add packed8I" %}
6335   ins_encode %{
6336     int vector_len = 1;
6337     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6338   %}
6339   ins_pipe( pipe_slow );
6340 %}
6341 
6342 instruct vadd8I_mem(vecY dst, vecY src, memory mem) %{
6343   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
6344   match(Set dst (AddVI src (LoadVector mem)));
6345   format %{ "vpaddd  $dst,$src,$mem\t! add packed8I" %}
6346   ins_encode %{
6347     int vector_len = 1;
6348     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6349   %}
6350   ins_pipe( pipe_slow );
6351 %}
6352 
6353 instruct vadd16I_reg(vecZ dst, vecZ src1, vecZ src2) %{
6354   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
6355   match(Set dst (AddVI src1 src2));
6356   format %{ "vpaddd  $dst,$src1,$src2\t! add packed16I" %}
6357   ins_encode %{
6358     int vector_len = 2;
6359     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6360   %}
6361   ins_pipe( pipe_slow );
6362 %}
6363 
6364 instruct vadd16I_mem(vecZ dst, vecZ src, memory mem) %{
6365   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
6366   match(Set dst (AddVI src (LoadVector mem)));
6367   format %{ "vpaddd  $dst,$src,$mem\t! add packed16I" %}
6368   ins_encode %{
6369     int vector_len = 2;
6370     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6371   %}
6372   ins_pipe( pipe_slow );
6373 %}
6374 
6375 // Longs vector add
6376 instruct vadd2L(vecX dst, vecX src) %{
6377   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
6378   match(Set dst (AddVL dst src));
6379   format %{ "paddq   $dst,$src\t! add packed2L" %}
6380   ins_encode %{
6381     __ paddq($dst$$XMMRegister, $src$$XMMRegister);
6382   %}
6383   ins_pipe( pipe_slow );
6384 %}
6385 
6386 instruct vadd2L_reg(vecX dst, vecX src1, vecX src2) %{
6387   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6388   match(Set dst (AddVL src1 src2));
6389   format %{ "vpaddq  $dst,$src1,$src2\t! add packed2L" %}
6390   ins_encode %{
6391     int vector_len = 0;
6392     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6393   %}
6394   ins_pipe( pipe_slow );
6395 %}
6396 
6397 instruct vadd2L_mem(vecX dst, vecX src, memory mem) %{
6398   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6399   match(Set dst (AddVL src (LoadVector mem)));
6400   format %{ "vpaddq  $dst,$src,$mem\t! add packed2L" %}
6401   ins_encode %{
6402     int vector_len = 0;
6403     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6404   %}
6405   ins_pipe( pipe_slow );
6406 %}
6407 
6408 instruct vadd4L_reg(vecY dst, vecY src1, vecY src2) %{
6409   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
6410   match(Set dst (AddVL src1 src2));
6411   format %{ "vpaddq  $dst,$src1,$src2\t! add packed4L" %}
6412   ins_encode %{
6413     int vector_len = 1;
6414     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6415   %}
6416   ins_pipe( pipe_slow );
6417 %}
6418 
6419 instruct vadd4L_mem(vecY dst, vecY src, memory mem) %{
6420   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
6421   match(Set dst (AddVL src (LoadVector mem)));
6422   format %{ "vpaddq  $dst,$src,$mem\t! add packed4L" %}
6423   ins_encode %{
6424     int vector_len = 1;
6425     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6426   %}
6427   ins_pipe( pipe_slow );
6428 %}
6429 
6430 instruct vadd8L_reg(vecZ dst, vecZ src1, vecZ src2) %{
6431   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
6432   match(Set dst (AddVL src1 src2));
6433   format %{ "vpaddq  $dst,$src1,$src2\t! add packed8L" %}
6434   ins_encode %{
6435     int vector_len = 2;
6436     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6437   %}
6438   ins_pipe( pipe_slow );
6439 %}
6440 
6441 instruct vadd8L_mem(vecZ dst, vecZ src, memory mem) %{
6442   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
6443   match(Set dst (AddVL src (LoadVector mem)));
6444   format %{ "vpaddq  $dst,$src,$mem\t! add packed8L" %}
6445   ins_encode %{
6446     int vector_len = 2;
6447     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6448   %}
6449   ins_pipe( pipe_slow );
6450 %}
6451 
6452 // Floats vector add
6453 instruct vadd2F(vecD dst, vecD src) %{
6454   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
6455   match(Set dst (AddVF dst src));
6456   format %{ "addps   $dst,$src\t! add packed2F" %}
6457   ins_encode %{
6458     __ addps($dst$$XMMRegister, $src$$XMMRegister);
6459   %}
6460   ins_pipe( pipe_slow );
6461 %}
6462 
6463 instruct vadd2F_reg(vecD dst, vecD src1, vecD src2) %{
6464   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6465   match(Set dst (AddVF src1 src2));
6466   format %{ "vaddps  $dst,$src1,$src2\t! add packed2F" %}
6467   ins_encode %{
6468     int vector_len = 0;
6469     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6470   %}
6471   ins_pipe( pipe_slow );
6472 %}
6473 
6474 instruct vadd2F_mem(vecD dst, vecD src, memory mem) %{
6475   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6476   match(Set dst (AddVF src (LoadVector mem)));
6477   format %{ "vaddps  $dst,$src,$mem\t! add packed2F" %}
6478   ins_encode %{
6479     int vector_len = 0;
6480     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6481   %}
6482   ins_pipe( pipe_slow );
6483 %}
6484 
6485 instruct vadd4F(vecX dst, vecX src) %{
6486   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
6487   match(Set dst (AddVF dst src));
6488   format %{ "addps   $dst,$src\t! add packed4F" %}
6489   ins_encode %{
6490     __ addps($dst$$XMMRegister, $src$$XMMRegister);
6491   %}
6492   ins_pipe( pipe_slow );
6493 %}
6494 
6495 instruct vadd4F_reg(vecX dst, vecX src1, vecX src2) %{
6496   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6497   match(Set dst (AddVF src1 src2));
6498   format %{ "vaddps  $dst,$src1,$src2\t! add packed4F" %}
6499   ins_encode %{
6500     int vector_len = 0;
6501     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6502   %}
6503   ins_pipe( pipe_slow );
6504 %}
6505 
6506 instruct vadd4F_mem(vecX dst, vecX src, memory mem) %{
6507   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6508   match(Set dst (AddVF src (LoadVector mem)));
6509   format %{ "vaddps  $dst,$src,$mem\t! add packed4F" %}
6510   ins_encode %{
6511     int vector_len = 0;
6512     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6513   %}
6514   ins_pipe( pipe_slow );
6515 %}
6516 
6517 instruct vadd8F_reg(vecY dst, vecY src1, vecY src2) %{
6518   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6519   match(Set dst (AddVF src1 src2));
6520   format %{ "vaddps  $dst,$src1,$src2\t! add packed8F" %}
6521   ins_encode %{
6522     int vector_len = 1;
6523     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6524   %}
6525   ins_pipe( pipe_slow );
6526 %}
6527 
6528 instruct vadd8F_mem(vecY dst, vecY src, memory mem) %{
6529   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6530   match(Set dst (AddVF src (LoadVector mem)));
6531   format %{ "vaddps  $dst,$src,$mem\t! add packed8F" %}
6532   ins_encode %{
6533     int vector_len = 1;
6534     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6535   %}
6536   ins_pipe( pipe_slow );
6537 %}
6538 
6539 instruct vadd16F_reg(vecZ dst, vecZ src1, vecZ src2) %{
6540   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
6541   match(Set dst (AddVF src1 src2));
6542   format %{ "vaddps  $dst,$src1,$src2\t! add packed16F" %}
6543   ins_encode %{
6544     int vector_len = 2;
6545     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6546   %}
6547   ins_pipe( pipe_slow );
6548 %}
6549 
6550 instruct vadd16F_mem(vecZ dst, vecZ src, memory mem) %{
6551   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
6552   match(Set dst (AddVF src (LoadVector mem)));
6553   format %{ "vaddps  $dst,$src,$mem\t! add packed16F" %}
6554   ins_encode %{
6555     int vector_len = 2;
6556     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6557   %}
6558   ins_pipe( pipe_slow );
6559 %}
6560 
6561 // Doubles vector add
6562 instruct vadd2D(vecX dst, vecX src) %{
6563   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
6564   match(Set dst (AddVD dst src));
6565   format %{ "addpd   $dst,$src\t! add packed2D" %}
6566   ins_encode %{
6567     __ addpd($dst$$XMMRegister, $src$$XMMRegister);
6568   %}
6569   ins_pipe( pipe_slow );
6570 %}
6571 
6572 instruct vadd2D_reg(vecX dst, vecX src1, vecX src2) %{
6573   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6574   match(Set dst (AddVD src1 src2));
6575   format %{ "vaddpd  $dst,$src1,$src2\t! add packed2D" %}
6576   ins_encode %{
6577     int vector_len = 0;
6578     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6579   %}
6580   ins_pipe( pipe_slow );
6581 %}
6582 
6583 instruct vadd2D_mem(vecX dst, vecX src, memory mem) %{
6584   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6585   match(Set dst (AddVD src (LoadVector mem)));
6586   format %{ "vaddpd  $dst,$src,$mem\t! add packed2D" %}
6587   ins_encode %{
6588     int vector_len = 0;
6589     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6590   %}
6591   ins_pipe( pipe_slow );
6592 %}
6593 
6594 instruct vadd4D_reg(vecY dst, vecY src1, vecY src2) %{
6595   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6596   match(Set dst (AddVD src1 src2));
6597   format %{ "vaddpd  $dst,$src1,$src2\t! add packed4D" %}
6598   ins_encode %{
6599     int vector_len = 1;
6600     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6601   %}
6602   ins_pipe( pipe_slow );
6603 %}
6604 
6605 instruct vadd4D_mem(vecY dst, vecY src, memory mem) %{
6606   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6607   match(Set dst (AddVD src (LoadVector mem)));
6608   format %{ "vaddpd  $dst,$src,$mem\t! add packed4D" %}
6609   ins_encode %{
6610     int vector_len = 1;
6611     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6612   %}
6613   ins_pipe( pipe_slow );
6614 %}
6615 
6616 instruct vadd8D_reg(vecZ dst, vecZ src1, vecZ src2) %{
6617   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
6618   match(Set dst (AddVD src1 src2));
6619   format %{ "vaddpd  $dst,$src1,$src2\t! add packed8D" %}
6620   ins_encode %{
6621     int vector_len = 2;
6622     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6623   %}
6624   ins_pipe( pipe_slow );
6625 %}
6626 
6627 instruct vadd8D_mem(vecZ dst, vecZ src, memory mem) %{
6628   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
6629   match(Set dst (AddVD src (LoadVector mem)));
6630   format %{ "vaddpd  $dst,$src,$mem\t! add packed8D" %}
6631   ins_encode %{
6632     int vector_len = 2;
6633     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6634   %}
6635   ins_pipe( pipe_slow );
6636 %}
6637 
6638 // --------------------------------- SUB --------------------------------------
6639 
6640 // Bytes vector sub
6641 instruct vsub4B(vecS dst, vecS src) %{
6642   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
6643   match(Set dst (SubVB dst src));
6644   format %{ "psubb   $dst,$src\t! sub packed4B" %}
6645   ins_encode %{
6646     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
6647   %}
6648   ins_pipe( pipe_slow );
6649 %}
6650 
6651 instruct vsub4B_reg(vecS dst, vecS src1, vecS src2) %{
6652   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6653   match(Set dst (SubVB src1 src2));
6654   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed4B" %}
6655   ins_encode %{
6656     int vector_len = 0;
6657     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6658   %}
6659   ins_pipe( pipe_slow );
6660 %}
6661 
6662 instruct vsub4B_mem(vecS dst, vecS src, memory mem) %{
6663   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6664   match(Set dst (SubVB src (LoadVector mem)));
6665   format %{ "vpsubb  $dst,$src,$mem\t! sub packed4B" %}
6666   ins_encode %{
6667     int vector_len = 0;
6668     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6669   %}
6670   ins_pipe( pipe_slow );
6671 %}
6672 
6673 instruct vsub8B(vecD dst, vecD src) %{
6674   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
6675   match(Set dst (SubVB dst src));
6676   format %{ "psubb   $dst,$src\t! sub packed8B" %}
6677   ins_encode %{
6678     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
6679   %}
6680   ins_pipe( pipe_slow );
6681 %}
6682 
6683 instruct vsub8B_reg(vecD dst, vecD src1, vecD src2) %{
6684   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6685   match(Set dst (SubVB src1 src2));
6686   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed8B" %}
6687   ins_encode %{
6688     int vector_len = 0;
6689     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6690   %}
6691   ins_pipe( pipe_slow );
6692 %}
6693 
6694 instruct vsub8B_mem(vecD dst, vecD src, memory mem) %{
6695   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6696   match(Set dst (SubVB src (LoadVector mem)));
6697   format %{ "vpsubb  $dst,$src,$mem\t! sub packed8B" %}
6698   ins_encode %{
6699     int vector_len = 0;
6700     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6701   %}
6702   ins_pipe( pipe_slow );
6703 %}
6704 
6705 instruct vsub16B(vecX dst, vecX src) %{
6706   predicate(UseAVX == 0 && n->as_Vector()->length() == 16);
6707   match(Set dst (SubVB dst src));
6708   format %{ "psubb   $dst,$src\t! sub packed16B" %}
6709   ins_encode %{
6710     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
6711   %}
6712   ins_pipe( pipe_slow );
6713 %}
6714 
6715 instruct vsub16B_reg(vecX dst, vecX src1, vecX src2) %{
6716   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
6717   match(Set dst (SubVB src1 src2));
6718   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed16B" %}
6719   ins_encode %{
6720     int vector_len = 0;
6721     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6722   %}
6723   ins_pipe( pipe_slow );
6724 %}
6725 
6726 instruct vsub16B_mem(vecX dst, vecX src, memory mem) %{
6727   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
6728   match(Set dst (SubVB src (LoadVector mem)));
6729   format %{ "vpsubb  $dst,$src,$mem\t! sub packed16B" %}
6730   ins_encode %{
6731     int vector_len = 0;
6732     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6733   %}
6734   ins_pipe( pipe_slow );
6735 %}
6736 
6737 instruct vsub32B_reg(vecY dst, vecY src1, vecY src2) %{
6738   predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
6739   match(Set dst (SubVB src1 src2));
6740   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed32B" %}
6741   ins_encode %{
6742     int vector_len = 1;
6743     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6744   %}
6745   ins_pipe( pipe_slow );
6746 %}
6747 
6748 instruct vsub32B_mem(vecY dst, vecY src, memory mem) %{
6749   predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
6750   match(Set dst (SubVB src (LoadVector mem)));
6751   format %{ "vpsubb  $dst,$src,$mem\t! sub packed32B" %}
6752   ins_encode %{
6753     int vector_len = 1;
6754     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6755   %}
6756   ins_pipe( pipe_slow );
6757 %}
6758 
6759 instruct vsub64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
6760   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64);
6761   match(Set dst (SubVB src1 src2));
6762   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed64B" %}
6763   ins_encode %{
6764     int vector_len = 2;
6765     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6766   %}
6767   ins_pipe( pipe_slow );
6768 %}
6769 
6770 instruct vsub64B_mem(vecZ dst, vecZ src, memory mem) %{
6771   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64);
6772   match(Set dst (SubVB src (LoadVector mem)));
6773   format %{ "vpsubb  $dst,$src,$mem\t! sub packed64B" %}
6774   ins_encode %{
6775     int vector_len = 2;
6776     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6777   %}
6778   ins_pipe( pipe_slow );
6779 %}
6780 
6781 // Shorts/Chars vector sub
6782 instruct vsub2S(vecS dst, vecS src) %{
6783   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
6784   match(Set dst (SubVS dst src));
6785   format %{ "psubw   $dst,$src\t! sub packed2S" %}
6786   ins_encode %{
6787     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
6788   %}
6789   ins_pipe( pipe_slow );
6790 %}
6791 
6792 instruct vsub2S_reg(vecS dst, vecS src1, vecS src2) %{
6793   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6794   match(Set dst (SubVS src1 src2));
6795   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed2S" %}
6796   ins_encode %{
6797     int vector_len = 0;
6798     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6799   %}
6800   ins_pipe( pipe_slow );
6801 %}
6802 
6803 instruct vsub2S_mem(vecS dst, vecS src, memory mem) %{
6804   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6805   match(Set dst (SubVS src (LoadVector mem)));
6806   format %{ "vpsubw  $dst,$src,$mem\t! sub packed2S" %}
6807   ins_encode %{
6808     int vector_len = 0;
6809     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6810   %}
6811   ins_pipe( pipe_slow );
6812 %}
6813 
6814 instruct vsub4S(vecD dst, vecD src) %{
6815   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
6816   match(Set dst (SubVS dst src));
6817   format %{ "psubw   $dst,$src\t! sub packed4S" %}
6818   ins_encode %{
6819     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
6820   %}
6821   ins_pipe( pipe_slow );
6822 %}
6823 
6824 instruct vsub4S_reg(vecD dst, vecD src1, vecD src2) %{
6825   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6826   match(Set dst (SubVS src1 src2));
6827   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed4S" %}
6828   ins_encode %{
6829     int vector_len = 0;
6830     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6831   %}
6832   ins_pipe( pipe_slow );
6833 %}
6834 
6835 instruct vsub4S_mem(vecD dst, vecD src, memory mem) %{
6836   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6837   match(Set dst (SubVS src (LoadVector mem)));
6838   format %{ "vpsubw  $dst,$src,$mem\t! sub packed4S" %}
6839   ins_encode %{
6840     int vector_len = 0;
6841     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6842   %}
6843   ins_pipe( pipe_slow );
6844 %}
6845 
6846 instruct vsub8S(vecX dst, vecX src) %{
6847   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
6848   match(Set dst (SubVS dst src));
6849   format %{ "psubw   $dst,$src\t! sub packed8S" %}
6850   ins_encode %{
6851     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
6852   %}
6853   ins_pipe( pipe_slow );
6854 %}
6855 
6856 instruct vsub8S_reg(vecX dst, vecX src1, vecX src2) %{
6857   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6858   match(Set dst (SubVS src1 src2));
6859   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed8S" %}
6860   ins_encode %{
6861     int vector_len = 0;
6862     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6863   %}
6864   ins_pipe( pipe_slow );
6865 %}
6866 
6867 instruct vsub8S_mem(vecX dst, vecX src, memory mem) %{
6868   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6869   match(Set dst (SubVS src (LoadVector mem)));
6870   format %{ "vpsubw  $dst,$src,$mem\t! sub packed8S" %}
6871   ins_encode %{
6872     int vector_len = 0;
6873     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6874   %}
6875   ins_pipe( pipe_slow );
6876 %}
6877 
6878 instruct vsub16S_reg(vecY dst, vecY src1, vecY src2) %{
6879   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
6880   match(Set dst (SubVS src1 src2));
6881   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed16S" %}
6882   ins_encode %{
6883     int vector_len = 1;
6884     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6885   %}
6886   ins_pipe( pipe_slow );
6887 %}
6888 
6889 instruct vsub16S_mem(vecY dst, vecY src, memory mem) %{
6890   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
6891   match(Set dst (SubVS src (LoadVector mem)));
6892   format %{ "vpsubw  $dst,$src,$mem\t! sub packed16S" %}
6893   ins_encode %{
6894     int vector_len = 1;
6895     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6896   %}
6897   ins_pipe( pipe_slow );
6898 %}
6899 
6900 instruct vsub32S_reg(vecZ dst, vecZ src1, vecZ src2) %{
6901   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
6902   match(Set dst (SubVS src1 src2));
6903   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed32S" %}
6904   ins_encode %{
6905     int vector_len = 2;
6906     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6907   %}
6908   ins_pipe( pipe_slow );
6909 %}
6910 
6911 instruct vsub32S_mem(vecZ dst, vecZ src, memory mem) %{
6912   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
6913   match(Set dst (SubVS src (LoadVector mem)));
6914   format %{ "vpsubw  $dst,$src,$mem\t! sub packed32S" %}
6915   ins_encode %{
6916     int vector_len = 2;
6917     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6918   %}
6919   ins_pipe( pipe_slow );
6920 %}
6921 
6922 // Integers vector sub
6923 instruct vsub2I(vecD dst, vecD src) %{
6924   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
6925   match(Set dst (SubVI dst src));
6926   format %{ "psubd   $dst,$src\t! sub packed2I" %}
6927   ins_encode %{
6928     __ psubd($dst$$XMMRegister, $src$$XMMRegister);
6929   %}
6930   ins_pipe( pipe_slow );
6931 %}
6932 
6933 instruct vsub2I_reg(vecD dst, vecD src1, vecD src2) %{
6934   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6935   match(Set dst (SubVI src1 src2));
6936   format %{ "vpsubd  $dst,$src1,$src2\t! sub packed2I" %}
6937   ins_encode %{
6938     int vector_len = 0;
6939     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6940   %}
6941   ins_pipe( pipe_slow );
6942 %}
6943 
6944 instruct vsub2I_mem(vecD dst, vecD src, memory mem) %{
6945   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6946   match(Set dst (SubVI src (LoadVector mem)));
6947   format %{ "vpsubd  $dst,$src,$mem\t! sub packed2I" %}
6948   ins_encode %{
6949     int vector_len = 0;
6950     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6951   %}
6952   ins_pipe( pipe_slow );
6953 %}
6954 
6955 instruct vsub4I(vecX dst, vecX src) %{
6956   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
6957   match(Set dst (SubVI dst src));
6958   format %{ "psubd   $dst,$src\t! sub packed4I" %}
6959   ins_encode %{
6960     __ psubd($dst$$XMMRegister, $src$$XMMRegister);
6961   %}
6962   ins_pipe( pipe_slow );
6963 %}
6964 
6965 instruct vsub4I_reg(vecX dst, vecX src1, vecX src2) %{
6966   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6967   match(Set dst (SubVI src1 src2));
6968   format %{ "vpsubd  $dst,$src1,$src2\t! sub packed4I" %}
6969   ins_encode %{
6970     int vector_len = 0;
6971     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6972   %}
6973   ins_pipe( pipe_slow );
6974 %}
6975 
6976 instruct vsub4I_mem(vecX dst, vecX src, memory mem) %{
6977   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6978   match(Set dst (SubVI src (LoadVector mem)));
6979   format %{ "vpsubd  $dst,$src,$mem\t! sub packed4I" %}
6980   ins_encode %{
6981     int vector_len = 0;
6982     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6983   %}
6984   ins_pipe( pipe_slow );
6985 %}
6986 
6987 instruct vsub8I_reg(vecY dst, vecY src1, vecY src2) %{
6988   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
6989   match(Set dst (SubVI src1 src2));
6990   format %{ "vpsubd  $dst,$src1,$src2\t! sub packed8I" %}
6991   ins_encode %{
6992     int vector_len = 1;
6993     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6994   %}
6995   ins_pipe( pipe_slow );
6996 %}
6997 
6998 instruct vsub8I_mem(vecY dst, vecY src, memory mem) %{
6999   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
7000   match(Set dst (SubVI src (LoadVector mem)));
7001   format %{ "vpsubd  $dst,$src,$mem\t! sub packed8I" %}
7002   ins_encode %{
7003     int vector_len = 1;
7004     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7005   %}
7006   ins_pipe( pipe_slow );
7007 %}
7008 
7009 instruct vsub16I_reg(vecZ dst, vecZ src1, vecZ src2) %{
7010   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
7011   match(Set dst (SubVI src1 src2));
7012   format %{ "vpsubd  $dst,$src1,$src2\t! sub packed16I" %}
7013   ins_encode %{
7014     int vector_len = 2;
7015     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7016   %}
7017   ins_pipe( pipe_slow );
7018 %}
7019 
7020 instruct vsub16I_mem(vecZ dst, vecZ src, memory mem) %{
7021   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
7022   match(Set dst (SubVI src (LoadVector mem)));
7023   format %{ "vpsubd  $dst,$src,$mem\t! sub packed16I" %}
7024   ins_encode %{
7025     int vector_len = 2;
7026     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7027   %}
7028   ins_pipe( pipe_slow );
7029 %}
7030 
7031 // Longs vector sub
7032 instruct vsub2L(vecX dst, vecX src) %{
7033   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
7034   match(Set dst (SubVL dst src));
7035   format %{ "psubq   $dst,$src\t! sub packed2L" %}
7036   ins_encode %{
7037     __ psubq($dst$$XMMRegister, $src$$XMMRegister);
7038   %}
7039   ins_pipe( pipe_slow );
7040 %}
7041 
7042 instruct vsub2L_reg(vecX dst, vecX src1, vecX src2) %{
7043   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7044   match(Set dst (SubVL src1 src2));
7045   format %{ "vpsubq  $dst,$src1,$src2\t! sub packed2L" %}
7046   ins_encode %{
7047     int vector_len = 0;
7048     __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7049   %}
7050   ins_pipe( pipe_slow );
7051 %}
7052 
7053 instruct vsub2L_mem(vecX dst, vecX src, memory mem) %{
7054   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7055   match(Set dst (SubVL src (LoadVector mem)));
7056   format %{ "vpsubq  $dst,$src,$mem\t! sub packed2L" %}
7057   ins_encode %{
7058     int vector_len = 0;
7059     __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7060   %}
7061   ins_pipe( pipe_slow );
7062 %}
7063 
7064 instruct vsub4L_reg(vecY dst, vecY src1, vecY src2) %{
7065   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
7066   match(Set dst (SubVL src1 src2));
7067   format %{ "vpsubq  $dst,$src1,$src2\t! sub packed4L" %}
7068   ins_encode %{
7069     int vector_len = 1;
7070     __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7071   %}
7072   ins_pipe( pipe_slow );
7073 %}
7074 
7075 instruct vsub4L_mem(vecY dst, vecY src, memory mem) %{
7076   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
7077   match(Set dst (SubVL src (LoadVector mem)));
7078   format %{ "vpsubq  $dst,$src,$mem\t! sub packed4L" %}
7079   ins_encode %{
7080     int vector_len = 1;
7081     __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7082   %}
7083   ins_pipe( pipe_slow );
7084 %}
7085 
7086 instruct vsub8L_reg(vecZ dst, vecZ src1, vecZ src2) %{
7087   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
7088   match(Set dst (SubVL src1 src2));
7089   format %{ "vpsubq  $dst,$src1,$src2\t! sub packed8L" %}
7090   ins_encode %{
7091     int vector_len = 2;
7092     __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7093   %}
7094   ins_pipe( pipe_slow );
7095 %}
7096 
7097 instruct vsub8L_mem(vecZ dst, vecZ src, memory mem) %{
7098   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
7099   match(Set dst (SubVL src (LoadVector mem)));
7100   format %{ "vpsubq  $dst,$src,$mem\t! sub packed8L" %}
7101   ins_encode %{
7102     int vector_len = 2;
7103     __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7104   %}
7105   ins_pipe( pipe_slow );
7106 %}
7107 
7108 // Floats vector sub
7109 instruct vsub2F(vecD dst, vecD src) %{
7110   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
7111   match(Set dst (SubVF dst src));
7112   format %{ "subps   $dst,$src\t! sub packed2F" %}
7113   ins_encode %{
7114     __ subps($dst$$XMMRegister, $src$$XMMRegister);
7115   %}
7116   ins_pipe( pipe_slow );
7117 %}
7118 
7119 instruct vsub2F_reg(vecD dst, vecD src1, vecD src2) %{
7120   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7121   match(Set dst (SubVF src1 src2));
7122   format %{ "vsubps  $dst,$src1,$src2\t! sub packed2F" %}
7123   ins_encode %{
7124     int vector_len = 0;
7125     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7126   %}
7127   ins_pipe( pipe_slow );
7128 %}
7129 
7130 instruct vsub2F_mem(vecD dst, vecD src, memory mem) %{
7131   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7132   match(Set dst (SubVF src (LoadVector mem)));
7133   format %{ "vsubps  $dst,$src,$mem\t! sub packed2F" %}
7134   ins_encode %{
7135     int vector_len = 0;
7136     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7137   %}
7138   ins_pipe( pipe_slow );
7139 %}
7140 
7141 instruct vsub4F(vecX dst, vecX src) %{
7142   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
7143   match(Set dst (SubVF dst src));
7144   format %{ "subps   $dst,$src\t! sub packed4F" %}
7145   ins_encode %{
7146     __ subps($dst$$XMMRegister, $src$$XMMRegister);
7147   %}
7148   ins_pipe( pipe_slow );
7149 %}
7150 
7151 instruct vsub4F_reg(vecX dst, vecX src1, vecX src2) %{
7152   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7153   match(Set dst (SubVF src1 src2));
7154   format %{ "vsubps  $dst,$src1,$src2\t! sub packed4F" %}
7155   ins_encode %{
7156     int vector_len = 0;
7157     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7158   %}
7159   ins_pipe( pipe_slow );
7160 %}
7161 
7162 instruct vsub4F_mem(vecX dst, vecX src, memory mem) %{
7163   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7164   match(Set dst (SubVF src (LoadVector mem)));
7165   format %{ "vsubps  $dst,$src,$mem\t! sub packed4F" %}
7166   ins_encode %{
7167     int vector_len = 0;
7168     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7169   %}
7170   ins_pipe( pipe_slow );
7171 %}
7172 
7173 instruct vsub8F_reg(vecY dst, vecY src1, vecY src2) %{
7174   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
7175   match(Set dst (SubVF src1 src2));
7176   format %{ "vsubps  $dst,$src1,$src2\t! sub packed8F" %}
7177   ins_encode %{
7178     int vector_len = 1;
7179     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7180   %}
7181   ins_pipe( pipe_slow );
7182 %}
7183 
7184 instruct vsub8F_mem(vecY dst, vecY src, memory mem) %{
7185   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
7186   match(Set dst (SubVF src (LoadVector mem)));
7187   format %{ "vsubps  $dst,$src,$mem\t! sub packed8F" %}
7188   ins_encode %{
7189     int vector_len = 1;
7190     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7191   %}
7192   ins_pipe( pipe_slow );
7193 %}
7194 
7195 instruct vsub16F_reg(vecZ dst, vecZ src1, vecZ src2) %{
7196   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
7197   match(Set dst (SubVF src1 src2));
7198   format %{ "vsubps  $dst,$src1,$src2\t! sub packed16F" %}
7199   ins_encode %{
7200     int vector_len = 2;
7201     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7202   %}
7203   ins_pipe( pipe_slow );
7204 %}
7205 
7206 instruct vsub16F_mem(vecZ dst, vecZ src, memory mem) %{
7207   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
7208   match(Set dst (SubVF src (LoadVector mem)));
7209   format %{ "vsubps  $dst,$src,$mem\t! sub packed16F" %}
7210   ins_encode %{
7211     int vector_len = 2;
7212     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7213   %}
7214   ins_pipe( pipe_slow );
7215 %}
7216 
7217 // Doubles vector sub
7218 instruct vsub2D(vecX dst, vecX src) %{
7219   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
7220   match(Set dst (SubVD dst src));
7221   format %{ "subpd   $dst,$src\t! sub packed2D" %}
7222   ins_encode %{
7223     __ subpd($dst$$XMMRegister, $src$$XMMRegister);
7224   %}
7225   ins_pipe( pipe_slow );
7226 %}
7227 
7228 instruct vsub2D_reg(vecX dst, vecX src1, vecX src2) %{
7229   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7230   match(Set dst (SubVD src1 src2));
7231   format %{ "vsubpd  $dst,$src1,$src2\t! sub packed2D" %}
7232   ins_encode %{
7233     int vector_len = 0;
7234     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7235   %}
7236   ins_pipe( pipe_slow );
7237 %}
7238 
7239 instruct vsub2D_mem(vecX dst, vecX src, memory mem) %{
7240   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7241   match(Set dst (SubVD src (LoadVector mem)));
7242   format %{ "vsubpd  $dst,$src,$mem\t! sub packed2D" %}
7243   ins_encode %{
7244     int vector_len = 0;
7245     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7246   %}
7247   ins_pipe( pipe_slow );
7248 %}
7249 
7250 instruct vsub4D_reg(vecY dst, vecY src1, vecY src2) %{
7251   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7252   match(Set dst (SubVD src1 src2));
7253   format %{ "vsubpd  $dst,$src1,$src2\t! sub packed4D" %}
7254   ins_encode %{
7255     int vector_len = 1;
7256     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7257   %}
7258   ins_pipe( pipe_slow );
7259 %}
7260 
7261 instruct vsub4D_mem(vecY dst, vecY src, memory mem) %{
7262   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7263   match(Set dst (SubVD src (LoadVector mem)));
7264   format %{ "vsubpd  $dst,$src,$mem\t! sub packed4D" %}
7265   ins_encode %{
7266     int vector_len = 1;
7267     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7268   %}
7269   ins_pipe( pipe_slow );
7270 %}
7271 
7272 instruct vsub8D_reg(vecZ dst, vecZ src1, vecZ src2) %{
7273   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
7274   match(Set dst (SubVD src1 src2));
7275   format %{ "vsubpd  $dst,$src1,$src2\t! sub packed8D" %}
7276   ins_encode %{
7277     int vector_len = 2;
7278     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7279   %}
7280   ins_pipe( pipe_slow );
7281 %}
7282 
7283 instruct vsub8D_mem(vecZ dst, vecZ src, memory mem) %{
7284   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
7285   match(Set dst (SubVD src (LoadVector mem)));
7286   format %{ "vsubpd  $dst,$src,$mem\t! sub packed8D" %}
7287   ins_encode %{
7288     int vector_len = 2;
7289     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7290   %}
7291   ins_pipe( pipe_slow );
7292 %}
7293 
7294 // --------------------------------- MUL --------------------------------------
7295 
7296 // Shorts/Chars vector mul
7297 instruct vmul2S(vecS dst, vecS src) %{
7298   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
7299   match(Set dst (MulVS dst src));
7300   format %{ "pmullw $dst,$src\t! mul packed2S" %}
7301   ins_encode %{
7302     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
7303   %}
7304   ins_pipe( pipe_slow );
7305 %}
7306 
7307 instruct vmul2S_reg(vecS dst, vecS src1, vecS src2) %{
7308   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7309   match(Set dst (MulVS src1 src2));
7310   format %{ "vpmullw $dst,$src1,$src2\t! mul packed2S" %}
7311   ins_encode %{
7312     int vector_len = 0;
7313     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7314   %}
7315   ins_pipe( pipe_slow );
7316 %}
7317 
7318 instruct vmul2S_mem(vecS dst, vecS src, memory mem) %{
7319   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7320   match(Set dst (MulVS src (LoadVector mem)));
7321   format %{ "vpmullw $dst,$src,$mem\t! mul packed2S" %}
7322   ins_encode %{
7323     int vector_len = 0;
7324     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7325   %}
7326   ins_pipe( pipe_slow );
7327 %}
7328 
7329 instruct vmul4S(vecD dst, vecD src) %{
7330   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
7331   match(Set dst (MulVS dst src));
7332   format %{ "pmullw  $dst,$src\t! mul packed4S" %}
7333   ins_encode %{
7334     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
7335   %}
7336   ins_pipe( pipe_slow );
7337 %}
7338 
7339 instruct vmul4S_reg(vecD dst, vecD src1, vecD src2) %{
7340   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7341   match(Set dst (MulVS src1 src2));
7342   format %{ "vpmullw $dst,$src1,$src2\t! mul packed4S" %}
7343   ins_encode %{
7344     int vector_len = 0;
7345     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7346   %}
7347   ins_pipe( pipe_slow );
7348 %}
7349 
7350 instruct vmul4S_mem(vecD dst, vecD src, memory mem) %{
7351   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7352   match(Set dst (MulVS src (LoadVector mem)));
7353   format %{ "vpmullw $dst,$src,$mem\t! mul packed4S" %}
7354   ins_encode %{
7355     int vector_len = 0;
7356     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7357   %}
7358   ins_pipe( pipe_slow );
7359 %}
7360 
7361 instruct vmul8S(vecX dst, vecX src) %{
7362   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
7363   match(Set dst (MulVS dst src));
7364   format %{ "pmullw  $dst,$src\t! mul packed8S" %}
7365   ins_encode %{
7366     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
7367   %}
7368   ins_pipe( pipe_slow );
7369 %}
7370 
7371 instruct vmul8S_reg(vecX dst, vecX src1, vecX src2) %{
7372   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
7373   match(Set dst (MulVS src1 src2));
7374   format %{ "vpmullw $dst,$src1,$src2\t! mul packed8S" %}
7375   ins_encode %{
7376     int vector_len = 0;
7377     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7378   %}
7379   ins_pipe( pipe_slow );
7380 %}
7381 
7382 instruct vmul8S_mem(vecX dst, vecX src, memory mem) %{
7383   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
7384   match(Set dst (MulVS src (LoadVector mem)));
7385   format %{ "vpmullw $dst,$src,$mem\t! mul packed8S" %}
7386   ins_encode %{
7387     int vector_len = 0;
7388     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7389   %}
7390   ins_pipe( pipe_slow );
7391 %}
7392 
7393 instruct vmul16S_reg(vecY dst, vecY src1, vecY src2) %{
7394   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
7395   match(Set dst (MulVS src1 src2));
7396   format %{ "vpmullw $dst,$src1,$src2\t! mul packed16S" %}
7397   ins_encode %{
7398     int vector_len = 1;
7399     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7400   %}
7401   ins_pipe( pipe_slow );
7402 %}
7403 
7404 instruct vmul16S_mem(vecY dst, vecY src, memory mem) %{
7405   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
7406   match(Set dst (MulVS src (LoadVector mem)));
7407   format %{ "vpmullw $dst,$src,$mem\t! mul packed16S" %}
7408   ins_encode %{
7409     int vector_len = 1;
7410     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7411   %}
7412   ins_pipe( pipe_slow );
7413 %}
7414 
7415 instruct vmul32S_reg(vecZ dst, vecZ src1, vecZ src2) %{
7416   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
7417   match(Set dst (MulVS src1 src2));
7418   format %{ "vpmullw $dst,$src1,$src2\t! mul packed32S" %}
7419   ins_encode %{
7420     int vector_len = 2;
7421     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7422   %}
7423   ins_pipe( pipe_slow );
7424 %}
7425 
7426 instruct vmul32S_mem(vecZ dst, vecZ src, memory mem) %{
7427   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
7428   match(Set dst (MulVS src (LoadVector mem)));
7429   format %{ "vpmullw $dst,$src,$mem\t! mul packed32S" %}
7430   ins_encode %{
7431     int vector_len = 2;
7432     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7433   %}
7434   ins_pipe( pipe_slow );
7435 %}
7436 
7437 // Integers vector mul (sse4_1)
7438 instruct vmul2I(vecD dst, vecD src) %{
7439   predicate(UseSSE > 3 && n->as_Vector()->length() == 2);
7440   match(Set dst (MulVI dst src));
7441   format %{ "pmulld  $dst,$src\t! mul packed2I" %}
7442   ins_encode %{
7443     __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
7444   %}
7445   ins_pipe( pipe_slow );
7446 %}
7447 
7448 instruct vmul2I_reg(vecD dst, vecD src1, vecD src2) %{
7449   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7450   match(Set dst (MulVI src1 src2));
7451   format %{ "vpmulld $dst,$src1,$src2\t! mul packed2I" %}
7452   ins_encode %{
7453     int vector_len = 0;
7454     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7455   %}
7456   ins_pipe( pipe_slow );
7457 %}
7458 
7459 instruct vmul2I_mem(vecD dst, vecD src, memory mem) %{
7460   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7461   match(Set dst (MulVI src (LoadVector mem)));
7462   format %{ "vpmulld $dst,$src,$mem\t! mul packed2I" %}
7463   ins_encode %{
7464     int vector_len = 0;
7465     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7466   %}
7467   ins_pipe( pipe_slow );
7468 %}
7469 
7470 instruct vmul4I(vecX dst, vecX src) %{
7471   predicate(UseSSE > 3 && n->as_Vector()->length() == 4);
7472   match(Set dst (MulVI dst src));
7473   format %{ "pmulld  $dst,$src\t! mul packed4I" %}
7474   ins_encode %{
7475     __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
7476   %}
7477   ins_pipe( pipe_slow );
7478 %}
7479 
7480 instruct vmul4I_reg(vecX dst, vecX src1, vecX src2) %{
7481   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7482   match(Set dst (MulVI src1 src2));
7483   format %{ "vpmulld $dst,$src1,$src2\t! mul packed4I" %}
7484   ins_encode %{
7485     int vector_len = 0;
7486     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7487   %}
7488   ins_pipe( pipe_slow );
7489 %}
7490 
7491 instruct vmul4I_mem(vecX dst, vecX src, memory mem) %{
7492   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7493   match(Set dst (MulVI src (LoadVector mem)));
7494   format %{ "vpmulld $dst,$src,$mem\t! mul packed4I" %}
7495   ins_encode %{
7496     int vector_len = 0;
7497     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7498   %}
7499   ins_pipe( pipe_slow );
7500 %}
7501 
7502 instruct vmul2L_reg(vecX dst, vecX src1, vecX src2) %{
7503   predicate(UseAVX > 2 && n->as_Vector()->length() == 2 && VM_Version::supports_avx512dq());
7504   match(Set dst (MulVL src1 src2));
7505   format %{ "vpmullq $dst,$src1,$src2\t! mul packed2L" %}
7506   ins_encode %{
7507     int vector_len = 0;
7508     __ vpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7509   %}
7510   ins_pipe( pipe_slow );
7511 %}
7512 
7513 instruct vmul2L_mem(vecX dst, vecX src, memory mem) %{
7514   predicate(UseAVX > 2 && n->as_Vector()->length() == 2 && VM_Version::supports_avx512dq());
7515   match(Set dst (MulVL src (LoadVector mem)));
7516   format %{ "vpmullq $dst,$src,$mem\t! mul packed2L" %}
7517   ins_encode %{
7518     int vector_len = 0;
7519     __ vpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7520   %}
7521   ins_pipe( pipe_slow );
7522 %}
7523 
7524 instruct vmul4L_reg(vecY dst, vecY src1, vecY src2) %{
7525   predicate(UseAVX > 2 && n->as_Vector()->length() == 4 && VM_Version::supports_avx512dq());
7526   match(Set dst (MulVL src1 src2));
7527   format %{ "vpmullq $dst,$src1,$src2\t! mul packed4L" %}
7528   ins_encode %{
7529     int vector_len = 1;
7530     __ vpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7531   %}
7532   ins_pipe( pipe_slow );
7533 %}
7534 
7535 instruct vmul4L_mem(vecY dst, vecY src, memory mem) %{
7536   predicate(UseAVX > 2 && n->as_Vector()->length() == 4 && VM_Version::supports_avx512dq());
7537   match(Set dst (MulVL src (LoadVector mem)));
7538   format %{ "vpmullq $dst,$src,$mem\t! mul packed4L" %}
7539   ins_encode %{
7540     int vector_len = 1;
7541     __ vpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7542   %}
7543   ins_pipe( pipe_slow );
7544 %}
7545 
7546 instruct vmul8L_reg(vecZ dst, vecZ src1, vecZ src2) %{
7547   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && VM_Version::supports_avx512dq());
7548   match(Set dst (MulVL src1 src2));
7549   format %{ "vpmullq $dst,$src1,$src2\t! mul packed8L" %}
7550   ins_encode %{
7551     int vector_len = 2;
7552     __ vpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7553   %}
7554   ins_pipe( pipe_slow );
7555 %}
7556 
7557 instruct vmul8L_mem(vecZ dst, vecZ src, memory mem) %{
7558   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && VM_Version::supports_avx512dq());
7559   match(Set dst (MulVL src (LoadVector mem)));
7560   format %{ "vpmullq $dst,$src,$mem\t! mul packed8L" %}
7561   ins_encode %{
7562     int vector_len = 2;
7563     __ vpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7564   %}
7565   ins_pipe( pipe_slow );
7566 %}
7567 
7568 instruct vmul8I_reg(vecY dst, vecY src1, vecY src2) %{
7569   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
7570   match(Set dst (MulVI src1 src2));
7571   format %{ "vpmulld $dst,$src1,$src2\t! mul packed8I" %}
7572   ins_encode %{
7573     int vector_len = 1;
7574     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7575   %}
7576   ins_pipe( pipe_slow );
7577 %}
7578 
7579 instruct vmul8I_mem(vecY dst, vecY src, memory mem) %{
7580   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
7581   match(Set dst (MulVI src (LoadVector mem)));
7582   format %{ "vpmulld $dst,$src,$mem\t! mul packed8I" %}
7583   ins_encode %{
7584     int vector_len = 1;
7585     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7586   %}
7587   ins_pipe( pipe_slow );
7588 %}
7589 
7590 instruct vmul16I_reg(vecZ dst, vecZ src1, vecZ src2) %{
7591   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
7592   match(Set dst (MulVI src1 src2));
7593   format %{ "vpmulld $dst,$src1,$src2\t! mul packed16I" %}
7594   ins_encode %{
7595     int vector_len = 2;
7596     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7597   %}
7598   ins_pipe( pipe_slow );
7599 %}
7600 
7601 instruct vmul16I_mem(vecZ dst, vecZ src, memory mem) %{
7602   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
7603   match(Set dst (MulVI src (LoadVector mem)));
7604   format %{ "vpmulld $dst,$src,$mem\t! mul packed16I" %}
7605   ins_encode %{
7606     int vector_len = 2;
7607     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7608   %}
7609   ins_pipe( pipe_slow );
7610 %}
7611 
7612 // Floats vector mul
7613 instruct vmul2F(vecD dst, vecD src) %{
7614   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
7615   match(Set dst (MulVF dst src));
7616   format %{ "mulps   $dst,$src\t! mul packed2F" %}
7617   ins_encode %{
7618     __ mulps($dst$$XMMRegister, $src$$XMMRegister);
7619   %}
7620   ins_pipe( pipe_slow );
7621 %}
7622 
7623 instruct vmul2F_reg(vecD dst, vecD src1, vecD src2) %{
7624   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7625   match(Set dst (MulVF src1 src2));
7626   format %{ "vmulps  $dst,$src1,$src2\t! mul packed2F" %}
7627   ins_encode %{
7628     int vector_len = 0;
7629     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7630   %}
7631   ins_pipe( pipe_slow );
7632 %}
7633 
7634 instruct vmul2F_mem(vecD dst, vecD src, memory mem) %{
7635   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7636   match(Set dst (MulVF src (LoadVector mem)));
7637   format %{ "vmulps  $dst,$src,$mem\t! mul packed2F" %}
7638   ins_encode %{
7639     int vector_len = 0;
7640     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7641   %}
7642   ins_pipe( pipe_slow );
7643 %}
7644 
7645 instruct vmul4F(vecX dst, vecX src) %{
7646   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
7647   match(Set dst (MulVF dst src));
7648   format %{ "mulps   $dst,$src\t! mul packed4F" %}
7649   ins_encode %{
7650     __ mulps($dst$$XMMRegister, $src$$XMMRegister);
7651   %}
7652   ins_pipe( pipe_slow );
7653 %}
7654 
7655 instruct vmul4F_reg(vecX dst, vecX src1, vecX src2) %{
7656   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7657   match(Set dst (MulVF src1 src2));
7658   format %{ "vmulps  $dst,$src1,$src2\t! mul packed4F" %}
7659   ins_encode %{
7660     int vector_len = 0;
7661     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7662   %}
7663   ins_pipe( pipe_slow );
7664 %}
7665 
7666 instruct vmul4F_mem(vecX dst, vecX src, memory mem) %{
7667   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7668   match(Set dst (MulVF src (LoadVector mem)));
7669   format %{ "vmulps  $dst,$src,$mem\t! mul packed4F" %}
7670   ins_encode %{
7671     int vector_len = 0;
7672     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7673   %}
7674   ins_pipe( pipe_slow );
7675 %}
7676 
7677 instruct vmul8F_reg(vecY dst, vecY src1, vecY src2) %{
7678   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
7679   match(Set dst (MulVF src1 src2));
7680   format %{ "vmulps  $dst,$src1,$src2\t! mul packed8F" %}
7681   ins_encode %{
7682     int vector_len = 1;
7683     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7684   %}
7685   ins_pipe( pipe_slow );
7686 %}
7687 
7688 instruct vmul8F_mem(vecY dst, vecY src, memory mem) %{
7689   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
7690   match(Set dst (MulVF src (LoadVector mem)));
7691   format %{ "vmulps  $dst,$src,$mem\t! mul packed8F" %}
7692   ins_encode %{
7693     int vector_len = 1;
7694     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7695   %}
7696   ins_pipe( pipe_slow );
7697 %}
7698 
7699 instruct vmul16F_reg(vecZ dst, vecZ src1, vecZ src2) %{
7700   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
7701   match(Set dst (MulVF src1 src2));
7702   format %{ "vmulps  $dst,$src1,$src2\t! mul packed16F" %}
7703   ins_encode %{
7704     int vector_len = 2;
7705     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7706   %}
7707   ins_pipe( pipe_slow );
7708 %}
7709 
7710 instruct vmul16F_mem(vecZ dst, vecZ src, memory mem) %{
7711   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
7712   match(Set dst (MulVF src (LoadVector mem)));
7713   format %{ "vmulps  $dst,$src,$mem\t! mul packed16F" %}
7714   ins_encode %{
7715     int vector_len = 2;
7716     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7717   %}
7718   ins_pipe( pipe_slow );
7719 %}
7720 
7721 // Doubles vector mul
7722 instruct vmul2D(vecX dst, vecX src) %{
7723   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
7724   match(Set dst (MulVD dst src));
7725   format %{ "mulpd   $dst,$src\t! mul packed2D" %}
7726   ins_encode %{
7727     __ mulpd($dst$$XMMRegister, $src$$XMMRegister);
7728   %}
7729   ins_pipe( pipe_slow );
7730 %}
7731 
7732 instruct vmul2D_reg(vecX dst, vecX src1, vecX src2) %{
7733   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7734   match(Set dst (MulVD src1 src2));
7735   format %{ "vmulpd  $dst,$src1,$src2\t! mul packed2D" %}
7736   ins_encode %{
7737     int vector_len = 0;
7738     __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7739   %}
7740   ins_pipe( pipe_slow );
7741 %}
7742 
7743 instruct vmul2D_mem(vecX dst, vecX src, memory mem) %{
7744   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7745   match(Set dst (MulVD src (LoadVector mem)));
7746   format %{ "vmulpd  $dst,$src,$mem\t! mul packed2D" %}
7747   ins_encode %{
7748     int vector_len = 0;
7749     __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7750   %}
7751   ins_pipe( pipe_slow );
7752 %}
7753 
7754 instruct vmul4D_reg(vecY dst, vecY src1, vecY src2) %{
7755   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7756   match(Set dst (MulVD src1 src2));
7757   format %{ "vmulpd  $dst,$src1,$src2\t! mul packed4D" %}
7758   ins_encode %{
7759     int vector_len = 1;
7760     __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7761   %}
7762   ins_pipe( pipe_slow );
7763 %}
7764 
7765 instruct vmul4D_mem(vecY dst, vecY src, memory mem) %{
7766   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7767   match(Set dst (MulVD src (LoadVector mem)));
7768   format %{ "vmulpd  $dst,$src,$mem\t! mul packed4D" %}
7769   ins_encode %{
7770     int vector_len = 1;
7771     __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7772   %}
7773   ins_pipe( pipe_slow );
7774 %}
7775 
7776 instruct vmul8D_reg(vecZ dst, vecZ src1, vecZ src2) %{
7777   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
7778   match(Set dst (MulVD src1 src2));
7779   format %{ "vmulpd  $dst k0,$src1,$src2\t! mul packed8D" %}
7780   ins_encode %{
7781     int vector_len = 2;
7782     __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7783   %}
7784   ins_pipe( pipe_slow );
7785 %}
7786 
7787 instruct vmul8D_mem(vecZ dst, vecZ src, memory mem) %{
7788   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
7789   match(Set dst (MulVD src (LoadVector mem)));
7790   format %{ "vmulpd  $dst k0,$src,$mem\t! mul packed8D" %}
7791   ins_encode %{
7792     int vector_len = 2;
7793     __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7794   %}
7795   ins_pipe( pipe_slow );
7796 %}
7797 
7798 instruct vcmov8F_reg(legVecY dst, legVecY src1, legVecY src2, immI8 cop, cmpOp_vcmppd copnd) %{
7799   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
7800   match(Set dst (CMoveVF (Binary copnd cop) (Binary src1 src2)));
7801   effect(TEMP dst, USE src1, USE src2);
7802   format %{ "cmpps.$copnd  $dst, $src1, $src2  ! vcmovevf, cond=$cop\n\t"
7803             "blendvps $dst,$src1,$src2,$dst ! vcmovevf\n\t"
7804          %}
7805   ins_encode %{
7806     int vector_len = 1;
7807     int cond = (Assembler::Condition)($copnd$$cmpcode);
7808     __ cmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cond, vector_len);
7809     __ blendvps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $dst$$XMMRegister, vector_len);
7810   %}
7811   ins_pipe( pipe_slow );
7812 %}
7813 
7814 instruct vcmov4D_reg(legVecY dst, legVecY src1, legVecY src2, immI8 cop, cmpOp_vcmppd copnd) %{
7815   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7816   match(Set dst (CMoveVD (Binary copnd cop) (Binary src1 src2)));
7817   effect(TEMP dst, USE src1, USE src2);
7818   format %{ "cmppd.$copnd  $dst, $src1, $src2  ! vcmovevd, cond=$cop\n\t"
7819             "blendvpd $dst,$src1,$src2,$dst ! vcmovevd\n\t"
7820          %}
7821   ins_encode %{
7822     int vector_len = 1;
7823     int cond = (Assembler::Condition)($copnd$$cmpcode);
7824     __ cmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cond, vector_len);
7825     __ blendvpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $dst$$XMMRegister, vector_len);
7826   %}
7827   ins_pipe( pipe_slow );
7828 %}
7829 
7830 // --------------------------------- DIV --------------------------------------
7831 
7832 // Floats vector div
7833 instruct vdiv2F(vecD dst, vecD src) %{
7834   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
7835   match(Set dst (DivVF dst src));
7836   format %{ "divps   $dst,$src\t! div packed2F" %}
7837   ins_encode %{
7838     __ divps($dst$$XMMRegister, $src$$XMMRegister);
7839   %}
7840   ins_pipe( pipe_slow );
7841 %}
7842 
7843 instruct vdiv2F_reg(vecD dst, vecD src1, vecD src2) %{
7844   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7845   match(Set dst (DivVF src1 src2));
7846   format %{ "vdivps  $dst,$src1,$src2\t! div packed2F" %}
7847   ins_encode %{
7848     int vector_len = 0;
7849     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7850   %}
7851   ins_pipe( pipe_slow );
7852 %}
7853 
7854 instruct vdiv2F_mem(vecD dst, vecD src, memory mem) %{
7855   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7856   match(Set dst (DivVF src (LoadVector mem)));
7857   format %{ "vdivps  $dst,$src,$mem\t! div packed2F" %}
7858   ins_encode %{
7859     int vector_len = 0;
7860     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7861   %}
7862   ins_pipe( pipe_slow );
7863 %}
7864 
7865 instruct vdiv4F(vecX dst, vecX src) %{
7866   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
7867   match(Set dst (DivVF dst src));
7868   format %{ "divps   $dst,$src\t! div packed4F" %}
7869   ins_encode %{
7870     __ divps($dst$$XMMRegister, $src$$XMMRegister);
7871   %}
7872   ins_pipe( pipe_slow );
7873 %}
7874 
7875 instruct vdiv4F_reg(vecX dst, vecX src1, vecX src2) %{
7876   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7877   match(Set dst (DivVF src1 src2));
7878   format %{ "vdivps  $dst,$src1,$src2\t! div packed4F" %}
7879   ins_encode %{
7880     int vector_len = 0;
7881     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7882   %}
7883   ins_pipe( pipe_slow );
7884 %}
7885 
7886 instruct vdiv4F_mem(vecX dst, vecX src, memory mem) %{
7887   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7888   match(Set dst (DivVF src (LoadVector mem)));
7889   format %{ "vdivps  $dst,$src,$mem\t! div packed4F" %}
7890   ins_encode %{
7891     int vector_len = 0;
7892     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7893   %}
7894   ins_pipe( pipe_slow );
7895 %}
7896 
7897 instruct vdiv8F_reg(vecY dst, vecY src1, vecY src2) %{
7898   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
7899   match(Set dst (DivVF src1 src2));
7900   format %{ "vdivps  $dst,$src1,$src2\t! div packed8F" %}
7901   ins_encode %{
7902     int vector_len = 1;
7903     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7904   %}
7905   ins_pipe( pipe_slow );
7906 %}
7907 
7908 instruct vdiv8F_mem(vecY dst, vecY src, memory mem) %{
7909   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
7910   match(Set dst (DivVF src (LoadVector mem)));
7911   format %{ "vdivps  $dst,$src,$mem\t! div packed8F" %}
7912   ins_encode %{
7913     int vector_len = 1;
7914     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7915   %}
7916   ins_pipe( pipe_slow );
7917 %}
7918 
7919 instruct vdiv16F_reg(vecZ dst, vecZ src1, vecZ src2) %{
7920   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
7921   match(Set dst (DivVF src1 src2));
7922   format %{ "vdivps  $dst,$src1,$src2\t! div packed16F" %}
7923   ins_encode %{
7924     int vector_len = 2;
7925     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7926   %}
7927   ins_pipe( pipe_slow );
7928 %}
7929 
7930 instruct vdiv16F_mem(vecZ dst, vecZ src, memory mem) %{
7931   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
7932   match(Set dst (DivVF src (LoadVector mem)));
7933   format %{ "vdivps  $dst,$src,$mem\t! div packed16F" %}
7934   ins_encode %{
7935     int vector_len = 2;
7936     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7937   %}
7938   ins_pipe( pipe_slow );
7939 %}
7940 
7941 // Doubles vector div
7942 instruct vdiv2D(vecX dst, vecX src) %{
7943   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
7944   match(Set dst (DivVD dst src));
7945   format %{ "divpd   $dst,$src\t! div packed2D" %}
7946   ins_encode %{
7947     __ divpd($dst$$XMMRegister, $src$$XMMRegister);
7948   %}
7949   ins_pipe( pipe_slow );
7950 %}
7951 
7952 instruct vdiv2D_reg(vecX dst, vecX src1, vecX src2) %{
7953   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7954   match(Set dst (DivVD src1 src2));
7955   format %{ "vdivpd  $dst,$src1,$src2\t! div packed2D" %}
7956   ins_encode %{
7957     int vector_len = 0;
7958     __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7959   %}
7960   ins_pipe( pipe_slow );
7961 %}
7962 
7963 instruct vdiv2D_mem(vecX dst, vecX src, memory mem) %{
7964   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7965   match(Set dst (DivVD src (LoadVector mem)));
7966   format %{ "vdivpd  $dst,$src,$mem\t! div packed2D" %}
7967   ins_encode %{
7968     int vector_len = 0;
7969     __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7970   %}
7971   ins_pipe( pipe_slow );
7972 %}
7973 
7974 instruct vdiv4D_reg(vecY dst, vecY src1, vecY src2) %{
7975   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7976   match(Set dst (DivVD src1 src2));
7977   format %{ "vdivpd  $dst,$src1,$src2\t! div packed4D" %}
7978   ins_encode %{
7979     int vector_len = 1;
7980     __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7981   %}
7982   ins_pipe( pipe_slow );
7983 %}
7984 
7985 instruct vdiv4D_mem(vecY dst, vecY src, memory mem) %{
7986   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7987   match(Set dst (DivVD src (LoadVector mem)));
7988   format %{ "vdivpd  $dst,$src,$mem\t! div packed4D" %}
7989   ins_encode %{
7990     int vector_len = 1;
7991     __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7992   %}
7993   ins_pipe( pipe_slow );
7994 %}
7995 
7996 instruct vdiv8D_reg(vecZ dst, vecZ src1, vecZ src2) %{
7997   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
7998   match(Set dst (DivVD src1 src2));
7999   format %{ "vdivpd  $dst,$src1,$src2\t! div packed8D" %}
8000   ins_encode %{
8001     int vector_len = 2;
8002     __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8003   %}
8004   ins_pipe( pipe_slow );
8005 %}
8006 
8007 instruct vdiv8D_mem(vecZ dst, vecZ src, memory mem) %{
8008   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8009   match(Set dst (DivVD src (LoadVector mem)));
8010   format %{ "vdivpd  $dst,$src,$mem\t! div packed8D" %}
8011   ins_encode %{
8012     int vector_len = 2;
8013     __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8014   %}
8015   ins_pipe( pipe_slow );
8016 %}
8017 
8018 // ------------------------------ Shift ---------------------------------------
8019 
8020 // Left and right shift count vectors are the same on x86
8021 // (only lowest bits of xmm reg are used for count).
8022 instruct vshiftcnt(vecS dst, rRegI cnt) %{
8023   match(Set dst (LShiftCntV cnt));
8024   match(Set dst (RShiftCntV cnt));
8025   format %{ "movd    $dst,$cnt\t! load shift count" %}
8026   ins_encode %{
8027     __ movdl($dst$$XMMRegister, $cnt$$Register);
8028   %}
8029   ins_pipe( pipe_slow );
8030 %}
8031 
8032 // --------------------------------- Sqrt --------------------------------------
8033 
8034 // Floating point vector sqrt
8035 instruct vsqrt2D_reg(vecX dst, vecX src) %{
8036   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8037   match(Set dst (SqrtVD src));
8038   format %{ "vsqrtpd  $dst,$src\t! sqrt packed2D" %}
8039   ins_encode %{
8040     int vector_len = 0;
8041     __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
8042   %}
8043   ins_pipe( pipe_slow );
8044 %}
8045 
8046 instruct vsqrt2D_mem(vecX dst, memory mem) %{
8047   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8048   match(Set dst (SqrtVD (LoadVector mem)));
8049   format %{ "vsqrtpd  $dst,$mem\t! sqrt packed2D" %}
8050   ins_encode %{
8051     int vector_len = 0;
8052     __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vector_len);
8053   %}
8054   ins_pipe( pipe_slow );
8055 %}
8056 
8057 instruct vsqrt4D_reg(vecY dst, vecY src) %{
8058   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8059   match(Set dst (SqrtVD src));
8060   format %{ "vsqrtpd  $dst,$src\t! sqrt packed4D" %}
8061   ins_encode %{
8062     int vector_len = 1;
8063     __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
8064   %}
8065   ins_pipe( pipe_slow );
8066 %}
8067 
8068 instruct vsqrt4D_mem(vecY dst, memory mem) %{
8069   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8070   match(Set dst (SqrtVD (LoadVector mem)));
8071   format %{ "vsqrtpd  $dst,$mem\t! sqrt packed4D" %}
8072   ins_encode %{
8073     int vector_len = 1;
8074     __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vector_len);
8075   %}
8076   ins_pipe( pipe_slow );
8077 %}
8078 
8079 instruct vsqrt8D_reg(vecZ dst, vecZ src) %{
8080   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8081   match(Set dst (SqrtVD src));
8082   format %{ "vsqrtpd  $dst,$src\t! sqrt packed8D" %}
8083   ins_encode %{
8084     int vector_len = 2;
8085     __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
8086   %}
8087   ins_pipe( pipe_slow );
8088 %}
8089 
8090 instruct vsqrt8D_mem(vecZ dst, memory mem) %{
8091   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8092   match(Set dst (SqrtVD (LoadVector mem)));
8093   format %{ "vsqrtpd  $dst,$mem\t! sqrt packed8D" %}
8094   ins_encode %{
8095     int vector_len = 2;
8096     __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vector_len);
8097   %}
8098   ins_pipe( pipe_slow );
8099 %}
8100 
8101 instruct vsqrt2F_reg(vecD dst, vecD src) %{
8102   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8103   match(Set dst (SqrtVF src));
8104   format %{ "vsqrtps  $dst,$src\t! sqrt packed2F" %}
8105   ins_encode %{
8106     int vector_len = 0;
8107     __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vector_len);
8108   %}
8109   ins_pipe( pipe_slow );
8110 %}
8111 
8112 instruct vsqrt2F_mem(vecD dst, memory mem) %{
8113   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8114   match(Set dst (SqrtVF (LoadVector mem)));
8115   format %{ "vsqrtps  $dst,$mem\t! sqrt packed2F" %}
8116   ins_encode %{
8117     int vector_len = 0;
8118     __ vsqrtps($dst$$XMMRegister, $mem$$Address, vector_len);
8119   %}
8120   ins_pipe( pipe_slow );
8121 %}
8122 
8123 instruct vsqrt4F_reg(vecX dst, vecX src) %{
8124   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8125   match(Set dst (SqrtVF src));
8126   format %{ "vsqrtps  $dst,$src\t! sqrt packed4F" %}
8127   ins_encode %{
8128     int vector_len = 0;
8129     __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vector_len);
8130   %}
8131   ins_pipe( pipe_slow );
8132 %}
8133 
8134 instruct vsqrt4F_mem(vecX dst, memory mem) %{
8135   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8136   match(Set dst (SqrtVF (LoadVector mem)));
8137   format %{ "vsqrtps  $dst,$mem\t! sqrt packed4F" %}
8138   ins_encode %{
8139     int vector_len = 0;
8140     __ vsqrtps($dst$$XMMRegister, $mem$$Address, vector_len);
8141   %}
8142   ins_pipe( pipe_slow );
8143 %}
8144 
8145 instruct vsqrt8F_reg(vecY dst, vecY src) %{
8146   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
8147   match(Set dst (SqrtVF src));
8148   format %{ "vsqrtps  $dst,$src\t! sqrt packed8F" %}
8149   ins_encode %{
8150     int vector_len = 1;
8151     __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vector_len);
8152   %}
8153   ins_pipe( pipe_slow );
8154 %}
8155 
8156 instruct vsqrt8F_mem(vecY dst, memory mem) %{
8157   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
8158   match(Set dst (SqrtVF (LoadVector mem)));
8159   format %{ "vsqrtps  $dst,$mem\t! sqrt packed8F" %}
8160   ins_encode %{
8161     int vector_len = 1;
8162     __ vsqrtps($dst$$XMMRegister, $mem$$Address, vector_len);
8163   %}
8164   ins_pipe( pipe_slow );
8165 %}
8166 
8167 instruct vsqrt16F_reg(vecZ dst, vecZ src) %{
8168   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
8169   match(Set dst (SqrtVF src));
8170   format %{ "vsqrtps  $dst,$src\t! sqrt packed16F" %}
8171   ins_encode %{
8172     int vector_len = 2;
8173     __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vector_len);
8174   %}
8175   ins_pipe( pipe_slow );
8176 %}
8177 
8178 instruct vsqrt16F_mem(vecZ dst, memory mem) %{
8179   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
8180   match(Set dst (SqrtVF (LoadVector mem)));
8181   format %{ "vsqrtps  $dst,$mem\t! sqrt packed16F" %}
8182   ins_encode %{
8183     int vector_len = 2;
8184     __ vsqrtps($dst$$XMMRegister, $mem$$Address, vector_len);
8185   %}
8186   ins_pipe( pipe_slow );
8187 %}
8188 
8189 // ------------------------------ LeftShift -----------------------------------
8190 
8191 // Shorts/Chars vector left shift
8192 instruct vsll2S(vecS dst, vecS shift) %{
8193   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
8194   match(Set dst (LShiftVS dst shift));
8195   format %{ "psllw   $dst,$shift\t! left shift packed2S" %}
8196   ins_encode %{
8197     __ psllw($dst$$XMMRegister, $shift$$XMMRegister);
8198   %}
8199   ins_pipe( pipe_slow );
8200 %}
8201 
8202 instruct vsll2S_imm(vecS dst, immI8 shift) %{
8203   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
8204   match(Set dst (LShiftVS dst shift));
8205   format %{ "psllw   $dst,$shift\t! left shift packed2S" %}
8206   ins_encode %{
8207     __ psllw($dst$$XMMRegister, (int)$shift$$constant);
8208   %}
8209   ins_pipe( pipe_slow );
8210 %}
8211 
8212 instruct vsll2S_reg(vecS dst, vecS src, vecS shift) %{
8213   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8214   match(Set dst (LShiftVS src shift));
8215   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
8216   ins_encode %{
8217     int vector_len = 0;
8218     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8219   %}
8220   ins_pipe( pipe_slow );
8221 %}
8222 
8223 instruct vsll2S_reg_imm(vecS dst, vecS src, immI8 shift) %{
8224   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8225   match(Set dst (LShiftVS src shift));
8226   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
8227   ins_encode %{
8228     int vector_len = 0;
8229     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8230   %}
8231   ins_pipe( pipe_slow );
8232 %}
8233 
8234 instruct vsll4S(vecD dst, vecS shift) %{
8235   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
8236   match(Set dst (LShiftVS dst shift));
8237   format %{ "psllw   $dst,$shift\t! left shift packed4S" %}
8238   ins_encode %{
8239     __ psllw($dst$$XMMRegister, $shift$$XMMRegister);
8240   %}
8241   ins_pipe( pipe_slow );
8242 %}
8243 
8244 instruct vsll4S_imm(vecD dst, immI8 shift) %{
8245   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
8246   match(Set dst (LShiftVS dst shift));
8247   format %{ "psllw   $dst,$shift\t! left shift packed4S" %}
8248   ins_encode %{
8249     __ psllw($dst$$XMMRegister, (int)$shift$$constant);
8250   %}
8251   ins_pipe( pipe_slow );
8252 %}
8253 
8254 instruct vsll4S_reg(vecD dst, vecD src, vecS shift) %{
8255   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8256   match(Set dst (LShiftVS src shift));
8257   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
8258   ins_encode %{
8259     int vector_len = 0;
8260     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8261   %}
8262   ins_pipe( pipe_slow );
8263 %}
8264 
8265 instruct vsll4S_reg_imm(vecD dst, vecD src, immI8 shift) %{
8266   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8267   match(Set dst (LShiftVS src shift));
8268   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
8269   ins_encode %{
8270     int vector_len = 0;
8271     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8272   %}
8273   ins_pipe( pipe_slow );
8274 %}
8275 
8276 instruct vsll8S(vecX dst, vecS shift) %{
8277   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
8278   match(Set dst (LShiftVS dst shift));
8279   format %{ "psllw   $dst,$shift\t! left shift packed8S" %}
8280   ins_encode %{
8281     __ psllw($dst$$XMMRegister, $shift$$XMMRegister);
8282   %}
8283   ins_pipe( pipe_slow );
8284 %}
8285 
8286 instruct vsll8S_imm(vecX dst, immI8 shift) %{
8287   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
8288   match(Set dst (LShiftVS dst shift));
8289   format %{ "psllw   $dst,$shift\t! left shift packed8S" %}
8290   ins_encode %{
8291     __ psllw($dst$$XMMRegister, (int)$shift$$constant);
8292   %}
8293   ins_pipe( pipe_slow );
8294 %}
8295 
8296 instruct vsll8S_reg(vecX dst, vecX src, vecS shift) %{
8297   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
8298   match(Set dst (LShiftVS src shift));
8299   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
8300   ins_encode %{
8301     int vector_len = 0;
8302     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8303   %}
8304   ins_pipe( pipe_slow );
8305 %}
8306 
8307 instruct vsll8S_reg_imm(vecX dst, vecX src, immI8 shift) %{
8308   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
8309   match(Set dst (LShiftVS src shift));
8310   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
8311   ins_encode %{
8312     int vector_len = 0;
8313     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8314   %}
8315   ins_pipe( pipe_slow );
8316 %}
8317 
8318 instruct vsll16S_reg(vecY dst, vecY src, vecS shift) %{
8319   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
8320   match(Set dst (LShiftVS src shift));
8321   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
8322   ins_encode %{
8323     int vector_len = 1;
8324     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8325   %}
8326   ins_pipe( pipe_slow );
8327 %}
8328 
8329 instruct vsll16S_reg_imm(vecY dst, vecY src, immI8 shift) %{
8330   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
8331   match(Set dst (LShiftVS src shift));
8332   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
8333   ins_encode %{
8334     int vector_len = 1;
8335     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8336   %}
8337   ins_pipe( pipe_slow );
8338 %}
8339 
8340 instruct vsll32S_reg(vecZ dst, vecZ src, vecS shift) %{
8341   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
8342   match(Set dst (LShiftVS src shift));
8343   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed32S" %}
8344   ins_encode %{
8345     int vector_len = 2;
8346     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8347   %}
8348   ins_pipe( pipe_slow );
8349 %}
8350 
8351 instruct vsll32S_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
8352   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
8353   match(Set dst (LShiftVS src shift));
8354   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed32S" %}
8355   ins_encode %{
8356     int vector_len = 2;
8357     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8358   %}
8359   ins_pipe( pipe_slow );
8360 %}
8361 
8362 // Integers vector left shift
8363 instruct vsll2I(vecD dst, vecS shift) %{
8364   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
8365   match(Set dst (LShiftVI dst shift));
8366   format %{ "pslld   $dst,$shift\t! left shift packed2I" %}
8367   ins_encode %{
8368     __ pslld($dst$$XMMRegister, $shift$$XMMRegister);
8369   %}
8370   ins_pipe( pipe_slow );
8371 %}
8372 
8373 instruct vsll2I_imm(vecD dst, immI8 shift) %{
8374   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
8375   match(Set dst (LShiftVI dst shift));
8376   format %{ "pslld   $dst,$shift\t! left shift packed2I" %}
8377   ins_encode %{
8378     __ pslld($dst$$XMMRegister, (int)$shift$$constant);
8379   %}
8380   ins_pipe( pipe_slow );
8381 %}
8382 
8383 instruct vsll2I_reg(vecD dst, vecD src, vecS shift) %{
8384   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8385   match(Set dst (LShiftVI src shift));
8386   format %{ "vpslld  $dst,$src,$shift\t! left shift packed2I" %}
8387   ins_encode %{
8388     int vector_len = 0;
8389     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8390   %}
8391   ins_pipe( pipe_slow );
8392 %}
8393 
8394 instruct vsll2I_reg_imm(vecD dst, vecD src, immI8 shift) %{
8395   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8396   match(Set dst (LShiftVI src shift));
8397   format %{ "vpslld  $dst,$src,$shift\t! left shift packed2I" %}
8398   ins_encode %{
8399     int vector_len = 0;
8400     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8401   %}
8402   ins_pipe( pipe_slow );
8403 %}
8404 
8405 instruct vsll4I(vecX dst, vecS shift) %{
8406   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
8407   match(Set dst (LShiftVI dst shift));
8408   format %{ "pslld   $dst,$shift\t! left shift packed4I" %}
8409   ins_encode %{
8410     __ pslld($dst$$XMMRegister, $shift$$XMMRegister);
8411   %}
8412   ins_pipe( pipe_slow );
8413 %}
8414 
8415 instruct vsll4I_imm(vecX dst, immI8 shift) %{
8416   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
8417   match(Set dst (LShiftVI dst shift));
8418   format %{ "pslld   $dst,$shift\t! left shift packed4I" %}
8419   ins_encode %{
8420     __ pslld($dst$$XMMRegister, (int)$shift$$constant);
8421   %}
8422   ins_pipe( pipe_slow );
8423 %}
8424 
8425 instruct vsll4I_reg(vecX dst, vecX src, vecS shift) %{
8426   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8427   match(Set dst (LShiftVI src shift));
8428   format %{ "vpslld  $dst,$src,$shift\t! left shift packed4I" %}
8429   ins_encode %{
8430     int vector_len = 0;
8431     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8432   %}
8433   ins_pipe( pipe_slow );
8434 %}
8435 
8436 instruct vsll4I_reg_imm(vecX dst, vecX src, immI8 shift) %{
8437   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8438   match(Set dst (LShiftVI src shift));
8439   format %{ "vpslld  $dst,$src,$shift\t! left shift packed4I" %}
8440   ins_encode %{
8441     int vector_len = 0;
8442     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8443   %}
8444   ins_pipe( pipe_slow );
8445 %}
8446 
8447 instruct vsll8I_reg(vecY dst, vecY src, vecS shift) %{
8448   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
8449   match(Set dst (LShiftVI src shift));
8450   format %{ "vpslld  $dst,$src,$shift\t! left shift packed8I" %}
8451   ins_encode %{
8452     int vector_len = 1;
8453     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8454   %}
8455   ins_pipe( pipe_slow );
8456 %}
8457 
8458 instruct vsll8I_reg_imm(vecY dst, vecY src, immI8 shift) %{
8459   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
8460   match(Set dst (LShiftVI src shift));
8461   format %{ "vpslld  $dst,$src,$shift\t! left shift packed8I" %}
8462   ins_encode %{
8463     int vector_len = 1;
8464     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8465   %}
8466   ins_pipe( pipe_slow );
8467 %}
8468 
8469 instruct vsll16I_reg(vecZ dst, vecZ src, vecS shift) %{
8470   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
8471   match(Set dst (LShiftVI src shift));
8472   format %{ "vpslld  $dst,$src,$shift\t! left shift packed16I" %}
8473   ins_encode %{
8474     int vector_len = 2;
8475     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8476   %}
8477   ins_pipe( pipe_slow );
8478 %}
8479 
8480 instruct vsll16I_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
8481   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
8482   match(Set dst (LShiftVI src shift));
8483   format %{ "vpslld  $dst,$src,$shift\t! left shift packed16I" %}
8484   ins_encode %{
8485     int vector_len = 2;
8486     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8487   %}
8488   ins_pipe( pipe_slow );
8489 %}
8490 
8491 // Longs vector left shift
8492 instruct vsll2L(vecX dst, vecS shift) %{
8493   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
8494   match(Set dst (LShiftVL dst shift));
8495   format %{ "psllq   $dst,$shift\t! left shift packed2L" %}
8496   ins_encode %{
8497     __ psllq($dst$$XMMRegister, $shift$$XMMRegister);
8498   %}
8499   ins_pipe( pipe_slow );
8500 %}
8501 
8502 instruct vsll2L_imm(vecX dst, immI8 shift) %{
8503   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
8504   match(Set dst (LShiftVL dst shift));
8505   format %{ "psllq   $dst,$shift\t! left shift packed2L" %}
8506   ins_encode %{
8507     __ psllq($dst$$XMMRegister, (int)$shift$$constant);
8508   %}
8509   ins_pipe( pipe_slow );
8510 %}
8511 
8512 instruct vsll2L_reg(vecX dst, vecX src, vecS shift) %{
8513   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8514   match(Set dst (LShiftVL src shift));
8515   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed2L" %}
8516   ins_encode %{
8517     int vector_len = 0;
8518     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8519   %}
8520   ins_pipe( pipe_slow );
8521 %}
8522 
8523 instruct vsll2L_reg_imm(vecX dst, vecX src, immI8 shift) %{
8524   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8525   match(Set dst (LShiftVL src shift));
8526   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed2L" %}
8527   ins_encode %{
8528     int vector_len = 0;
8529     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8530   %}
8531   ins_pipe( pipe_slow );
8532 %}
8533 
8534 instruct vsll4L_reg(vecY dst, vecY src, vecS shift) %{
8535   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
8536   match(Set dst (LShiftVL src shift));
8537   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed4L" %}
8538   ins_encode %{
8539     int vector_len = 1;
8540     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8541   %}
8542   ins_pipe( pipe_slow );
8543 %}
8544 
8545 instruct vsll4L_reg_imm(vecY dst, vecY src, immI8 shift) %{
8546   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
8547   match(Set dst (LShiftVL src shift));
8548   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed4L" %}
8549   ins_encode %{
8550     int vector_len = 1;
8551     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8552   %}
8553   ins_pipe( pipe_slow );
8554 %}
8555 
8556 instruct vsll8L_reg(vecZ dst, vecZ src, vecS shift) %{
8557   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8558   match(Set dst (LShiftVL src shift));
8559   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed8L" %}
8560   ins_encode %{
8561     int vector_len = 2;
8562     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8563   %}
8564   ins_pipe( pipe_slow );
8565 %}
8566 
8567 instruct vsll8L_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
8568   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8569   match(Set dst (LShiftVL src shift));
8570   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed8L" %}
8571   ins_encode %{
8572     int vector_len = 2;
8573     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8574   %}
8575   ins_pipe( pipe_slow );
8576 %}
8577 
8578 // ----------------------- LogicalRightShift -----------------------------------
8579 
8580 // Shorts vector logical right shift produces incorrect Java result
8581 // for negative data because java code convert short value into int with
8582 // sign extension before a shift. But char vectors are fine since chars are
8583 // unsigned values.
8584 
8585 instruct vsrl2S(vecS dst, vecS shift) %{
8586   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
8587   match(Set dst (URShiftVS dst shift));
8588   format %{ "psrlw   $dst,$shift\t! logical right shift packed2S" %}
8589   ins_encode %{
8590     __ psrlw($dst$$XMMRegister, $shift$$XMMRegister);
8591   %}
8592   ins_pipe( pipe_slow );
8593 %}
8594 
8595 instruct vsrl2S_imm(vecS dst, immI8 shift) %{
8596   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
8597   match(Set dst (URShiftVS dst shift));
8598   format %{ "psrlw   $dst,$shift\t! logical right shift packed2S" %}
8599   ins_encode %{
8600     __ psrlw($dst$$XMMRegister, (int)$shift$$constant);
8601   %}
8602   ins_pipe( pipe_slow );
8603 %}
8604 
8605 instruct vsrl2S_reg(vecS dst, vecS src, vecS shift) %{
8606   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8607   match(Set dst (URShiftVS src shift));
8608   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed2S" %}
8609   ins_encode %{
8610     int vector_len = 0;
8611     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8612   %}
8613   ins_pipe( pipe_slow );
8614 %}
8615 
8616 instruct vsrl2S_reg_imm(vecS dst, vecS src, immI8 shift) %{
8617   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8618   match(Set dst (URShiftVS src shift));
8619   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed2S" %}
8620   ins_encode %{
8621     int vector_len = 0;
8622     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8623   %}
8624   ins_pipe( pipe_slow );
8625 %}
8626 
8627 instruct vsrl4S(vecD dst, vecS shift) %{
8628   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
8629   match(Set dst (URShiftVS dst shift));
8630   format %{ "psrlw   $dst,$shift\t! logical right shift packed4S" %}
8631   ins_encode %{
8632     __ psrlw($dst$$XMMRegister, $shift$$XMMRegister);
8633   %}
8634   ins_pipe( pipe_slow );
8635 %}
8636 
8637 instruct vsrl4S_imm(vecD dst, immI8 shift) %{
8638   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
8639   match(Set dst (URShiftVS dst shift));
8640   format %{ "psrlw   $dst,$shift\t! logical right shift packed4S" %}
8641   ins_encode %{
8642     __ psrlw($dst$$XMMRegister, (int)$shift$$constant);
8643   %}
8644   ins_pipe( pipe_slow );
8645 %}
8646 
8647 instruct vsrl4S_reg(vecD dst, vecD src, vecS shift) %{
8648   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8649   match(Set dst (URShiftVS src shift));
8650   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed4S" %}
8651   ins_encode %{
8652     int vector_len = 0;
8653     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8654   %}
8655   ins_pipe( pipe_slow );
8656 %}
8657 
8658 instruct vsrl4S_reg_imm(vecD dst, vecD src, immI8 shift) %{
8659   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8660   match(Set dst (URShiftVS src shift));
8661   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed4S" %}
8662   ins_encode %{
8663     int vector_len = 0;
8664     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8665   %}
8666   ins_pipe( pipe_slow );
8667 %}
8668 
8669 instruct vsrl8S(vecX dst, vecS shift) %{
8670   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
8671   match(Set dst (URShiftVS dst shift));
8672   format %{ "psrlw   $dst,$shift\t! logical right shift packed8S" %}
8673   ins_encode %{
8674     __ psrlw($dst$$XMMRegister, $shift$$XMMRegister);
8675   %}
8676   ins_pipe( pipe_slow );
8677 %}
8678 
8679 instruct vsrl8S_imm(vecX dst, immI8 shift) %{
8680   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
8681   match(Set dst (URShiftVS dst shift));
8682   format %{ "psrlw   $dst,$shift\t! logical right shift packed8S" %}
8683   ins_encode %{
8684     __ psrlw($dst$$XMMRegister, (int)$shift$$constant);
8685   %}
8686   ins_pipe( pipe_slow );
8687 %}
8688 
8689 instruct vsrl8S_reg(vecX dst, vecX src, vecS shift) %{
8690   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
8691   match(Set dst (URShiftVS src shift));
8692   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed8S" %}
8693   ins_encode %{
8694     int vector_len = 0;
8695     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8696   %}
8697   ins_pipe( pipe_slow );
8698 %}
8699 
8700 instruct vsrl8S_reg_imm(vecX dst, vecX src, immI8 shift) %{
8701   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
8702   match(Set dst (URShiftVS src shift));
8703   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed8S" %}
8704   ins_encode %{
8705     int vector_len = 0;
8706     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8707   %}
8708   ins_pipe( pipe_slow );
8709 %}
8710 
8711 instruct vsrl16S_reg(vecY dst, vecY src, vecS shift) %{
8712   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
8713   match(Set dst (URShiftVS src shift));
8714   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed16S" %}
8715   ins_encode %{
8716     int vector_len = 1;
8717     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8718   %}
8719   ins_pipe( pipe_slow );
8720 %}
8721 
8722 instruct vsrl16S_reg_imm(vecY dst, vecY src, immI8 shift) %{
8723   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
8724   match(Set dst (URShiftVS src shift));
8725   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed16S" %}
8726   ins_encode %{
8727     int vector_len = 1;
8728     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8729   %}
8730   ins_pipe( pipe_slow );
8731 %}
8732 
8733 instruct vsrl32S_reg(vecZ dst, vecZ src, vecS shift) %{
8734   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
8735   match(Set dst (URShiftVS src shift));
8736   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed32S" %}
8737   ins_encode %{
8738     int vector_len = 2;
8739     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8740   %}
8741   ins_pipe( pipe_slow );
8742 %}
8743 
8744 instruct vsrl32S_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
8745   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
8746   match(Set dst (URShiftVS src shift));
8747   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed32S" %}
8748   ins_encode %{
8749     int vector_len = 2;
8750     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8751   %}
8752   ins_pipe( pipe_slow );
8753 %}
8754 
8755 // Integers vector logical right shift
8756 instruct vsrl2I(vecD dst, vecS shift) %{
8757   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
8758   match(Set dst (URShiftVI dst shift));
8759   format %{ "psrld   $dst,$shift\t! logical right shift packed2I" %}
8760   ins_encode %{
8761     __ psrld($dst$$XMMRegister, $shift$$XMMRegister);
8762   %}
8763   ins_pipe( pipe_slow );
8764 %}
8765 
8766 instruct vsrl2I_imm(vecD dst, immI8 shift) %{
8767   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
8768   match(Set dst (URShiftVI dst shift));
8769   format %{ "psrld   $dst,$shift\t! logical right shift packed2I" %}
8770   ins_encode %{
8771     __ psrld($dst$$XMMRegister, (int)$shift$$constant);
8772   %}
8773   ins_pipe( pipe_slow );
8774 %}
8775 
8776 instruct vsrl2I_reg(vecD dst, vecD src, vecS shift) %{
8777   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8778   match(Set dst (URShiftVI src shift));
8779   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed2I" %}
8780   ins_encode %{
8781     int vector_len = 0;
8782     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8783   %}
8784   ins_pipe( pipe_slow );
8785 %}
8786 
8787 instruct vsrl2I_reg_imm(vecD dst, vecD src, immI8 shift) %{
8788   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8789   match(Set dst (URShiftVI src shift));
8790   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed2I" %}
8791   ins_encode %{
8792     int vector_len = 0;
8793     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8794   %}
8795   ins_pipe( pipe_slow );
8796 %}
8797 
8798 instruct vsrl4I(vecX dst, vecS shift) %{
8799   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
8800   match(Set dst (URShiftVI dst shift));
8801   format %{ "psrld   $dst,$shift\t! logical right shift packed4I" %}
8802   ins_encode %{
8803     __ psrld($dst$$XMMRegister, $shift$$XMMRegister);
8804   %}
8805   ins_pipe( pipe_slow );
8806 %}
8807 
8808 instruct vsrl4I_imm(vecX dst, immI8 shift) %{
8809   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
8810   match(Set dst (URShiftVI dst shift));
8811   format %{ "psrld   $dst,$shift\t! logical right shift packed4I" %}
8812   ins_encode %{
8813     __ psrld($dst$$XMMRegister, (int)$shift$$constant);
8814   %}
8815   ins_pipe( pipe_slow );
8816 %}
8817 
8818 instruct vsrl4I_reg(vecX dst, vecX src, vecS shift) %{
8819   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8820   match(Set dst (URShiftVI src shift));
8821   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed4I" %}
8822   ins_encode %{
8823     int vector_len = 0;
8824     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8825   %}
8826   ins_pipe( pipe_slow );
8827 %}
8828 
8829 instruct vsrl4I_reg_imm(vecX dst, vecX src, immI8 shift) %{
8830   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8831   match(Set dst (URShiftVI src shift));
8832   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed4I" %}
8833   ins_encode %{
8834     int vector_len = 0;
8835     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8836   %}
8837   ins_pipe( pipe_slow );
8838 %}
8839 
8840 instruct vsrl8I_reg(vecY dst, vecY src, vecS shift) %{
8841   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
8842   match(Set dst (URShiftVI src shift));
8843   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed8I" %}
8844   ins_encode %{
8845     int vector_len = 1;
8846     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8847   %}
8848   ins_pipe( pipe_slow );
8849 %}
8850 
8851 instruct vsrl8I_reg_imm(vecY dst, vecY src, immI8 shift) %{
8852   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
8853   match(Set dst (URShiftVI src shift));
8854   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed8I" %}
8855   ins_encode %{
8856     int vector_len = 1;
8857     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8858   %}
8859   ins_pipe( pipe_slow );
8860 %}
8861 
8862 instruct vsrl16I_reg(vecZ dst, vecZ src, vecS shift) %{
8863   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
8864   match(Set dst (URShiftVI src shift));
8865   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed16I" %}
8866   ins_encode %{
8867     int vector_len = 2;
8868     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8869   %}
8870   ins_pipe( pipe_slow );
8871 %}
8872 
8873 instruct vsrl16I_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
8874   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
8875   match(Set dst (URShiftVI src shift));
8876   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed16I" %}
8877   ins_encode %{
8878     int vector_len = 2;
8879     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8880   %}
8881   ins_pipe( pipe_slow );
8882 %}
8883 
8884 // Longs vector logical right shift
8885 instruct vsrl2L(vecX dst, vecS shift) %{
8886   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
8887   match(Set dst (URShiftVL dst shift));
8888   format %{ "psrlq   $dst,$shift\t! logical right shift packed2L" %}
8889   ins_encode %{
8890     __ psrlq($dst$$XMMRegister, $shift$$XMMRegister);
8891   %}
8892   ins_pipe( pipe_slow );
8893 %}
8894 
8895 instruct vsrl2L_imm(vecX dst, immI8 shift) %{
8896   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
8897   match(Set dst (URShiftVL dst shift));
8898   format %{ "psrlq   $dst,$shift\t! logical right shift packed2L" %}
8899   ins_encode %{
8900     __ psrlq($dst$$XMMRegister, (int)$shift$$constant);
8901   %}
8902   ins_pipe( pipe_slow );
8903 %}
8904 
8905 instruct vsrl2L_reg(vecX dst, vecX src, vecS shift) %{
8906   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8907   match(Set dst (URShiftVL src shift));
8908   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed2L" %}
8909   ins_encode %{
8910     int vector_len = 0;
8911     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8912   %}
8913   ins_pipe( pipe_slow );
8914 %}
8915 
8916 instruct vsrl2L_reg_imm(vecX dst, vecX src, immI8 shift) %{
8917   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8918   match(Set dst (URShiftVL src shift));
8919   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed2L" %}
8920   ins_encode %{
8921     int vector_len = 0;
8922     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8923   %}
8924   ins_pipe( pipe_slow );
8925 %}
8926 
8927 instruct vsrl4L_reg(vecY dst, vecY src, vecS shift) %{
8928   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
8929   match(Set dst (URShiftVL src shift));
8930   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed4L" %}
8931   ins_encode %{
8932     int vector_len = 1;
8933     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8934   %}
8935   ins_pipe( pipe_slow );
8936 %}
8937 
8938 instruct vsrl4L_reg_imm(vecY dst, vecY src, immI8 shift) %{
8939   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
8940   match(Set dst (URShiftVL src shift));
8941   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed4L" %}
8942   ins_encode %{
8943     int vector_len = 1;
8944     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8945   %}
8946   ins_pipe( pipe_slow );
8947 %}
8948 
8949 instruct vsrl8L_reg(vecZ dst, vecZ src, vecS shift) %{
8950   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8951   match(Set dst (URShiftVL src shift));
8952   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed8L" %}
8953   ins_encode %{
8954     int vector_len = 2;
8955     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8956   %}
8957   ins_pipe( pipe_slow );
8958 %}
8959 
8960 instruct vsrl8L_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
8961   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8962   match(Set dst (URShiftVL src shift));
8963   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed8L" %}
8964   ins_encode %{
8965     int vector_len = 2;
8966     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8967   %}
8968   ins_pipe( pipe_slow );
8969 %}
8970 
8971 // ------------------- ArithmeticRightShift -----------------------------------
8972 
8973 // Shorts/Chars vector arithmetic right shift
8974 instruct vsra2S(vecS dst, vecS shift) %{
8975   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
8976   match(Set dst (RShiftVS dst shift));
8977   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed2S" %}
8978   ins_encode %{
8979     __ psraw($dst$$XMMRegister, $shift$$XMMRegister);
8980   %}
8981   ins_pipe( pipe_slow );
8982 %}
8983 
8984 instruct vsra2S_imm(vecS dst, immI8 shift) %{
8985   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
8986   match(Set dst (RShiftVS dst shift));
8987   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed2S" %}
8988   ins_encode %{
8989     __ psraw($dst$$XMMRegister, (int)$shift$$constant);
8990   %}
8991   ins_pipe( pipe_slow );
8992 %}
8993 
8994 instruct vsra2S_reg(vecS dst, vecS src, vecS shift) %{
8995   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8996   match(Set dst (RShiftVS src shift));
8997   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
8998   ins_encode %{
8999     int vector_len = 0;
9000     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9001   %}
9002   ins_pipe( pipe_slow );
9003 %}
9004 
9005 instruct vsra2S_reg_imm(vecS dst, vecS src, immI8 shift) %{
9006   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
9007   match(Set dst (RShiftVS src shift));
9008   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
9009   ins_encode %{
9010     int vector_len = 0;
9011     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9012   %}
9013   ins_pipe( pipe_slow );
9014 %}
9015 
9016 instruct vsra4S(vecD dst, vecS shift) %{
9017   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
9018   match(Set dst (RShiftVS dst shift));
9019   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed4S" %}
9020   ins_encode %{
9021     __ psraw($dst$$XMMRegister, $shift$$XMMRegister);
9022   %}
9023   ins_pipe( pipe_slow );
9024 %}
9025 
9026 instruct vsra4S_imm(vecD dst, immI8 shift) %{
9027   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
9028   match(Set dst (RShiftVS dst shift));
9029   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed4S" %}
9030   ins_encode %{
9031     __ psraw($dst$$XMMRegister, (int)$shift$$constant);
9032   %}
9033   ins_pipe( pipe_slow );
9034 %}
9035 
9036 instruct vsra4S_reg(vecD dst, vecD src, vecS shift) %{
9037   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
9038   match(Set dst (RShiftVS src shift));
9039   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
9040   ins_encode %{
9041     int vector_len = 0;
9042     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9043   %}
9044   ins_pipe( pipe_slow );
9045 %}
9046 
9047 instruct vsra4S_reg_imm(vecD dst, vecD src, immI8 shift) %{
9048   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
9049   match(Set dst (RShiftVS src shift));
9050   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
9051   ins_encode %{
9052     int vector_len = 0;
9053     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9054   %}
9055   ins_pipe( pipe_slow );
9056 %}
9057 
9058 instruct vsra8S(vecX dst, vecS shift) %{
9059   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
9060   match(Set dst (RShiftVS dst shift));
9061   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed8S" %}
9062   ins_encode %{
9063     __ psraw($dst$$XMMRegister, $shift$$XMMRegister);
9064   %}
9065   ins_pipe( pipe_slow );
9066 %}
9067 
9068 instruct vsra8S_imm(vecX dst, immI8 shift) %{
9069   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
9070   match(Set dst (RShiftVS dst shift));
9071   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed8S" %}
9072   ins_encode %{
9073     __ psraw($dst$$XMMRegister, (int)$shift$$constant);
9074   %}
9075   ins_pipe( pipe_slow );
9076 %}
9077 
9078 instruct vsra8S_reg(vecX dst, vecX src, vecS shift) %{
9079   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
9080   match(Set dst (RShiftVS src shift));
9081   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
9082   ins_encode %{
9083     int vector_len = 0;
9084     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9085   %}
9086   ins_pipe( pipe_slow );
9087 %}
9088 
9089 instruct vsra8S_reg_imm(vecX dst, vecX src, immI8 shift) %{
9090   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
9091   match(Set dst (RShiftVS src shift));
9092   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
9093   ins_encode %{
9094     int vector_len = 0;
9095     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9096   %}
9097   ins_pipe( pipe_slow );
9098 %}
9099 
9100 instruct vsra16S_reg(vecY dst, vecY src, vecS shift) %{
9101   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
9102   match(Set dst (RShiftVS src shift));
9103   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
9104   ins_encode %{
9105     int vector_len = 1;
9106     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9107   %}
9108   ins_pipe( pipe_slow );
9109 %}
9110 
9111 instruct vsra16S_reg_imm(vecY dst, vecY src, immI8 shift) %{
9112   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
9113   match(Set dst (RShiftVS src shift));
9114   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
9115   ins_encode %{
9116     int vector_len = 1;
9117     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9118   %}
9119   ins_pipe( pipe_slow );
9120 %}
9121 
9122 instruct vsra32S_reg(vecZ dst, vecZ src, vecS shift) %{
9123   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
9124   match(Set dst (RShiftVS src shift));
9125   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed32S" %}
9126   ins_encode %{
9127     int vector_len = 2;
9128     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9129   %}
9130   ins_pipe( pipe_slow );
9131 %}
9132 
9133 instruct vsra32S_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
9134   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
9135   match(Set dst (RShiftVS src shift));
9136   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed32S" %}
9137   ins_encode %{
9138     int vector_len = 2;
9139     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9140   %}
9141   ins_pipe( pipe_slow );
9142 %}
9143 
9144 // Integers vector arithmetic right shift
9145 instruct vsra2I(vecD dst, vecS shift) %{
9146   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
9147   match(Set dst (RShiftVI dst shift));
9148   format %{ "psrad   $dst,$shift\t! arithmetic right shift packed2I" %}
9149   ins_encode %{
9150     __ psrad($dst$$XMMRegister, $shift$$XMMRegister);
9151   %}
9152   ins_pipe( pipe_slow );
9153 %}
9154 
9155 instruct vsra2I_imm(vecD dst, immI8 shift) %{
9156   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
9157   match(Set dst (RShiftVI dst shift));
9158   format %{ "psrad   $dst,$shift\t! arithmetic right shift packed2I" %}
9159   ins_encode %{
9160     __ psrad($dst$$XMMRegister, (int)$shift$$constant);
9161   %}
9162   ins_pipe( pipe_slow );
9163 %}
9164 
9165 instruct vsra2I_reg(vecD dst, vecD src, vecS shift) %{
9166   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
9167   match(Set dst (RShiftVI src shift));
9168   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed2I" %}
9169   ins_encode %{
9170     int vector_len = 0;
9171     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9172   %}
9173   ins_pipe( pipe_slow );
9174 %}
9175 
9176 instruct vsra2I_reg_imm(vecD dst, vecD src, immI8 shift) %{
9177   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
9178   match(Set dst (RShiftVI src shift));
9179   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed2I" %}
9180   ins_encode %{
9181     int vector_len = 0;
9182     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9183   %}
9184   ins_pipe( pipe_slow );
9185 %}
9186 
9187 instruct vsra4I(vecX dst, vecS shift) %{
9188   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
9189   match(Set dst (RShiftVI dst shift));
9190   format %{ "psrad   $dst,$shift\t! arithmetic right shift packed4I" %}
9191   ins_encode %{
9192     __ psrad($dst$$XMMRegister, $shift$$XMMRegister);
9193   %}
9194   ins_pipe( pipe_slow );
9195 %}
9196 
9197 instruct vsra4I_imm(vecX dst, immI8 shift) %{
9198   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
9199   match(Set dst (RShiftVI dst shift));
9200   format %{ "psrad   $dst,$shift\t! arithmetic right shift packed4I" %}
9201   ins_encode %{
9202     __ psrad($dst$$XMMRegister, (int)$shift$$constant);
9203   %}
9204   ins_pipe( pipe_slow );
9205 %}
9206 
9207 instruct vsra4I_reg(vecX dst, vecX src, vecS shift) %{
9208   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
9209   match(Set dst (RShiftVI src shift));
9210   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed4I" %}
9211   ins_encode %{
9212     int vector_len = 0;
9213     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9214   %}
9215   ins_pipe( pipe_slow );
9216 %}
9217 
9218 instruct vsra4I_reg_imm(vecX dst, vecX src, immI8 shift) %{
9219   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
9220   match(Set dst (RShiftVI src shift));
9221   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed4I" %}
9222   ins_encode %{
9223     int vector_len = 0;
9224     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9225   %}
9226   ins_pipe( pipe_slow );
9227 %}
9228 
9229 instruct vsra8I_reg(vecY dst, vecY src, vecS shift) %{
9230   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
9231   match(Set dst (RShiftVI src shift));
9232   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed8I" %}
9233   ins_encode %{
9234     int vector_len = 1;
9235     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9236   %}
9237   ins_pipe( pipe_slow );
9238 %}
9239 
9240 instruct vsra8I_reg_imm(vecY dst, vecY src, immI8 shift) %{
9241   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
9242   match(Set dst (RShiftVI src shift));
9243   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed8I" %}
9244   ins_encode %{
9245     int vector_len = 1;
9246     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9247   %}
9248   ins_pipe( pipe_slow );
9249 %}
9250 
9251 instruct vsra16I_reg(vecZ dst, vecZ src, vecS shift) %{
9252   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
9253   match(Set dst (RShiftVI src shift));
9254   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed16I" %}
9255   ins_encode %{
9256     int vector_len = 2;
9257     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9258   %}
9259   ins_pipe( pipe_slow );
9260 %}
9261 
9262 instruct vsra16I_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
9263   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
9264   match(Set dst (RShiftVI src shift));
9265   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed16I" %}
9266   ins_encode %{
9267     int vector_len = 2;
9268     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9269   %}
9270   ins_pipe( pipe_slow );
9271 %}
9272 
9273 // There are no longs vector arithmetic right shift instructions.
9274 
9275 
9276 // --------------------------------- AND --------------------------------------
9277 
9278 instruct vand4B(vecS dst, vecS src) %{
9279   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 4);
9280   match(Set dst (AndV dst src));
9281   format %{ "pand    $dst,$src\t! and vectors (4 bytes)" %}
9282   ins_encode %{
9283     __ pand($dst$$XMMRegister, $src$$XMMRegister);
9284   %}
9285   ins_pipe( pipe_slow );
9286 %}
9287 
9288 instruct vand4B_reg(vecS dst, vecS src1, vecS src2) %{
9289   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
9290   match(Set dst (AndV src1 src2));
9291   format %{ "vpand   $dst,$src1,$src2\t! and vectors (4 bytes)" %}
9292   ins_encode %{
9293     int vector_len = 0;
9294     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9295   %}
9296   ins_pipe( pipe_slow );
9297 %}
9298 
9299 instruct vand4B_mem(vecS dst, vecS src, memory mem) %{
9300   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
9301   match(Set dst (AndV src (LoadVector mem)));
9302   format %{ "vpand   $dst,$src,$mem\t! and vectors (4 bytes)" %}
9303   ins_encode %{
9304     int vector_len = 0;
9305     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9306   %}
9307   ins_pipe( pipe_slow );
9308 %}
9309 
9310 instruct vand8B(vecD dst, vecD src) %{
9311   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 8);
9312   match(Set dst (AndV dst src));
9313   format %{ "pand    $dst,$src\t! and vectors (8 bytes)" %}
9314   ins_encode %{
9315     __ pand($dst$$XMMRegister, $src$$XMMRegister);
9316   %}
9317   ins_pipe( pipe_slow );
9318 %}
9319 
9320 instruct vand8B_reg(vecD dst, vecD src1, vecD src2) %{
9321   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
9322   match(Set dst (AndV src1 src2));
9323   format %{ "vpand   $dst,$src1,$src2\t! and vectors (8 bytes)" %}
9324   ins_encode %{
9325     int vector_len = 0;
9326     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9327   %}
9328   ins_pipe( pipe_slow );
9329 %}
9330 
9331 instruct vand8B_mem(vecD dst, vecD src, memory mem) %{
9332   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
9333   match(Set dst (AndV src (LoadVector mem)));
9334   format %{ "vpand   $dst,$src,$mem\t! and vectors (8 bytes)" %}
9335   ins_encode %{
9336     int vector_len = 0;
9337     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9338   %}
9339   ins_pipe( pipe_slow );
9340 %}
9341 
9342 instruct vand16B(vecX dst, vecX src) %{
9343   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 16);
9344   match(Set dst (AndV dst src));
9345   format %{ "pand    $dst,$src\t! and vectors (16 bytes)" %}
9346   ins_encode %{
9347     __ pand($dst$$XMMRegister, $src$$XMMRegister);
9348   %}
9349   ins_pipe( pipe_slow );
9350 %}
9351 
9352 instruct vand16B_reg(vecX dst, vecX src1, vecX src2) %{
9353   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
9354   match(Set dst (AndV src1 src2));
9355   format %{ "vpand   $dst,$src1,$src2\t! and vectors (16 bytes)" %}
9356   ins_encode %{
9357     int vector_len = 0;
9358     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9359   %}
9360   ins_pipe( pipe_slow );
9361 %}
9362 
9363 instruct vand16B_mem(vecX dst, vecX src, memory mem) %{
9364   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
9365   match(Set dst (AndV src (LoadVector mem)));
9366   format %{ "vpand   $dst,$src,$mem\t! and vectors (16 bytes)" %}
9367   ins_encode %{
9368     int vector_len = 0;
9369     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9370   %}
9371   ins_pipe( pipe_slow );
9372 %}
9373 
9374 instruct vand32B_reg(vecY dst, vecY src1, vecY src2) %{
9375   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
9376   match(Set dst (AndV src1 src2));
9377   format %{ "vpand   $dst,$src1,$src2\t! and vectors (32 bytes)" %}
9378   ins_encode %{
9379     int vector_len = 1;
9380     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9381   %}
9382   ins_pipe( pipe_slow );
9383 %}
9384 
9385 instruct vand32B_mem(vecY dst, vecY src, memory mem) %{
9386   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
9387   match(Set dst (AndV src (LoadVector mem)));
9388   format %{ "vpand   $dst,$src,$mem\t! and vectors (32 bytes)" %}
9389   ins_encode %{
9390     int vector_len = 1;
9391     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9392   %}
9393   ins_pipe( pipe_slow );
9394 %}
9395 
9396 instruct vand64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
9397   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
9398   match(Set dst (AndV src1 src2));
9399   format %{ "vpand   $dst,$src1,$src2\t! and vectors (64 bytes)" %}
9400   ins_encode %{
9401     int vector_len = 2;
9402     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9403   %}
9404   ins_pipe( pipe_slow );
9405 %}
9406 
9407 instruct vand64B_mem(vecZ dst, vecZ src, memory mem) %{
9408   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
9409   match(Set dst (AndV src (LoadVector mem)));
9410   format %{ "vpand   $dst,$src,$mem\t! and vectors (64 bytes)" %}
9411   ins_encode %{
9412     int vector_len = 2;
9413     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9414   %}
9415   ins_pipe( pipe_slow );
9416 %}
9417 
9418 // --------------------------------- OR ---------------------------------------
9419 
9420 instruct vor4B(vecS dst, vecS src) %{
9421   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 4);
9422   match(Set dst (OrV dst src));
9423   format %{ "por     $dst,$src\t! or vectors (4 bytes)" %}
9424   ins_encode %{
9425     __ por($dst$$XMMRegister, $src$$XMMRegister);
9426   %}
9427   ins_pipe( pipe_slow );
9428 %}
9429 
9430 instruct vor4B_reg(vecS dst, vecS src1, vecS src2) %{
9431   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
9432   match(Set dst (OrV src1 src2));
9433   format %{ "vpor    $dst,$src1,$src2\t! or vectors (4 bytes)" %}
9434   ins_encode %{
9435     int vector_len = 0;
9436     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9437   %}
9438   ins_pipe( pipe_slow );
9439 %}
9440 
9441 instruct vor4B_mem(vecS dst, vecS src, memory mem) %{
9442   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
9443   match(Set dst (OrV src (LoadVector mem)));
9444   format %{ "vpor    $dst,$src,$mem\t! or vectors (4 bytes)" %}
9445   ins_encode %{
9446     int vector_len = 0;
9447     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9448   %}
9449   ins_pipe( pipe_slow );
9450 %}
9451 
9452 instruct vor8B(vecD dst, vecD src) %{
9453   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 8);
9454   match(Set dst (OrV dst src));
9455   format %{ "por     $dst,$src\t! or vectors (8 bytes)" %}
9456   ins_encode %{
9457     __ por($dst$$XMMRegister, $src$$XMMRegister);
9458   %}
9459   ins_pipe( pipe_slow );
9460 %}
9461 
9462 instruct vor8B_reg(vecD dst, vecD src1, vecD src2) %{
9463   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
9464   match(Set dst (OrV src1 src2));
9465   format %{ "vpor    $dst,$src1,$src2\t! or vectors (8 bytes)" %}
9466   ins_encode %{
9467     int vector_len = 0;
9468     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9469   %}
9470   ins_pipe( pipe_slow );
9471 %}
9472 
9473 instruct vor8B_mem(vecD dst, vecD src, memory mem) %{
9474   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
9475   match(Set dst (OrV src (LoadVector mem)));
9476   format %{ "vpor    $dst,$src,$mem\t! or vectors (8 bytes)" %}
9477   ins_encode %{
9478     int vector_len = 0;
9479     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9480   %}
9481   ins_pipe( pipe_slow );
9482 %}
9483 
9484 instruct vor16B(vecX dst, vecX src) %{
9485   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 16);
9486   match(Set dst (OrV dst src));
9487   format %{ "por     $dst,$src\t! or vectors (16 bytes)" %}
9488   ins_encode %{
9489     __ por($dst$$XMMRegister, $src$$XMMRegister);
9490   %}
9491   ins_pipe( pipe_slow );
9492 %}
9493 
9494 instruct vor16B_reg(vecX dst, vecX src1, vecX src2) %{
9495   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
9496   match(Set dst (OrV src1 src2));
9497   format %{ "vpor    $dst,$src1,$src2\t! or vectors (16 bytes)" %}
9498   ins_encode %{
9499     int vector_len = 0;
9500     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9501   %}
9502   ins_pipe( pipe_slow );
9503 %}
9504 
9505 instruct vor16B_mem(vecX dst, vecX src, memory mem) %{
9506   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
9507   match(Set dst (OrV src (LoadVector mem)));
9508   format %{ "vpor    $dst,$src,$mem\t! or vectors (16 bytes)" %}
9509   ins_encode %{
9510     int vector_len = 0;
9511     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9512   %}
9513   ins_pipe( pipe_slow );
9514 %}
9515 
9516 instruct vor32B_reg(vecY dst, vecY src1, vecY src2) %{
9517   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
9518   match(Set dst (OrV src1 src2));
9519   format %{ "vpor    $dst,$src1,$src2\t! or vectors (32 bytes)" %}
9520   ins_encode %{
9521     int vector_len = 1;
9522     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9523   %}
9524   ins_pipe( pipe_slow );
9525 %}
9526 
9527 instruct vor32B_mem(vecY dst, vecY src, memory mem) %{
9528   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
9529   match(Set dst (OrV src (LoadVector mem)));
9530   format %{ "vpor    $dst,$src,$mem\t! or vectors (32 bytes)" %}
9531   ins_encode %{
9532     int vector_len = 1;
9533     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9534   %}
9535   ins_pipe( pipe_slow );
9536 %}
9537 
9538 instruct vor64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
9539   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
9540   match(Set dst (OrV src1 src2));
9541   format %{ "vpor    $dst,$src1,$src2\t! or vectors (64 bytes)" %}
9542   ins_encode %{
9543     int vector_len = 2;
9544     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9545   %}
9546   ins_pipe( pipe_slow );
9547 %}
9548 
9549 instruct vor64B_mem(vecZ dst, vecZ src, memory mem) %{
9550   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
9551   match(Set dst (OrV src (LoadVector mem)));
9552   format %{ "vpor    $dst,$src,$mem\t! or vectors (64 bytes)" %}
9553   ins_encode %{
9554     int vector_len = 2;
9555     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9556   %}
9557   ins_pipe( pipe_slow );
9558 %}
9559 
9560 // --------------------------------- XOR --------------------------------------
9561 
9562 instruct vxor4B(vecS dst, vecS src) %{
9563   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 4);
9564   match(Set dst (XorV dst src));
9565   format %{ "pxor    $dst,$src\t! xor vectors (4 bytes)" %}
9566   ins_encode %{
9567     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
9568   %}
9569   ins_pipe( pipe_slow );
9570 %}
9571 
9572 instruct vxor4B_reg(vecS dst, vecS src1, vecS src2) %{
9573   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
9574   match(Set dst (XorV src1 src2));
9575   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (4 bytes)" %}
9576   ins_encode %{
9577     int vector_len = 0;
9578     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9579   %}
9580   ins_pipe( pipe_slow );
9581 %}
9582 
9583 instruct vxor4B_mem(vecS dst, vecS src, memory mem) %{
9584   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
9585   match(Set dst (XorV src (LoadVector mem)));
9586   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (4 bytes)" %}
9587   ins_encode %{
9588     int vector_len = 0;
9589     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9590   %}
9591   ins_pipe( pipe_slow );
9592 %}
9593 
9594 instruct vxor8B(vecD dst, vecD src) %{
9595   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 8);
9596   match(Set dst (XorV dst src));
9597   format %{ "pxor    $dst,$src\t! xor vectors (8 bytes)" %}
9598   ins_encode %{
9599     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
9600   %}
9601   ins_pipe( pipe_slow );
9602 %}
9603 
9604 instruct vxor8B_reg(vecD dst, vecD src1, vecD src2) %{
9605   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
9606   match(Set dst (XorV src1 src2));
9607   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (8 bytes)" %}
9608   ins_encode %{
9609     int vector_len = 0;
9610     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9611   %}
9612   ins_pipe( pipe_slow );
9613 %}
9614 
9615 instruct vxor8B_mem(vecD dst, vecD src, memory mem) %{
9616   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
9617   match(Set dst (XorV src (LoadVector mem)));
9618   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (8 bytes)" %}
9619   ins_encode %{
9620     int vector_len = 0;
9621     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9622   %}
9623   ins_pipe( pipe_slow );
9624 %}
9625 
9626 instruct vxor16B(vecX dst, vecX src) %{
9627   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 16);
9628   match(Set dst (XorV dst src));
9629   format %{ "pxor    $dst,$src\t! xor vectors (16 bytes)" %}
9630   ins_encode %{
9631     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
9632   %}
9633   ins_pipe( pipe_slow );
9634 %}
9635 
9636 instruct vxor16B_reg(vecX dst, vecX src1, vecX src2) %{
9637   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
9638   match(Set dst (XorV src1 src2));
9639   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (16 bytes)" %}
9640   ins_encode %{
9641     int vector_len = 0;
9642     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9643   %}
9644   ins_pipe( pipe_slow );
9645 %}
9646 
9647 instruct vxor16B_mem(vecX dst, vecX src, memory mem) %{
9648   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
9649   match(Set dst (XorV src (LoadVector mem)));
9650   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (16 bytes)" %}
9651   ins_encode %{
9652     int vector_len = 0;
9653     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9654   %}
9655   ins_pipe( pipe_slow );
9656 %}
9657 
9658 instruct vxor32B_reg(vecY dst, vecY src1, vecY src2) %{
9659   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
9660   match(Set dst (XorV src1 src2));
9661   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (32 bytes)" %}
9662   ins_encode %{
9663     int vector_len = 1;
9664     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9665   %}
9666   ins_pipe( pipe_slow );
9667 %}
9668 
9669 instruct vxor32B_mem(vecY dst, vecY src, memory mem) %{
9670   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
9671   match(Set dst (XorV src (LoadVector mem)));
9672   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (32 bytes)" %}
9673   ins_encode %{
9674     int vector_len = 1;
9675     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9676   %}
9677   ins_pipe( pipe_slow );
9678 %}
9679 
9680 instruct vxor64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
9681   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
9682   match(Set dst (XorV src1 src2));
9683   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (64 bytes)" %}
9684   ins_encode %{
9685     int vector_len = 2;
9686     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9687   %}
9688   ins_pipe( pipe_slow );
9689 %}
9690 
9691 instruct vxor64B_mem(vecZ dst, vecZ src, memory mem) %{
9692   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
9693   match(Set dst (XorV src (LoadVector mem)));
9694   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (64 bytes)" %}
9695   ins_encode %{
9696     int vector_len = 2;
9697     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9698   %}
9699   ins_pipe( pipe_slow );
9700 %}
9701 
9702 // --------------------------------- FMA --------------------------------------
9703 
9704 // a * b + c
9705 instruct vfma2D_reg(vecX a, vecX b, vecX c) %{
9706   predicate(UseFMA && n->as_Vector()->length() == 2);
9707   match(Set c (FmaVD  c (Binary a b)));
9708   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed2D" %}
9709   ins_cost(150);
9710   ins_encode %{
9711     int vector_len = 0;
9712     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
9713   %}
9714   ins_pipe( pipe_slow );
9715 %}
9716 
9717 // a * b + c
9718 instruct vfma2D_mem(vecX a, memory b, vecX c) %{
9719   predicate(UseFMA && n->as_Vector()->length() == 2);
9720   match(Set c (FmaVD  c (Binary a (LoadVector b))));
9721   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed2D" %}
9722   ins_cost(150);
9723   ins_encode %{
9724     int vector_len = 0;
9725     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
9726   %}
9727   ins_pipe( pipe_slow );
9728 %}
9729 
9730 
9731 // a * b + c
9732 instruct vfma4D_reg(vecY a, vecY b, vecY c) %{
9733   predicate(UseFMA && n->as_Vector()->length() == 4);
9734   match(Set c (FmaVD  c (Binary a b)));
9735   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed4D" %}
9736   ins_cost(150);
9737   ins_encode %{
9738     int vector_len = 1;
9739     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
9740   %}
9741   ins_pipe( pipe_slow );
9742 %}
9743 
9744 // a * b + c
9745 instruct vfma4D_mem(vecY a, memory b, vecY c) %{
9746   predicate(UseFMA && n->as_Vector()->length() == 4);
9747   match(Set c (FmaVD  c (Binary a (LoadVector b))));
9748   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed4D" %}
9749   ins_cost(150);
9750   ins_encode %{
9751     int vector_len = 1;
9752     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
9753   %}
9754   ins_pipe( pipe_slow );
9755 %}
9756 
9757 // a * b + c
9758 instruct vfma8D_reg(vecZ a, vecZ b, vecZ c) %{
9759   predicate(UseFMA && n->as_Vector()->length() == 8);
9760   match(Set c (FmaVD  c (Binary a b)));
9761   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed8D" %}
9762   ins_cost(150);
9763   ins_encode %{
9764     int vector_len = 2;
9765     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
9766   %}
9767   ins_pipe( pipe_slow );
9768 %}
9769 
9770 // a * b + c
9771 instruct vfma8D_mem(vecZ a, memory b, vecZ c) %{
9772   predicate(UseFMA && n->as_Vector()->length() == 8);
9773   match(Set c (FmaVD  c (Binary a (LoadVector b))));
9774   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed8D" %}
9775   ins_cost(150);
9776   ins_encode %{
9777     int vector_len = 2;
9778     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
9779   %}
9780   ins_pipe( pipe_slow );
9781 %}
9782 
9783 // a * b + c
9784 instruct vfma4F_reg(vecX a, vecX b, vecX c) %{
9785   predicate(UseFMA && n->as_Vector()->length() == 4);
9786   match(Set c (FmaVF  c (Binary a b)));
9787   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed4F" %}
9788   ins_cost(150);
9789   ins_encode %{
9790     int vector_len = 0;
9791     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
9792   %}
9793   ins_pipe( pipe_slow );
9794 %}
9795 
9796 // a * b + c
9797 instruct vfma4F_mem(vecX a, memory b, vecX c) %{
9798   predicate(UseFMA && n->as_Vector()->length() == 4);
9799   match(Set c (FmaVF  c (Binary a (LoadVector b))));
9800   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed4F" %}
9801   ins_cost(150);
9802   ins_encode %{
9803     int vector_len = 0;
9804     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
9805   %}
9806   ins_pipe( pipe_slow );
9807 %}
9808 
9809 // a * b + c
9810 instruct vfma8F_reg(vecY a, vecY b, vecY c) %{
9811   predicate(UseFMA && n->as_Vector()->length() == 8);
9812   match(Set c (FmaVF  c (Binary a b)));
9813   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed8F" %}
9814   ins_cost(150);
9815   ins_encode %{
9816     int vector_len = 1;
9817     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
9818   %}
9819   ins_pipe( pipe_slow );
9820 %}
9821 
9822 // a * b + c
9823 instruct vfma8F_mem(vecY a, memory b, vecY c) %{
9824   predicate(UseFMA && n->as_Vector()->length() == 8);
9825   match(Set c (FmaVF  c (Binary a (LoadVector b))));
9826   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed8F" %}
9827   ins_cost(150);
9828   ins_encode %{
9829     int vector_len = 1;
9830     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
9831   %}
9832   ins_pipe( pipe_slow );
9833 %}
9834 
9835 // a * b + c
9836 instruct vfma16F_reg(vecZ a, vecZ b, vecZ c) %{
9837   predicate(UseFMA && n->as_Vector()->length() == 16);
9838   match(Set c (FmaVF  c (Binary a b)));
9839   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed16F" %}
9840   ins_cost(150);
9841   ins_encode %{
9842     int vector_len = 2;
9843     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
9844   %}
9845   ins_pipe( pipe_slow );
9846 %}
9847 
9848 // a * b + c
9849 instruct vfma16F_mem(vecZ a, memory b, vecZ c) %{
9850   predicate(UseFMA && n->as_Vector()->length() == 16);
9851   match(Set c (FmaVF  c (Binary a (LoadVector b))));
9852   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed16F" %}
9853   ins_cost(150);
9854   ins_encode %{
9855     int vector_len = 2;
9856     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
9857   %}
9858   ins_pipe( pipe_slow );
9859 %}
9860 
9861 // --------------------------------- Vector Multiply Add --------------------------------------
9862 
9863 instruct smuladd4S2I_reg(vecD dst, vecD src1) %{
9864   predicate(UseSSE >= 2 && UseAVX == 0 && n->as_Vector()->length() == 2);
9865   match(Set dst (MulAddVS2VI dst src1));
9866   format %{ "pmaddwd $dst,$dst,$src1\t! muladd packed4Sto2I" %}
9867   ins_encode %{
9868     __ pmaddwd($dst$$XMMRegister, $src1$$XMMRegister);
9869   %}
9870   ins_pipe( pipe_slow );
9871 %}
9872 
9873 instruct vmuladd4S2I_reg(vecD dst, vecD src1, vecD src2) %{
9874   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
9875   match(Set dst (MulAddVS2VI src1 src2));
9876   format %{ "vpmaddwd $dst,$src1,$src2\t! muladd packed4Sto2I" %}
9877   ins_encode %{
9878     int vector_len = 0;
9879     __ vpmaddwd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9880   %}
9881   ins_pipe( pipe_slow );
9882 %}
9883 
9884 instruct smuladd8S4I_reg(vecX dst, vecX src1) %{
9885   predicate(UseSSE >= 2 && UseAVX == 0 && n->as_Vector()->length() == 4);
9886   match(Set dst (MulAddVS2VI dst src1));
9887   format %{ "pmaddwd $dst,$dst,$src1\t! muladd packed8Sto4I" %}
9888   ins_encode %{
9889     __ pmaddwd($dst$$XMMRegister, $src1$$XMMRegister);
9890   %}
9891   ins_pipe( pipe_slow );
9892 %}
9893 
9894 instruct vmuladd8S4I_reg(vecX dst, vecX src1, vecX src2) %{
9895   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
9896   match(Set dst (MulAddVS2VI src1 src2));
9897   format %{ "vpmaddwd $dst,$src1,$src2\t! muladd packed8Sto4I" %}
9898   ins_encode %{
9899     int vector_len = 0;
9900     __ vpmaddwd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9901   %}
9902   ins_pipe( pipe_slow );
9903 %}
9904 
9905 instruct vmuladd16S8I_reg(vecY dst, vecY src1, vecY src2) %{
9906   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
9907   match(Set dst (MulAddVS2VI src1 src2));
9908   format %{ "vpmaddwd $dst,$src1,$src2\t! muladd packed16Sto8I" %}
9909   ins_encode %{
9910     int vector_len = 1;
9911     __ vpmaddwd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9912   %}
9913   ins_pipe( pipe_slow );
9914 %}
9915 
9916 instruct vmuladd32S16I_reg(vecZ dst, vecZ src1, vecZ src2) %{
9917   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
9918   match(Set dst (MulAddVS2VI src1 src2));
9919   format %{ "vpmaddwd $dst,$src1,$src2\t! muladd packed32Sto16I" %}
9920   ins_encode %{
9921     int vector_len = 2;
9922     __ vpmaddwd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9923   %}
9924   ins_pipe( pipe_slow );
9925 %}
9926 
9927 // --------------------------------- Vector Multiply Add Add ----------------------------------
9928 
9929 instruct vmuladdadd4S2I_reg(vecD dst, vecD src1, vecD src2) %{
9930   predicate(VM_Version::supports_vnni() && UseAVX > 2 && n->as_Vector()->length() == 2);
9931   match(Set dst (AddVI (MulAddVS2VI src1 src2) dst));
9932   format %{ "evpdpwssd $dst,$src1,$src2\t! muladdadd packed4Sto2I" %}
9933   ins_encode %{
9934     int vector_len = 0;
9935     __ evpdpwssd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9936   %}
9937   ins_pipe( pipe_slow );
9938   ins_cost(10);
9939 %}
9940 
9941 instruct vmuladdadd8S4I_reg(vecX dst, vecX src1, vecX src2) %{
9942   predicate(VM_Version::supports_vnni() && UseAVX > 2 && n->as_Vector()->length() == 4);
9943   match(Set dst (AddVI (MulAddVS2VI src1 src2) dst));
9944   format %{ "evpdpwssd $dst,$src1,$src2\t! muladdadd packed8Sto4I" %}
9945   ins_encode %{
9946     int vector_len = 0;
9947     __ evpdpwssd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9948   %}
9949   ins_pipe( pipe_slow );
9950   ins_cost(10);
9951 %}
9952 
9953 instruct vmuladdadd16S8I_reg(vecY dst, vecY src1, vecY src2) %{
9954   predicate(VM_Version::supports_vnni() && UseAVX > 2 && n->as_Vector()->length() == 8);
9955   match(Set dst (AddVI (MulAddVS2VI src1 src2) dst));
9956   format %{ "evpdpwssd $dst,$src1,$src2\t! muladdadd packed16Sto8I" %}
9957   ins_encode %{
9958     int vector_len = 1;
9959     __ evpdpwssd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9960   %}
9961   ins_pipe( pipe_slow );
9962   ins_cost(10);
9963 %}
9964 
9965 instruct vmuladdadd32S16I_reg(vecZ dst, vecZ src1, vecZ src2) %{
9966   predicate(VM_Version::supports_vnni() && UseAVX > 2 && n->as_Vector()->length() == 16);
9967   match(Set dst (AddVI (MulAddVS2VI src1 src2) dst));
9968   format %{ "evpdpwssd $dst,$src1,$src2\t! muladdadd packed32Sto16I" %}
9969   ins_encode %{
9970     int vector_len = 2;
9971     __ evpdpwssd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9972   %}
9973   ins_pipe( pipe_slow );
9974   ins_cost(10);
9975 %}
9976 
9977 // --------------------------------- PopCount --------------------------------------
9978 
9979 instruct vpopcount2I(vecD dst, vecD src) %{
9980   predicate(VM_Version::supports_vpopcntdq() && UsePopCountInstruction && n->as_Vector()->length() == 2);
9981   match(Set dst (PopCountVI src));
9982   format %{ "vpopcntd  $dst,$src\t! vector popcount packed2I" %}
9983   ins_encode %{
9984     int vector_len = 0;
9985     __ vpopcntd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
9986   %}
9987   ins_pipe( pipe_slow );
9988 %}
9989 
9990 instruct vpopcount4I(vecX dst, vecX src) %{
9991   predicate(VM_Version::supports_vpopcntdq() && UsePopCountInstruction && n->as_Vector()->length() == 4);
9992   match(Set dst (PopCountVI src));
9993   format %{ "vpopcntd  $dst,$src\t! vector popcount packed4I" %}
9994   ins_encode %{
9995     int vector_len = 0;
9996     __ vpopcntd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
9997   %}
9998   ins_pipe( pipe_slow );
9999 %}
10000 
10001 instruct vpopcount8I(vecY dst, vecY src) %{
10002   predicate(VM_Version::supports_vpopcntdq() && UsePopCountInstruction && n->as_Vector()->length() == 8);
10003   match(Set dst (PopCountVI src));
10004   format %{ "vpopcntd  $dst,$src\t! vector popcount packed8I" %}
10005   ins_encode %{
10006     int vector_len = 1;
10007     __ vpopcntd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
10008   %}
10009   ins_pipe( pipe_slow );
10010 %}
10011 
10012 instruct vpopcount16I(vecZ dst, vecZ src) %{
10013   predicate(VM_Version::supports_vpopcntdq() && UsePopCountInstruction && n->as_Vector()->length() == 16);
10014   match(Set dst (PopCountVI src));
10015   format %{ "vpopcntd  $dst,$src\t! vector popcount packed16I" %}
10016   ins_encode %{
10017     int vector_len = 2;
10018     __ vpopcntd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
10019   %}
10020   ins_pipe( pipe_slow );
10021 %}