1 //
   2 // Copyright (c) 2011, 2018, Oracle and/or its affiliates. All rights reserved.
   3 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4 //
   5 // This code is free software; you can redistribute it and/or modify it
   6 // under the terms of the GNU General Public License version 2 only, as
   7 // published by the Free Software Foundation.
   8 //
   9 // This code is distributed in the hope that it will be useful, but WITHOUT
  10 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11 // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12 // version 2 for more details (a copy is included in the LICENSE file that
  13 // accompanied this code).
  14 //
  15 // You should have received a copy of the GNU General Public License version
  16 // 2 along with this work; if not, write to the Free Software Foundation,
  17 // Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18 //
  19 // Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20 // or visit www.oracle.com if you need additional information or have any
  21 // questions.
  22 //
  23 //
  24 
  25 // X86 Common Architecture Description File
  26 
  27 //----------REGISTER DEFINITION BLOCK------------------------------------------
  28 // This information is used by the matcher and the register allocator to
  29 // describe individual registers and classes of registers within the target
  30 // archtecture.
  31 
  32 register %{
  33 //----------Architecture Description Register Definitions----------------------
  34 // General Registers
  35 // "reg_def"  name ( register save type, C convention save type,
  36 //                   ideal register type, encoding );
  37 // Register Save Types:
  38 //
  39 // NS  = No-Save:       The register allocator assumes that these registers
  40 //                      can be used without saving upon entry to the method, &
  41 //                      that they do not need to be saved at call sites.
  42 //
  43 // SOC = Save-On-Call:  The register allocator assumes that these registers
  44 //                      can be used without saving upon entry to the method,
  45 //                      but that they must be saved at call sites.
  46 //
  47 // SOE = Save-On-Entry: The register allocator assumes that these registers
  48 //                      must be saved before using them upon entry to the
  49 //                      method, but they do not need to be saved at call
  50 //                      sites.
  51 //
  52 // AS  = Always-Save:   The register allocator assumes that these registers
  53 //                      must be saved before using them upon entry to the
  54 //                      method, & that they must be saved at call sites.
  55 //
  56 // Ideal Register Type is used to determine how to save & restore a
  57 // register.  Op_RegI will get spilled with LoadI/StoreI, Op_RegP will get
  58 // spilled with LoadP/StoreP.  If the register supports both, use Op_RegI.
  59 //
  60 // The encoding number is the actual bit-pattern placed into the opcodes.
  61 
  62 // XMM registers.  512-bit registers or 8 words each, labeled (a)-p.
  63 // Word a in each register holds a Float, words ab hold a Double.
  64 // The whole registers are used in SSE4.2 version intrinsics,
  65 // array copy stubs and superword operations (see UseSSE42Intrinsics,
  66 // UseXMMForArrayCopy and UseSuperword flags).
  67 // For pre EVEX enabled architectures:
  68 //      XMM8-XMM15 must be encoded with REX (VEX for UseAVX)
  69 // For EVEX enabled architectures:
  70 //      XMM8-XMM31 must be encoded with REX (EVEX for UseAVX).
  71 //
  72 // Linux ABI:   No register preserved across function calls
  73 //              XMM0-XMM7 might hold parameters
  74 // Windows ABI: XMM6-XMM31 preserved across function calls
  75 //              XMM0-XMM3 might hold parameters
  76 
  77 reg_def XMM0 ( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg());
  78 reg_def XMM0b( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(1));
  79 reg_def XMM0c( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(2));
  80 reg_def XMM0d( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(3));
  81 reg_def XMM0e( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(4));
  82 reg_def XMM0f( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(5));
  83 reg_def XMM0g( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(6));
  84 reg_def XMM0h( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(7));
  85 reg_def XMM0i( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(8));
  86 reg_def XMM0j( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(9));
  87 reg_def XMM0k( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(10));
  88 reg_def XMM0l( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(11));
  89 reg_def XMM0m( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(12));
  90 reg_def XMM0n( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(13));
  91 reg_def XMM0o( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(14));
  92 reg_def XMM0p( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(15));
  93 
  94 reg_def XMM1 ( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg());
  95 reg_def XMM1b( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(1));
  96 reg_def XMM1c( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(2));
  97 reg_def XMM1d( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(3));
  98 reg_def XMM1e( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(4));
  99 reg_def XMM1f( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(5));
 100 reg_def XMM1g( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(6));
 101 reg_def XMM1h( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(7));
 102 reg_def XMM1i( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(8));
 103 reg_def XMM1j( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(9));
 104 reg_def XMM1k( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(10));
 105 reg_def XMM1l( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(11));
 106 reg_def XMM1m( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(12));
 107 reg_def XMM1n( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(13));
 108 reg_def XMM1o( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(14));
 109 reg_def XMM1p( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(15));
 110 
 111 reg_def XMM2 ( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg());
 112 reg_def XMM2b( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(1));
 113 reg_def XMM2c( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(2));
 114 reg_def XMM2d( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(3));
 115 reg_def XMM2e( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(4));
 116 reg_def XMM2f( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(5));
 117 reg_def XMM2g( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(6));
 118 reg_def XMM2h( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(7));
 119 reg_def XMM2i( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(8));
 120 reg_def XMM2j( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(9));
 121 reg_def XMM2k( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(10));
 122 reg_def XMM2l( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(11));
 123 reg_def XMM2m( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(12));
 124 reg_def XMM2n( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(13));
 125 reg_def XMM2o( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(14));
 126 reg_def XMM2p( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(15));
 127 
 128 reg_def XMM3 ( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg());
 129 reg_def XMM3b( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(1));
 130 reg_def XMM3c( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(2));
 131 reg_def XMM3d( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(3));
 132 reg_def XMM3e( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(4));
 133 reg_def XMM3f( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(5));
 134 reg_def XMM3g( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(6));
 135 reg_def XMM3h( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(7));
 136 reg_def XMM3i( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(8));
 137 reg_def XMM3j( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(9));
 138 reg_def XMM3k( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(10));
 139 reg_def XMM3l( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(11));
 140 reg_def XMM3m( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(12));
 141 reg_def XMM3n( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(13));
 142 reg_def XMM3o( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(14));
 143 reg_def XMM3p( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(15));
 144 
 145 reg_def XMM4 ( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg());
 146 reg_def XMM4b( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(1));
 147 reg_def XMM4c( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(2));
 148 reg_def XMM4d( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(3));
 149 reg_def XMM4e( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(4));
 150 reg_def XMM4f( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(5));
 151 reg_def XMM4g( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(6));
 152 reg_def XMM4h( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(7));
 153 reg_def XMM4i( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(8));
 154 reg_def XMM4j( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(9));
 155 reg_def XMM4k( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(10));
 156 reg_def XMM4l( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(11));
 157 reg_def XMM4m( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(12));
 158 reg_def XMM4n( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(13));
 159 reg_def XMM4o( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(14));
 160 reg_def XMM4p( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(15));
 161 
 162 reg_def XMM5 ( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg());
 163 reg_def XMM5b( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(1));
 164 reg_def XMM5c( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(2));
 165 reg_def XMM5d( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(3));
 166 reg_def XMM5e( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(4));
 167 reg_def XMM5f( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(5));
 168 reg_def XMM5g( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(6));
 169 reg_def XMM5h( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(7));
 170 reg_def XMM5i( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(8));
 171 reg_def XMM5j( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(9));
 172 reg_def XMM5k( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(10));
 173 reg_def XMM5l( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(11));
 174 reg_def XMM5m( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(12));
 175 reg_def XMM5n( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(13));
 176 reg_def XMM5o( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(14));
 177 reg_def XMM5p( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(15));
 178 
 179 reg_def XMM6 ( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg());
 180 reg_def XMM6b( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(1));
 181 reg_def XMM6c( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(2));
 182 reg_def XMM6d( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(3));
 183 reg_def XMM6e( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(4));
 184 reg_def XMM6f( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(5));
 185 reg_def XMM6g( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(6));
 186 reg_def XMM6h( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(7));
 187 reg_def XMM6i( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(8));
 188 reg_def XMM6j( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(9));
 189 reg_def XMM6k( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(10));
 190 reg_def XMM6l( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(11));
 191 reg_def XMM6m( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(12));
 192 reg_def XMM6n( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(13));
 193 reg_def XMM6o( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(14));
 194 reg_def XMM6p( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(15));
 195 
 196 reg_def XMM7 ( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg());
 197 reg_def XMM7b( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(1));
 198 reg_def XMM7c( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(2));
 199 reg_def XMM7d( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(3));
 200 reg_def XMM7e( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(4));
 201 reg_def XMM7f( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(5));
 202 reg_def XMM7g( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(6));
 203 reg_def XMM7h( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(7));
 204 reg_def XMM7i( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(8));
 205 reg_def XMM7j( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(9));
 206 reg_def XMM7k( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(10));
 207 reg_def XMM7l( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(11));
 208 reg_def XMM7m( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(12));
 209 reg_def XMM7n( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(13));
 210 reg_def XMM7o( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(14));
 211 reg_def XMM7p( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(15));
 212 
 213 #ifdef _LP64
 214 
 215 reg_def XMM8 ( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg());
 216 reg_def XMM8b( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(1));
 217 reg_def XMM8c( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(2));
 218 reg_def XMM8d( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(3));
 219 reg_def XMM8e( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(4));
 220 reg_def XMM8f( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(5));
 221 reg_def XMM8g( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(6));
 222 reg_def XMM8h( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(7));
 223 reg_def XMM8i( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(8));
 224 reg_def XMM8j( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(9));
 225 reg_def XMM8k( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(10));
 226 reg_def XMM8l( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(11));
 227 reg_def XMM8m( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(12));
 228 reg_def XMM8n( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(13));
 229 reg_def XMM8o( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(14));
 230 reg_def XMM8p( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(15));
 231 
 232 reg_def XMM9 ( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg());
 233 reg_def XMM9b( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(1));
 234 reg_def XMM9c( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(2));
 235 reg_def XMM9d( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(3));
 236 reg_def XMM9e( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(4));
 237 reg_def XMM9f( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(5));
 238 reg_def XMM9g( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(6));
 239 reg_def XMM9h( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(7));
 240 reg_def XMM9i( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(8));
 241 reg_def XMM9j( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(9));
 242 reg_def XMM9k( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(10));
 243 reg_def XMM9l( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(11));
 244 reg_def XMM9m( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(12));
 245 reg_def XMM9n( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(13));
 246 reg_def XMM9o( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(14));
 247 reg_def XMM9p( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(15));
 248 
 249 reg_def XMM10 ( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg());
 250 reg_def XMM10b( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(1));
 251 reg_def XMM10c( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(2));
 252 reg_def XMM10d( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(3));
 253 reg_def XMM10e( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(4));
 254 reg_def XMM10f( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(5));
 255 reg_def XMM10g( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(6));
 256 reg_def XMM10h( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(7));
 257 reg_def XMM10i( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(8));
 258 reg_def XMM10j( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(9));
 259 reg_def XMM10k( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(10));
 260 reg_def XMM10l( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(11));
 261 reg_def XMM10m( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(12));
 262 reg_def XMM10n( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(13));
 263 reg_def XMM10o( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(14));
 264 reg_def XMM10p( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(15));
 265 
 266 reg_def XMM11 ( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg());
 267 reg_def XMM11b( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(1));
 268 reg_def XMM11c( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(2));
 269 reg_def XMM11d( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(3));
 270 reg_def XMM11e( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(4));
 271 reg_def XMM11f( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(5));
 272 reg_def XMM11g( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(6));
 273 reg_def XMM11h( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(7));
 274 reg_def XMM11i( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(8));
 275 reg_def XMM11j( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(9));
 276 reg_def XMM11k( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(10));
 277 reg_def XMM11l( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(11));
 278 reg_def XMM11m( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(12));
 279 reg_def XMM11n( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(13));
 280 reg_def XMM11o( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(14));
 281 reg_def XMM11p( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(15));
 282 
 283 reg_def XMM12 ( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg());
 284 reg_def XMM12b( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(1));
 285 reg_def XMM12c( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(2));
 286 reg_def XMM12d( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(3));
 287 reg_def XMM12e( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(4));
 288 reg_def XMM12f( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(5));
 289 reg_def XMM12g( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(6));
 290 reg_def XMM12h( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(7));
 291 reg_def XMM12i( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(8));
 292 reg_def XMM12j( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(9));
 293 reg_def XMM12k( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(10));
 294 reg_def XMM12l( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(11));
 295 reg_def XMM12m( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(12));
 296 reg_def XMM12n( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(13));
 297 reg_def XMM12o( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(14));
 298 reg_def XMM12p( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(15));
 299 
 300 reg_def XMM13 ( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg());
 301 reg_def XMM13b( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(1));
 302 reg_def XMM13c( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(2));
 303 reg_def XMM13d( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(3));
 304 reg_def XMM13e( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(4));
 305 reg_def XMM13f( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(5));
 306 reg_def XMM13g( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(6));
 307 reg_def XMM13h( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(7));
 308 reg_def XMM13i( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(8));
 309 reg_def XMM13j( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(9));
 310 reg_def XMM13k( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(10));
 311 reg_def XMM13l( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(11));
 312 reg_def XMM13m( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(12));
 313 reg_def XMM13n( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(13));
 314 reg_def XMM13o( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(14));
 315 reg_def XMM13p( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(15));
 316 
 317 reg_def XMM14 ( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg());
 318 reg_def XMM14b( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(1));
 319 reg_def XMM14c( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(2));
 320 reg_def XMM14d( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(3));
 321 reg_def XMM14e( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(4));
 322 reg_def XMM14f( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(5));
 323 reg_def XMM14g( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(6));
 324 reg_def XMM14h( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(7));
 325 reg_def XMM14i( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(8));
 326 reg_def XMM14j( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(9));
 327 reg_def XMM14k( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(10));
 328 reg_def XMM14l( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(11));
 329 reg_def XMM14m( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(12));
 330 reg_def XMM14n( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(13));
 331 reg_def XMM14o( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(14));
 332 reg_def XMM14p( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(15));
 333 
 334 reg_def XMM15 ( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg());
 335 reg_def XMM15b( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(1));
 336 reg_def XMM15c( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(2));
 337 reg_def XMM15d( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(3));
 338 reg_def XMM15e( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(4));
 339 reg_def XMM15f( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(5));
 340 reg_def XMM15g( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(6));
 341 reg_def XMM15h( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(7));
 342 reg_def XMM15i( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(8));
 343 reg_def XMM15j( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(9));
 344 reg_def XMM15k( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(10));
 345 reg_def XMM15l( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(11));
 346 reg_def XMM15m( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(12));
 347 reg_def XMM15n( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(13));
 348 reg_def XMM15o( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(14));
 349 reg_def XMM15p( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(15));
 350 
 351 reg_def XMM16 ( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg());
 352 reg_def XMM16b( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(1));
 353 reg_def XMM16c( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(2));
 354 reg_def XMM16d( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(3));
 355 reg_def XMM16e( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(4));
 356 reg_def XMM16f( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(5));
 357 reg_def XMM16g( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(6));
 358 reg_def XMM16h( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(7));
 359 reg_def XMM16i( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(8));
 360 reg_def XMM16j( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(9));
 361 reg_def XMM16k( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(10));
 362 reg_def XMM16l( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(11));
 363 reg_def XMM16m( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(12));
 364 reg_def XMM16n( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(13));
 365 reg_def XMM16o( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(14));
 366 reg_def XMM16p( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(15));
 367 
 368 reg_def XMM17 ( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg());
 369 reg_def XMM17b( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(1));
 370 reg_def XMM17c( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(2));
 371 reg_def XMM17d( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(3));
 372 reg_def XMM17e( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(4));
 373 reg_def XMM17f( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(5));
 374 reg_def XMM17g( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(6));
 375 reg_def XMM17h( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(7));
 376 reg_def XMM17i( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(8));
 377 reg_def XMM17j( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(9));
 378 reg_def XMM17k( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(10));
 379 reg_def XMM17l( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(11));
 380 reg_def XMM17m( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(12));
 381 reg_def XMM17n( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(13));
 382 reg_def XMM17o( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(14));
 383 reg_def XMM17p( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(15));
 384 
 385 reg_def XMM18 ( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg());
 386 reg_def XMM18b( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(1));
 387 reg_def XMM18c( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(2));
 388 reg_def XMM18d( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(3));
 389 reg_def XMM18e( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(4));
 390 reg_def XMM18f( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(5));
 391 reg_def XMM18g( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(6));
 392 reg_def XMM18h( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(7));
 393 reg_def XMM18i( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(8));
 394 reg_def XMM18j( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(9));
 395 reg_def XMM18k( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(10));
 396 reg_def XMM18l( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(11));
 397 reg_def XMM18m( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(12));
 398 reg_def XMM18n( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(13));
 399 reg_def XMM18o( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(14));
 400 reg_def XMM18p( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(15));
 401 
 402 reg_def XMM19 ( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg());
 403 reg_def XMM19b( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(1));
 404 reg_def XMM19c( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(2));
 405 reg_def XMM19d( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(3));
 406 reg_def XMM19e( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(4));
 407 reg_def XMM19f( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(5));
 408 reg_def XMM19g( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(6));
 409 reg_def XMM19h( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(7));
 410 reg_def XMM19i( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(8));
 411 reg_def XMM19j( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(9));
 412 reg_def XMM19k( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(10));
 413 reg_def XMM19l( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(11));
 414 reg_def XMM19m( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(12));
 415 reg_def XMM19n( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(13));
 416 reg_def XMM19o( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(14));
 417 reg_def XMM19p( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(15));
 418 
 419 reg_def XMM20 ( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg());
 420 reg_def XMM20b( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(1));
 421 reg_def XMM20c( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(2));
 422 reg_def XMM20d( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(3));
 423 reg_def XMM20e( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(4));
 424 reg_def XMM20f( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(5));
 425 reg_def XMM20g( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(6));
 426 reg_def XMM20h( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(7));
 427 reg_def XMM20i( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(8));
 428 reg_def XMM20j( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(9));
 429 reg_def XMM20k( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(10));
 430 reg_def XMM20l( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(11));
 431 reg_def XMM20m( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(12));
 432 reg_def XMM20n( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(13));
 433 reg_def XMM20o( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(14));
 434 reg_def XMM20p( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(15));
 435 
 436 reg_def XMM21 ( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg());
 437 reg_def XMM21b( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(1));
 438 reg_def XMM21c( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(2));
 439 reg_def XMM21d( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(3));
 440 reg_def XMM21e( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(4));
 441 reg_def XMM21f( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(5));
 442 reg_def XMM21g( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(6));
 443 reg_def XMM21h( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(7));
 444 reg_def XMM21i( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(8));
 445 reg_def XMM21j( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(9));
 446 reg_def XMM21k( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(10));
 447 reg_def XMM21l( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(11));
 448 reg_def XMM21m( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(12));
 449 reg_def XMM21n( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(13));
 450 reg_def XMM21o( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(14));
 451 reg_def XMM21p( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(15));
 452 
 453 reg_def XMM22 ( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg());
 454 reg_def XMM22b( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(1));
 455 reg_def XMM22c( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(2));
 456 reg_def XMM22d( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(3));
 457 reg_def XMM22e( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(4));
 458 reg_def XMM22f( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(5));
 459 reg_def XMM22g( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(6));
 460 reg_def XMM22h( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(7));
 461 reg_def XMM22i( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(8));
 462 reg_def XMM22j( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(9));
 463 reg_def XMM22k( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(10));
 464 reg_def XMM22l( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(11));
 465 reg_def XMM22m( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(12));
 466 reg_def XMM22n( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(13));
 467 reg_def XMM22o( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(14));
 468 reg_def XMM22p( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(15));
 469 
 470 reg_def XMM23 ( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg());
 471 reg_def XMM23b( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(1));
 472 reg_def XMM23c( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(2));
 473 reg_def XMM23d( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(3));
 474 reg_def XMM23e( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(4));
 475 reg_def XMM23f( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(5));
 476 reg_def XMM23g( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(6));
 477 reg_def XMM23h( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(7));
 478 reg_def XMM23i( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(8));
 479 reg_def XMM23j( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(9));
 480 reg_def XMM23k( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(10));
 481 reg_def XMM23l( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(11));
 482 reg_def XMM23m( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(12));
 483 reg_def XMM23n( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(13));
 484 reg_def XMM23o( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(14));
 485 reg_def XMM23p( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(15));
 486 
 487 reg_def XMM24 ( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg());
 488 reg_def XMM24b( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(1));
 489 reg_def XMM24c( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(2));
 490 reg_def XMM24d( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(3));
 491 reg_def XMM24e( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(4));
 492 reg_def XMM24f( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(5));
 493 reg_def XMM24g( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(6));
 494 reg_def XMM24h( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(7));
 495 reg_def XMM24i( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(8));
 496 reg_def XMM24j( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(9));
 497 reg_def XMM24k( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(10));
 498 reg_def XMM24l( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(11));
 499 reg_def XMM24m( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(12));
 500 reg_def XMM24n( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(13));
 501 reg_def XMM24o( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(14));
 502 reg_def XMM24p( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(15));
 503 
 504 reg_def XMM25 ( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg());
 505 reg_def XMM25b( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(1));
 506 reg_def XMM25c( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(2));
 507 reg_def XMM25d( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(3));
 508 reg_def XMM25e( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(4));
 509 reg_def XMM25f( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(5));
 510 reg_def XMM25g( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(6));
 511 reg_def XMM25h( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(7));
 512 reg_def XMM25i( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(8));
 513 reg_def XMM25j( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(9));
 514 reg_def XMM25k( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(10));
 515 reg_def XMM25l( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(11));
 516 reg_def XMM25m( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(12));
 517 reg_def XMM25n( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(13));
 518 reg_def XMM25o( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(14));
 519 reg_def XMM25p( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(15));
 520 
 521 reg_def XMM26 ( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg());
 522 reg_def XMM26b( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(1));
 523 reg_def XMM26c( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(2));
 524 reg_def XMM26d( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(3));
 525 reg_def XMM26e( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(4));
 526 reg_def XMM26f( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(5));
 527 reg_def XMM26g( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(6));
 528 reg_def XMM26h( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(7));
 529 reg_def XMM26i( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(8));
 530 reg_def XMM26j( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(9));
 531 reg_def XMM26k( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(10));
 532 reg_def XMM26l( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(11));
 533 reg_def XMM26m( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(12));
 534 reg_def XMM26n( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(13));
 535 reg_def XMM26o( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(14));
 536 reg_def XMM26p( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(15));
 537 
 538 reg_def XMM27 ( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg());
 539 reg_def XMM27b( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(1));
 540 reg_def XMM27c( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(2));
 541 reg_def XMM27d( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(3));
 542 reg_def XMM27e( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(4));
 543 reg_def XMM27f( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(5));
 544 reg_def XMM27g( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(6));
 545 reg_def XMM27h( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(7));
 546 reg_def XMM27i( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(8));
 547 reg_def XMM27j( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(9));
 548 reg_def XMM27k( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(10));
 549 reg_def XMM27l( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(11));
 550 reg_def XMM27m( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(12));
 551 reg_def XMM27n( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(13));
 552 reg_def XMM27o( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(14));
 553 reg_def XMM27p( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(15));
 554 
 555 reg_def XMM28 ( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg());
 556 reg_def XMM28b( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(1));
 557 reg_def XMM28c( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(2));
 558 reg_def XMM28d( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(3));
 559 reg_def XMM28e( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(4));
 560 reg_def XMM28f( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(5));
 561 reg_def XMM28g( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(6));
 562 reg_def XMM28h( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(7));
 563 reg_def XMM28i( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(8));
 564 reg_def XMM28j( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(9));
 565 reg_def XMM28k( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(10));
 566 reg_def XMM28l( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(11));
 567 reg_def XMM28m( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(12));
 568 reg_def XMM28n( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(13));
 569 reg_def XMM28o( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(14));
 570 reg_def XMM28p( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(15));
 571 
 572 reg_def XMM29 ( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg());
 573 reg_def XMM29b( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(1));
 574 reg_def XMM29c( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(2));
 575 reg_def XMM29d( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(3));
 576 reg_def XMM29e( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(4));
 577 reg_def XMM29f( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(5));
 578 reg_def XMM29g( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(6));
 579 reg_def XMM29h( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(7));
 580 reg_def XMM29i( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(8));
 581 reg_def XMM29j( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(9));
 582 reg_def XMM29k( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(10));
 583 reg_def XMM29l( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(11));
 584 reg_def XMM29m( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(12));
 585 reg_def XMM29n( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(13));
 586 reg_def XMM29o( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(14));
 587 reg_def XMM29p( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(15));
 588 
 589 reg_def XMM30 ( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg());
 590 reg_def XMM30b( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(1));
 591 reg_def XMM30c( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(2));
 592 reg_def XMM30d( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(3));
 593 reg_def XMM30e( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(4));
 594 reg_def XMM30f( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(5));
 595 reg_def XMM30g( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(6));
 596 reg_def XMM30h( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(7));
 597 reg_def XMM30i( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(8));
 598 reg_def XMM30j( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(9));
 599 reg_def XMM30k( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(10));
 600 reg_def XMM30l( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(11));
 601 reg_def XMM30m( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(12));
 602 reg_def XMM30n( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(13));
 603 reg_def XMM30o( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(14));
 604 reg_def XMM30p( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(15));
 605 
 606 reg_def XMM31 ( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg());
 607 reg_def XMM31b( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(1));
 608 reg_def XMM31c( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(2));
 609 reg_def XMM31d( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(3));
 610 reg_def XMM31e( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(4));
 611 reg_def XMM31f( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(5));
 612 reg_def XMM31g( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(6));
 613 reg_def XMM31h( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(7));
 614 reg_def XMM31i( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(8));
 615 reg_def XMM31j( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(9));
 616 reg_def XMM31k( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(10));
 617 reg_def XMM31l( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(11));
 618 reg_def XMM31m( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(12));
 619 reg_def XMM31n( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(13));
 620 reg_def XMM31o( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(14));
 621 reg_def XMM31p( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(15));
 622 
 623 #endif // _LP64
 624 
 625 #ifdef _LP64
 626 reg_def RFLAGS(SOC, SOC, 0, 16, VMRegImpl::Bad());
 627 #else
 628 reg_def RFLAGS(SOC, SOC, 0, 8, VMRegImpl::Bad());
 629 #endif // _LP64
 630 
 631 alloc_class chunk1(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
 632                    XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
 633                    XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
 634                    XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
 635                    XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
 636                    XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
 637                    XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
 638                    XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
 639 #ifdef _LP64
 640                   ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
 641                    XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
 642                    XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
 643                    XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
 644                    XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
 645                    XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
 646                    XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
 647                    XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
 648                   ,XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
 649                    XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
 650                    XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
 651                    XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
 652                    XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
 653                    XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
 654                    XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
 655                    XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
 656                    XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
 657                    XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
 658                    XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
 659                    XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
 660                    XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
 661                    XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
 662                    XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
 663                    XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p
 664 #endif
 665                       );
 666 
 667 // flags allocation class should be last.
 668 alloc_class chunk2(RFLAGS);
 669 
 670 // Singleton class for condition codes
 671 reg_class int_flags(RFLAGS);
 672 
 673 // Class for pre evex float registers
 674 reg_class float_reg_legacy(XMM0,
 675                     XMM1,
 676                     XMM2,
 677                     XMM3,
 678                     XMM4,
 679                     XMM5,
 680                     XMM6,
 681                     XMM7
 682 #ifdef _LP64
 683                    ,XMM8,
 684                     XMM9,
 685                     XMM10,
 686                     XMM11,
 687                     XMM12,
 688                     XMM13,
 689                     XMM14,
 690                     XMM15
 691 #endif
 692                     );
 693 
 694 // Class for evex float registers
 695 reg_class float_reg_evex(XMM0,
 696                     XMM1,
 697                     XMM2,
 698                     XMM3,
 699                     XMM4,
 700                     XMM5,
 701                     XMM6,
 702                     XMM7
 703 #ifdef _LP64
 704                    ,XMM8,
 705                     XMM9,
 706                     XMM10,
 707                     XMM11,
 708                     XMM12,
 709                     XMM13,
 710                     XMM14,
 711                     XMM15,
 712                     XMM16,
 713                     XMM17,
 714                     XMM18,
 715                     XMM19,
 716                     XMM20,
 717                     XMM21,
 718                     XMM22,
 719                     XMM23,
 720                     XMM24,
 721                     XMM25,
 722                     XMM26,
 723                     XMM27,
 724                     XMM28,
 725                     XMM29,
 726                     XMM30,
 727                     XMM31
 728 #endif
 729                     );
 730 
 731 reg_class_dynamic float_reg(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() %} );
 732 reg_class_dynamic float_reg_vl(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
 733 
 734 // Class for pre evex double registers
 735 reg_class double_reg_legacy(XMM0,  XMM0b,
 736                      XMM1,  XMM1b,
 737                      XMM2,  XMM2b,
 738                      XMM3,  XMM3b,
 739                      XMM4,  XMM4b,
 740                      XMM5,  XMM5b,
 741                      XMM6,  XMM6b,
 742                      XMM7,  XMM7b
 743 #ifdef _LP64
 744                     ,XMM8,  XMM8b,
 745                      XMM9,  XMM9b,
 746                      XMM10, XMM10b,
 747                      XMM11, XMM11b,
 748                      XMM12, XMM12b,
 749                      XMM13, XMM13b,
 750                      XMM14, XMM14b,
 751                      XMM15, XMM15b
 752 #endif
 753                      );
 754 
 755 // Class for evex double registers
 756 reg_class double_reg_evex(XMM0,  XMM0b,
 757                      XMM1,  XMM1b,
 758                      XMM2,  XMM2b,
 759                      XMM3,  XMM3b,
 760                      XMM4,  XMM4b,
 761                      XMM5,  XMM5b,
 762                      XMM6,  XMM6b,
 763                      XMM7,  XMM7b
 764 #ifdef _LP64
 765                     ,XMM8,  XMM8b,
 766                      XMM9,  XMM9b,
 767                      XMM10, XMM10b,
 768                      XMM11, XMM11b,
 769                      XMM12, XMM12b,
 770                      XMM13, XMM13b,
 771                      XMM14, XMM14b,
 772                      XMM15, XMM15b,
 773                      XMM16, XMM16b,
 774                      XMM17, XMM17b,
 775                      XMM18, XMM18b,
 776                      XMM19, XMM19b,
 777                      XMM20, XMM20b,
 778                      XMM21, XMM21b,
 779                      XMM22, XMM22b,
 780                      XMM23, XMM23b,
 781                      XMM24, XMM24b,
 782                      XMM25, XMM25b,
 783                      XMM26, XMM26b,
 784                      XMM27, XMM27b,
 785                      XMM28, XMM28b,
 786                      XMM29, XMM29b,
 787                      XMM30, XMM30b,
 788                      XMM31, XMM31b
 789 #endif
 790                      );
 791 
 792 reg_class_dynamic double_reg(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() %} );
 793 reg_class_dynamic double_reg_vl(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
 794 
 795 // Class for pre evex 32bit vector registers
 796 reg_class vectors_reg_legacy(XMM0,
 797                       XMM1,
 798                       XMM2,
 799                       XMM3,
 800                       XMM4,
 801                       XMM5,
 802                       XMM6,
 803                       XMM7
 804 #ifdef _LP64
 805                      ,XMM8,
 806                       XMM9,
 807                       XMM10,
 808                       XMM11,
 809                       XMM12,
 810                       XMM13,
 811                       XMM14,
 812                       XMM15
 813 #endif
 814                       );
 815 
 816 // Class for evex 32bit vector registers
 817 reg_class vectors_reg_evex(XMM0,
 818                       XMM1,
 819                       XMM2,
 820                       XMM3,
 821                       XMM4,
 822                       XMM5,
 823                       XMM6,
 824                       XMM7
 825 #ifdef _LP64
 826                      ,XMM8,
 827                       XMM9,
 828                       XMM10,
 829                       XMM11,
 830                       XMM12,
 831                       XMM13,
 832                       XMM14,
 833                       XMM15,
 834                       XMM16,
 835                       XMM17,
 836                       XMM18,
 837                       XMM19,
 838                       XMM20,
 839                       XMM21,
 840                       XMM22,
 841                       XMM23,
 842                       XMM24,
 843                       XMM25,
 844                       XMM26,
 845                       XMM27,
 846                       XMM28,
 847                       XMM29,
 848                       XMM30,
 849                       XMM31
 850 #endif
 851                       );
 852 
 853 reg_class_dynamic vectors_reg(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_evex() %} );
 854 reg_class_dynamic vectors_reg_vlbwdq(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
 855 
 856 // Class for all 64bit vector registers
 857 reg_class vectord_reg_legacy(XMM0,  XMM0b,
 858                       XMM1,  XMM1b,
 859                       XMM2,  XMM2b,
 860                       XMM3,  XMM3b,
 861                       XMM4,  XMM4b,
 862                       XMM5,  XMM5b,
 863                       XMM6,  XMM6b,
 864                       XMM7,  XMM7b
 865 #ifdef _LP64
 866                      ,XMM8,  XMM8b,
 867                       XMM9,  XMM9b,
 868                       XMM10, XMM10b,
 869                       XMM11, XMM11b,
 870                       XMM12, XMM12b,
 871                       XMM13, XMM13b,
 872                       XMM14, XMM14b,
 873                       XMM15, XMM15b
 874 #endif
 875                       );
 876 
 877 // Class for all 64bit vector registers
 878 reg_class vectord_reg_evex(XMM0,  XMM0b,
 879                       XMM1,  XMM1b,
 880                       XMM2,  XMM2b,
 881                       XMM3,  XMM3b,
 882                       XMM4,  XMM4b,
 883                       XMM5,  XMM5b,
 884                       XMM6,  XMM6b,
 885                       XMM7,  XMM7b
 886 #ifdef _LP64
 887                      ,XMM8,  XMM8b,
 888                       XMM9,  XMM9b,
 889                       XMM10, XMM10b,
 890                       XMM11, XMM11b,
 891                       XMM12, XMM12b,
 892                       XMM13, XMM13b,
 893                       XMM14, XMM14b,
 894                       XMM15, XMM15b,
 895                       XMM16, XMM16b,
 896                       XMM17, XMM17b,
 897                       XMM18, XMM18b,
 898                       XMM19, XMM19b,
 899                       XMM20, XMM20b,
 900                       XMM21, XMM21b,
 901                       XMM22, XMM22b,
 902                       XMM23, XMM23b,
 903                       XMM24, XMM24b,
 904                       XMM25, XMM25b,
 905                       XMM26, XMM26b,
 906                       XMM27, XMM27b,
 907                       XMM28, XMM28b,
 908                       XMM29, XMM29b,
 909                       XMM30, XMM30b,
 910                       XMM31, XMM31b
 911 #endif
 912                       );
 913 
 914 reg_class_dynamic vectord_reg(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_evex() %} );
 915 reg_class_dynamic vectord_reg_vlbwdq(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
 916 
 917 // Class for all 128bit vector registers
 918 reg_class vectorx_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,
 919                       XMM1,  XMM1b,  XMM1c,  XMM1d,
 920                       XMM2,  XMM2b,  XMM2c,  XMM2d,
 921                       XMM3,  XMM3b,  XMM3c,  XMM3d,
 922                       XMM4,  XMM4b,  XMM4c,  XMM4d,
 923                       XMM5,  XMM5b,  XMM5c,  XMM5d,
 924                       XMM6,  XMM6b,  XMM6c,  XMM6d,
 925                       XMM7,  XMM7b,  XMM7c,  XMM7d
 926 #ifdef _LP64
 927                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,
 928                       XMM9,  XMM9b,  XMM9c,  XMM9d,
 929                       XMM10, XMM10b, XMM10c, XMM10d,
 930                       XMM11, XMM11b, XMM11c, XMM11d,
 931                       XMM12, XMM12b, XMM12c, XMM12d,
 932                       XMM13, XMM13b, XMM13c, XMM13d,
 933                       XMM14, XMM14b, XMM14c, XMM14d,
 934                       XMM15, XMM15b, XMM15c, XMM15d
 935 #endif
 936                       );
 937 
 938 // Class for all 128bit vector registers
 939 reg_class vectorx_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,
 940                       XMM1,  XMM1b,  XMM1c,  XMM1d,
 941                       XMM2,  XMM2b,  XMM2c,  XMM2d,
 942                       XMM3,  XMM3b,  XMM3c,  XMM3d,
 943                       XMM4,  XMM4b,  XMM4c,  XMM4d,
 944                       XMM5,  XMM5b,  XMM5c,  XMM5d,
 945                       XMM6,  XMM6b,  XMM6c,  XMM6d,
 946                       XMM7,  XMM7b,  XMM7c,  XMM7d
 947 #ifdef _LP64
 948                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,
 949                       XMM9,  XMM9b,  XMM9c,  XMM9d,
 950                       XMM10, XMM10b, XMM10c, XMM10d,
 951                       XMM11, XMM11b, XMM11c, XMM11d,
 952                       XMM12, XMM12b, XMM12c, XMM12d,
 953                       XMM13, XMM13b, XMM13c, XMM13d,
 954                       XMM14, XMM14b, XMM14c, XMM14d,
 955                       XMM15, XMM15b, XMM15c, XMM15d,
 956                       XMM16, XMM16b, XMM16c, XMM16d,
 957                       XMM17, XMM17b, XMM17c, XMM17d,
 958                       XMM18, XMM18b, XMM18c, XMM18d,
 959                       XMM19, XMM19b, XMM19c, XMM19d,
 960                       XMM20, XMM20b, XMM20c, XMM20d,
 961                       XMM21, XMM21b, XMM21c, XMM21d,
 962                       XMM22, XMM22b, XMM22c, XMM22d,
 963                       XMM23, XMM23b, XMM23c, XMM23d,
 964                       XMM24, XMM24b, XMM24c, XMM24d,
 965                       XMM25, XMM25b, XMM25c, XMM25d,
 966                       XMM26, XMM26b, XMM26c, XMM26d,
 967                       XMM27, XMM27b, XMM27c, XMM27d,
 968                       XMM28, XMM28b, XMM28c, XMM28d,
 969                       XMM29, XMM29b, XMM29c, XMM29d,
 970                       XMM30, XMM30b, XMM30c, XMM30d,
 971                       XMM31, XMM31b, XMM31c, XMM31d
 972 #endif
 973                       );
 974 
 975 reg_class_dynamic vectorx_reg(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_evex() %} );
 976 reg_class_dynamic vectorx_reg_vlbwdq(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
 977 
 978 // Class for all 256bit vector registers
 979 reg_class vectory_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
 980                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
 981                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
 982                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
 983                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
 984                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
 985                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
 986                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
 987 #ifdef _LP64
 988                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
 989                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
 990                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
 991                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
 992                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
 993                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
 994                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
 995                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h
 996 #endif
 997                       );
 998 
 999 // Class for all 256bit vector registers
1000 reg_class vectory_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
1001                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
1002                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
1003                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
1004                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
1005                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
1006                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
1007                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
1008 #ifdef _LP64
1009                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
1010                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
1011                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
1012                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
1013                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
1014                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
1015                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
1016                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h,
1017                       XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h,
1018                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h,
1019                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h,
1020                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h,
1021                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h,
1022                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h,
1023                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h,
1024                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h,
1025                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h,
1026                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h,
1027                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h,
1028                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h,
1029                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h,
1030                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h,
1031                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h,
1032                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h
1033 #endif
1034                       );
1035 
1036 reg_class_dynamic vectory_reg(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_evex() %} );
1037 reg_class_dynamic vectory_reg_vlbwdq(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
1038 
1039 // Class for all 512bit vector registers
1040 reg_class vectorz_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
1041                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
1042                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
1043                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
1044                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
1045                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
1046                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
1047                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
1048 #ifdef _LP64
1049                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
1050                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
1051                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
1052                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
1053                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
1054                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
1055                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
1056                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
1057                      ,XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
1058                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
1059                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
1060                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
1061                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
1062                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
1063                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
1064                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
1065                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
1066                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
1067                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
1068                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
1069                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
1070                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
1071                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
1072                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p
1073 #endif
1074                       );
1075 
1076 // Class for restricted 512bit vector registers
1077 reg_class vectorz_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
1078                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
1079                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
1080                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
1081                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
1082                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
1083                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
1084                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
1085 #ifdef _LP64
1086                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
1087                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
1088                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
1089                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
1090                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
1091                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
1092                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
1093                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
1094 #endif
1095                       );
1096 
1097 reg_class_dynamic vectorz_reg(vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() %} );
1098 reg_class_dynamic vectorz_reg_vl(vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
1099 
1100 reg_class xmm0_reg(XMM0, XMM0b, XMM0c, XMM0d);
1101 reg_class ymm0_reg(XMM0, XMM0b, XMM0c, XMM0d, XMM0e, XMM0f, XMM0g, XMM0h);
1102 reg_class zmm0_reg(XMM0, XMM0b, XMM0c, XMM0d, XMM0e, XMM0f, XMM0g, XMM0h, XMM0i, XMM0j, XMM0k, XMM0l, XMM0m, XMM0n, XMM0o, XMM0p);
1103 
1104 reg_class xmm1_reg(XMM1, XMM1b, XMM1c, XMM1d);
1105 reg_class ymm1_reg(XMM1, XMM1b, XMM1c, XMM1d, XMM1e, XMM1f, XMM1g, XMM1h);
1106 reg_class zmm1_reg(XMM1, XMM1b, XMM1c, XMM1d, XMM1e, XMM1f, XMM1g, XMM1h, XMM1i, XMM1j, XMM1k, XMM1l, XMM1m, XMM1n, XMM1o, XMM1p);
1107 
1108 reg_class xmm2_reg(XMM2, XMM2b, XMM2c, XMM2d);
1109 reg_class ymm2_reg(XMM2, XMM2b, XMM2c, XMM2d, XMM2e, XMM2f, XMM2g, XMM2h);
1110 reg_class zmm2_reg(XMM2, XMM2b, XMM2c, XMM2d, XMM2e, XMM2f, XMM2g, XMM2h, XMM2i, XMM2j, XMM2k, XMM2l, XMM2m, XMM2n, XMM2o, XMM2p);
1111 
1112 reg_class xmm3_reg(XMM3, XMM3b, XMM3c, XMM3d);
1113 reg_class ymm3_reg(XMM3, XMM3b, XMM3c, XMM3d, XMM3e, XMM3f, XMM3g, XMM3h);
1114 reg_class zmm3_reg(XMM3, XMM3b, XMM3c, XMM3d, XMM3e, XMM3f, XMM3g, XMM3h, XMM3i, XMM3j, XMM3k, XMM3l, XMM3m, XMM3n, XMM3o, XMM3p);
1115 
1116 reg_class xmm4_reg(XMM4, XMM4b, XMM4c, XMM4d);
1117 reg_class ymm4_reg(XMM4, XMM4b, XMM4c, XMM4d, XMM4e, XMM4f, XMM4g, XMM4h);
1118 reg_class zmm4_reg(XMM4, XMM4b, XMM4c, XMM4d, XMM4e, XMM4f, XMM4g, XMM4h, XMM4i, XMM4j, XMM4k, XMM4l, XMM4m, XMM4n, XMM4o, XMM4p);
1119 
1120 reg_class xmm5_reg(XMM5, XMM5b, XMM5c, XMM5d);
1121 reg_class ymm5_reg(XMM5, XMM5b, XMM5c, XMM5d, XMM5e, XMM5f, XMM5g, XMM5h);
1122 reg_class zmm5_reg(XMM5, XMM5b, XMM5c, XMM5d, XMM5e, XMM5f, XMM5g, XMM5h, XMM5i, XMM5j, XMM5k, XMM5l, XMM5m, XMM5n, XMM5o, XMM5p);
1123 
1124 reg_class xmm6_reg(XMM6, XMM6b, XMM6c, XMM6d);
1125 reg_class ymm6_reg(XMM6, XMM6b, XMM6c, XMM6d, XMM6e, XMM6f, XMM6g, XMM6h);
1126 reg_class zmm6_reg(XMM6, XMM6b, XMM6c, XMM6d, XMM6e, XMM6f, XMM6g, XMM6h, XMM6i, XMM6j, XMM6k, XMM6l, XMM6m, XMM6n, XMM6o, XMM6p);
1127 
1128 reg_class xmm7_reg(XMM7, XMM7b, XMM7c, XMM7d);
1129 reg_class ymm7_reg(XMM7, XMM7b, XMM7c, XMM7d, XMM7e, XMM7f, XMM7g, XMM7h);
1130 reg_class zmm7_reg(XMM7, XMM7b, XMM7c, XMM7d, XMM7e, XMM7f, XMM7g, XMM7h, XMM7i, XMM7j, XMM7k, XMM7l, XMM7m, XMM7n, XMM7o, XMM7p);
1131 
1132 #ifdef _LP64
1133 
1134 reg_class xmm8_reg(XMM8, XMM8b, XMM8c, XMM8d);
1135 reg_class ymm8_reg(XMM8, XMM8b, XMM8c, XMM8d, XMM8e, XMM8f, XMM8g, XMM8h);
1136 reg_class zmm8_reg(XMM8, XMM8b, XMM8c, XMM8d, XMM8e, XMM8f, XMM8g, XMM8h, XMM8i, XMM8j, XMM8k, XMM8l, XMM8m, XMM8n, XMM8o, XMM8p);
1137 
1138 reg_class xmm9_reg(XMM9, XMM9b, XMM9c, XMM9d);
1139 reg_class ymm9_reg(XMM9, XMM9b, XMM9c, XMM9d, XMM9e, XMM9f, XMM9g, XMM9h);
1140 reg_class zmm9_reg(XMM9, XMM9b, XMM9c, XMM9d, XMM9e, XMM9f, XMM9g, XMM9h, XMM9i, XMM9j, XMM9k, XMM9l, XMM9m, XMM9n, XMM9o, XMM9p);
1141 
1142 reg_class xmm10_reg(XMM10, XMM10b, XMM10c, XMM10d);
1143 reg_class ymm10_reg(XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h);
1144 reg_class zmm10_reg(XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p);
1145 
1146 reg_class xmm11_reg(XMM11, XMM11b, XMM11c, XMM11d);
1147 reg_class ymm11_reg(XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h);
1148 reg_class zmm11_reg(XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p);
1149 
1150 reg_class xmm12_reg(XMM12, XMM12b, XMM12c, XMM12d);
1151 reg_class ymm12_reg(XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h);
1152 reg_class zmm12_reg(XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p);
1153 
1154 reg_class xmm13_reg(XMM13, XMM13b, XMM13c, XMM13d);
1155 reg_class ymm13_reg(XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h);
1156 reg_class zmm13_reg(XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p);
1157 
1158 reg_class xmm14_reg(XMM14, XMM14b, XMM14c, XMM14d);
1159 reg_class ymm14_reg(XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h);
1160 reg_class zmm14_reg(XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p);
1161 
1162 reg_class xmm15_reg(XMM15, XMM15b, XMM15c, XMM15d);
1163 reg_class ymm15_reg(XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h);
1164 reg_class zmm15_reg(XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p);
1165 
1166 reg_class xmm16_reg(XMM16, XMM16b, XMM16c, XMM16d);
1167 reg_class ymm16_reg(XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h);
1168 reg_class zmm16_reg(XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p);
1169 
1170 reg_class xmm17_reg(XMM17, XMM17b, XMM17c, XMM17d);
1171 reg_class ymm17_reg(XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h);
1172 reg_class zmm17_reg(XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p);
1173 
1174 reg_class xmm18_reg(XMM18, XMM18b, XMM18c, XMM18d);
1175 reg_class ymm18_reg(XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h);
1176 reg_class zmm18_reg(XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p);
1177 
1178 reg_class xmm19_reg(XMM19, XMM19b, XMM19c, XMM19d);
1179 reg_class ymm19_reg(XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h);
1180 reg_class zmm19_reg(XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p);
1181 
1182 reg_class xmm20_reg(XMM20, XMM20b, XMM20c, XMM20d);
1183 reg_class ymm20_reg(XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h);
1184 reg_class zmm20_reg(XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p);
1185 
1186 reg_class xmm21_reg(XMM21, XMM21b, XMM21c, XMM21d);
1187 reg_class ymm21_reg(XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h);
1188 reg_class zmm21_reg(XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p);
1189 
1190 reg_class xmm22_reg(XMM22, XMM22b, XMM22c, XMM22d);
1191 reg_class ymm22_reg(XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h);
1192 reg_class zmm22_reg(XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p);
1193 
1194 reg_class xmm23_reg(XMM23, XMM23b, XMM23c, XMM23d);
1195 reg_class ymm23_reg(XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h);
1196 reg_class zmm23_reg(XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p);
1197 
1198 reg_class xmm24_reg(XMM24, XMM24b, XMM24c, XMM24d);
1199 reg_class ymm24_reg(XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h);
1200 reg_class zmm24_reg(XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p);
1201 
1202 reg_class xmm25_reg(XMM25, XMM25b, XMM25c, XMM25d);
1203 reg_class ymm25_reg(XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h);
1204 reg_class zmm25_reg(XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p);
1205 
1206 reg_class xmm26_reg(XMM26, XMM26b, XMM26c, XMM26d);
1207 reg_class ymm26_reg(XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h);
1208 reg_class zmm26_reg(XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p);
1209 
1210 reg_class xmm27_reg(XMM27, XMM27b, XMM27c, XMM27d);
1211 reg_class ymm27_reg(XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h);
1212 reg_class zmm27_reg(XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p);
1213 
1214 reg_class xmm28_reg(XMM28, XMM28b, XMM28c, XMM28d);
1215 reg_class ymm28_reg(XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h);
1216 reg_class zmm28_reg(XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p);
1217 
1218 reg_class xmm29_reg(XMM29, XMM29b, XMM29c, XMM29d);
1219 reg_class ymm29_reg(XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h);
1220 reg_class zmm29_reg(XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p);
1221 
1222 reg_class xmm30_reg(XMM30, XMM30b, XMM30c, XMM30d);
1223 reg_class ymm30_reg(XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h);
1224 reg_class zmm30_reg(XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p);
1225 
1226 reg_class xmm31_reg(XMM31, XMM31b, XMM31c, XMM31d);
1227 reg_class ymm31_reg(XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h);
1228 reg_class zmm31_reg(XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p);
1229 
1230 #endif
1231 
1232 %}
1233 
1234 
1235 //----------SOURCE BLOCK-------------------------------------------------------
1236 // This is a block of C++ code which provides values, functions, and
1237 // definitions necessary in the rest of the architecture description
1238 
1239 source_hpp %{
1240 // Header information of the source block.
1241 // Method declarations/definitions which are used outside
1242 // the ad-scope can conveniently be defined here.
1243 //
1244 // To keep related declarations/definitions/uses close together,
1245 // we switch between source %{ }% and source_hpp %{ }% freely as needed.
1246 
1247 class NativeJump;
1248 
1249 class CallStubImpl {
1250 
1251   //--------------------------------------------------------------
1252   //---<  Used for optimization in Compile::shorten_branches  >---
1253   //--------------------------------------------------------------
1254 
1255  public:
1256   // Size of call trampoline stub.
1257   static uint size_call_trampoline() {
1258     return 0; // no call trampolines on this platform
1259   }
1260 
1261   // number of relocations needed by a call trampoline stub
1262   static uint reloc_call_trampoline() {
1263     return 0; // no call trampolines on this platform
1264   }
1265 };
1266 
1267 class HandlerImpl {
1268 
1269  public:
1270 
1271   static int emit_exception_handler(CodeBuffer &cbuf);
1272   static int emit_deopt_handler(CodeBuffer& cbuf);
1273 
1274   static uint size_exception_handler() {
1275     // NativeCall instruction size is the same as NativeJump.
1276     // exception handler starts out as jump and can be patched to
1277     // a call be deoptimization.  (4932387)
1278     // Note that this value is also credited (in output.cpp) to
1279     // the size of the code section.
1280     return NativeJump::instruction_size;
1281   }
1282 
1283 #ifdef _LP64
1284   static uint size_deopt_handler() {
1285     // three 5 byte instructions plus one move for unreachable address.
1286     return 15+3;
1287   }
1288 #else
1289   static uint size_deopt_handler() {
1290     // NativeCall instruction size is the same as NativeJump.
1291     // exception handler starts out as jump and can be patched to
1292     // a call be deoptimization.  (4932387)
1293     // Note that this value is also credited (in output.cpp) to
1294     // the size of the code section.
1295     return 5 + NativeJump::instruction_size; // pushl(); jmp;
1296   }
1297 #endif
1298 };
1299 
1300 %} // end source_hpp
1301 
1302 source %{
1303 
1304 #include "opto/addnode.hpp"
1305 
1306 // Emit exception handler code.
1307 // Stuff framesize into a register and call a VM stub routine.
1308 int HandlerImpl::emit_exception_handler(CodeBuffer& cbuf) {
1309 
1310   // Note that the code buffer's insts_mark is always relative to insts.
1311   // That's why we must use the macroassembler to generate a handler.
1312   MacroAssembler _masm(&cbuf);
1313   address base = __ start_a_stub(size_exception_handler());
1314   if (base == NULL) {
1315     ciEnv::current()->record_failure("CodeCache is full");
1316     return 0;  // CodeBuffer::expand failed
1317   }
1318   int offset = __ offset();
1319   __ jump(RuntimeAddress(OptoRuntime::exception_blob()->entry_point()));
1320   assert(__ offset() - offset <= (int) size_exception_handler(), "overflow");
1321   __ end_a_stub();
1322   return offset;
1323 }
1324 
1325 // Emit deopt handler code.
1326 int HandlerImpl::emit_deopt_handler(CodeBuffer& cbuf) {
1327 
1328   // Note that the code buffer's insts_mark is always relative to insts.
1329   // That's why we must use the macroassembler to generate a handler.
1330   MacroAssembler _masm(&cbuf);
1331   address base = __ start_a_stub(size_deopt_handler());
1332   if (base == NULL) {
1333     ciEnv::current()->record_failure("CodeCache is full");
1334     return 0;  // CodeBuffer::expand failed
1335   }
1336   int offset = __ offset();
1337 
1338 #ifdef _LP64
1339   address the_pc = (address) __ pc();
1340   Label next;
1341   // push a "the_pc" on the stack without destroying any registers
1342   // as they all may be live.
1343 
1344   // push address of "next"
1345   __ call(next, relocInfo::none); // reloc none is fine since it is a disp32
1346   __ bind(next);
1347   // adjust it so it matches "the_pc"
1348   __ subptr(Address(rsp, 0), __ offset() - offset);
1349 #else
1350   InternalAddress here(__ pc());
1351   __ pushptr(here.addr());
1352 #endif
1353 
1354   __ jump(RuntimeAddress(SharedRuntime::deopt_blob()->unpack()));
1355   assert(__ offset() - offset <= (int) size_deopt_handler(), "overflow %d", (__ offset() - offset));
1356   __ end_a_stub();
1357   return offset;
1358 }
1359 
1360 
1361 //=============================================================================
1362 
1363   // Float masks come from different places depending on platform.
1364 #ifdef _LP64
1365   static address float_signmask()  { return StubRoutines::x86::float_sign_mask(); }
1366   static address float_signflip()  { return StubRoutines::x86::float_sign_flip(); }
1367   static address double_signmask() { return StubRoutines::x86::double_sign_mask(); }
1368   static address double_signflip() { return StubRoutines::x86::double_sign_flip(); }
1369 #else
1370   static address float_signmask()  { return (address)float_signmask_pool; }
1371   static address float_signflip()  { return (address)float_signflip_pool; }
1372   static address double_signmask() { return (address)double_signmask_pool; }
1373   static address double_signflip() { return (address)double_signflip_pool; }
1374 #endif
1375 
1376 
1377 const bool Matcher::match_rule_supported(int opcode) {
1378   if (!has_match_rule(opcode))
1379     return false;
1380 
1381   bool ret_value = true;
1382   switch (opcode) {
1383     case Op_PopCountI:
1384     case Op_PopCountL:
1385       if (!UsePopCountInstruction)
1386         ret_value = false;
1387       break;
1388     case Op_PopCountVI:
1389       if (!UsePopCountInstruction || !VM_Version::supports_vpopcntdq())
1390         ret_value = false;
1391       break;
1392     case Op_MulVI:
1393       if ((UseSSE < 4) && (UseAVX < 1)) // only with SSE4_1 or AVX
1394         ret_value = false;
1395       break;
1396     case Op_MulVL:
1397     case Op_MulReductionVL:
1398       if (VM_Version::supports_avx512dq() == false)
1399         ret_value = false;
1400       break;
1401     case Op_AddReductionVL:
1402       if (UseAVX < 3) // only EVEX : vector connectivity becomes an issue here
1403         ret_value = false;
1404       break;
1405     case Op_AddReductionVI:
1406       if (UseSSE < 3) // requires at least SSE3
1407         ret_value = false;
1408       break;
1409     case Op_MulReductionVI:
1410       if (UseSSE < 4) // requires at least SSE4
1411         ret_value = false;
1412       break;
1413     case Op_AddReductionVF:
1414     case Op_AddReductionVD:
1415     case Op_MulReductionVF:
1416     case Op_MulReductionVD:
1417       if (UseSSE < 1) // requires at least SSE
1418         ret_value = false;
1419       break;
1420     case Op_SqrtVD:
1421     case Op_SqrtVF:
1422       if (UseAVX < 1) // enabled for AVX only
1423         ret_value = false;
1424       break;
1425     case Op_CompareAndSwapL:
1426 #ifdef _LP64
1427     case Op_CompareAndSwapP:
1428 #endif
1429       if (!VM_Version::supports_cx8())
1430         ret_value = false;
1431       break;
1432     case Op_CMoveVF:
1433     case Op_CMoveVD:
1434       if (UseAVX < 1 || UseAVX > 2)
1435         ret_value = false;
1436       break;
1437     case Op_StrIndexOf:
1438       if (!UseSSE42Intrinsics)
1439         ret_value = false;
1440       break;
1441     case Op_StrIndexOfChar:
1442       if (!UseSSE42Intrinsics)
1443         ret_value = false;
1444       break;
1445     case Op_OnSpinWait:
1446       if (VM_Version::supports_on_spin_wait() == false)
1447         ret_value = false;
1448       break;
1449     case Op_MulAddVS2VI:
1450       if (UseSSE < 2)
1451         ret_value = false;
1452       break;
1453   }
1454 
1455   return ret_value;  // Per default match rules are supported.
1456 }
1457 
1458 const bool Matcher::match_rule_supported_vector(int opcode, int vlen) {
1459   // identify extra cases that we might want to provide match rules for
1460   // e.g. Op_ vector nodes and other intrinsics while guarding with vlen
1461   bool ret_value = match_rule_supported(opcode);
1462   if (ret_value) {
1463     switch (opcode) {
1464       case Op_AddVB:
1465       case Op_SubVB:
1466         if ((vlen == 64) && (VM_Version::supports_avx512bw() == false))
1467           ret_value = false;
1468         break;
1469       case Op_URShiftVS:
1470       case Op_RShiftVS:
1471       case Op_LShiftVS:
1472       case Op_MulVS:
1473       case Op_AddVS:
1474       case Op_SubVS:
1475         if ((vlen == 32) && (VM_Version::supports_avx512bw() == false))
1476           ret_value = false;
1477         break;
1478       case Op_CMoveVF:
1479         if (vlen != 8)
1480           ret_value  = false;
1481         break;
1482       case Op_CMoveVD:
1483         if (vlen != 4)
1484           ret_value  = false;
1485         break;
1486     }
1487   }
1488 
1489   return ret_value;  // Per default match rules are supported.
1490 }
1491 
1492 const bool Matcher::has_predicated_vectors(void) {
1493   bool ret_value = false;
1494   if (UseAVX > 2) {
1495     ret_value = VM_Version::supports_avx512vl();
1496   }
1497 
1498   return ret_value;
1499 }
1500 
1501 const int Matcher::float_pressure(int default_pressure_threshold) {
1502   int float_pressure_threshold = default_pressure_threshold;
1503 #ifdef _LP64
1504   if (UseAVX > 2) {
1505     // Increase pressure threshold on machines with AVX3 which have
1506     // 2x more XMM registers.
1507     float_pressure_threshold = default_pressure_threshold * 2;
1508   }
1509 #endif
1510   return float_pressure_threshold;
1511 }
1512 
1513 // Max vector size in bytes. 0 if not supported.
1514 const int Matcher::vector_width_in_bytes(BasicType bt) {
1515   assert(is_java_primitive(bt), "only primitive type vectors");
1516   if (UseSSE < 2) return 0;
1517   // SSE2 supports 128bit vectors for all types.
1518   // AVX2 supports 256bit vectors for all types.
1519   // AVX2/EVEX supports 512bit vectors for all types.
1520   int size = (UseAVX > 1) ? (1 << UseAVX) * 8 : 16;
1521   // AVX1 supports 256bit vectors only for FLOAT and DOUBLE.
1522   if (UseAVX > 0 && (bt == T_FLOAT || bt == T_DOUBLE))
1523     size = (UseAVX > 2) ? 64 : 32;
1524   if (UseAVX > 2 && (bt == T_BYTE || bt == T_SHORT || bt == T_CHAR))
1525     size = (VM_Version::supports_avx512bw()) ? 64 : 32;
1526   // Use flag to limit vector size.
1527   size = MIN2(size,(int)MaxVectorSize);
1528   // Minimum 2 values in vector (or 4 for bytes).
1529   switch (bt) {
1530   case T_DOUBLE:
1531   case T_LONG:
1532     if (size < 16) return 0;
1533     break;
1534   case T_FLOAT:
1535   case T_INT:
1536     if (size < 8) return 0;
1537     break;
1538   case T_BOOLEAN:
1539     if (size < 4) return 0;
1540     break;
1541   case T_CHAR:
1542     if (size < 4) return 0;
1543     break;
1544   case T_BYTE:
1545     if (size < 4) return 0;
1546     break;
1547   case T_SHORT:
1548     if (size < 4) return 0;
1549     break;
1550   default:
1551     ShouldNotReachHere();
1552   }
1553   return size;
1554 }
1555 
1556 // Limits on vector size (number of elements) loaded into vector.
1557 const int Matcher::max_vector_size(const BasicType bt) {
1558   return vector_width_in_bytes(bt)/type2aelembytes(bt);
1559 }
1560 const int Matcher::min_vector_size(const BasicType bt) {
1561   int max_size = max_vector_size(bt);
1562   // Min size which can be loaded into vector is 4 bytes.
1563   int size = (type2aelembytes(bt) == 1) ? 4 : 2;
1564   return MIN2(size,max_size);
1565 }
1566 
1567 // Vector ideal reg corresponding to specified size in bytes
1568 const uint Matcher::vector_ideal_reg(int size) {
1569   assert(MaxVectorSize >= size, "");
1570   switch(size) {
1571     case  4: return Op_VecS;
1572     case  8: return Op_VecD;
1573     case 16: return Op_VecX;
1574     case 32: return Op_VecY;
1575     case 64: return Op_VecZ;
1576   }
1577   ShouldNotReachHere();
1578   return 0;
1579 }
1580 
1581 // Only lowest bits of xmm reg are used for vector shift count.
1582 const uint Matcher::vector_shift_count_ideal_reg(int size) {
1583   return Op_VecS;
1584 }
1585 
1586 // x86 supports misaligned vectors store/load.
1587 const bool Matcher::misaligned_vectors_ok() {
1588   return !AlignVector; // can be changed by flag
1589 }
1590 
1591 // x86 AES instructions are compatible with SunJCE expanded
1592 // keys, hence we do not need to pass the original key to stubs
1593 const bool Matcher::pass_original_key_for_aes() {
1594   return false;
1595 }
1596 
1597 
1598 const bool Matcher::convi2l_type_required = true;
1599 
1600 // Check for shift by small constant as well
1601 static bool clone_shift(Node* shift, Matcher* matcher, Matcher::MStack& mstack, VectorSet& address_visited) {
1602   if (shift->Opcode() == Op_LShiftX && shift->in(2)->is_Con() &&
1603       shift->in(2)->get_int() <= 3 &&
1604       // Are there other uses besides address expressions?
1605       !matcher->is_visited(shift)) {
1606     address_visited.set(shift->_idx); // Flag as address_visited
1607     mstack.push(shift->in(2), Matcher::Visit);
1608     Node *conv = shift->in(1);
1609 #ifdef _LP64
1610     // Allow Matcher to match the rule which bypass
1611     // ConvI2L operation for an array index on LP64
1612     // if the index value is positive.
1613     if (conv->Opcode() == Op_ConvI2L &&
1614         conv->as_Type()->type()->is_long()->_lo >= 0 &&
1615         // Are there other uses besides address expressions?
1616         !matcher->is_visited(conv)) {
1617       address_visited.set(conv->_idx); // Flag as address_visited
1618       mstack.push(conv->in(1), Matcher::Pre_Visit);
1619     } else
1620 #endif
1621       mstack.push(conv, Matcher::Pre_Visit);
1622     return true;
1623   }
1624   return false;
1625 }
1626 
1627 // Should the Matcher clone shifts on addressing modes, expecting them
1628 // to be subsumed into complex addressing expressions or compute them
1629 // into registers?
1630 bool Matcher::clone_address_expressions(AddPNode* m, Matcher::MStack& mstack, VectorSet& address_visited) {
1631   Node *off = m->in(AddPNode::Offset);
1632   if (off->is_Con()) {
1633     address_visited.test_set(m->_idx); // Flag as address_visited
1634     Node *adr = m->in(AddPNode::Address);
1635 
1636     // Intel can handle 2 adds in addressing mode
1637     // AtomicAdd is not an addressing expression.
1638     // Cheap to find it by looking for screwy base.
1639     if (adr->is_AddP() &&
1640         !adr->in(AddPNode::Base)->is_top() &&
1641         LP64_ONLY( off->get_long() == (int) (off->get_long()) && ) // immL32
1642         // Are there other uses besides address expressions?
1643         !is_visited(adr)) {
1644       address_visited.set(adr->_idx); // Flag as address_visited
1645       Node *shift = adr->in(AddPNode::Offset);
1646       if (!clone_shift(shift, this, mstack, address_visited)) {
1647         mstack.push(shift, Pre_Visit);
1648       }
1649       mstack.push(adr->in(AddPNode::Address), Pre_Visit);
1650       mstack.push(adr->in(AddPNode::Base), Pre_Visit);
1651     } else {
1652       mstack.push(adr, Pre_Visit);
1653     }
1654 
1655     // Clone X+offset as it also folds into most addressing expressions
1656     mstack.push(off, Visit);
1657     mstack.push(m->in(AddPNode::Base), Pre_Visit);
1658     return true;
1659   } else if (clone_shift(off, this, mstack, address_visited)) {
1660     address_visited.test_set(m->_idx); // Flag as address_visited
1661     mstack.push(m->in(AddPNode::Address), Pre_Visit);
1662     mstack.push(m->in(AddPNode::Base), Pre_Visit);
1663     return true;
1664   }
1665   return false;
1666 }
1667 
1668 void Compile::reshape_address(AddPNode* addp) {
1669 }
1670 
1671 // Helper methods for MachSpillCopyNode::implementation().
1672 static int vec_mov_helper(CodeBuffer *cbuf, bool do_size, int src_lo, int dst_lo,
1673                           int src_hi, int dst_hi, uint ireg, outputStream* st) {
1674   // In 64-bit VM size calculation is very complex. Emitting instructions
1675   // into scratch buffer is used to get size in 64-bit VM.
1676   LP64_ONLY( assert(!do_size, "this method calculates size only for 32-bit VM"); )
1677   assert(ireg == Op_VecS || // 32bit vector
1678          (src_lo & 1) == 0 && (src_lo + 1) == src_hi &&
1679          (dst_lo & 1) == 0 && (dst_lo + 1) == dst_hi,
1680          "no non-adjacent vector moves" );
1681   if (cbuf) {
1682     MacroAssembler _masm(cbuf);
1683     int offset = __ offset();
1684     switch (ireg) {
1685     case Op_VecS: // copy whole register
1686     case Op_VecD:
1687     case Op_VecX:
1688 #ifndef LP64
1689       __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
1690 #else
1691       if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
1692         __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
1693       } else {
1694         __ vpxor(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[dst_lo]), 2);
1695         __ vinserti32x4(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 0x0);
1696      }
1697 #endif
1698       break;
1699     case Op_VecY:
1700 #ifndef LP64
1701       __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
1702 #else
1703       if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
1704         __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
1705       } else {
1706         __ vpxor(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[dst_lo]), 2);
1707         __ vinserti64x4(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 0x0);
1708      }
1709 #endif
1710       break;
1711     case Op_VecZ:
1712       __ evmovdquq(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 2);
1713       break;
1714     default:
1715       ShouldNotReachHere();
1716     }
1717     int size = __ offset() - offset;
1718 #ifdef ASSERT
1719     // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
1720     assert(!do_size || size == 4, "incorrect size calculattion");
1721 #endif
1722     return size;
1723 #ifndef PRODUCT
1724   } else if (!do_size) {
1725     switch (ireg) {
1726     case Op_VecS:
1727     case Op_VecD:
1728     case Op_VecX:
1729       st->print("movdqu  %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
1730       break;
1731     case Op_VecY:
1732     case Op_VecZ:
1733       st->print("vmovdqu %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
1734       break;
1735     default:
1736       ShouldNotReachHere();
1737     }
1738 #endif
1739   }
1740   // VEX_2bytes prefix is used if UseAVX > 0, and it takes the same 2 bytes as SIMD prefix.
1741   return (UseAVX > 2) ? 6 : 4;
1742 }
1743 
1744 static int vec_spill_helper(CodeBuffer *cbuf, bool do_size, bool is_load,
1745                             int stack_offset, int reg, uint ireg, outputStream* st) {
1746   // In 64-bit VM size calculation is very complex. Emitting instructions
1747   // into scratch buffer is used to get size in 64-bit VM.
1748   LP64_ONLY( assert(!do_size, "this method calculates size only for 32-bit VM"); )
1749   if (cbuf) {
1750     MacroAssembler _masm(cbuf);
1751     int offset = __ offset();
1752     if (is_load) {
1753       switch (ireg) {
1754       case Op_VecS:
1755         __ movdl(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
1756         break;
1757       case Op_VecD:
1758         __ movq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
1759         break;
1760       case Op_VecX:
1761 #ifndef LP64
1762         __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
1763 #else
1764         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
1765           __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
1766         } else {
1767           __ vpxor(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 2);
1768           __ vinserti32x4(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset),0x0);
1769         }
1770 #endif
1771         break;
1772       case Op_VecY:
1773 #ifndef LP64
1774         __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
1775 #else
1776         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
1777           __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
1778         } else {
1779           __ vpxor(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 2);
1780           __ vinserti64x4(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset),0x0);
1781         }
1782 #endif
1783         break;
1784       case Op_VecZ:
1785         __ evmovdquq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset), 2);
1786         break;
1787       default:
1788         ShouldNotReachHere();
1789       }
1790     } else { // store
1791       switch (ireg) {
1792       case Op_VecS:
1793         __ movdl(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
1794         break;
1795       case Op_VecD:
1796         __ movq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
1797         break;
1798       case Op_VecX:
1799 #ifndef LP64
1800         __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
1801 #else
1802         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
1803           __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
1804         }
1805         else {
1806           __ vextracti32x4(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 0x0);
1807         }
1808 #endif
1809         break;
1810       case Op_VecY:
1811 #ifndef LP64
1812         __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
1813 #else
1814         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
1815           __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
1816         }
1817         else {
1818           __ vextracti64x4(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 0x0);
1819         }
1820 #endif
1821         break;
1822       case Op_VecZ:
1823         __ evmovdquq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 2);
1824         break;
1825       default:
1826         ShouldNotReachHere();
1827       }
1828     }
1829     int size = __ offset() - offset;
1830 #ifdef ASSERT
1831     int offset_size = (stack_offset == 0) ? 0 : ((stack_offset < 0x80) ? 1 : (UseAVX > 2) ? 6 : 4);
1832     // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
1833     assert(!do_size || size == (5+offset_size), "incorrect size calculattion");
1834 #endif
1835     return size;
1836 #ifndef PRODUCT
1837   } else if (!do_size) {
1838     if (is_load) {
1839       switch (ireg) {
1840       case Op_VecS:
1841         st->print("movd    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
1842         break;
1843       case Op_VecD:
1844         st->print("movq    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
1845         break;
1846        case Op_VecX:
1847         st->print("movdqu  %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
1848         break;
1849       case Op_VecY:
1850       case Op_VecZ:
1851         st->print("vmovdqu %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
1852         break;
1853       default:
1854         ShouldNotReachHere();
1855       }
1856     } else { // store
1857       switch (ireg) {
1858       case Op_VecS:
1859         st->print("movd    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
1860         break;
1861       case Op_VecD:
1862         st->print("movq    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
1863         break;
1864        case Op_VecX:
1865         st->print("movdqu  [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
1866         break;
1867       case Op_VecY:
1868       case Op_VecZ:
1869         st->print("vmovdqu [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
1870         break;
1871       default:
1872         ShouldNotReachHere();
1873       }
1874     }
1875 #endif
1876   }
1877   bool is_single_byte = false;
1878   int vec_len = 0;
1879   if ((UseAVX > 2) && (stack_offset != 0)) {
1880     int tuple_type = Assembler::EVEX_FVM;
1881     int input_size = Assembler::EVEX_32bit;
1882     switch (ireg) {
1883     case Op_VecS:
1884       tuple_type = Assembler::EVEX_T1S;
1885       break;
1886     case Op_VecD:
1887       tuple_type = Assembler::EVEX_T1S;
1888       input_size = Assembler::EVEX_64bit;
1889       break;
1890     case Op_VecX:
1891       break;
1892     case Op_VecY:
1893       vec_len = 1;
1894       break;
1895     case Op_VecZ:
1896       vec_len = 2;
1897       break;
1898     }
1899     is_single_byte = Assembler::query_compressed_disp_byte(stack_offset, true, vec_len, tuple_type, input_size, 0);
1900   }
1901   int offset_size = 0;
1902   int size = 5;
1903   if (UseAVX > 2 ) {
1904     if (VM_Version::supports_avx512novl() && (vec_len == 2)) {
1905       offset_size = (stack_offset == 0) ? 0 : ((is_single_byte) ? 1 : 4);
1906       size += 2; // Need an additional two bytes for EVEX encoding
1907     } else if (VM_Version::supports_avx512novl() && (vec_len < 2)) {
1908       offset_size = (stack_offset == 0) ? 0 : ((stack_offset <= 127) ? 1 : 4);
1909     } else {
1910       offset_size = (stack_offset == 0) ? 0 : ((is_single_byte) ? 1 : 4);
1911       size += 2; // Need an additional two bytes for EVEX encodding
1912     }
1913   } else {
1914     offset_size = (stack_offset == 0) ? 0 : ((stack_offset <= 127) ? 1 : 4);
1915   }
1916   // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
1917   return size+offset_size;
1918 }
1919 
1920 static inline jint replicate4_imm(int con, int width) {
1921   // Load a constant of "width" (in bytes) and replicate it to fill 32bit.
1922   assert(width == 1 || width == 2, "only byte or short types here");
1923   int bit_width = width * 8;
1924   jint val = con;
1925   val &= (1 << bit_width) - 1;  // mask off sign bits
1926   while(bit_width < 32) {
1927     val |= (val << bit_width);
1928     bit_width <<= 1;
1929   }
1930   return val;
1931 }
1932 
1933 static inline jlong replicate8_imm(int con, int width) {
1934   // Load a constant of "width" (in bytes) and replicate it to fill 64bit.
1935   assert(width == 1 || width == 2 || width == 4, "only byte, short or int types here");
1936   int bit_width = width * 8;
1937   jlong val = con;
1938   val &= (((jlong) 1) << bit_width) - 1;  // mask off sign bits
1939   while(bit_width < 64) {
1940     val |= (val << bit_width);
1941     bit_width <<= 1;
1942   }
1943   return val;
1944 }
1945 
1946 #ifndef PRODUCT
1947   void MachNopNode::format(PhaseRegAlloc*, outputStream* st) const {
1948     st->print("nop \t# %d bytes pad for loops and calls", _count);
1949   }
1950 #endif
1951 
1952   void MachNopNode::emit(CodeBuffer &cbuf, PhaseRegAlloc*) const {
1953     MacroAssembler _masm(&cbuf);
1954     __ nop(_count);
1955   }
1956 
1957   uint MachNopNode::size(PhaseRegAlloc*) const {
1958     return _count;
1959   }
1960 
1961 #ifndef PRODUCT
1962   void MachBreakpointNode::format(PhaseRegAlloc*, outputStream* st) const {
1963     st->print("# breakpoint");
1964   }
1965 #endif
1966 
1967   void MachBreakpointNode::emit(CodeBuffer &cbuf, PhaseRegAlloc* ra_) const {
1968     MacroAssembler _masm(&cbuf);
1969     __ int3();
1970   }
1971 
1972   uint MachBreakpointNode::size(PhaseRegAlloc* ra_) const {
1973     return MachNode::size(ra_);
1974   }
1975 
1976 %}
1977 
1978 encode %{
1979 
1980   enc_class call_epilog %{
1981     if (VerifyStackAtCalls) {
1982       // Check that stack depth is unchanged: find majik cookie on stack
1983       int framesize = ra_->reg2offset_unchecked(OptoReg::add(ra_->_matcher._old_SP, -3*VMRegImpl::slots_per_word));
1984       MacroAssembler _masm(&cbuf);
1985       Label L;
1986       __ cmpptr(Address(rsp, framesize), (int32_t)0xbadb100d);
1987       __ jccb(Assembler::equal, L);
1988       // Die if stack mismatch
1989       __ int3();
1990       __ bind(L);
1991     }
1992   %}
1993 
1994 %}
1995 
1996 
1997 //----------OPERANDS-----------------------------------------------------------
1998 // Operand definitions must precede instruction definitions for correct parsing
1999 // in the ADLC because operands constitute user defined types which are used in
2000 // instruction definitions.
2001 
2002 operand vecZ() %{
2003   constraint(ALLOC_IN_RC(vectorz_reg));
2004   match(VecZ);
2005 
2006   format %{ %}
2007   interface(REG_INTER);
2008 %}
2009 
2010 operand legVecZ() %{
2011   constraint(ALLOC_IN_RC(vectorz_reg_vl));
2012   match(VecZ);
2013 
2014   format %{ %}
2015   interface(REG_INTER);
2016 %}
2017 
2018 // Comparison Code for FP conditional move
2019 operand cmpOp_vcmppd() %{
2020   match(Bool);
2021 
2022   predicate(n->as_Bool()->_test._test != BoolTest::overflow &&
2023             n->as_Bool()->_test._test != BoolTest::no_overflow);
2024   format %{ "" %}
2025   interface(COND_INTER) %{
2026     equal        (0x0, "eq");
2027     less         (0x1, "lt");
2028     less_equal   (0x2, "le");
2029     not_equal    (0xC, "ne");
2030     greater_equal(0xD, "ge");
2031     greater      (0xE, "gt");
2032     //TODO cannot compile (adlc breaks) without two next lines with error:
2033     // x86_64.ad(13987) Syntax Error: :In operand cmpOp_vcmppd: Do not support this encode constant: ' %{
2034     // equal' for overflow.
2035     overflow     (0x20, "o");  // not really supported by the instruction
2036     no_overflow  (0x21, "no"); // not really supported by the instruction
2037   %}
2038 %}
2039 
2040 
2041 // INSTRUCTIONS -- Platform independent definitions (same for 32- and 64-bit)
2042 
2043 // ============================================================================
2044 
2045 instruct ShouldNotReachHere() %{
2046   match(Halt);
2047   format %{ "ud2\t# ShouldNotReachHere" %}
2048   ins_encode %{
2049     __ ud2();
2050   %}
2051   ins_pipe(pipe_slow);
2052 %}
2053 
2054 // =================================EVEX special===============================
2055 
2056 instruct setMask(rRegI dst, rRegI src) %{
2057   predicate(Matcher::has_predicated_vectors());
2058   match(Set dst (SetVectMaskI  src));
2059   effect(TEMP dst);
2060   format %{ "setvectmask   $dst, $src" %}
2061   ins_encode %{
2062     __ setvectmask($dst$$Register, $src$$Register);
2063   %}
2064   ins_pipe(pipe_slow);
2065 %}
2066 
2067 // ============================================================================
2068 
2069 instruct addF_reg(regF dst, regF src) %{
2070   predicate((UseSSE>=1) && (UseAVX == 0));
2071   match(Set dst (AddF dst src));
2072 
2073   format %{ "addss   $dst, $src" %}
2074   ins_cost(150);
2075   ins_encode %{
2076     __ addss($dst$$XMMRegister, $src$$XMMRegister);
2077   %}
2078   ins_pipe(pipe_slow);
2079 %}
2080 
2081 instruct addF_mem(regF dst, memory src) %{
2082   predicate((UseSSE>=1) && (UseAVX == 0));
2083   match(Set dst (AddF dst (LoadF src)));
2084 
2085   format %{ "addss   $dst, $src" %}
2086   ins_cost(150);
2087   ins_encode %{
2088     __ addss($dst$$XMMRegister, $src$$Address);
2089   %}
2090   ins_pipe(pipe_slow);
2091 %}
2092 
2093 instruct addF_imm(regF dst, immF con) %{
2094   predicate((UseSSE>=1) && (UseAVX == 0));
2095   match(Set dst (AddF dst con));
2096   format %{ "addss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2097   ins_cost(150);
2098   ins_encode %{
2099     __ addss($dst$$XMMRegister, $constantaddress($con));
2100   %}
2101   ins_pipe(pipe_slow);
2102 %}
2103 
2104 instruct addF_reg_reg(regF dst, regF src1, regF src2) %{
2105   predicate(UseAVX > 0);
2106   match(Set dst (AddF src1 src2));
2107 
2108   format %{ "vaddss  $dst, $src1, $src2" %}
2109   ins_cost(150);
2110   ins_encode %{
2111     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2112   %}
2113   ins_pipe(pipe_slow);
2114 %}
2115 
2116 instruct addF_reg_mem(regF dst, regF src1, memory src2) %{
2117   predicate(UseAVX > 0);
2118   match(Set dst (AddF src1 (LoadF src2)));
2119 
2120   format %{ "vaddss  $dst, $src1, $src2" %}
2121   ins_cost(150);
2122   ins_encode %{
2123     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2124   %}
2125   ins_pipe(pipe_slow);
2126 %}
2127 
2128 instruct addF_reg_imm(regF dst, regF src, immF con) %{
2129   predicate(UseAVX > 0);
2130   match(Set dst (AddF src con));
2131 
2132   format %{ "vaddss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
2133   ins_cost(150);
2134   ins_encode %{
2135     __ vaddss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2136   %}
2137   ins_pipe(pipe_slow);
2138 %}
2139 
2140 instruct addD_reg(regD dst, regD src) %{
2141   predicate((UseSSE>=2) && (UseAVX == 0));
2142   match(Set dst (AddD dst src));
2143 
2144   format %{ "addsd   $dst, $src" %}
2145   ins_cost(150);
2146   ins_encode %{
2147     __ addsd($dst$$XMMRegister, $src$$XMMRegister);
2148   %}
2149   ins_pipe(pipe_slow);
2150 %}
2151 
2152 instruct addD_mem(regD dst, memory src) %{
2153   predicate((UseSSE>=2) && (UseAVX == 0));
2154   match(Set dst (AddD dst (LoadD src)));
2155 
2156   format %{ "addsd   $dst, $src" %}
2157   ins_cost(150);
2158   ins_encode %{
2159     __ addsd($dst$$XMMRegister, $src$$Address);
2160   %}
2161   ins_pipe(pipe_slow);
2162 %}
2163 
2164 instruct addD_imm(regD dst, immD con) %{
2165   predicate((UseSSE>=2) && (UseAVX == 0));
2166   match(Set dst (AddD dst con));
2167   format %{ "addsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2168   ins_cost(150);
2169   ins_encode %{
2170     __ addsd($dst$$XMMRegister, $constantaddress($con));
2171   %}
2172   ins_pipe(pipe_slow);
2173 %}
2174 
2175 instruct addD_reg_reg(regD dst, regD src1, regD src2) %{
2176   predicate(UseAVX > 0);
2177   match(Set dst (AddD src1 src2));
2178 
2179   format %{ "vaddsd  $dst, $src1, $src2" %}
2180   ins_cost(150);
2181   ins_encode %{
2182     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2183   %}
2184   ins_pipe(pipe_slow);
2185 %}
2186 
2187 instruct addD_reg_mem(regD dst, regD src1, memory src2) %{
2188   predicate(UseAVX > 0);
2189   match(Set dst (AddD src1 (LoadD src2)));
2190 
2191   format %{ "vaddsd  $dst, $src1, $src2" %}
2192   ins_cost(150);
2193   ins_encode %{
2194     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2195   %}
2196   ins_pipe(pipe_slow);
2197 %}
2198 
2199 instruct addD_reg_imm(regD dst, regD src, immD con) %{
2200   predicate(UseAVX > 0);
2201   match(Set dst (AddD src con));
2202 
2203   format %{ "vaddsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
2204   ins_cost(150);
2205   ins_encode %{
2206     __ vaddsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2207   %}
2208   ins_pipe(pipe_slow);
2209 %}
2210 
2211 instruct subF_reg(regF dst, regF src) %{
2212   predicate((UseSSE>=1) && (UseAVX == 0));
2213   match(Set dst (SubF dst src));
2214 
2215   format %{ "subss   $dst, $src" %}
2216   ins_cost(150);
2217   ins_encode %{
2218     __ subss($dst$$XMMRegister, $src$$XMMRegister);
2219   %}
2220   ins_pipe(pipe_slow);
2221 %}
2222 
2223 instruct subF_mem(regF dst, memory src) %{
2224   predicate((UseSSE>=1) && (UseAVX == 0));
2225   match(Set dst (SubF dst (LoadF src)));
2226 
2227   format %{ "subss   $dst, $src" %}
2228   ins_cost(150);
2229   ins_encode %{
2230     __ subss($dst$$XMMRegister, $src$$Address);
2231   %}
2232   ins_pipe(pipe_slow);
2233 %}
2234 
2235 instruct subF_imm(regF dst, immF con) %{
2236   predicate((UseSSE>=1) && (UseAVX == 0));
2237   match(Set dst (SubF dst con));
2238   format %{ "subss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2239   ins_cost(150);
2240   ins_encode %{
2241     __ subss($dst$$XMMRegister, $constantaddress($con));
2242   %}
2243   ins_pipe(pipe_slow);
2244 %}
2245 
2246 instruct subF_reg_reg(regF dst, regF src1, regF src2) %{
2247   predicate(UseAVX > 0);
2248   match(Set dst (SubF src1 src2));
2249 
2250   format %{ "vsubss  $dst, $src1, $src2" %}
2251   ins_cost(150);
2252   ins_encode %{
2253     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2254   %}
2255   ins_pipe(pipe_slow);
2256 %}
2257 
2258 instruct subF_reg_mem(regF dst, regF src1, memory src2) %{
2259   predicate(UseAVX > 0);
2260   match(Set dst (SubF src1 (LoadF src2)));
2261 
2262   format %{ "vsubss  $dst, $src1, $src2" %}
2263   ins_cost(150);
2264   ins_encode %{
2265     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2266   %}
2267   ins_pipe(pipe_slow);
2268 %}
2269 
2270 instruct subF_reg_imm(regF dst, regF src, immF con) %{
2271   predicate(UseAVX > 0);
2272   match(Set dst (SubF src con));
2273 
2274   format %{ "vsubss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
2275   ins_cost(150);
2276   ins_encode %{
2277     __ vsubss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2278   %}
2279   ins_pipe(pipe_slow);
2280 %}
2281 
2282 instruct subD_reg(regD dst, regD src) %{
2283   predicate((UseSSE>=2) && (UseAVX == 0));
2284   match(Set dst (SubD dst src));
2285 
2286   format %{ "subsd   $dst, $src" %}
2287   ins_cost(150);
2288   ins_encode %{
2289     __ subsd($dst$$XMMRegister, $src$$XMMRegister);
2290   %}
2291   ins_pipe(pipe_slow);
2292 %}
2293 
2294 instruct subD_mem(regD dst, memory src) %{
2295   predicate((UseSSE>=2) && (UseAVX == 0));
2296   match(Set dst (SubD dst (LoadD src)));
2297 
2298   format %{ "subsd   $dst, $src" %}
2299   ins_cost(150);
2300   ins_encode %{
2301     __ subsd($dst$$XMMRegister, $src$$Address);
2302   %}
2303   ins_pipe(pipe_slow);
2304 %}
2305 
2306 instruct subD_imm(regD dst, immD con) %{
2307   predicate((UseSSE>=2) && (UseAVX == 0));
2308   match(Set dst (SubD dst con));
2309   format %{ "subsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2310   ins_cost(150);
2311   ins_encode %{
2312     __ subsd($dst$$XMMRegister, $constantaddress($con));
2313   %}
2314   ins_pipe(pipe_slow);
2315 %}
2316 
2317 instruct subD_reg_reg(regD dst, regD src1, regD src2) %{
2318   predicate(UseAVX > 0);
2319   match(Set dst (SubD src1 src2));
2320 
2321   format %{ "vsubsd  $dst, $src1, $src2" %}
2322   ins_cost(150);
2323   ins_encode %{
2324     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2325   %}
2326   ins_pipe(pipe_slow);
2327 %}
2328 
2329 instruct subD_reg_mem(regD dst, regD src1, memory src2) %{
2330   predicate(UseAVX > 0);
2331   match(Set dst (SubD src1 (LoadD src2)));
2332 
2333   format %{ "vsubsd  $dst, $src1, $src2" %}
2334   ins_cost(150);
2335   ins_encode %{
2336     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2337   %}
2338   ins_pipe(pipe_slow);
2339 %}
2340 
2341 instruct subD_reg_imm(regD dst, regD src, immD con) %{
2342   predicate(UseAVX > 0);
2343   match(Set dst (SubD src con));
2344 
2345   format %{ "vsubsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
2346   ins_cost(150);
2347   ins_encode %{
2348     __ vsubsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2349   %}
2350   ins_pipe(pipe_slow);
2351 %}
2352 
2353 instruct mulF_reg(regF dst, regF src) %{
2354   predicate((UseSSE>=1) && (UseAVX == 0));
2355   match(Set dst (MulF dst src));
2356 
2357   format %{ "mulss   $dst, $src" %}
2358   ins_cost(150);
2359   ins_encode %{
2360     __ mulss($dst$$XMMRegister, $src$$XMMRegister);
2361   %}
2362   ins_pipe(pipe_slow);
2363 %}
2364 
2365 instruct mulF_mem(regF dst, memory src) %{
2366   predicate((UseSSE>=1) && (UseAVX == 0));
2367   match(Set dst (MulF dst (LoadF src)));
2368 
2369   format %{ "mulss   $dst, $src" %}
2370   ins_cost(150);
2371   ins_encode %{
2372     __ mulss($dst$$XMMRegister, $src$$Address);
2373   %}
2374   ins_pipe(pipe_slow);
2375 %}
2376 
2377 instruct mulF_imm(regF dst, immF con) %{
2378   predicate((UseSSE>=1) && (UseAVX == 0));
2379   match(Set dst (MulF dst con));
2380   format %{ "mulss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2381   ins_cost(150);
2382   ins_encode %{
2383     __ mulss($dst$$XMMRegister, $constantaddress($con));
2384   %}
2385   ins_pipe(pipe_slow);
2386 %}
2387 
2388 instruct mulF_reg_reg(regF dst, regF src1, regF src2) %{
2389   predicate(UseAVX > 0);
2390   match(Set dst (MulF src1 src2));
2391 
2392   format %{ "vmulss  $dst, $src1, $src2" %}
2393   ins_cost(150);
2394   ins_encode %{
2395     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2396   %}
2397   ins_pipe(pipe_slow);
2398 %}
2399 
2400 instruct mulF_reg_mem(regF dst, regF src1, memory src2) %{
2401   predicate(UseAVX > 0);
2402   match(Set dst (MulF src1 (LoadF src2)));
2403 
2404   format %{ "vmulss  $dst, $src1, $src2" %}
2405   ins_cost(150);
2406   ins_encode %{
2407     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2408   %}
2409   ins_pipe(pipe_slow);
2410 %}
2411 
2412 instruct mulF_reg_imm(regF dst, regF src, immF con) %{
2413   predicate(UseAVX > 0);
2414   match(Set dst (MulF src con));
2415 
2416   format %{ "vmulss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
2417   ins_cost(150);
2418   ins_encode %{
2419     __ vmulss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2420   %}
2421   ins_pipe(pipe_slow);
2422 %}
2423 
2424 instruct mulD_reg(regD dst, regD src) %{
2425   predicate((UseSSE>=2) && (UseAVX == 0));
2426   match(Set dst (MulD dst src));
2427 
2428   format %{ "mulsd   $dst, $src" %}
2429   ins_cost(150);
2430   ins_encode %{
2431     __ mulsd($dst$$XMMRegister, $src$$XMMRegister);
2432   %}
2433   ins_pipe(pipe_slow);
2434 %}
2435 
2436 instruct mulD_mem(regD dst, memory src) %{
2437   predicate((UseSSE>=2) && (UseAVX == 0));
2438   match(Set dst (MulD dst (LoadD src)));
2439 
2440   format %{ "mulsd   $dst, $src" %}
2441   ins_cost(150);
2442   ins_encode %{
2443     __ mulsd($dst$$XMMRegister, $src$$Address);
2444   %}
2445   ins_pipe(pipe_slow);
2446 %}
2447 
2448 instruct mulD_imm(regD dst, immD con) %{
2449   predicate((UseSSE>=2) && (UseAVX == 0));
2450   match(Set dst (MulD dst con));
2451   format %{ "mulsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2452   ins_cost(150);
2453   ins_encode %{
2454     __ mulsd($dst$$XMMRegister, $constantaddress($con));
2455   %}
2456   ins_pipe(pipe_slow);
2457 %}
2458 
2459 instruct mulD_reg_reg(regD dst, regD src1, regD src2) %{
2460   predicate(UseAVX > 0);
2461   match(Set dst (MulD src1 src2));
2462 
2463   format %{ "vmulsd  $dst, $src1, $src2" %}
2464   ins_cost(150);
2465   ins_encode %{
2466     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2467   %}
2468   ins_pipe(pipe_slow);
2469 %}
2470 
2471 instruct mulD_reg_mem(regD dst, regD src1, memory src2) %{
2472   predicate(UseAVX > 0);
2473   match(Set dst (MulD src1 (LoadD src2)));
2474 
2475   format %{ "vmulsd  $dst, $src1, $src2" %}
2476   ins_cost(150);
2477   ins_encode %{
2478     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2479   %}
2480   ins_pipe(pipe_slow);
2481 %}
2482 
2483 instruct mulD_reg_imm(regD dst, regD src, immD con) %{
2484   predicate(UseAVX > 0);
2485   match(Set dst (MulD src con));
2486 
2487   format %{ "vmulsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
2488   ins_cost(150);
2489   ins_encode %{
2490     __ vmulsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2491   %}
2492   ins_pipe(pipe_slow);
2493 %}
2494 
2495 instruct divF_reg(regF dst, regF src) %{
2496   predicate((UseSSE>=1) && (UseAVX == 0));
2497   match(Set dst (DivF dst src));
2498 
2499   format %{ "divss   $dst, $src" %}
2500   ins_cost(150);
2501   ins_encode %{
2502     __ divss($dst$$XMMRegister, $src$$XMMRegister);
2503   %}
2504   ins_pipe(pipe_slow);
2505 %}
2506 
2507 instruct divF_mem(regF dst, memory src) %{
2508   predicate((UseSSE>=1) && (UseAVX == 0));
2509   match(Set dst (DivF dst (LoadF src)));
2510 
2511   format %{ "divss   $dst, $src" %}
2512   ins_cost(150);
2513   ins_encode %{
2514     __ divss($dst$$XMMRegister, $src$$Address);
2515   %}
2516   ins_pipe(pipe_slow);
2517 %}
2518 
2519 instruct divF_imm(regF dst, immF con) %{
2520   predicate((UseSSE>=1) && (UseAVX == 0));
2521   match(Set dst (DivF dst con));
2522   format %{ "divss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2523   ins_cost(150);
2524   ins_encode %{
2525     __ divss($dst$$XMMRegister, $constantaddress($con));
2526   %}
2527   ins_pipe(pipe_slow);
2528 %}
2529 
2530 instruct divF_reg_reg(regF dst, regF src1, regF src2) %{
2531   predicate(UseAVX > 0);
2532   match(Set dst (DivF src1 src2));
2533 
2534   format %{ "vdivss  $dst, $src1, $src2" %}
2535   ins_cost(150);
2536   ins_encode %{
2537     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2538   %}
2539   ins_pipe(pipe_slow);
2540 %}
2541 
2542 instruct divF_reg_mem(regF dst, regF src1, memory src2) %{
2543   predicate(UseAVX > 0);
2544   match(Set dst (DivF src1 (LoadF src2)));
2545 
2546   format %{ "vdivss  $dst, $src1, $src2" %}
2547   ins_cost(150);
2548   ins_encode %{
2549     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2550   %}
2551   ins_pipe(pipe_slow);
2552 %}
2553 
2554 instruct divF_reg_imm(regF dst, regF src, immF con) %{
2555   predicate(UseAVX > 0);
2556   match(Set dst (DivF src con));
2557 
2558   format %{ "vdivss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
2559   ins_cost(150);
2560   ins_encode %{
2561     __ vdivss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2562   %}
2563   ins_pipe(pipe_slow);
2564 %}
2565 
2566 instruct divD_reg(regD dst, regD src) %{
2567   predicate((UseSSE>=2) && (UseAVX == 0));
2568   match(Set dst (DivD dst src));
2569 
2570   format %{ "divsd   $dst, $src" %}
2571   ins_cost(150);
2572   ins_encode %{
2573     __ divsd($dst$$XMMRegister, $src$$XMMRegister);
2574   %}
2575   ins_pipe(pipe_slow);
2576 %}
2577 
2578 instruct divD_mem(regD dst, memory src) %{
2579   predicate((UseSSE>=2) && (UseAVX == 0));
2580   match(Set dst (DivD dst (LoadD src)));
2581 
2582   format %{ "divsd   $dst, $src" %}
2583   ins_cost(150);
2584   ins_encode %{
2585     __ divsd($dst$$XMMRegister, $src$$Address);
2586   %}
2587   ins_pipe(pipe_slow);
2588 %}
2589 
2590 instruct divD_imm(regD dst, immD con) %{
2591   predicate((UseSSE>=2) && (UseAVX == 0));
2592   match(Set dst (DivD dst con));
2593   format %{ "divsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2594   ins_cost(150);
2595   ins_encode %{
2596     __ divsd($dst$$XMMRegister, $constantaddress($con));
2597   %}
2598   ins_pipe(pipe_slow);
2599 %}
2600 
2601 instruct divD_reg_reg(regD dst, regD src1, regD src2) %{
2602   predicate(UseAVX > 0);
2603   match(Set dst (DivD src1 src2));
2604 
2605   format %{ "vdivsd  $dst, $src1, $src2" %}
2606   ins_cost(150);
2607   ins_encode %{
2608     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2609   %}
2610   ins_pipe(pipe_slow);
2611 %}
2612 
2613 instruct divD_reg_mem(regD dst, regD src1, memory src2) %{
2614   predicate(UseAVX > 0);
2615   match(Set dst (DivD src1 (LoadD src2)));
2616 
2617   format %{ "vdivsd  $dst, $src1, $src2" %}
2618   ins_cost(150);
2619   ins_encode %{
2620     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2621   %}
2622   ins_pipe(pipe_slow);
2623 %}
2624 
2625 instruct divD_reg_imm(regD dst, regD src, immD con) %{
2626   predicate(UseAVX > 0);
2627   match(Set dst (DivD src con));
2628 
2629   format %{ "vdivsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
2630   ins_cost(150);
2631   ins_encode %{
2632     __ vdivsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2633   %}
2634   ins_pipe(pipe_slow);
2635 %}
2636 
2637 instruct absF_reg(regF dst) %{
2638   predicate((UseSSE>=1) && (UseAVX == 0));
2639   match(Set dst (AbsF dst));
2640   ins_cost(150);
2641   format %{ "andps   $dst, [0x7fffffff]\t# abs float by sign masking" %}
2642   ins_encode %{
2643     __ andps($dst$$XMMRegister, ExternalAddress(float_signmask()));
2644   %}
2645   ins_pipe(pipe_slow);
2646 %}
2647 
2648 instruct absF_reg_reg(vlRegF dst, vlRegF src) %{
2649   predicate(UseAVX > 0);
2650   match(Set dst (AbsF src));
2651   ins_cost(150);
2652   format %{ "vandps  $dst, $src, [0x7fffffff]\t# abs float by sign masking" %}
2653   ins_encode %{
2654     int vector_len = 0;
2655     __ vandps($dst$$XMMRegister, $src$$XMMRegister,
2656               ExternalAddress(float_signmask()), vector_len);
2657   %}
2658   ins_pipe(pipe_slow);
2659 %}
2660 
2661 instruct absD_reg(regD dst) %{
2662   predicate((UseSSE>=2) && (UseAVX == 0));
2663   match(Set dst (AbsD dst));
2664   ins_cost(150);
2665   format %{ "andpd   $dst, [0x7fffffffffffffff]\t"
2666             "# abs double by sign masking" %}
2667   ins_encode %{
2668     __ andpd($dst$$XMMRegister, ExternalAddress(double_signmask()));
2669   %}
2670   ins_pipe(pipe_slow);
2671 %}
2672 
2673 instruct absD_reg_reg(vlRegD dst, vlRegD src) %{
2674   predicate(UseAVX > 0);
2675   match(Set dst (AbsD src));
2676   ins_cost(150);
2677   format %{ "vandpd  $dst, $src, [0x7fffffffffffffff]\t"
2678             "# abs double by sign masking" %}
2679   ins_encode %{
2680     int vector_len = 0;
2681     __ vandpd($dst$$XMMRegister, $src$$XMMRegister,
2682               ExternalAddress(double_signmask()), vector_len);
2683   %}
2684   ins_pipe(pipe_slow);
2685 %}
2686 
2687 instruct negF_reg(regF dst) %{
2688   predicate((UseSSE>=1) && (UseAVX == 0));
2689   match(Set dst (NegF dst));
2690   ins_cost(150);
2691   format %{ "xorps   $dst, [0x80000000]\t# neg float by sign flipping" %}
2692   ins_encode %{
2693     __ xorps($dst$$XMMRegister, ExternalAddress(float_signflip()));
2694   %}
2695   ins_pipe(pipe_slow);
2696 %}
2697 
2698 instruct negF_reg_reg(vlRegF dst, vlRegF src) %{
2699   predicate(UseAVX > 0);
2700   match(Set dst (NegF src));
2701   ins_cost(150);
2702   format %{ "vnegatess  $dst, $src, [0x80000000]\t# neg float by sign flipping" %}
2703   ins_encode %{
2704     __ vnegatess($dst$$XMMRegister, $src$$XMMRegister,
2705                  ExternalAddress(float_signflip()));
2706   %}
2707   ins_pipe(pipe_slow);
2708 %}
2709 
2710 instruct negD_reg(regD dst) %{
2711   predicate((UseSSE>=2) && (UseAVX == 0));
2712   match(Set dst (NegD dst));
2713   ins_cost(150);
2714   format %{ "xorpd   $dst, [0x8000000000000000]\t"
2715             "# neg double by sign flipping" %}
2716   ins_encode %{
2717     __ xorpd($dst$$XMMRegister, ExternalAddress(double_signflip()));
2718   %}
2719   ins_pipe(pipe_slow);
2720 %}
2721 
2722 instruct negD_reg_reg(vlRegD dst, vlRegD src) %{
2723   predicate(UseAVX > 0);
2724   match(Set dst (NegD src));
2725   ins_cost(150);
2726   format %{ "vnegatesd  $dst, $src, [0x8000000000000000]\t"
2727             "# neg double by sign flipping" %}
2728   ins_encode %{
2729     __ vnegatesd($dst$$XMMRegister, $src$$XMMRegister,
2730                  ExternalAddress(double_signflip()));
2731   %}
2732   ins_pipe(pipe_slow);
2733 %}
2734 
2735 instruct sqrtF_reg(regF dst, regF src) %{
2736   predicate(UseSSE>=1);
2737   match(Set dst (SqrtF src));
2738 
2739   format %{ "sqrtss  $dst, $src" %}
2740   ins_cost(150);
2741   ins_encode %{
2742     __ sqrtss($dst$$XMMRegister, $src$$XMMRegister);
2743   %}
2744   ins_pipe(pipe_slow);
2745 %}
2746 
2747 instruct sqrtF_mem(regF dst, memory src) %{
2748   predicate(UseSSE>=1);
2749   match(Set dst (SqrtF (LoadF src)));
2750 
2751   format %{ "sqrtss  $dst, $src" %}
2752   ins_cost(150);
2753   ins_encode %{
2754     __ sqrtss($dst$$XMMRegister, $src$$Address);
2755   %}
2756   ins_pipe(pipe_slow);
2757 %}
2758 
2759 instruct sqrtF_imm(regF dst, immF con) %{
2760   predicate(UseSSE>=1);
2761   match(Set dst (SqrtF con));
2762 
2763   format %{ "sqrtss  $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2764   ins_cost(150);
2765   ins_encode %{
2766     __ sqrtss($dst$$XMMRegister, $constantaddress($con));
2767   %}
2768   ins_pipe(pipe_slow);
2769 %}
2770 
2771 instruct sqrtD_reg(regD dst, regD src) %{
2772   predicate(UseSSE>=2);
2773   match(Set dst (SqrtD src));
2774 
2775   format %{ "sqrtsd  $dst, $src" %}
2776   ins_cost(150);
2777   ins_encode %{
2778     __ sqrtsd($dst$$XMMRegister, $src$$XMMRegister);
2779   %}
2780   ins_pipe(pipe_slow);
2781 %}
2782 
2783 instruct sqrtD_mem(regD dst, memory src) %{
2784   predicate(UseSSE>=2);
2785   match(Set dst (SqrtD (LoadD src)));
2786 
2787   format %{ "sqrtsd  $dst, $src" %}
2788   ins_cost(150);
2789   ins_encode %{
2790     __ sqrtsd($dst$$XMMRegister, $src$$Address);
2791   %}
2792   ins_pipe(pipe_slow);
2793 %}
2794 
2795 instruct sqrtD_imm(regD dst, immD con) %{
2796   predicate(UseSSE>=2);
2797   match(Set dst (SqrtD con));
2798   format %{ "sqrtsd  $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2799   ins_cost(150);
2800   ins_encode %{
2801     __ sqrtsd($dst$$XMMRegister, $constantaddress($con));
2802   %}
2803   ins_pipe(pipe_slow);
2804 %}
2805 
2806 instruct onspinwait() %{
2807   match(OnSpinWait);
2808   ins_cost(200);
2809 
2810   format %{
2811     $$template
2812     $$emit$$"pause\t! membar_onspinwait"
2813   %}
2814   ins_encode %{
2815     __ pause();
2816   %}
2817   ins_pipe(pipe_slow);
2818 %}
2819 
2820 // a * b + c
2821 instruct fmaD_reg(regD a, regD b, regD c) %{
2822   predicate(UseFMA);
2823   match(Set c (FmaD  c (Binary a b)));
2824   format %{ "fmasd $a,$b,$c\t# $c = $a * $b + $c" %}
2825   ins_cost(150);
2826   ins_encode %{
2827     __ fmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
2828   %}
2829   ins_pipe( pipe_slow );
2830 %}
2831 
2832 // a * b + c
2833 instruct fmaF_reg(regF a, regF b, regF c) %{
2834   predicate(UseFMA);
2835   match(Set c (FmaF  c (Binary a b)));
2836   format %{ "fmass $a,$b,$c\t# $c = $a * $b + $c" %}
2837   ins_cost(150);
2838   ins_encode %{
2839     __ fmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
2840   %}
2841   ins_pipe( pipe_slow );
2842 %}
2843 
2844 // ====================VECTOR INSTRUCTIONS=====================================
2845 
2846 
2847 // Load vectors (4 bytes long)
2848 instruct loadV4(vecS dst, memory mem) %{
2849   predicate(n->as_LoadVector()->memory_size() == 4);
2850   match(Set dst (LoadVector mem));
2851   ins_cost(125);
2852   format %{ "movd    $dst,$mem\t! load vector (4 bytes)" %}
2853   ins_encode %{
2854     __ movdl($dst$$XMMRegister, $mem$$Address);
2855   %}
2856   ins_pipe( pipe_slow );
2857 %}
2858 
2859 // Load vectors (4 bytes long)
2860 instruct MoveVecS2Leg(legVecS dst, vecS src) %{
2861   match(Set dst src);
2862   format %{ "movss $dst,$src\t! load vector (4 bytes)" %}
2863   ins_encode %{
2864     __ movflt($dst$$XMMRegister, $src$$XMMRegister);
2865   %}
2866   ins_pipe( fpu_reg_reg );
2867 %}
2868 
2869 // Load vectors (4 bytes long)
2870 instruct MoveLeg2VecS(vecS dst, legVecS src) %{
2871   match(Set dst src);
2872   format %{ "movss $dst,$src\t! load vector (4 bytes)" %}
2873   ins_encode %{
2874     __ movflt($dst$$XMMRegister, $src$$XMMRegister);
2875   %}
2876   ins_pipe( fpu_reg_reg );
2877 %}
2878 
2879 // Load vectors (8 bytes long)
2880 instruct loadV8(vecD dst, memory mem) %{
2881   predicate(n->as_LoadVector()->memory_size() == 8);
2882   match(Set dst (LoadVector mem));
2883   ins_cost(125);
2884   format %{ "movq    $dst,$mem\t! load vector (8 bytes)" %}
2885   ins_encode %{
2886     __ movq($dst$$XMMRegister, $mem$$Address);
2887   %}
2888   ins_pipe( pipe_slow );
2889 %}
2890 
2891 // Load vectors (8 bytes long)
2892 instruct MoveVecD2Leg(legVecD dst, vecD src) %{
2893   match(Set dst src);
2894   format %{ "movsd $dst,$src\t! load vector (8 bytes)" %}
2895   ins_encode %{
2896     __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
2897   %}
2898   ins_pipe( fpu_reg_reg );
2899 %}
2900 
2901 // Load vectors (8 bytes long)
2902 instruct MoveLeg2VecD(vecD dst, legVecD src) %{
2903   match(Set dst src);
2904   format %{ "movsd $dst,$src\t! load vector (8 bytes)" %}
2905   ins_encode %{
2906     __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
2907   %}
2908   ins_pipe( fpu_reg_reg );
2909 %}
2910 
2911 // Load vectors (16 bytes long)
2912 instruct loadV16(vecX dst, memory mem) %{
2913   predicate(n->as_LoadVector()->memory_size() == 16);
2914   match(Set dst (LoadVector mem));
2915   ins_cost(125);
2916   format %{ "movdqu  $dst,$mem\t! load vector (16 bytes)" %}
2917   ins_encode %{
2918     __ movdqu($dst$$XMMRegister, $mem$$Address);
2919   %}
2920   ins_pipe( pipe_slow );
2921 %}
2922 
2923 // Load vectors (16 bytes long)
2924 instruct MoveVecX2Leg(legVecX dst, vecX src) %{
2925   match(Set dst src);
2926   format %{ "movdqu $dst,$src\t! load vector (16 bytes)" %}
2927   ins_encode %{
2928     if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
2929       int vector_len = 2;
2930       __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
2931     } else {
2932       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
2933     }
2934   %}
2935   ins_pipe( fpu_reg_reg );
2936 %}
2937 
2938 // Load vectors (16 bytes long)
2939 instruct MoveLeg2VecX(vecX dst, legVecX src) %{
2940   match(Set dst src);
2941   format %{ "movdqu $dst,$src\t! load vector (16 bytes)" %}
2942   ins_encode %{
2943     if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
2944       int vector_len = 2;
2945       __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
2946     } else {
2947       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
2948     }
2949   %}
2950   ins_pipe( fpu_reg_reg );
2951 %}
2952 
2953 // Load vectors (32 bytes long)
2954 instruct loadV32(vecY dst, memory mem) %{
2955   predicate(n->as_LoadVector()->memory_size() == 32);
2956   match(Set dst (LoadVector mem));
2957   ins_cost(125);
2958   format %{ "vmovdqu $dst,$mem\t! load vector (32 bytes)" %}
2959   ins_encode %{
2960     __ vmovdqu($dst$$XMMRegister, $mem$$Address);
2961   %}
2962   ins_pipe( pipe_slow );
2963 %}
2964 
2965 // Load vectors (32 bytes long)
2966 instruct MoveVecY2Leg(legVecY dst, vecY src) %{
2967   match(Set dst src);
2968   format %{ "vmovdqu $dst,$src\t! load vector (32 bytes)" %}
2969   ins_encode %{
2970     if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
2971       int vector_len = 2;
2972       __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
2973     } else {
2974       __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister);
2975     }
2976   %}
2977   ins_pipe( fpu_reg_reg );
2978 %}
2979 
2980 // Load vectors (32 bytes long)
2981 instruct MoveLeg2VecY(vecY dst, legVecY src) %{
2982   match(Set dst src);
2983   format %{ "vmovdqu $dst,$src\t! load vector (32 bytes)" %}
2984   ins_encode %{
2985     if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
2986       int vector_len = 2;
2987       __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
2988     } else {
2989       __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister);
2990     }
2991   %}
2992   ins_pipe( fpu_reg_reg );
2993 %}
2994 
2995 // Load vectors (64 bytes long)
2996 instruct loadV64_dword(vecZ dst, memory mem) %{
2997   predicate(n->as_LoadVector()->memory_size() == 64 && n->as_LoadVector()->element_size() <= 4);
2998   match(Set dst (LoadVector mem));
2999   ins_cost(125);
3000   format %{ "vmovdqul $dst k0,$mem\t! load vector (64 bytes)" %}
3001   ins_encode %{
3002     int vector_len = 2;
3003     __ evmovdqul($dst$$XMMRegister, $mem$$Address, vector_len);
3004   %}
3005   ins_pipe( pipe_slow );
3006 %}
3007 
3008 // Load vectors (64 bytes long)
3009 instruct loadV64_qword(vecZ dst, memory mem) %{
3010   predicate(n->as_LoadVector()->memory_size() == 64 && n->as_LoadVector()->element_size() > 4);
3011   match(Set dst (LoadVector mem));
3012   ins_cost(125);
3013   format %{ "vmovdquq $dst k0,$mem\t! load vector (64 bytes)" %}
3014   ins_encode %{
3015     int vector_len = 2;
3016     __ evmovdquq($dst$$XMMRegister, $mem$$Address, vector_len);
3017   %}
3018   ins_pipe( pipe_slow );
3019 %}
3020 
3021 instruct MoveVecZ2Leg(legVecZ dst, vecZ  src) %{
3022   match(Set dst src);
3023   format %{ "vmovdquq $dst k0,$src\t! Move vector (64 bytes)" %}
3024   ins_encode %{
3025     int vector_len = 2;
3026     __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
3027   %}
3028   ins_pipe( fpu_reg_reg );
3029 %}
3030 
3031 instruct MoveLeg2VecZ(vecZ dst, legVecZ  src) %{
3032   match(Set dst src);
3033   format %{ "vmovdquq $dst k0,$src\t! Move vector (64 bytes)" %}
3034   ins_encode %{
3035     int vector_len = 2;
3036     __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
3037   %}
3038   ins_pipe( fpu_reg_reg );
3039 %}
3040 
3041 // Store vectors
3042 instruct storeV4(memory mem, vecS src) %{
3043   predicate(n->as_StoreVector()->memory_size() == 4);
3044   match(Set mem (StoreVector mem src));
3045   ins_cost(145);
3046   format %{ "movd    $mem,$src\t! store vector (4 bytes)" %}
3047   ins_encode %{
3048     __ movdl($mem$$Address, $src$$XMMRegister);
3049   %}
3050   ins_pipe( pipe_slow );
3051 %}
3052 
3053 instruct storeV8(memory mem, vecD src) %{
3054   predicate(n->as_StoreVector()->memory_size() == 8);
3055   match(Set mem (StoreVector mem src));
3056   ins_cost(145);
3057   format %{ "movq    $mem,$src\t! store vector (8 bytes)" %}
3058   ins_encode %{
3059     __ movq($mem$$Address, $src$$XMMRegister);
3060   %}
3061   ins_pipe( pipe_slow );
3062 %}
3063 
3064 instruct storeV16(memory mem, vecX src) %{
3065   predicate(n->as_StoreVector()->memory_size() == 16);
3066   match(Set mem (StoreVector mem src));
3067   ins_cost(145);
3068   format %{ "movdqu  $mem,$src\t! store vector (16 bytes)" %}
3069   ins_encode %{
3070     __ movdqu($mem$$Address, $src$$XMMRegister);
3071   %}
3072   ins_pipe( pipe_slow );
3073 %}
3074 
3075 instruct storeV32(memory mem, vecY src) %{
3076   predicate(n->as_StoreVector()->memory_size() == 32);
3077   match(Set mem (StoreVector mem src));
3078   ins_cost(145);
3079   format %{ "vmovdqu $mem,$src\t! store vector (32 bytes)" %}
3080   ins_encode %{
3081     __ vmovdqu($mem$$Address, $src$$XMMRegister);
3082   %}
3083   ins_pipe( pipe_slow );
3084 %}
3085 
3086 instruct storeV64_dword(memory mem, vecZ src) %{
3087   predicate(n->as_StoreVector()->memory_size() == 64 && n->as_StoreVector()->element_size() <= 4);
3088   match(Set mem (StoreVector mem src));
3089   ins_cost(145);
3090   format %{ "vmovdqul $mem k0,$src\t! store vector (64 bytes)" %}
3091   ins_encode %{
3092     int vector_len = 2;
3093     __ evmovdqul($mem$$Address, $src$$XMMRegister, vector_len);
3094   %}
3095   ins_pipe( pipe_slow );
3096 %}
3097 
3098 instruct storeV64_qword(memory mem, vecZ src) %{
3099   predicate(n->as_StoreVector()->memory_size() == 64 && n->as_StoreVector()->element_size() > 4);
3100   match(Set mem (StoreVector mem src));
3101   ins_cost(145);
3102   format %{ "vmovdquq $mem k0,$src\t! store vector (64 bytes)" %}
3103   ins_encode %{
3104     int vector_len = 2;
3105     __ evmovdquq($mem$$Address, $src$$XMMRegister, vector_len);
3106   %}
3107   ins_pipe( pipe_slow );
3108 %}
3109 
3110 // ====================LEGACY REPLICATE=======================================
3111 
3112 instruct Repl4B_mem(vecS dst, memory mem) %{
3113   predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
3114   match(Set dst (ReplicateB (LoadB mem)));
3115   format %{ "punpcklbw $dst,$mem\n\t"
3116             "pshuflw $dst,$dst,0x00\t! replicate4B" %}
3117   ins_encode %{
3118     __ punpcklbw($dst$$XMMRegister, $mem$$Address);
3119     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3120   %}
3121   ins_pipe( pipe_slow );
3122 %}
3123 
3124 instruct Repl8B_mem(vecD dst, memory mem) %{
3125   predicate(n->as_Vector()->length() == 8 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
3126   match(Set dst (ReplicateB (LoadB mem)));
3127   format %{ "punpcklbw $dst,$mem\n\t"
3128             "pshuflw $dst,$dst,0x00\t! replicate8B" %}
3129   ins_encode %{
3130     __ punpcklbw($dst$$XMMRegister, $mem$$Address);
3131     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3132   %}
3133   ins_pipe( pipe_slow );
3134 %}
3135 
3136 instruct Repl16B(vecX dst, rRegI src) %{
3137   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
3138   match(Set dst (ReplicateB src));
3139   format %{ "movd    $dst,$src\n\t"
3140             "punpcklbw $dst,$dst\n\t"
3141             "pshuflw $dst,$dst,0x00\n\t"
3142             "punpcklqdq $dst,$dst\t! replicate16B" %}
3143   ins_encode %{
3144     __ movdl($dst$$XMMRegister, $src$$Register);
3145     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
3146     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3147     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3148   %}
3149   ins_pipe( pipe_slow );
3150 %}
3151 
3152 instruct Repl16B_mem(vecX dst, memory mem) %{
3153   predicate(n->as_Vector()->length() == 16 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
3154   match(Set dst (ReplicateB (LoadB mem)));
3155   format %{ "punpcklbw $dst,$mem\n\t"
3156             "pshuflw $dst,$dst,0x00\n\t"
3157             "punpcklqdq $dst,$dst\t! replicate16B" %}
3158   ins_encode %{
3159     __ punpcklbw($dst$$XMMRegister, $mem$$Address);
3160     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3161     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3162   %}
3163   ins_pipe( pipe_slow );
3164 %}
3165 
3166 instruct Repl32B(vecY dst, rRegI src) %{
3167   predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
3168   match(Set dst (ReplicateB src));
3169   format %{ "movd    $dst,$src\n\t"
3170             "punpcklbw $dst,$dst\n\t"
3171             "pshuflw $dst,$dst,0x00\n\t"
3172             "punpcklqdq $dst,$dst\n\t"
3173             "vinserti128_high $dst,$dst\t! replicate32B" %}
3174   ins_encode %{
3175     __ movdl($dst$$XMMRegister, $src$$Register);
3176     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
3177     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3178     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3179     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3180   %}
3181   ins_pipe( pipe_slow );
3182 %}
3183 
3184 instruct Repl32B_mem(vecY dst, memory mem) %{
3185   predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
3186   match(Set dst (ReplicateB (LoadB mem)));
3187   format %{ "punpcklbw $dst,$mem\n\t"
3188             "pshuflw $dst,$dst,0x00\n\t"
3189             "punpcklqdq $dst,$dst\n\t"
3190             "vinserti128_high $dst,$dst\t! replicate32B" %}
3191   ins_encode %{
3192     __ punpcklbw($dst$$XMMRegister, $mem$$Address);
3193     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3194     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3195     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3196   %}
3197   ins_pipe( pipe_slow );
3198 %}
3199 
3200 instruct Repl64B(legVecZ dst, rRegI src) %{
3201   predicate(n->as_Vector()->length() == 64 && !VM_Version::supports_avx512vlbw());
3202   match(Set dst (ReplicateB src));
3203   format %{ "movd    $dst,$src\n\t"
3204             "punpcklbw $dst,$dst\n\t"
3205             "pshuflw $dst,$dst,0x00\n\t"
3206             "punpcklqdq $dst,$dst\n\t"
3207             "vinserti128_high $dst,$dst\t"
3208             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate64B" %}
3209   ins_encode %{
3210     __ movdl($dst$$XMMRegister, $src$$Register);
3211     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
3212     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3213     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3214     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3215     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3216   %}
3217   ins_pipe( pipe_slow );
3218 %}
3219 
3220 instruct Repl64B_mem(legVecZ dst, memory mem) %{
3221   predicate(n->as_Vector()->length() == 64 && !VM_Version::supports_avx512vlbw());
3222   match(Set dst (ReplicateB (LoadB mem)));
3223   format %{ "punpcklbw $dst,$mem\n\t"
3224             "pshuflw $dst,$dst,0x00\n\t"
3225             "punpcklqdq $dst,$dst\n\t"
3226             "vinserti128_high $dst,$dst\t"
3227             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate64B" %}
3228   ins_encode %{
3229     __ punpcklbw($dst$$XMMRegister, $mem$$Address);
3230     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3231     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3232     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3233     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3234   %}
3235   ins_pipe( pipe_slow );
3236 %}
3237 
3238 instruct Repl16B_imm(vecX dst, immI con) %{
3239   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
3240   match(Set dst (ReplicateB con));
3241   format %{ "movq    $dst,[$constantaddress]\n\t"
3242             "punpcklqdq $dst,$dst\t! replicate16B($con)" %}
3243   ins_encode %{
3244     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
3245     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3246   %}
3247   ins_pipe( pipe_slow );
3248 %}
3249 
3250 instruct Repl32B_imm(vecY dst, immI con) %{
3251   predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
3252   match(Set dst (ReplicateB con));
3253   format %{ "movq    $dst,[$constantaddress]\n\t"
3254             "punpcklqdq $dst,$dst\n\t"
3255             "vinserti128_high $dst,$dst\t! lreplicate32B($con)" %}
3256   ins_encode %{
3257     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
3258     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3259     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3260   %}
3261   ins_pipe( pipe_slow );
3262 %}
3263 
3264 instruct Repl64B_imm(legVecZ dst, immI con) %{
3265   predicate(n->as_Vector()->length() == 64 && !VM_Version::supports_avx512vlbw());
3266   match(Set dst (ReplicateB con));
3267   format %{ "movq    $dst,[$constantaddress]\n\t"
3268             "punpcklqdq $dst,$dst\n\t"
3269             "vinserti128_high $dst,$dst\t"
3270             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate64B($con)" %}
3271   ins_encode %{
3272     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
3273     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3274     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3275     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3276   %}
3277   ins_pipe( pipe_slow );
3278 %}
3279 
3280 instruct Repl4S(vecD dst, rRegI src) %{
3281   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vlbw());
3282   match(Set dst (ReplicateS src));
3283   format %{ "movd    $dst,$src\n\t"
3284             "pshuflw $dst,$dst,0x00\t! replicate4S" %}
3285   ins_encode %{
3286     __ movdl($dst$$XMMRegister, $src$$Register);
3287     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3288   %}
3289   ins_pipe( pipe_slow );
3290 %}
3291 
3292 instruct Repl4S_mem(vecD dst, memory mem) %{
3293   predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
3294   match(Set dst (ReplicateS (LoadS mem)));
3295   format %{ "pshuflw $dst,$mem,0x00\t! replicate4S" %}
3296   ins_encode %{
3297     __ pshuflw($dst$$XMMRegister, $mem$$Address, 0x00);
3298   %}
3299   ins_pipe( pipe_slow );
3300 %}
3301 
3302 instruct Repl8S(vecX dst, rRegI src) %{
3303   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vlbw());
3304   match(Set dst (ReplicateS src));
3305   format %{ "movd    $dst,$src\n\t"
3306             "pshuflw $dst,$dst,0x00\n\t"
3307             "punpcklqdq $dst,$dst\t! replicate8S" %}
3308   ins_encode %{
3309     __ movdl($dst$$XMMRegister, $src$$Register);
3310     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3311     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3312   %}
3313   ins_pipe( pipe_slow );
3314 %}
3315 
3316 instruct Repl8S_mem(vecX dst, memory mem) %{
3317   predicate(n->as_Vector()->length() == 8 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
3318   match(Set dst (ReplicateS (LoadS mem)));
3319   format %{ "pshuflw $dst,$mem,0x00\n\t"
3320             "punpcklqdq $dst,$dst\t! replicate8S" %}
3321   ins_encode %{
3322     __ pshuflw($dst$$XMMRegister, $mem$$Address, 0x00);
3323     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3324   %}
3325   ins_pipe( pipe_slow );
3326 %}
3327 
3328 instruct Repl8S_imm(vecX dst, immI con) %{
3329   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vlbw());
3330   match(Set dst (ReplicateS con));
3331   format %{ "movq    $dst,[$constantaddress]\n\t"
3332             "punpcklqdq $dst,$dst\t! replicate8S($con)" %}
3333   ins_encode %{
3334     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
3335     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3336   %}
3337   ins_pipe( pipe_slow );
3338 %}
3339 
3340 instruct Repl16S(vecY dst, rRegI src) %{
3341   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
3342   match(Set dst (ReplicateS src));
3343   format %{ "movd    $dst,$src\n\t"
3344             "pshuflw $dst,$dst,0x00\n\t"
3345             "punpcklqdq $dst,$dst\n\t"
3346             "vinserti128_high $dst,$dst\t! replicate16S" %}
3347   ins_encode %{
3348     __ movdl($dst$$XMMRegister, $src$$Register);
3349     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3350     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3351     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3352   %}
3353   ins_pipe( pipe_slow );
3354 %}
3355 
3356 instruct Repl16S_mem(vecY dst, memory mem) %{
3357   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
3358   match(Set dst (ReplicateS (LoadS mem)));
3359   format %{ "pshuflw $dst,$mem,0x00\n\t"
3360             "punpcklqdq $dst,$dst\n\t"
3361             "vinserti128_high $dst,$dst\t! replicate16S" %}
3362   ins_encode %{
3363     __ pshuflw($dst$$XMMRegister, $mem$$Address, 0x00);
3364     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3365     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3366   %}
3367   ins_pipe( pipe_slow );
3368 %}
3369 
3370 instruct Repl16S_imm(vecY dst, immI con) %{
3371   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
3372   match(Set dst (ReplicateS con));
3373   format %{ "movq    $dst,[$constantaddress]\n\t"
3374             "punpcklqdq $dst,$dst\n\t"
3375             "vinserti128_high $dst,$dst\t! replicate16S($con)" %}
3376   ins_encode %{
3377     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
3378     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3379     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3380   %}
3381   ins_pipe( pipe_slow );
3382 %}
3383 
3384 instruct Repl32S(legVecZ dst, rRegI src) %{
3385   predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
3386   match(Set dst (ReplicateS src));
3387   format %{ "movd    $dst,$src\n\t"
3388             "pshuflw $dst,$dst,0x00\n\t"
3389             "punpcklqdq $dst,$dst\n\t"
3390             "vinserti128_high $dst,$dst\t"
3391             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate32S" %}
3392   ins_encode %{
3393     __ movdl($dst$$XMMRegister, $src$$Register);
3394     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3395     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3396     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3397     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3398   %}
3399   ins_pipe( pipe_slow );
3400 %}
3401 
3402 instruct Repl32S_mem(legVecZ dst, memory mem) %{
3403   predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
3404   match(Set dst (ReplicateS (LoadS mem)));
3405   format %{ "pshuflw $dst,$mem,0x00\n\t"
3406             "punpcklqdq $dst,$dst\n\t"
3407             "vinserti128_high $dst,$dst\t"
3408             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate32S" %}
3409   ins_encode %{
3410     __ pshuflw($dst$$XMMRegister, $mem$$Address, 0x00);
3411     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3412     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3413     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3414   %}
3415   ins_pipe( pipe_slow );
3416 %}
3417 
3418 instruct Repl32S_imm(legVecZ dst, immI con) %{
3419   predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
3420   match(Set dst (ReplicateS con));
3421   format %{ "movq    $dst,[$constantaddress]\n\t"
3422             "punpcklqdq $dst,$dst\n\t"
3423             "vinserti128_high $dst,$dst\t"
3424             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate32S($con)" %}
3425   ins_encode %{
3426     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
3427     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3428     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3429     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3430   %}
3431   ins_pipe( pipe_slow );
3432 %}
3433 
3434 instruct Repl4I(vecX dst, rRegI src) %{
3435   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3436   match(Set dst (ReplicateI src));
3437   format %{ "movd    $dst,$src\n\t"
3438             "pshufd  $dst,$dst,0x00\t! replicate4I" %}
3439   ins_encode %{
3440     __ movdl($dst$$XMMRegister, $src$$Register);
3441     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3442   %}
3443   ins_pipe( pipe_slow );
3444 %}
3445 
3446 instruct Repl4I_mem(vecX dst, memory mem) %{
3447   predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3448   match(Set dst (ReplicateI (LoadI mem)));
3449   format %{ "pshufd  $dst,$mem,0x00\t! replicate4I" %}
3450   ins_encode %{
3451     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3452   %}
3453   ins_pipe( pipe_slow );
3454 %}
3455 
3456 instruct Repl8I(vecY dst, rRegI src) %{
3457   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3458   match(Set dst (ReplicateI src));
3459   format %{ "movd    $dst,$src\n\t"
3460             "pshufd  $dst,$dst,0x00\n\t"
3461             "vinserti128_high $dst,$dst\t! replicate8I" %}
3462   ins_encode %{
3463     __ movdl($dst$$XMMRegister, $src$$Register);
3464     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3465     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3466   %}
3467   ins_pipe( pipe_slow );
3468 %}
3469 
3470 instruct Repl8I_mem(vecY dst, memory mem) %{
3471   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3472   match(Set dst (ReplicateI (LoadI mem)));
3473   format %{ "pshufd  $dst,$mem,0x00\n\t"
3474             "vinserti128_high $dst,$dst\t! replicate8I" %}
3475   ins_encode %{
3476     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3477     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3478   %}
3479   ins_pipe( pipe_slow );
3480 %}
3481 
3482 instruct Repl16I(legVecZ dst, rRegI src) %{
3483   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vl());
3484   match(Set dst (ReplicateI src));
3485   format %{ "movd    $dst,$src\n\t"
3486             "pshufd  $dst,$dst,0x00\n\t"
3487             "vinserti128_high $dst,$dst\t"
3488             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate16I" %}
3489   ins_encode %{
3490     __ movdl($dst$$XMMRegister, $src$$Register);
3491     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3492     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3493     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3494   %}
3495   ins_pipe( pipe_slow );
3496 %}
3497 
3498 instruct Repl16I_mem(legVecZ dst, memory mem) %{
3499   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vl());
3500   match(Set dst (ReplicateI (LoadI mem)));
3501   format %{ "pshufd  $dst,$mem,0x00\n\t"
3502             "vinserti128_high $dst,$dst\t"
3503             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate16I" %}
3504   ins_encode %{
3505     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3506     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3507     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3508   %}
3509   ins_pipe( pipe_slow );
3510 %}
3511 
3512 instruct Repl4I_imm(vecX dst, immI con) %{
3513   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3514   match(Set dst (ReplicateI con));
3515   format %{ "movq    $dst,[$constantaddress]\t! replicate4I($con)\n\t"
3516             "punpcklqdq $dst,$dst" %}
3517   ins_encode %{
3518     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
3519     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3520   %}
3521   ins_pipe( pipe_slow );
3522 %}
3523 
3524 instruct Repl8I_imm(vecY dst, immI con) %{
3525   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3526   match(Set dst (ReplicateI con));
3527   format %{ "movq    $dst,[$constantaddress]\t! replicate8I($con)\n\t"
3528             "punpcklqdq $dst,$dst\n\t"
3529             "vinserti128_high $dst,$dst" %}
3530   ins_encode %{
3531     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
3532     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3533     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3534   %}
3535   ins_pipe( pipe_slow );
3536 %}
3537 
3538 instruct Repl16I_imm(legVecZ dst, immI con) %{
3539   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vl());
3540   match(Set dst (ReplicateI con));
3541   format %{ "movq    $dst,[$constantaddress]\t"
3542             "punpcklqdq $dst,$dst\n\t"
3543             "vinserti128_high $dst,$dst"
3544             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate16I($con)" %}
3545   ins_encode %{
3546     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
3547     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3548     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3549     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3550   %}
3551   ins_pipe( pipe_slow );
3552 %}
3553 
3554 // Long could be loaded into xmm register directly from memory.
3555 instruct Repl2L_mem(vecX dst, memory mem) %{
3556   predicate(n->as_Vector()->length() == 2 && !VM_Version::supports_avx512vlbw());
3557   match(Set dst (ReplicateL (LoadL mem)));
3558   format %{ "movq    $dst,$mem\n\t"
3559             "punpcklqdq $dst,$dst\t! replicate2L" %}
3560   ins_encode %{
3561     __ movq($dst$$XMMRegister, $mem$$Address);
3562     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3563   %}
3564   ins_pipe( pipe_slow );
3565 %}
3566 
3567 // Replicate long (8 byte) scalar to be vector
3568 #ifdef _LP64
3569 instruct Repl4L(vecY dst, rRegL src) %{
3570   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3571   match(Set dst (ReplicateL src));
3572   format %{ "movdq   $dst,$src\n\t"
3573             "punpcklqdq $dst,$dst\n\t"
3574             "vinserti128_high $dst,$dst\t! replicate4L" %}
3575   ins_encode %{
3576     __ movdq($dst$$XMMRegister, $src$$Register);
3577     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3578     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3579   %}
3580   ins_pipe( pipe_slow );
3581 %}
3582 
3583 instruct Repl8L(legVecZ dst, rRegL src) %{
3584   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3585   match(Set dst (ReplicateL src));
3586   format %{ "movdq   $dst,$src\n\t"
3587             "punpcklqdq $dst,$dst\n\t"
3588             "vinserti128_high $dst,$dst\t"
3589             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate8L" %}
3590   ins_encode %{
3591     __ movdq($dst$$XMMRegister, $src$$Register);
3592     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3593     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3594     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3595   %}
3596   ins_pipe( pipe_slow );
3597 %}
3598 #else // _LP64
3599 instruct Repl4L(vecY dst, eRegL src, vecY tmp) %{
3600   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3601   match(Set dst (ReplicateL src));
3602   effect(TEMP dst, USE src, TEMP tmp);
3603   format %{ "movdl   $dst,$src.lo\n\t"
3604             "movdl   $tmp,$src.hi\n\t"
3605             "punpckldq $dst,$tmp\n\t"
3606             "punpcklqdq $dst,$dst\n\t"
3607             "vinserti128_high $dst,$dst\t! replicate4L" %}
3608   ins_encode %{
3609     __ movdl($dst$$XMMRegister, $src$$Register);
3610     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
3611     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
3612     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3613     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3614   %}
3615   ins_pipe( pipe_slow );
3616 %}
3617 
3618 instruct Repl8L(legVecZ dst, eRegL src, legVecZ tmp) %{
3619   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3620   match(Set dst (ReplicateL src));
3621   effect(TEMP dst, USE src, TEMP tmp);
3622   format %{ "movdl   $dst,$src.lo\n\t"
3623             "movdl   $tmp,$src.hi\n\t"
3624             "punpckldq $dst,$tmp\n\t"
3625             "punpcklqdq $dst,$dst\n\t"
3626             "vinserti128_high $dst,$dst\t"
3627             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate8L" %}
3628   ins_encode %{
3629     __ movdl($dst$$XMMRegister, $src$$Register);
3630     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
3631     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
3632     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3633     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3634     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3635   %}
3636   ins_pipe( pipe_slow );
3637 %}
3638 #endif // _LP64
3639 
3640 instruct Repl4L_imm(vecY dst, immL con) %{
3641   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3642   match(Set dst (ReplicateL con));
3643   format %{ "movq    $dst,[$constantaddress]\n\t"
3644             "punpcklqdq $dst,$dst\n\t"
3645             "vinserti128_high $dst,$dst\t! replicate4L($con)" %}
3646   ins_encode %{
3647     __ movq($dst$$XMMRegister, $constantaddress($con));
3648     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3649     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3650   %}
3651   ins_pipe( pipe_slow );
3652 %}
3653 
3654 instruct Repl8L_imm(legVecZ dst, immL con) %{
3655   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3656   match(Set dst (ReplicateL con));
3657   format %{ "movq    $dst,[$constantaddress]\n\t"
3658             "punpcklqdq $dst,$dst\n\t"
3659             "vinserti128_high $dst,$dst\t"
3660             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate8L($con)" %}
3661   ins_encode %{
3662     __ movq($dst$$XMMRegister, $constantaddress($con));
3663     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3664     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3665     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3666   %}
3667   ins_pipe( pipe_slow );
3668 %}
3669 
3670 instruct Repl4L_mem(vecY dst, memory mem) %{
3671   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3672   match(Set dst (ReplicateL (LoadL mem)));
3673   format %{ "movq    $dst,$mem\n\t"
3674             "punpcklqdq $dst,$dst\n\t"
3675             "vinserti128_high $dst,$dst\t! replicate4L" %}
3676   ins_encode %{
3677     __ movq($dst$$XMMRegister, $mem$$Address);
3678     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3679     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3680   %}
3681   ins_pipe( pipe_slow );
3682 %}
3683 
3684 instruct Repl8L_mem(legVecZ dst, memory mem) %{
3685   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3686   match(Set dst (ReplicateL (LoadL mem)));
3687   format %{ "movq    $dst,$mem\n\t"
3688             "punpcklqdq $dst,$dst\n\t"
3689             "vinserti128_high $dst,$dst\t"
3690             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate8L" %}
3691   ins_encode %{
3692     __ movq($dst$$XMMRegister, $mem$$Address);
3693     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3694     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3695     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3696   %}
3697   ins_pipe( pipe_slow );
3698 %}
3699 
3700 instruct Repl2F_mem(vecD dst, memory mem) %{
3701   predicate(n->as_Vector()->length() == 2 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3702   match(Set dst (ReplicateF (LoadF mem)));
3703   format %{ "pshufd  $dst,$mem,0x00\t! replicate2F" %}
3704   ins_encode %{
3705     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3706   %}
3707   ins_pipe( pipe_slow );
3708 %}
3709 
3710 instruct Repl4F_mem(vecX dst, memory mem) %{
3711   predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3712   match(Set dst (ReplicateF (LoadF mem)));
3713   format %{ "pshufd  $dst,$mem,0x00\t! replicate4F" %}
3714   ins_encode %{
3715     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3716   %}
3717   ins_pipe( pipe_slow );
3718 %}
3719 
3720 instruct Repl8F(vecY dst, vlRegF src) %{
3721   predicate(n->as_Vector()->length() == 8 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3722   match(Set dst (ReplicateF src));
3723   format %{ "pshufd  $dst,$src,0x00\n\t"
3724             "vinsertf128_high $dst,$dst\t! replicate8F" %}
3725   ins_encode %{
3726     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
3727     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3728   %}
3729   ins_pipe( pipe_slow );
3730 %}
3731 
3732 instruct Repl8F_mem(vecY dst, memory mem) %{
3733   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3734   match(Set dst (ReplicateF (LoadF mem)));
3735   format %{ "pshufd  $dst,$mem,0x00\n\t"
3736             "vinsertf128_high $dst,$dst\t! replicate8F" %}
3737   ins_encode %{
3738     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3739     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3740   %}
3741   ins_pipe( pipe_slow );
3742 %}
3743 
3744 instruct Repl16F(legVecZ dst, vlRegF src) %{
3745   predicate(n->as_Vector()->length() == 16 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3746   match(Set dst (ReplicateF src));
3747   format %{ "pshufd  $dst,$src,0x00\n\t"
3748             "vinsertf128_high $dst,$dst\t"
3749             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate16F" %}
3750   ins_encode %{
3751     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
3752     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3753     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3754   %}
3755   ins_pipe( pipe_slow );
3756 %}
3757 
3758 instruct Repl16F_mem(legVecZ dst, memory mem) %{
3759   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vl());
3760   match(Set dst (ReplicateF (LoadF mem)));
3761   format %{ "pshufd  $dst,$mem,0x00\n\t"
3762             "vinsertf128_high $dst,$dst\t"
3763             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate16F" %}
3764   ins_encode %{
3765     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3766     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3767     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3768   %}
3769   ins_pipe( pipe_slow );
3770 %}
3771 
3772 instruct Repl2F_zero(vecD dst, immF0 zero) %{
3773   predicate(n->as_Vector()->length() == 2 && UseAVX < 3);
3774   match(Set dst (ReplicateF zero));
3775   format %{ "xorps   $dst,$dst\t! replicate2F zero" %}
3776   ins_encode %{
3777     __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
3778   %}
3779   ins_pipe( fpu_reg_reg );
3780 %}
3781 
3782 instruct Repl4F_zero(vecX dst, immF0 zero) %{
3783   predicate(n->as_Vector()->length() == 4 && UseAVX < 3);
3784   match(Set dst (ReplicateF zero));
3785   format %{ "xorps   $dst,$dst\t! replicate4F zero" %}
3786   ins_encode %{
3787     __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
3788   %}
3789   ins_pipe( fpu_reg_reg );
3790 %}
3791 
3792 instruct Repl8F_zero(vecY dst, immF0 zero) %{
3793   predicate(n->as_Vector()->length() == 8 && UseAVX < 3);
3794   match(Set dst (ReplicateF zero));
3795   format %{ "vxorps  $dst,$dst,$dst\t! replicate8F zero" %}
3796   ins_encode %{
3797     int vector_len = 1;
3798     __ vxorps($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3799   %}
3800   ins_pipe( fpu_reg_reg );
3801 %}
3802 
3803 instruct Repl2D_mem(vecX dst, memory mem) %{
3804   predicate(n->as_Vector()->length() == 2 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3805   match(Set dst (ReplicateD (LoadD mem)));
3806   format %{ "pshufd  $dst,$mem,0x44\t! replicate2D" %}
3807   ins_encode %{
3808     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x44);
3809   %}
3810   ins_pipe( pipe_slow );
3811 %}
3812 
3813 instruct Repl4D(vecY dst, vlRegD src) %{
3814   predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3815   match(Set dst (ReplicateD src));
3816   format %{ "pshufd  $dst,$src,0x44\n\t"
3817             "vinsertf128_high $dst,$dst\t! replicate4D" %}
3818   ins_encode %{
3819     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
3820     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3821   %}
3822   ins_pipe( pipe_slow );
3823 %}
3824 
3825 instruct Repl4D_mem(vecY dst, memory mem) %{
3826   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3827   match(Set dst (ReplicateD (LoadD mem)));
3828   format %{ "pshufd  $dst,$mem,0x44\n\t"
3829             "vinsertf128_high $dst,$dst\t! replicate4D" %}
3830   ins_encode %{
3831     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x44);
3832     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3833   %}
3834   ins_pipe( pipe_slow );
3835 %}
3836 
3837 instruct Repl8D(legVecZ dst, vlRegD src) %{
3838   predicate(n->as_Vector()->length() == 8 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3839   match(Set dst (ReplicateD src));
3840   format %{ "pshufd  $dst,$src,0x44\n\t"
3841             "vinsertf128_high $dst,$dst\t"
3842             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate8D" %}
3843   ins_encode %{
3844     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
3845     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3846     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3847   %}
3848   ins_pipe( pipe_slow );
3849 %}
3850 
3851 instruct Repl8D_mem(legVecZ dst, memory mem) %{
3852   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3853   match(Set dst (ReplicateD (LoadD mem)));
3854   format %{ "pshufd  $dst,$mem,0x44\n\t"
3855             "vinsertf128_high $dst,$dst\t"
3856             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate8D" %}
3857   ins_encode %{
3858     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x44);
3859     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3860     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3861   %}
3862   ins_pipe( pipe_slow );
3863 %}
3864 
3865 // Replicate double (8 byte) scalar zero to be vector
3866 instruct Repl2D_zero(vecX dst, immD0 zero) %{
3867   predicate(n->as_Vector()->length() == 2 && UseAVX < 3);
3868   match(Set dst (ReplicateD zero));
3869   format %{ "xorpd   $dst,$dst\t! replicate2D zero" %}
3870   ins_encode %{
3871     __ xorpd($dst$$XMMRegister, $dst$$XMMRegister);
3872   %}
3873   ins_pipe( fpu_reg_reg );
3874 %}
3875 
3876 instruct Repl4D_zero(vecY dst, immD0 zero) %{
3877   predicate(n->as_Vector()->length() == 4 && UseAVX < 3);
3878   match(Set dst (ReplicateD zero));
3879   format %{ "vxorpd  $dst,$dst,$dst,vect256\t! replicate4D zero" %}
3880   ins_encode %{
3881     int vector_len = 1;
3882     __ vxorpd($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3883   %}
3884   ins_pipe( fpu_reg_reg );
3885 %}
3886 
3887 // ====================GENERIC REPLICATE==========================================
3888 
3889 // Replicate byte scalar to be vector
3890 instruct Repl4B(vecS dst, rRegI src) %{
3891   predicate(n->as_Vector()->length() == 4);
3892   match(Set dst (ReplicateB src));
3893   format %{ "movd    $dst,$src\n\t"
3894             "punpcklbw $dst,$dst\n\t"
3895             "pshuflw $dst,$dst,0x00\t! replicate4B" %}
3896   ins_encode %{
3897     __ movdl($dst$$XMMRegister, $src$$Register);
3898     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
3899     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3900   %}
3901   ins_pipe( pipe_slow );
3902 %}
3903 
3904 instruct Repl8B(vecD dst, rRegI src) %{
3905   predicate(n->as_Vector()->length() == 8);
3906   match(Set dst (ReplicateB src));
3907   format %{ "movd    $dst,$src\n\t"
3908             "punpcklbw $dst,$dst\n\t"
3909             "pshuflw $dst,$dst,0x00\t! replicate8B" %}
3910   ins_encode %{
3911     __ movdl($dst$$XMMRegister, $src$$Register);
3912     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
3913     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3914   %}
3915   ins_pipe( pipe_slow );
3916 %}
3917 
3918 // Replicate byte scalar immediate to be vector by loading from const table.
3919 instruct Repl4B_imm(vecS dst, immI con) %{
3920   predicate(n->as_Vector()->length() == 4);
3921   match(Set dst (ReplicateB con));
3922   format %{ "movdl   $dst,[$constantaddress]\t! replicate4B($con)" %}
3923   ins_encode %{
3924     __ movdl($dst$$XMMRegister, $constantaddress(replicate4_imm($con$$constant, 1)));
3925   %}
3926   ins_pipe( pipe_slow );
3927 %}
3928 
3929 instruct Repl8B_imm(vecD dst, immI con) %{
3930   predicate(n->as_Vector()->length() == 8);
3931   match(Set dst (ReplicateB con));
3932   format %{ "movq    $dst,[$constantaddress]\t! replicate8B($con)" %}
3933   ins_encode %{
3934     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
3935   %}
3936   ins_pipe( pipe_slow );
3937 %}
3938 
3939 // Replicate byte scalar zero to be vector
3940 instruct Repl4B_zero(vecS dst, immI0 zero) %{
3941   predicate(n->as_Vector()->length() == 4);
3942   match(Set dst (ReplicateB zero));
3943   format %{ "pxor    $dst,$dst\t! replicate4B zero" %}
3944   ins_encode %{
3945     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3946   %}
3947   ins_pipe( fpu_reg_reg );
3948 %}
3949 
3950 instruct Repl8B_zero(vecD dst, immI0 zero) %{
3951   predicate(n->as_Vector()->length() == 8);
3952   match(Set dst (ReplicateB zero));
3953   format %{ "pxor    $dst,$dst\t! replicate8B zero" %}
3954   ins_encode %{
3955     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3956   %}
3957   ins_pipe( fpu_reg_reg );
3958 %}
3959 
3960 instruct Repl16B_zero(vecX dst, immI0 zero) %{
3961   predicate(n->as_Vector()->length() == 16);
3962   match(Set dst (ReplicateB zero));
3963   format %{ "pxor    $dst,$dst\t! replicate16B zero" %}
3964   ins_encode %{
3965     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3966   %}
3967   ins_pipe( fpu_reg_reg );
3968 %}
3969 
3970 instruct Repl32B_zero(vecY dst, immI0 zero) %{
3971   predicate(n->as_Vector()->length() == 32);
3972   match(Set dst (ReplicateB zero));
3973   format %{ "vpxor   $dst,$dst,$dst\t! replicate32B zero" %}
3974   ins_encode %{
3975     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
3976     int vector_len = 1;
3977     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3978   %}
3979   ins_pipe( fpu_reg_reg );
3980 %}
3981 
3982 // Replicate char/short (2 byte) scalar to be vector
3983 instruct Repl2S(vecS dst, rRegI src) %{
3984   predicate(n->as_Vector()->length() == 2);
3985   match(Set dst (ReplicateS src));
3986   format %{ "movd    $dst,$src\n\t"
3987             "pshuflw $dst,$dst,0x00\t! replicate2S" %}
3988   ins_encode %{
3989     __ movdl($dst$$XMMRegister, $src$$Register);
3990     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3991   %}
3992   ins_pipe( fpu_reg_reg );
3993 %}
3994 
3995 // Replicate char/short (2 byte) scalar immediate to be vector by loading from const table.
3996 instruct Repl2S_imm(vecS dst, immI con) %{
3997   predicate(n->as_Vector()->length() == 2);
3998   match(Set dst (ReplicateS con));
3999   format %{ "movdl   $dst,[$constantaddress]\t! replicate2S($con)" %}
4000   ins_encode %{
4001     __ movdl($dst$$XMMRegister, $constantaddress(replicate4_imm($con$$constant, 2)));
4002   %}
4003   ins_pipe( fpu_reg_reg );
4004 %}
4005 
4006 instruct Repl4S_imm(vecD dst, immI con) %{
4007   predicate(n->as_Vector()->length() == 4);
4008   match(Set dst (ReplicateS con));
4009   format %{ "movq    $dst,[$constantaddress]\t! replicate4S($con)" %}
4010   ins_encode %{
4011     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
4012   %}
4013   ins_pipe( fpu_reg_reg );
4014 %}
4015 
4016 // Replicate char/short (2 byte) scalar zero to be vector
4017 instruct Repl2S_zero(vecS dst, immI0 zero) %{
4018   predicate(n->as_Vector()->length() == 2);
4019   match(Set dst (ReplicateS zero));
4020   format %{ "pxor    $dst,$dst\t! replicate2S zero" %}
4021   ins_encode %{
4022     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4023   %}
4024   ins_pipe( fpu_reg_reg );
4025 %}
4026 
4027 instruct Repl4S_zero(vecD dst, immI0 zero) %{
4028   predicate(n->as_Vector()->length() == 4);
4029   match(Set dst (ReplicateS zero));
4030   format %{ "pxor    $dst,$dst\t! replicate4S zero" %}
4031   ins_encode %{
4032     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4033   %}
4034   ins_pipe( fpu_reg_reg );
4035 %}
4036 
4037 instruct Repl8S_zero(vecX dst, immI0 zero) %{
4038   predicate(n->as_Vector()->length() == 8);
4039   match(Set dst (ReplicateS zero));
4040   format %{ "pxor    $dst,$dst\t! replicate8S zero" %}
4041   ins_encode %{
4042     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4043   %}
4044   ins_pipe( fpu_reg_reg );
4045 %}
4046 
4047 instruct Repl16S_zero(vecY dst, immI0 zero) %{
4048   predicate(n->as_Vector()->length() == 16);
4049   match(Set dst (ReplicateS zero));
4050   format %{ "vpxor   $dst,$dst,$dst\t! replicate16S zero" %}
4051   ins_encode %{
4052     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
4053     int vector_len = 1;
4054     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4055   %}
4056   ins_pipe( fpu_reg_reg );
4057 %}
4058 
4059 // Replicate integer (4 byte) scalar to be vector
4060 instruct Repl2I(vecD dst, rRegI src) %{
4061   predicate(n->as_Vector()->length() == 2);
4062   match(Set dst (ReplicateI src));
4063   format %{ "movd    $dst,$src\n\t"
4064             "pshufd  $dst,$dst,0x00\t! replicate2I" %}
4065   ins_encode %{
4066     __ movdl($dst$$XMMRegister, $src$$Register);
4067     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
4068   %}
4069   ins_pipe( fpu_reg_reg );
4070 %}
4071 
4072 // Integer could be loaded into xmm register directly from memory.
4073 instruct Repl2I_mem(vecD dst, memory mem) %{
4074   predicate(n->as_Vector()->length() == 2);
4075   match(Set dst (ReplicateI (LoadI mem)));
4076   format %{ "movd    $dst,$mem\n\t"
4077             "pshufd  $dst,$dst,0x00\t! replicate2I" %}
4078   ins_encode %{
4079     __ movdl($dst$$XMMRegister, $mem$$Address);
4080     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
4081   %}
4082   ins_pipe( fpu_reg_reg );
4083 %}
4084 
4085 // Replicate integer (4 byte) scalar immediate to be vector by loading from const table.
4086 instruct Repl2I_imm(vecD dst, immI con) %{
4087   predicate(n->as_Vector()->length() == 2);
4088   match(Set dst (ReplicateI con));
4089   format %{ "movq    $dst,[$constantaddress]\t! replicate2I($con)" %}
4090   ins_encode %{
4091     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
4092   %}
4093   ins_pipe( fpu_reg_reg );
4094 %}
4095 
4096 // Replicate integer (4 byte) scalar zero to be vector
4097 instruct Repl2I_zero(vecD dst, immI0 zero) %{
4098   predicate(n->as_Vector()->length() == 2);
4099   match(Set dst (ReplicateI zero));
4100   format %{ "pxor    $dst,$dst\t! replicate2I" %}
4101   ins_encode %{
4102     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4103   %}
4104   ins_pipe( fpu_reg_reg );
4105 %}
4106 
4107 instruct Repl4I_zero(vecX dst, immI0 zero) %{
4108   predicate(n->as_Vector()->length() == 4);
4109   match(Set dst (ReplicateI zero));
4110   format %{ "pxor    $dst,$dst\t! replicate4I zero)" %}
4111   ins_encode %{
4112     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4113   %}
4114   ins_pipe( fpu_reg_reg );
4115 %}
4116 
4117 instruct Repl8I_zero(vecY dst, immI0 zero) %{
4118   predicate(n->as_Vector()->length() == 8);
4119   match(Set dst (ReplicateI zero));
4120   format %{ "vpxor   $dst,$dst,$dst\t! replicate8I zero" %}
4121   ins_encode %{
4122     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
4123     int vector_len = 1;
4124     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4125   %}
4126   ins_pipe( fpu_reg_reg );
4127 %}
4128 
4129 // Replicate long (8 byte) scalar to be vector
4130 #ifdef _LP64
4131 instruct Repl2L(vecX dst, rRegL src) %{
4132   predicate(n->as_Vector()->length() == 2);
4133   match(Set dst (ReplicateL src));
4134   format %{ "movdq   $dst,$src\n\t"
4135             "punpcklqdq $dst,$dst\t! replicate2L" %}
4136   ins_encode %{
4137     __ movdq($dst$$XMMRegister, $src$$Register);
4138     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4139   %}
4140   ins_pipe( pipe_slow );
4141 %}
4142 #else // _LP64
4143 instruct Repl2L(vecX dst, eRegL src, vecX tmp) %{
4144   predicate(n->as_Vector()->length() == 2);
4145   match(Set dst (ReplicateL src));
4146   effect(TEMP dst, USE src, TEMP tmp);
4147   format %{ "movdl   $dst,$src.lo\n\t"
4148             "movdl   $tmp,$src.hi\n\t"
4149             "punpckldq $dst,$tmp\n\t"
4150             "punpcklqdq $dst,$dst\t! replicate2L"%}
4151   ins_encode %{
4152     __ movdl($dst$$XMMRegister, $src$$Register);
4153     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
4154     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
4155     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4156   %}
4157   ins_pipe( pipe_slow );
4158 %}
4159 #endif // _LP64
4160 
4161 // Replicate long (8 byte) scalar immediate to be vector by loading from const table.
4162 instruct Repl2L_imm(vecX dst, immL con) %{
4163   predicate(n->as_Vector()->length() == 2);
4164   match(Set dst (ReplicateL con));
4165   format %{ "movq    $dst,[$constantaddress]\n\t"
4166             "punpcklqdq $dst,$dst\t! replicate2L($con)" %}
4167   ins_encode %{
4168     __ movq($dst$$XMMRegister, $constantaddress($con));
4169     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4170   %}
4171   ins_pipe( pipe_slow );
4172 %}
4173 
4174 // Replicate long (8 byte) scalar zero to be vector
4175 instruct Repl2L_zero(vecX dst, immL0 zero) %{
4176   predicate(n->as_Vector()->length() == 2);
4177   match(Set dst (ReplicateL zero));
4178   format %{ "pxor    $dst,$dst\t! replicate2L zero" %}
4179   ins_encode %{
4180     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4181   %}
4182   ins_pipe( fpu_reg_reg );
4183 %}
4184 
4185 instruct Repl4L_zero(vecY dst, immL0 zero) %{
4186   predicate(n->as_Vector()->length() == 4);
4187   match(Set dst (ReplicateL zero));
4188   format %{ "vpxor   $dst,$dst,$dst\t! replicate4L zero" %}
4189   ins_encode %{
4190     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
4191     int vector_len = 1;
4192     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4193   %}
4194   ins_pipe( fpu_reg_reg );
4195 %}
4196 
4197 // Replicate float (4 byte) scalar to be vector
4198 instruct Repl2F(vecD dst, vlRegF src) %{
4199   predicate(n->as_Vector()->length() == 2);
4200   match(Set dst (ReplicateF src));
4201   format %{ "pshufd  $dst,$dst,0x00\t! replicate2F" %}
4202   ins_encode %{
4203     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
4204   %}
4205   ins_pipe( fpu_reg_reg );
4206 %}
4207 
4208 instruct Repl4F(vecX dst, vlRegF src) %{
4209   predicate(n->as_Vector()->length() == 4);
4210   match(Set dst (ReplicateF src));
4211   format %{ "pshufd  $dst,$dst,0x00\t! replicate4F" %}
4212   ins_encode %{
4213     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
4214   %}
4215   ins_pipe( pipe_slow );
4216 %}
4217 
4218 // Replicate double (8 bytes) scalar to be vector
4219 instruct Repl2D(vecX dst, vlRegD src) %{
4220   predicate(n->as_Vector()->length() == 2);
4221   match(Set dst (ReplicateD src));
4222   format %{ "pshufd  $dst,$src,0x44\t! replicate2D" %}
4223   ins_encode %{
4224     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
4225   %}
4226   ins_pipe( pipe_slow );
4227 %}
4228 
4229 // ====================EVEX REPLICATE=============================================
4230 
4231 instruct Repl4B_mem_evex(vecS dst, memory mem) %{
4232   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4233   match(Set dst (ReplicateB (LoadB mem)));
4234   format %{ "vpbroadcastb  $dst,$mem\t! replicate4B" %}
4235   ins_encode %{
4236     int vector_len = 0;
4237     __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
4238   %}
4239   ins_pipe( pipe_slow );
4240 %}
4241 
4242 instruct Repl8B_mem_evex(vecD dst, memory mem) %{
4243   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4244   match(Set dst (ReplicateB (LoadB mem)));
4245   format %{ "vpbroadcastb  $dst,$mem\t! replicate8B" %}
4246   ins_encode %{
4247     int vector_len = 0;
4248     __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
4249   %}
4250   ins_pipe( pipe_slow );
4251 %}
4252 
4253 instruct Repl16B_evex(vecX dst, rRegI src) %{
4254   predicate(n->as_Vector()->length() == 16 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4255   match(Set dst (ReplicateB src));
4256   format %{ "evpbroadcastb $dst,$src\t! replicate16B" %}
4257   ins_encode %{
4258    int vector_len = 0;
4259     __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vector_len);
4260   %}
4261   ins_pipe( pipe_slow );
4262 %}
4263 
4264 instruct Repl16B_mem_evex(vecX dst, memory mem) %{
4265   predicate(n->as_Vector()->length() == 16 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4266   match(Set dst (ReplicateB (LoadB mem)));
4267   format %{ "vpbroadcastb  $dst,$mem\t! replicate16B" %}
4268   ins_encode %{
4269     int vector_len = 0;
4270     __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
4271   %}
4272   ins_pipe( pipe_slow );
4273 %}
4274 
4275 instruct Repl32B_evex(vecY dst, rRegI src) %{
4276   predicate(n->as_Vector()->length() == 32 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4277   match(Set dst (ReplicateB src));
4278   format %{ "evpbroadcastb $dst,$src\t! replicate32B" %}
4279   ins_encode %{
4280    int vector_len = 1;
4281     __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vector_len);
4282   %}
4283   ins_pipe( pipe_slow );
4284 %}
4285 
4286 instruct Repl32B_mem_evex(vecY dst, memory mem) %{
4287   predicate(n->as_Vector()->length() == 32 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4288   match(Set dst (ReplicateB (LoadB mem)));
4289   format %{ "vpbroadcastb  $dst,$mem\t! replicate32B" %}
4290   ins_encode %{
4291     int vector_len = 1;
4292     __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
4293   %}
4294   ins_pipe( pipe_slow );
4295 %}
4296 
4297 instruct Repl64B_evex(vecZ dst, rRegI src) %{
4298   predicate(n->as_Vector()->length() == 64 && UseAVX > 2 && VM_Version::supports_avx512bw());
4299   match(Set dst (ReplicateB src));
4300   format %{ "evpbroadcastb $dst,$src\t! upper replicate64B" %}
4301   ins_encode %{
4302    int vector_len = 2;
4303     __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vector_len);
4304   %}
4305   ins_pipe( pipe_slow );
4306 %}
4307 
4308 instruct Repl64B_mem_evex(vecZ dst, memory mem) %{
4309   predicate(n->as_Vector()->length() == 64 && UseAVX > 2 && VM_Version::supports_avx512bw());
4310   match(Set dst (ReplicateB (LoadB mem)));
4311   format %{ "vpbroadcastb  $dst,$mem\t! replicate64B" %}
4312   ins_encode %{
4313     int vector_len = 2;
4314     __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
4315   %}
4316   ins_pipe( pipe_slow );
4317 %}
4318 
4319 instruct Repl16B_imm_evex(vecX dst, immI con) %{
4320   predicate(n->as_Vector()->length() == 16 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4321   match(Set dst (ReplicateB con));
4322   format %{ "movq    $dst,[$constantaddress]\n\t"
4323             "vpbroadcastb $dst,$dst\t! replicate16B" %}
4324   ins_encode %{
4325    int vector_len = 0;
4326     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
4327     __ vpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4328   %}
4329   ins_pipe( pipe_slow );
4330 %}
4331 
4332 instruct Repl32B_imm_evex(vecY dst, immI con) %{
4333   predicate(n->as_Vector()->length() == 32 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4334   match(Set dst (ReplicateB con));
4335   format %{ "movq    $dst,[$constantaddress]\n\t"
4336             "vpbroadcastb $dst,$dst\t! replicate32B" %}
4337   ins_encode %{
4338    int vector_len = 1;
4339     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
4340     __ vpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4341   %}
4342   ins_pipe( pipe_slow );
4343 %}
4344 
4345 instruct Repl64B_imm_evex(vecZ dst, immI con) %{
4346   predicate(n->as_Vector()->length() == 64 && UseAVX > 2 && VM_Version::supports_avx512bw());
4347   match(Set dst (ReplicateB con));
4348   format %{ "movq    $dst,[$constantaddress]\n\t"
4349             "vpbroadcastb $dst,$dst\t! upper replicate64B" %}
4350   ins_encode %{
4351    int vector_len = 2;
4352     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
4353     __ vpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4354   %}
4355   ins_pipe( pipe_slow );
4356 %}
4357 
4358 instruct Repl64B_zero_evex(vecZ dst, immI0 zero) %{
4359   predicate(n->as_Vector()->length() == 64 && UseAVX > 2);
4360   match(Set dst (ReplicateB zero));
4361   format %{ "vpxor   $dst k0,$dst,$dst\t! replicate64B zero" %}
4362   ins_encode %{
4363     // Use vxorpd since AVX does not have vpxor for 512-bit (EVEX will have it).
4364     int vector_len = 2;
4365     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4366   %}
4367   ins_pipe( fpu_reg_reg );
4368 %}
4369 
4370 instruct Repl4S_evex(vecD dst, rRegI src) %{
4371   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4372   match(Set dst (ReplicateS src));
4373   format %{ "evpbroadcastw $dst,$src\t! replicate4S" %}
4374   ins_encode %{
4375    int vector_len = 0;
4376     __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
4377   %}
4378   ins_pipe( pipe_slow );
4379 %}
4380 
4381 instruct Repl4S_mem_evex(vecD dst, memory mem) %{
4382   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4383   match(Set dst (ReplicateS (LoadS mem)));
4384   format %{ "vpbroadcastw  $dst,$mem\t! replicate4S" %}
4385   ins_encode %{
4386     int vector_len = 0;
4387     __ vpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
4388   %}
4389   ins_pipe( pipe_slow );
4390 %}
4391 
4392 instruct Repl8S_evex(vecX dst, rRegI src) %{
4393   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4394   match(Set dst (ReplicateS src));
4395   format %{ "evpbroadcastw $dst,$src\t! replicate8S" %}
4396   ins_encode %{
4397    int vector_len = 0;
4398     __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
4399   %}
4400   ins_pipe( pipe_slow );
4401 %}
4402 
4403 instruct Repl8S_mem_evex(vecX dst, memory mem) %{
4404   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4405   match(Set dst (ReplicateS (LoadS mem)));
4406   format %{ "vpbroadcastw  $dst,$mem\t! replicate8S" %}
4407   ins_encode %{
4408     int vector_len = 0;
4409     __ vpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
4410   %}
4411   ins_pipe( pipe_slow );
4412 %}
4413 
4414 instruct Repl16S_evex(vecY dst, rRegI src) %{
4415   predicate(n->as_Vector()->length() == 16 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4416   match(Set dst (ReplicateS src));
4417   format %{ "evpbroadcastw $dst,$src\t! replicate16S" %}
4418   ins_encode %{
4419    int vector_len = 1;
4420     __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
4421   %}
4422   ins_pipe( pipe_slow );
4423 %}
4424 
4425 instruct Repl16S_mem_evex(vecY dst, memory mem) %{
4426   predicate(n->as_Vector()->length() == 16 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4427   match(Set dst (ReplicateS (LoadS mem)));
4428   format %{ "vpbroadcastw  $dst,$mem\t! replicate16S" %}
4429   ins_encode %{
4430     int vector_len = 1;
4431     __ vpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
4432   %}
4433   ins_pipe( pipe_slow );
4434 %}
4435 
4436 instruct Repl32S_evex(vecZ dst, rRegI src) %{
4437   predicate(n->as_Vector()->length() == 32 && UseAVX > 2 && VM_Version::supports_avx512bw());
4438   match(Set dst (ReplicateS src));
4439   format %{ "evpbroadcastw $dst,$src\t! replicate32S" %}
4440   ins_encode %{
4441    int vector_len = 2;
4442     __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
4443   %}
4444   ins_pipe( pipe_slow );
4445 %}
4446 
4447 instruct Repl32S_mem_evex(vecZ dst, memory mem) %{
4448   predicate(n->as_Vector()->length() == 32 && UseAVX > 2 && VM_Version::supports_avx512bw());
4449   match(Set dst (ReplicateS (LoadS mem)));
4450   format %{ "vpbroadcastw  $dst,$mem\t! replicate32S" %}
4451   ins_encode %{
4452     int vector_len = 2;
4453     __ vpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
4454   %}
4455   ins_pipe( pipe_slow );
4456 %}
4457 
4458 instruct Repl8S_imm_evex(vecX dst, immI con) %{
4459   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4460   match(Set dst (ReplicateS con));
4461   format %{ "movq    $dst,[$constantaddress]\n\t"
4462             "vpbroadcastw $dst,$dst\t! replicate8S" %}
4463   ins_encode %{
4464    int vector_len = 0;
4465     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
4466     __ vpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4467   %}
4468   ins_pipe( pipe_slow );
4469 %}
4470 
4471 instruct Repl16S_imm_evex(vecY dst, immI con) %{
4472   predicate(n->as_Vector()->length() == 16 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4473   match(Set dst (ReplicateS con));
4474   format %{ "movq    $dst,[$constantaddress]\n\t"
4475             "vpbroadcastw $dst,$dst\t! replicate16S" %}
4476   ins_encode %{
4477    int vector_len = 1;
4478     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
4479     __ vpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4480   %}
4481   ins_pipe( pipe_slow );
4482 %}
4483 
4484 instruct Repl32S_imm_evex(vecZ dst, immI con) %{
4485   predicate(n->as_Vector()->length() == 32 && UseAVX > 2 && VM_Version::supports_avx512bw());
4486   match(Set dst (ReplicateS con));
4487   format %{ "movq    $dst,[$constantaddress]\n\t"
4488             "vpbroadcastw $dst,$dst\t! replicate32S" %}
4489   ins_encode %{
4490    int vector_len = 2;
4491     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
4492     __ vpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4493   %}
4494   ins_pipe( pipe_slow );
4495 %}
4496 
4497 instruct Repl32S_zero_evex(vecZ dst, immI0 zero) %{
4498   predicate(n->as_Vector()->length() == 32 && UseAVX > 2);
4499   match(Set dst (ReplicateS zero));
4500   format %{ "vpxor   $dst k0,$dst,$dst\t! replicate32S zero" %}
4501   ins_encode %{
4502     // Use vxorpd since AVX does not have vpxor for 512-bit (EVEX will have it).
4503     int vector_len = 2;
4504     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4505   %}
4506   ins_pipe( fpu_reg_reg );
4507 %}
4508 
4509 instruct Repl4I_evex(vecX dst, rRegI src) %{
4510   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
4511   match(Set dst (ReplicateI src));
4512   format %{ "evpbroadcastd  $dst,$src\t! replicate4I" %}
4513   ins_encode %{
4514     int vector_len = 0;
4515     __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vector_len);
4516   %}
4517   ins_pipe( pipe_slow );
4518 %}
4519 
4520 instruct Repl4I_mem_evex(vecX dst, memory mem) %{
4521   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
4522   match(Set dst (ReplicateI (LoadI mem)));
4523   format %{ "vpbroadcastd  $dst,$mem\t! replicate4I" %}
4524   ins_encode %{
4525     int vector_len = 0;
4526     __ vpbroadcastd($dst$$XMMRegister, $mem$$Address, vector_len);
4527   %}
4528   ins_pipe( pipe_slow );
4529 %}
4530 
4531 instruct Repl8I_evex(vecY dst, rRegI src) %{
4532   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vl());
4533   match(Set dst (ReplicateI src));
4534   format %{ "evpbroadcastd  $dst,$src\t! replicate8I" %}
4535   ins_encode %{
4536     int vector_len = 1;
4537     __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vector_len);
4538   %}
4539   ins_pipe( pipe_slow );
4540 %}
4541 
4542 instruct Repl8I_mem_evex(vecY dst, memory mem) %{
4543   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vl());
4544   match(Set dst (ReplicateI (LoadI mem)));
4545   format %{ "vpbroadcastd  $dst,$mem\t! replicate8I" %}
4546   ins_encode %{
4547     int vector_len = 1;
4548     __ vpbroadcastd($dst$$XMMRegister, $mem$$Address, vector_len);
4549   %}
4550   ins_pipe( pipe_slow );
4551 %}
4552 
4553 instruct Repl16I_evex(vecZ dst, rRegI src) %{
4554   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4555   match(Set dst (ReplicateI src));
4556   format %{ "evpbroadcastd  $dst,$src\t! replicate16I" %}
4557   ins_encode %{
4558     int vector_len = 2;
4559     __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vector_len);
4560   %}
4561   ins_pipe( pipe_slow );
4562 %}
4563 
4564 instruct Repl16I_mem_evex(vecZ dst, memory mem) %{
4565   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4566   match(Set dst (ReplicateI (LoadI mem)));
4567   format %{ "vpbroadcastd  $dst,$mem\t! replicate16I" %}
4568   ins_encode %{
4569     int vector_len = 2;
4570     __ vpbroadcastd($dst$$XMMRegister, $mem$$Address, vector_len);
4571   %}
4572   ins_pipe( pipe_slow );
4573 %}
4574 
4575 instruct Repl4I_imm_evex(vecX dst, immI con) %{
4576   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
4577   match(Set dst (ReplicateI con));
4578   format %{ "movq    $dst,[$constantaddress]\t! replicate8I($con)\n\t"
4579             "vpbroadcastd  $dst,$dst\t! replicate4I" %}
4580   ins_encode %{
4581     int vector_len = 0;
4582     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
4583     __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4584   %}
4585   ins_pipe( pipe_slow );
4586 %}
4587 
4588 instruct Repl8I_imm_evex(vecY dst, immI con) %{
4589   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vl());
4590   match(Set dst (ReplicateI con));
4591   format %{ "movq    $dst,[$constantaddress]\t! replicate8I($con)\n\t"
4592             "vpbroadcastd  $dst,$dst\t! replicate8I" %}
4593   ins_encode %{
4594     int vector_len = 1;
4595     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
4596     __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4597   %}
4598   ins_pipe( pipe_slow );
4599 %}
4600 
4601 instruct Repl16I_imm_evex(vecZ dst, immI con) %{
4602   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4603   match(Set dst (ReplicateI con));
4604   format %{ "movq    $dst,[$constantaddress]\t! replicate16I($con)\n\t"
4605             "vpbroadcastd  $dst,$dst\t! replicate16I" %}
4606   ins_encode %{
4607     int vector_len = 2;
4608     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
4609     __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4610   %}
4611   ins_pipe( pipe_slow );
4612 %}
4613 
4614 instruct Repl16I_zero_evex(vecZ dst, immI0 zero) %{
4615   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4616   match(Set dst (ReplicateI zero));
4617   format %{ "vpxor   $dst k0,$dst,$dst\t! replicate16I zero" %}
4618   ins_encode %{
4619     // Use vxorpd since AVX does not have vpxor for 512-bit (AVX2 will have it).
4620     int vector_len = 2;
4621     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4622   %}
4623   ins_pipe( fpu_reg_reg );
4624 %}
4625 
4626 // Replicate long (8 byte) scalar to be vector
4627 #ifdef _LP64
4628 instruct Repl4L_evex(vecY dst, rRegL src) %{
4629   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
4630   match(Set dst (ReplicateL src));
4631   format %{ "evpbroadcastq  $dst,$src\t! replicate4L" %}
4632   ins_encode %{
4633     int vector_len = 1;
4634     __ evpbroadcastq($dst$$XMMRegister, $src$$Register, vector_len);
4635   %}
4636   ins_pipe( pipe_slow );
4637 %}
4638 
4639 instruct Repl8L_evex(vecZ dst, rRegL src) %{
4640   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4641   match(Set dst (ReplicateL src));
4642   format %{ "evpbroadcastq  $dst,$src\t! replicate8L" %}
4643   ins_encode %{
4644     int vector_len = 2;
4645     __ evpbroadcastq($dst$$XMMRegister, $src$$Register, vector_len);
4646   %}
4647   ins_pipe( pipe_slow );
4648 %}
4649 #else // _LP64
4650 instruct Repl4L_evex(vecY dst, eRegL src, regD tmp) %{
4651   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
4652   match(Set dst (ReplicateL src));
4653   effect(TEMP dst, USE src, TEMP tmp);
4654   format %{ "movdl   $dst,$src.lo\n\t"
4655             "movdl   $tmp,$src.hi\n\t"
4656             "punpckldq $dst,$tmp\n\t"
4657             "vpbroadcastq  $dst,$dst\t! replicate4L" %}
4658   ins_encode %{
4659     int vector_len = 1;
4660     __ movdl($dst$$XMMRegister, $src$$Register);
4661     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
4662     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
4663     __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4664   %}
4665   ins_pipe( pipe_slow );
4666 %}
4667 
4668 instruct Repl8L_evex(legVecZ dst, eRegL src, legVecZ tmp) %{
4669   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4670   match(Set dst (ReplicateL src));
4671   effect(TEMP dst, USE src, TEMP tmp);
4672   format %{ "movdl   $dst,$src.lo\n\t"
4673             "movdl   $tmp,$src.hi\n\t"
4674             "punpckldq $dst,$tmp\n\t"
4675             "vpbroadcastq  $dst,$dst\t! replicate8L" %}
4676   ins_encode %{
4677     int vector_len = 2;
4678     __ movdl($dst$$XMMRegister, $src$$Register);
4679     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
4680     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
4681     __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4682   %}
4683   ins_pipe( pipe_slow );
4684 %}
4685 #endif // _LP64
4686 
4687 instruct Repl4L_imm_evex(vecY dst, immL con) %{
4688   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
4689   match(Set dst (ReplicateL con));
4690   format %{ "movq    $dst,[$constantaddress]\n\t"
4691             "vpbroadcastq  $dst,$dst\t! replicate4L" %}
4692   ins_encode %{
4693     int vector_len = 1;
4694     __ movq($dst$$XMMRegister, $constantaddress($con));
4695     __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4696   %}
4697   ins_pipe( pipe_slow );
4698 %}
4699 
4700 instruct Repl8L_imm_evex(vecZ dst, immL con) %{
4701   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4702   match(Set dst (ReplicateL con));
4703   format %{ "movq    $dst,[$constantaddress]\n\t"
4704             "vpbroadcastq  $dst,$dst\t! replicate8L" %}
4705   ins_encode %{
4706     int vector_len = 2;
4707     __ movq($dst$$XMMRegister, $constantaddress($con));
4708     __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4709   %}
4710   ins_pipe( pipe_slow );
4711 %}
4712 
4713 instruct Repl2L_mem_evex(vecX dst, memory mem) %{
4714   predicate(n->as_Vector()->length() == 2 && UseAVX > 2 && VM_Version::supports_avx512vl());
4715   match(Set dst (ReplicateL (LoadL mem)));
4716   format %{ "vpbroadcastd  $dst,$mem\t! replicate2L" %}
4717   ins_encode %{
4718     int vector_len = 0;
4719     __ vpbroadcastq($dst$$XMMRegister, $mem$$Address, vector_len);
4720   %}
4721   ins_pipe( pipe_slow );
4722 %}
4723 
4724 instruct Repl4L_mem_evex(vecY dst, memory mem) %{
4725   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
4726   match(Set dst (ReplicateL (LoadL mem)));
4727   format %{ "vpbroadcastd  $dst,$mem\t! replicate4L" %}
4728   ins_encode %{
4729     int vector_len = 1;
4730     __ vpbroadcastq($dst$$XMMRegister, $mem$$Address, vector_len);
4731   %}
4732   ins_pipe( pipe_slow );
4733 %}
4734 
4735 instruct Repl8L_mem_evex(vecZ dst, memory mem) %{
4736   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4737   match(Set dst (ReplicateL (LoadL mem)));
4738   format %{ "vpbroadcastd  $dst,$mem\t! replicate8L" %}
4739   ins_encode %{
4740     int vector_len = 2;
4741     __ vpbroadcastq($dst$$XMMRegister, $mem$$Address, vector_len);
4742   %}
4743   ins_pipe( pipe_slow );
4744 %}
4745 
4746 instruct Repl8L_zero_evex(vecZ dst, immL0 zero) %{
4747   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4748   match(Set dst (ReplicateL zero));
4749   format %{ "vpxor   $dst k0,$dst,$dst\t! replicate8L zero" %}
4750   ins_encode %{
4751     // Use vxorpd since AVX does not have vpxor for 512-bit (EVEX will have it).
4752     int vector_len = 2;
4753     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4754   %}
4755   ins_pipe( fpu_reg_reg );
4756 %}
4757 
4758 instruct Repl8F_evex(vecY dst, regF src) %{
4759   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vl());
4760   match(Set dst (ReplicateF src));
4761   format %{ "vpbroadcastss $dst,$src\t! replicate8F" %}
4762   ins_encode %{
4763     int vector_len = 1;
4764     __ vpbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vector_len);
4765   %}
4766   ins_pipe( pipe_slow );
4767 %}
4768 
4769 instruct Repl8F_mem_evex(vecY dst, memory mem) %{
4770   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vl());
4771   match(Set dst (ReplicateF (LoadF mem)));
4772   format %{ "vbroadcastss  $dst,$mem\t! replicate8F" %}
4773   ins_encode %{
4774     int vector_len = 1;
4775     __ vpbroadcastss($dst$$XMMRegister, $mem$$Address, vector_len);
4776   %}
4777   ins_pipe( pipe_slow );
4778 %}
4779 
4780 instruct Repl16F_evex(vecZ dst, regF src) %{
4781   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4782   match(Set dst (ReplicateF src));
4783   format %{ "vpbroadcastss $dst,$src\t! replicate16F" %}
4784   ins_encode %{
4785     int vector_len = 2;
4786     __ vpbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vector_len);
4787   %}
4788   ins_pipe( pipe_slow );
4789 %}
4790 
4791 instruct Repl16F_mem_evex(vecZ dst, memory mem) %{
4792   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4793   match(Set dst (ReplicateF (LoadF mem)));
4794   format %{ "vbroadcastss  $dst,$mem\t! replicate16F" %}
4795   ins_encode %{
4796     int vector_len = 2;
4797     __ vpbroadcastss($dst$$XMMRegister, $mem$$Address, vector_len);
4798   %}
4799   ins_pipe( pipe_slow );
4800 %}
4801 
4802 instruct Repl2F_zero_evex(vecD dst, immF0 zero) %{
4803   predicate(n->as_Vector()->length() == 2 && UseAVX > 2);
4804   match(Set dst (ReplicateF zero));
4805   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate2F zero" %}
4806   ins_encode %{
4807     // Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
4808     int vector_len = 2;
4809     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4810   %}
4811   ins_pipe( fpu_reg_reg );
4812 %}
4813 
4814 instruct Repl4F_zero_evex(vecX dst, immF0 zero) %{
4815   predicate(n->as_Vector()->length() == 4 && UseAVX > 2);
4816   match(Set dst (ReplicateF zero));
4817   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate4F zero" %}
4818   ins_encode %{
4819     // Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
4820     int vector_len = 2;
4821     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4822   %}
4823   ins_pipe( fpu_reg_reg );
4824 %}
4825 
4826 instruct Repl8F_zero_evex(vecY dst, immF0 zero) %{
4827   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4828   match(Set dst (ReplicateF zero));
4829   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate8F zero" %}
4830   ins_encode %{
4831     // Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
4832     int vector_len = 2;
4833     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4834   %}
4835   ins_pipe( fpu_reg_reg );
4836 %}
4837 
4838 instruct Repl16F_zero_evex(vecZ dst, immF0 zero) %{
4839   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4840   match(Set dst (ReplicateF zero));
4841   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate16F zero" %}
4842   ins_encode %{
4843     // Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
4844     int vector_len = 2;
4845     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4846   %}
4847   ins_pipe( fpu_reg_reg );
4848 %}
4849 
4850 instruct Repl4D_evex(vecY dst, regD src) %{
4851   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
4852   match(Set dst (ReplicateD src));
4853   format %{ "vpbroadcastsd $dst,$src\t! replicate4D" %}
4854   ins_encode %{
4855     int vector_len = 1;
4856     __ vpbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
4857   %}
4858   ins_pipe( pipe_slow );
4859 %}
4860 
4861 instruct Repl4D_mem_evex(vecY dst, memory mem) %{
4862   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
4863   match(Set dst (ReplicateD (LoadD mem)));
4864   format %{ "vbroadcastsd  $dst,$mem\t! replicate4D" %}
4865   ins_encode %{
4866     int vector_len = 1;
4867     __ vpbroadcastsd($dst$$XMMRegister, $mem$$Address, vector_len);
4868   %}
4869   ins_pipe( pipe_slow );
4870 %}
4871 
4872 instruct Repl8D_evex(vecZ dst, regD src) %{
4873   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4874   match(Set dst (ReplicateD src));
4875   format %{ "vpbroadcastsd $dst,$src\t! replicate8D" %}
4876   ins_encode %{
4877     int vector_len = 2;
4878     __ vpbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
4879   %}
4880   ins_pipe( pipe_slow );
4881 %}
4882 
4883 instruct Repl8D_mem_evex(vecZ dst, memory mem) %{
4884   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4885   match(Set dst (ReplicateD (LoadD mem)));
4886   format %{ "vbroadcastsd  $dst,$mem\t! replicate8D" %}
4887   ins_encode %{
4888     int vector_len = 2;
4889     __ vpbroadcastsd($dst$$XMMRegister, $mem$$Address, vector_len);
4890   %}
4891   ins_pipe( pipe_slow );
4892 %}
4893 
4894 instruct Repl2D_zero_evex(vecX dst, immD0 zero) %{
4895   predicate(n->as_Vector()->length() == 2 && UseAVX > 2);
4896   match(Set dst (ReplicateD zero));
4897   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate2D zero" %}
4898   ins_encode %{
4899     // Use vpxor in place of vxorpd since EVEX has a constriant on dq for vxorpd: this is a 512-bit operation
4900     int vector_len = 2;
4901     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4902   %}
4903   ins_pipe( fpu_reg_reg );
4904 %}
4905 
4906 instruct Repl4D_zero_evex(vecY dst, immD0 zero) %{
4907   predicate(n->as_Vector()->length() == 4 && UseAVX > 2);
4908   match(Set dst (ReplicateD zero));
4909   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate4D zero" %}
4910   ins_encode %{
4911     // Use vpxor in place of vxorpd since EVEX has a constriant on dq for vxorpd: this is a 512-bit operation
4912     int vector_len = 2;
4913     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4914   %}
4915   ins_pipe( fpu_reg_reg );
4916 %}
4917 
4918 instruct Repl8D_zero_evex(vecZ dst, immD0 zero) %{
4919   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4920   match(Set dst (ReplicateD zero));
4921   format %{ "vpxor  $dst k0,$dst,$dst,vect512\t! replicate8D zero" %}
4922   ins_encode %{
4923     // Use vpxor in place of vxorpd since EVEX has a constriant on dq for vxorpd: this is a 512-bit operation
4924     int vector_len = 2;
4925     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4926   %}
4927   ins_pipe( fpu_reg_reg );
4928 %}
4929 
4930 // ====================REDUCTION ARITHMETIC=======================================
4931 
4932 instruct rsadd2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, vecD tmp, vecD tmp2) %{
4933   predicate(UseSSE > 2 && UseAVX == 0);
4934   match(Set dst (AddReductionVI src1 src2));
4935   effect(TEMP tmp2, TEMP tmp);
4936   format %{ "movdqu  $tmp2,$src2\n\t"
4937             "phaddd  $tmp2,$tmp2\n\t"
4938             "movd    $tmp,$src1\n\t"
4939             "paddd   $tmp,$tmp2\n\t"
4940             "movd    $dst,$tmp\t! add reduction2I" %}
4941   ins_encode %{
4942     __ movdqu($tmp2$$XMMRegister, $src2$$XMMRegister);
4943     __ phaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister);
4944     __ movdl($tmp$$XMMRegister, $src1$$Register);
4945     __ paddd($tmp$$XMMRegister, $tmp2$$XMMRegister);
4946     __ movdl($dst$$Register, $tmp$$XMMRegister);
4947   %}
4948   ins_pipe( pipe_slow );
4949 %}
4950 
4951 instruct rvadd2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, vecD tmp, vecD tmp2) %{
4952   predicate(VM_Version::supports_avxonly());
4953   match(Set dst (AddReductionVI src1 src2));
4954   effect(TEMP tmp, TEMP tmp2);
4955   format %{ "vphaddd  $tmp,$src2,$src2\n\t"
4956             "movd     $tmp2,$src1\n\t"
4957             "vpaddd   $tmp2,$tmp2,$tmp\n\t"
4958             "movd     $dst,$tmp2\t! add reduction2I" %}
4959   ins_encode %{
4960     int vector_len = 0;
4961     __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len);
4962     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4963     __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
4964     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4965   %}
4966   ins_pipe( pipe_slow );
4967 %}
4968 
4969 instruct rvadd2I_reduction_reg_evex(rRegI dst, rRegI src1, vecD src2, vecD tmp, vecD tmp2) %{
4970   predicate(UseAVX > 2);
4971   match(Set dst (AddReductionVI src1 src2));
4972   effect(TEMP tmp, TEMP tmp2);
4973   format %{ "pshufd  $tmp2,$src2,0x1\n\t"
4974             "vpaddd  $tmp,$src2,$tmp2\n\t"
4975             "movd    $tmp2,$src1\n\t"
4976             "vpaddd  $tmp2,$tmp,$tmp2\n\t"
4977             "movd    $dst,$tmp2\t! add reduction2I" %}
4978   ins_encode %{
4979     int vector_len = 0;
4980     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
4981     __ vpaddd($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4982     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4983     __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4984     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4985   %}
4986   ins_pipe( pipe_slow );
4987 %}
4988 
4989 instruct rsadd4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, vecX tmp, vecX tmp2) %{
4990   predicate(UseSSE > 2 && UseAVX == 0);
4991   match(Set dst (AddReductionVI src1 src2));
4992   effect(TEMP tmp, TEMP tmp2);
4993   format %{ "movdqu  $tmp,$src2\n\t"
4994             "phaddd  $tmp,$tmp\n\t"
4995             "phaddd  $tmp,$tmp\n\t"
4996             "movd    $tmp2,$src1\n\t"
4997             "paddd   $tmp2,$tmp\n\t"
4998             "movd    $dst,$tmp2\t! add reduction4I" %}
4999   ins_encode %{
5000     __ movdqu($tmp$$XMMRegister, $src2$$XMMRegister);
5001     __ phaddd($tmp$$XMMRegister, $tmp$$XMMRegister);
5002     __ phaddd($tmp$$XMMRegister, $tmp$$XMMRegister);
5003     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5004     __ paddd($tmp2$$XMMRegister, $tmp$$XMMRegister);
5005     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5006   %}
5007   ins_pipe( pipe_slow );
5008 %}
5009 
5010 instruct rvadd4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, vecX tmp, vecX tmp2) %{
5011   predicate(VM_Version::supports_avxonly());
5012   match(Set dst (AddReductionVI src1 src2));
5013   effect(TEMP tmp, TEMP tmp2);
5014   format %{ "vphaddd  $tmp,$src2,$src2\n\t"
5015             "vphaddd  $tmp,$tmp,$tmp\n\t"
5016             "movd     $tmp2,$src1\n\t"
5017             "vpaddd   $tmp2,$tmp2,$tmp\n\t"
5018             "movd     $dst,$tmp2\t! add reduction4I" %}
5019   ins_encode %{
5020     int vector_len = 0;
5021     __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len);
5022     __ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp$$XMMRegister, vector_len);
5023     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5024     __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
5025     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5026   %}
5027   ins_pipe( pipe_slow );
5028 %}
5029 
5030 instruct rvadd4I_reduction_reg_evex(rRegI dst, rRegI src1, vecX src2, vecX tmp, vecX tmp2) %{
5031   predicate(UseAVX > 2);
5032   match(Set dst (AddReductionVI src1 src2));
5033   effect(TEMP tmp, TEMP tmp2);
5034   format %{ "pshufd  $tmp2,$src2,0xE\n\t"
5035             "vpaddd  $tmp,$src2,$tmp2\n\t"
5036             "pshufd  $tmp2,$tmp,0x1\n\t"
5037             "vpaddd  $tmp,$tmp,$tmp2\n\t"
5038             "movd    $tmp2,$src1\n\t"
5039             "vpaddd  $tmp2,$tmp,$tmp2\n\t"
5040             "movd    $dst,$tmp2\t! add reduction4I" %}
5041   ins_encode %{
5042     int vector_len = 0;
5043     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
5044     __ vpaddd($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5045     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
5046     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5047     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5048     __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5049     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5050   %}
5051   ins_pipe( pipe_slow );
5052 %}
5053 
5054 instruct rvadd8I_reduction_reg(rRegI dst, rRegI src1, vecY src2, vecY tmp, vecY tmp2) %{
5055   predicate(VM_Version::supports_avxonly());
5056   match(Set dst (AddReductionVI src1 src2));
5057   effect(TEMP tmp, TEMP tmp2);
5058   format %{ "vphaddd  $tmp,$src2,$src2\n\t"
5059             "vphaddd  $tmp,$tmp,$tmp2\n\t"
5060             "vextracti128_high  $tmp2,$tmp\n\t"
5061             "vpaddd   $tmp,$tmp,$tmp2\n\t"
5062             "movd     $tmp2,$src1\n\t"
5063             "vpaddd   $tmp2,$tmp2,$tmp\n\t"
5064             "movd     $dst,$tmp2\t! add reduction8I" %}
5065   ins_encode %{
5066     int vector_len = 1;
5067     __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len);
5068     __ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5069     __ vextracti128_high($tmp2$$XMMRegister, $tmp$$XMMRegister);
5070     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5071     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5072     __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5073     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5074   %}
5075   ins_pipe( pipe_slow );
5076 %}
5077 
5078 instruct rvadd8I_reduction_reg_evex(rRegI dst, rRegI src1, vecY src2, vecY tmp, vecY tmp2) %{
5079   predicate(UseAVX > 2);
5080   match(Set dst (AddReductionVI src1 src2));
5081   effect(TEMP tmp, TEMP tmp2);
5082   format %{ "vextracti128_high  $tmp,$src2\n\t"
5083             "vpaddd  $tmp,$tmp,$src2\n\t"
5084             "pshufd  $tmp2,$tmp,0xE\n\t"
5085             "vpaddd  $tmp,$tmp,$tmp2\n\t"
5086             "pshufd  $tmp2,$tmp,0x1\n\t"
5087             "vpaddd  $tmp,$tmp,$tmp2\n\t"
5088             "movd    $tmp2,$src1\n\t"
5089             "vpaddd  $tmp2,$tmp,$tmp2\n\t"
5090             "movd    $dst,$tmp2\t! add reduction8I" %}
5091   ins_encode %{
5092     int vector_len = 0;
5093     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
5094     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
5095     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
5096     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5097     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
5098     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5099     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5100     __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5101     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5102   %}
5103   ins_pipe( pipe_slow );
5104 %}
5105 
5106 instruct rvadd16I_reduction_reg_evex(rRegI dst, rRegI src1, legVecZ src2, legVecZ tmp, legVecZ tmp2, legVecZ tmp3) %{
5107   predicate(UseAVX > 2);
5108   match(Set dst (AddReductionVI src1 src2));
5109   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
5110   format %{ "vextracti64x4_high  $tmp3,$src2\n\t"
5111             "vpaddd  $tmp3,$tmp3,$src2\n\t"
5112             "vextracti128_high  $tmp,$tmp3\n\t"
5113             "vpaddd  $tmp,$tmp,$tmp3\n\t"
5114             "pshufd  $tmp2,$tmp,0xE\n\t"
5115             "vpaddd  $tmp,$tmp,$tmp2\n\t"
5116             "pshufd  $tmp2,$tmp,0x1\n\t"
5117             "vpaddd  $tmp,$tmp,$tmp2\n\t"
5118             "movd    $tmp2,$src1\n\t"
5119             "vpaddd  $tmp2,$tmp,$tmp2\n\t"
5120             "movd    $dst,$tmp2\t! mul reduction16I" %}
5121   ins_encode %{
5122     __ vextracti64x4_high($tmp3$$XMMRegister, $src2$$XMMRegister);
5123     __ vpaddd($tmp3$$XMMRegister, $tmp3$$XMMRegister, $src2$$XMMRegister, 1);
5124     __ vextracti128_high($tmp$$XMMRegister, $tmp3$$XMMRegister);
5125     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp3$$XMMRegister, 0);
5126     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
5127     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5128     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
5129     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5130     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5131     __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5132     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5133   %}
5134   ins_pipe( pipe_slow );
5135 %}
5136 
5137 #ifdef _LP64
5138 instruct rvadd2L_reduction_reg(rRegL dst, rRegL src1, vecX src2, vecX tmp, vecX tmp2) %{
5139   predicate(UseAVX > 2);
5140   match(Set dst (AddReductionVL src1 src2));
5141   effect(TEMP tmp, TEMP tmp2);
5142   format %{ "pshufd  $tmp2,$src2,0xE\n\t"
5143             "vpaddq  $tmp,$src2,$tmp2\n\t"
5144             "movdq   $tmp2,$src1\n\t"
5145             "vpaddq  $tmp2,$tmp,$tmp2\n\t"
5146             "movdq   $dst,$tmp2\t! add reduction2L" %}
5147   ins_encode %{
5148     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
5149     __ vpaddq($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, 0);
5150     __ movdq($tmp2$$XMMRegister, $src1$$Register);
5151     __ vpaddq($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5152     __ movdq($dst$$Register, $tmp2$$XMMRegister);
5153   %}
5154   ins_pipe( pipe_slow );
5155 %}
5156 
5157 instruct rvadd4L_reduction_reg(rRegL dst, rRegL src1, vecY src2, vecY tmp, vecY tmp2) %{
5158   predicate(UseAVX > 2);
5159   match(Set dst (AddReductionVL src1 src2));
5160   effect(TEMP tmp, TEMP tmp2);
5161   format %{ "vextracti128_high  $tmp,$src2\n\t"
5162             "vpaddq  $tmp2,$tmp,$src2\n\t"
5163             "pshufd  $tmp,$tmp2,0xE\n\t"
5164             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
5165             "movdq   $tmp,$src1\n\t"
5166             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
5167             "movdq   $dst,$tmp2\t! add reduction4L" %}
5168   ins_encode %{
5169     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
5170     __ vpaddq($tmp2$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, 0);
5171     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5172     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5173     __ movdq($tmp$$XMMRegister, $src1$$Register);
5174     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5175     __ movdq($dst$$Register, $tmp2$$XMMRegister);
5176   %}
5177   ins_pipe( pipe_slow );
5178 %}
5179 
5180 instruct rvadd8L_reduction_reg(rRegL dst, rRegL src1, legVecZ src2, legVecZ tmp, legVecZ tmp2) %{
5181   predicate(UseAVX > 2);
5182   match(Set dst (AddReductionVL src1 src2));
5183   effect(TEMP tmp, TEMP tmp2);
5184   format %{ "vextracti64x4_high  $tmp2,$src2\n\t"
5185             "vpaddq  $tmp2,$tmp2,$src2\n\t"
5186             "vextracti128_high  $tmp,$tmp2\n\t"
5187             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
5188             "pshufd  $tmp,$tmp2,0xE\n\t"
5189             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
5190             "movdq   $tmp,$src1\n\t"
5191             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
5192             "movdq   $dst,$tmp2\t! add reduction8L" %}
5193   ins_encode %{
5194     __ vextracti64x4_high($tmp2$$XMMRegister, $src2$$XMMRegister);
5195     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
5196     __ vextracti128_high($tmp$$XMMRegister, $tmp2$$XMMRegister);
5197     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5198     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5199     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5200     __ movdq($tmp$$XMMRegister, $src1$$Register);
5201     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5202     __ movdq($dst$$Register, $tmp2$$XMMRegister);
5203   %}
5204   ins_pipe( pipe_slow );
5205 %}
5206 #endif
5207 
5208 instruct rsadd2F_reduction_reg(regF dst, vecD src2, vecD tmp) %{
5209   predicate(UseSSE >= 1 && UseAVX == 0);
5210   match(Set dst (AddReductionVF dst src2));
5211   effect(TEMP dst, TEMP tmp);
5212   format %{ "addss   $dst,$src2\n\t"
5213             "pshufd  $tmp,$src2,0x01\n\t"
5214             "addss   $dst,$tmp\t! add reduction2F" %}
5215   ins_encode %{
5216     __ addss($dst$$XMMRegister, $src2$$XMMRegister);
5217     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5218     __ addss($dst$$XMMRegister, $tmp$$XMMRegister);
5219   %}
5220   ins_pipe( pipe_slow );
5221 %}
5222 
5223 instruct rvadd2F_reduction_reg(regF dst, vecD src2, vecD tmp) %{
5224   predicate(UseAVX > 0);
5225   match(Set dst (AddReductionVF dst src2));
5226   effect(TEMP dst, TEMP tmp);
5227   format %{ "vaddss  $dst,$dst,$src2\n\t"
5228             "pshufd  $tmp,$src2,0x01\n\t"
5229             "vaddss  $dst,$dst,$tmp\t! add reduction2F" %}
5230   ins_encode %{
5231     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5232     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5233     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5234   %}
5235   ins_pipe( pipe_slow );
5236 %}
5237 
5238 instruct rsadd4F_reduction_reg(regF dst, vecX src2, vecX tmp) %{
5239   predicate(UseSSE >= 1 && UseAVX == 0);
5240   match(Set dst (AddReductionVF dst src2));
5241   effect(TEMP dst, TEMP tmp);
5242   format %{ "addss   $dst,$src2\n\t"
5243             "pshufd  $tmp,$src2,0x01\n\t"
5244             "addss   $dst,$tmp\n\t"
5245             "pshufd  $tmp,$src2,0x02\n\t"
5246             "addss   $dst,$tmp\n\t"
5247             "pshufd  $tmp,$src2,0x03\n\t"
5248             "addss   $dst,$tmp\t! add reduction4F" %}
5249   ins_encode %{
5250     __ addss($dst$$XMMRegister, $src2$$XMMRegister);
5251     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5252     __ addss($dst$$XMMRegister, $tmp$$XMMRegister);
5253     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5254     __ addss($dst$$XMMRegister, $tmp$$XMMRegister);
5255     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5256     __ addss($dst$$XMMRegister, $tmp$$XMMRegister);
5257   %}
5258   ins_pipe( pipe_slow );
5259 %}
5260 
5261 instruct rvadd4F_reduction_reg(regF dst, vecX src2, vecX tmp) %{
5262   predicate(UseAVX > 0);
5263   match(Set dst (AddReductionVF dst src2));
5264   effect(TEMP tmp, TEMP dst);
5265   format %{ "vaddss  $dst,dst,$src2\n\t"
5266             "pshufd  $tmp,$src2,0x01\n\t"
5267             "vaddss  $dst,$dst,$tmp\n\t"
5268             "pshufd  $tmp,$src2,0x02\n\t"
5269             "vaddss  $dst,$dst,$tmp\n\t"
5270             "pshufd  $tmp,$src2,0x03\n\t"
5271             "vaddss  $dst,$dst,$tmp\t! add reduction4F" %}
5272   ins_encode %{
5273     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5274     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5275     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5276     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5277     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5278     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5279     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5280   %}
5281   ins_pipe( pipe_slow );
5282 %}
5283 
5284 instruct radd8F_reduction_reg(regF dst, vecY src2, vecY tmp, vecY tmp2) %{
5285   predicate(UseAVX > 0);
5286   match(Set dst (AddReductionVF dst src2));
5287   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5288   format %{ "vaddss  $dst,$dst,$src2\n\t"
5289             "pshufd  $tmp,$src2,0x01\n\t"
5290             "vaddss  $dst,$dst,$tmp\n\t"
5291             "pshufd  $tmp,$src2,0x02\n\t"
5292             "vaddss  $dst,$dst,$tmp\n\t"
5293             "pshufd  $tmp,$src2,0x03\n\t"
5294             "vaddss  $dst,$dst,$tmp\n\t"
5295             "vextractf128_high  $tmp2,$src2\n\t"
5296             "vaddss  $dst,$dst,$tmp2\n\t"
5297             "pshufd  $tmp,$tmp2,0x01\n\t"
5298             "vaddss  $dst,$dst,$tmp\n\t"
5299             "pshufd  $tmp,$tmp2,0x02\n\t"
5300             "vaddss  $dst,$dst,$tmp\n\t"
5301             "pshufd  $tmp,$tmp2,0x03\n\t"
5302             "vaddss  $dst,$dst,$tmp\t! add reduction8F" %}
5303   ins_encode %{
5304     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5305     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5306     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5307     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5308     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5309     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5310     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5311     __ vextractf128_high($tmp2$$XMMRegister, $src2$$XMMRegister);
5312     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5313     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5314     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5315     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5316     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5317     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5318     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5319   %}
5320   ins_pipe( pipe_slow );
5321 %}
5322 
5323 instruct radd16F_reduction_reg(regF dst, legVecZ src2, legVecZ tmp, legVecZ tmp2) %{
5324   predicate(UseAVX > 2);
5325   match(Set dst (AddReductionVF dst src2));
5326   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5327   format %{ "vaddss  $dst,$dst,$src2\n\t"
5328             "pshufd  $tmp,$src2,0x01\n\t"
5329             "vaddss  $dst,$dst,$tmp\n\t"
5330             "pshufd  $tmp,$src2,0x02\n\t"
5331             "vaddss  $dst,$dst,$tmp\n\t"
5332             "pshufd  $tmp,$src2,0x03\n\t"
5333             "vaddss  $dst,$dst,$tmp\n\t"
5334             "vextractf32x4  $tmp2,$src2,0x1\n\t"
5335             "vaddss  $dst,$dst,$tmp2\n\t"
5336             "pshufd  $tmp,$tmp2,0x01\n\t"
5337             "vaddss  $dst,$dst,$tmp\n\t"
5338             "pshufd  $tmp,$tmp2,0x02\n\t"
5339             "vaddss  $dst,$dst,$tmp\n\t"
5340             "pshufd  $tmp,$tmp2,0x03\n\t"
5341             "vaddss  $dst,$dst,$tmp\n\t"
5342             "vextractf32x4  $tmp2,$src2,0x2\n\t"
5343             "vaddss  $dst,$dst,$tmp2\n\t"
5344             "pshufd  $tmp,$tmp2,0x01\n\t"
5345             "vaddss  $dst,$dst,$tmp\n\t"
5346             "pshufd  $tmp,$tmp2,0x02\n\t"
5347             "vaddss  $dst,$dst,$tmp\n\t"
5348             "pshufd  $tmp,$tmp2,0x03\n\t"
5349             "vaddss  $dst,$dst,$tmp\n\t"
5350             "vextractf32x4  $tmp2,$src2,0x3\n\t"
5351             "vaddss  $dst,$dst,$tmp2\n\t"
5352             "pshufd  $tmp,$tmp2,0x01\n\t"
5353             "vaddss  $dst,$dst,$tmp\n\t"
5354             "pshufd  $tmp,$tmp2,0x02\n\t"
5355             "vaddss  $dst,$dst,$tmp\n\t"
5356             "pshufd  $tmp,$tmp2,0x03\n\t"
5357             "vaddss  $dst,$dst,$tmp\t! add reduction16F" %}
5358   ins_encode %{
5359     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5360     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5361     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5362     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5363     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5364     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5365     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5366     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5367     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5368     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5369     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5370     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5371     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5372     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5373     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5374     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
5375     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5376     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5377     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5378     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5379     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5380     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5381     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5382     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
5383     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5384     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5385     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5386     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5387     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5388     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5389     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5390   %}
5391   ins_pipe( pipe_slow );
5392 %}
5393 
5394 instruct rsadd2D_reduction_reg(regD dst, vecX src2, vecX tmp) %{
5395   predicate(UseSSE >= 1 && UseAVX == 0);
5396   match(Set dst (AddReductionVD dst src2));
5397   effect(TEMP tmp, TEMP dst);
5398   format %{ "addsd   $dst,$src2\n\t"
5399             "pshufd  $tmp,$src2,0xE\n\t"
5400             "addsd   $dst,$tmp\t! add reduction2D" %}
5401   ins_encode %{
5402     __ addsd($dst$$XMMRegister, $src2$$XMMRegister);
5403     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5404     __ addsd($dst$$XMMRegister, $tmp$$XMMRegister);
5405   %}
5406   ins_pipe( pipe_slow );
5407 %}
5408 
5409 instruct rvadd2D_reduction_reg(regD dst, vecX src2, vecX tmp) %{
5410   predicate(UseAVX > 0);
5411   match(Set dst (AddReductionVD dst src2));
5412   effect(TEMP tmp, TEMP dst);
5413   format %{ "vaddsd  $dst,$dst,$src2\n\t"
5414             "pshufd  $tmp,$src2,0xE\n\t"
5415             "vaddsd  $dst,$dst,$tmp\t! add reduction2D" %}
5416   ins_encode %{
5417     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5418     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5419     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5420   %}
5421   ins_pipe( pipe_slow );
5422 %}
5423 
5424 instruct rvadd4D_reduction_reg(regD dst, vecY src2, vecX tmp, vecX tmp2) %{
5425   predicate(UseAVX > 0);
5426   match(Set dst (AddReductionVD dst src2));
5427   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5428   format %{ "vaddsd  $dst,$dst,$src2\n\t"
5429             "pshufd  $tmp,$src2,0xE\n\t"
5430             "vaddsd  $dst,$dst,$tmp\n\t"
5431             "vextractf128  $tmp2,$src2,0x1\n\t"
5432             "vaddsd  $dst,$dst,$tmp2\n\t"
5433             "pshufd  $tmp,$tmp2,0xE\n\t"
5434             "vaddsd  $dst,$dst,$tmp\t! add reduction4D" %}
5435   ins_encode %{
5436     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5437     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5438     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5439     __ vextractf128($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5440     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5441     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5442     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5443   %}
5444   ins_pipe( pipe_slow );
5445 %}
5446 
5447 instruct rvadd8D_reduction_reg(regD dst, legVecZ src2, legVecZ tmp, legVecZ tmp2) %{
5448   predicate(UseAVX > 2);
5449   match(Set dst (AddReductionVD dst src2));
5450   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5451   format %{ "vaddsd  $dst,$dst,$src2\n\t"
5452             "pshufd  $tmp,$src2,0xE\n\t"
5453             "vaddsd  $dst,$dst,$tmp\n\t"
5454             "vextractf32x4  $tmp2,$src2,0x1\n\t"
5455             "vaddsd  $dst,$dst,$tmp2\n\t"
5456             "pshufd  $tmp,$tmp2,0xE\n\t"
5457             "vaddsd  $dst,$dst,$tmp\n\t"
5458             "vextractf32x4  $tmp2,$src2,0x2\n\t"
5459             "vaddsd  $dst,$dst,$tmp2\n\t"
5460             "pshufd  $tmp,$tmp2,0xE\n\t"
5461             "vaddsd  $dst,$dst,$tmp\n\t"
5462             "vextractf32x4  $tmp2,$src2,0x3\n\t"
5463             "vaddsd  $dst,$dst,$tmp2\n\t"
5464             "pshufd  $tmp,$tmp2,0xE\n\t"
5465             "vaddsd  $dst,$dst,$tmp\t! add reduction8D" %}
5466   ins_encode %{
5467     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5468     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5469     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5470     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5471     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5472     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5473     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5474     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
5475     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5476     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5477     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5478     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
5479     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5480     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5481     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5482   %}
5483   ins_pipe( pipe_slow );
5484 %}
5485 
5486 instruct rsmul2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, vecD tmp, vecD tmp2) %{
5487   predicate(UseSSE > 3 && UseAVX == 0);
5488   match(Set dst (MulReductionVI src1 src2));
5489   effect(TEMP tmp, TEMP tmp2);
5490   format %{ "pshufd  $tmp2,$src2,0x1\n\t"
5491             "pmulld  $tmp2,$src2\n\t"
5492             "movd    $tmp,$src1\n\t"
5493             "pmulld  $tmp2,$tmp\n\t"
5494             "movd    $dst,$tmp2\t! mul reduction2I" %}
5495   ins_encode %{
5496     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5497     __ pmulld($tmp2$$XMMRegister, $src2$$XMMRegister);
5498     __ movdl($tmp$$XMMRegister, $src1$$Register);
5499     __ pmulld($tmp2$$XMMRegister, $tmp$$XMMRegister);
5500     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5501   %}
5502   ins_pipe( pipe_slow );
5503 %}
5504 
5505 instruct rvmul2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, vecD tmp, vecD tmp2) %{
5506   predicate(UseAVX > 0);
5507   match(Set dst (MulReductionVI src1 src2));
5508   effect(TEMP tmp, TEMP tmp2);
5509   format %{ "pshufd   $tmp2,$src2,0x1\n\t"
5510             "vpmulld  $tmp,$src2,$tmp2\n\t"
5511             "movd     $tmp2,$src1\n\t"
5512             "vpmulld  $tmp2,$tmp,$tmp2\n\t"
5513             "movd     $dst,$tmp2\t! mul reduction2I" %}
5514   ins_encode %{
5515     int vector_len = 0;
5516     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5517     __ vpmulld($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5518     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5519     __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5520     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5521   %}
5522   ins_pipe( pipe_slow );
5523 %}
5524 
5525 instruct rsmul4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, vecX tmp, vecX tmp2) %{
5526   predicate(UseSSE > 3 && UseAVX == 0);
5527   match(Set dst (MulReductionVI src1 src2));
5528   effect(TEMP tmp, TEMP tmp2);
5529   format %{ "pshufd  $tmp2,$src2,0xE\n\t"
5530             "pmulld  $tmp2,$src2\n\t"
5531             "pshufd  $tmp,$tmp2,0x1\n\t"
5532             "pmulld  $tmp2,$tmp\n\t"
5533             "movd    $tmp,$src1\n\t"
5534             "pmulld  $tmp2,$tmp\n\t"
5535             "movd    $dst,$tmp2\t! mul reduction4I" %}
5536   ins_encode %{
5537     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
5538     __ pmulld($tmp2$$XMMRegister, $src2$$XMMRegister);
5539     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x1);
5540     __ pmulld($tmp2$$XMMRegister, $tmp$$XMMRegister);
5541     __ movdl($tmp$$XMMRegister, $src1$$Register);
5542     __ pmulld($tmp2$$XMMRegister, $tmp$$XMMRegister);
5543     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5544   %}
5545   ins_pipe( pipe_slow );
5546 %}
5547 
5548 instruct rvmul4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, vecX tmp, vecX tmp2) %{
5549   predicate(UseAVX > 0);
5550   match(Set dst (MulReductionVI src1 src2));
5551   effect(TEMP tmp, TEMP tmp2);
5552   format %{ "pshufd   $tmp2,$src2,0xE\n\t"
5553             "vpmulld  $tmp,$src2,$tmp2\n\t"
5554             "pshufd   $tmp2,$tmp,0x1\n\t"
5555             "vpmulld  $tmp,$tmp,$tmp2\n\t"
5556             "movd     $tmp2,$src1\n\t"
5557             "vpmulld  $tmp2,$tmp,$tmp2\n\t"
5558             "movd     $dst,$tmp2\t! mul reduction4I" %}
5559   ins_encode %{
5560     int vector_len = 0;
5561     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
5562     __ vpmulld($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5563     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
5564     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5565     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5566     __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5567     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5568   %}
5569   ins_pipe( pipe_slow );
5570 %}
5571 
5572 instruct rvmul8I_reduction_reg(rRegI dst, rRegI src1, vecY src2, vecY tmp, vecY tmp2) %{
5573   predicate(UseAVX > 1);
5574   match(Set dst (MulReductionVI src1 src2));
5575   effect(TEMP tmp, TEMP tmp2);
5576   format %{ "vextracti128_high  $tmp,$src2\n\t"
5577             "vpmulld  $tmp,$tmp,$src2\n\t"
5578             "pshufd   $tmp2,$tmp,0xE\n\t"
5579             "vpmulld  $tmp,$tmp,$tmp2\n\t"
5580             "pshufd   $tmp2,$tmp,0x1\n\t"
5581             "vpmulld  $tmp,$tmp,$tmp2\n\t"
5582             "movd     $tmp2,$src1\n\t"
5583             "vpmulld  $tmp2,$tmp,$tmp2\n\t"
5584             "movd     $dst,$tmp2\t! mul reduction8I" %}
5585   ins_encode %{
5586     int vector_len = 0;
5587     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
5588     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
5589     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
5590     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5591     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
5592     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5593     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5594     __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5595     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5596   %}
5597   ins_pipe( pipe_slow );
5598 %}
5599 
5600 instruct rvmul16I_reduction_reg(rRegI dst, rRegI src1, legVecZ src2, legVecZ tmp, legVecZ tmp2, legVecZ tmp3) %{
5601   predicate(UseAVX > 2);
5602   match(Set dst (MulReductionVI src1 src2));
5603   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
5604   format %{ "vextracti64x4_high  $tmp3,$src2\n\t"
5605             "vpmulld  $tmp3,$tmp3,$src2\n\t"
5606             "vextracti128_high  $tmp,$tmp3\n\t"
5607             "vpmulld  $tmp,$tmp,$src2\n\t"
5608             "pshufd   $tmp2,$tmp,0xE\n\t"
5609             "vpmulld  $tmp,$tmp,$tmp2\n\t"
5610             "pshufd   $tmp2,$tmp,0x1\n\t"
5611             "vpmulld  $tmp,$tmp,$tmp2\n\t"
5612             "movd     $tmp2,$src1\n\t"
5613             "vpmulld  $tmp2,$tmp,$tmp2\n\t"
5614             "movd     $dst,$tmp2\t! mul reduction16I" %}
5615   ins_encode %{
5616     __ vextracti64x4_high($tmp3$$XMMRegister, $src2$$XMMRegister);
5617     __ vpmulld($tmp3$$XMMRegister, $tmp3$$XMMRegister, $src2$$XMMRegister, 1);
5618     __ vextracti128_high($tmp$$XMMRegister, $tmp3$$XMMRegister);
5619     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp3$$XMMRegister, 0);
5620     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
5621     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5622     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
5623     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5624     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5625     __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5626     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5627   %}
5628   ins_pipe( pipe_slow );
5629 %}
5630 
5631 #ifdef _LP64
5632 instruct rvmul2L_reduction_reg(rRegL dst, rRegL src1, vecX src2, vecX tmp, vecX tmp2) %{
5633   predicate(UseAVX > 2 && VM_Version::supports_avx512dq());
5634   match(Set dst (MulReductionVL src1 src2));
5635   effect(TEMP tmp, TEMP tmp2);
5636   format %{ "pshufd   $tmp2,$src2,0xE\n\t"
5637             "vpmullq  $tmp,$src2,$tmp2\n\t"
5638             "movdq    $tmp2,$src1\n\t"
5639             "vpmullq  $tmp2,$tmp,$tmp2\n\t"
5640             "movdq    $dst,$tmp2\t! mul reduction2L" %}
5641   ins_encode %{
5642     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
5643     __ vpmullq($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, 0);
5644     __ movdq($tmp2$$XMMRegister, $src1$$Register);
5645     __ vpmullq($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5646     __ movdq($dst$$Register, $tmp2$$XMMRegister);
5647   %}
5648   ins_pipe( pipe_slow );
5649 %}
5650 
5651 instruct rvmul4L_reduction_reg(rRegL dst, rRegL src1, vecY src2, vecY tmp, vecY tmp2) %{
5652   predicate(UseAVX > 2 && VM_Version::supports_avx512dq());
5653   match(Set dst (MulReductionVL src1 src2));
5654   effect(TEMP tmp, TEMP tmp2);
5655   format %{ "vextracti128_high  $tmp,$src2\n\t"
5656             "vpmullq  $tmp2,$tmp,$src2\n\t"
5657             "pshufd   $tmp,$tmp2,0xE\n\t"
5658             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5659             "movdq    $tmp,$src1\n\t"
5660             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5661             "movdq    $dst,$tmp2\t! mul reduction4L" %}
5662   ins_encode %{
5663     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
5664     __ vpmullq($tmp2$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, 0);
5665     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5666     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5667     __ movdq($tmp$$XMMRegister, $src1$$Register);
5668     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5669     __ movdq($dst$$Register, $tmp2$$XMMRegister);
5670   %}
5671   ins_pipe( pipe_slow );
5672 %}
5673 
5674 instruct rvmul8L_reduction_reg(rRegL dst, rRegL src1, legVecZ src2, legVecZ tmp, legVecZ tmp2) %{
5675   predicate(UseAVX > 2 && VM_Version::supports_avx512dq());
5676   match(Set dst (MulReductionVL src1 src2));
5677   effect(TEMP tmp, TEMP tmp2);
5678   format %{ "vextracti64x4_high  $tmp2,$src2\n\t"
5679             "vpmullq  $tmp2,$tmp2,$src2\n\t"
5680             "vextracti128_high  $tmp,$tmp2\n\t"
5681             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5682             "pshufd   $tmp,$tmp2,0xE\n\t"
5683             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5684             "movdq    $tmp,$src1\n\t"
5685             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5686             "movdq    $dst,$tmp2\t! mul reduction8L" %}
5687   ins_encode %{
5688     __ vextracti64x4_high($tmp2$$XMMRegister, $src2$$XMMRegister);
5689     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
5690     __ vextracti128_high($tmp$$XMMRegister, $tmp2$$XMMRegister);
5691     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5692     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5693     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5694     __ movdq($tmp$$XMMRegister, $src1$$Register);
5695     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5696     __ movdq($dst$$Register, $tmp2$$XMMRegister);
5697   %}
5698   ins_pipe( pipe_slow );
5699 %}
5700 #endif
5701 
5702 instruct rsmul2F_reduction(regF dst, vecD src2, vecD tmp) %{
5703   predicate(UseSSE >= 1 && UseAVX == 0);
5704   match(Set dst (MulReductionVF dst src2));
5705   effect(TEMP dst, TEMP tmp);
5706   format %{ "mulss   $dst,$src2\n\t"
5707             "pshufd  $tmp,$src2,0x01\n\t"
5708             "mulss   $dst,$tmp\t! mul reduction2F" %}
5709   ins_encode %{
5710     __ mulss($dst$$XMMRegister, $src2$$XMMRegister);
5711     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5712     __ mulss($dst$$XMMRegister, $tmp$$XMMRegister);
5713   %}
5714   ins_pipe( pipe_slow );
5715 %}
5716 
5717 instruct rvmul2F_reduction_reg(regF dst, vecD src2, vecD tmp) %{
5718   predicate(UseAVX > 0);
5719   match(Set dst (MulReductionVF dst src2));
5720   effect(TEMP tmp, TEMP dst);
5721   format %{ "vmulss  $dst,$dst,$src2\n\t"
5722             "pshufd  $tmp,$src2,0x01\n\t"
5723             "vmulss  $dst,$dst,$tmp\t! mul reduction2F" %}
5724   ins_encode %{
5725     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5726     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5727     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5728   %}
5729   ins_pipe( pipe_slow );
5730 %}
5731 
5732 instruct rsmul4F_reduction_reg(regF dst, vecX src2, vecX tmp) %{
5733   predicate(UseSSE >= 1 && UseAVX == 0);
5734   match(Set dst (MulReductionVF dst src2));
5735   effect(TEMP dst, TEMP tmp);
5736   format %{ "mulss   $dst,$src2\n\t"
5737             "pshufd  $tmp,$src2,0x01\n\t"
5738             "mulss   $dst,$tmp\n\t"
5739             "pshufd  $tmp,$src2,0x02\n\t"
5740             "mulss   $dst,$tmp\n\t"
5741             "pshufd  $tmp,$src2,0x03\n\t"
5742             "mulss   $dst,$tmp\t! mul reduction4F" %}
5743   ins_encode %{
5744     __ mulss($dst$$XMMRegister, $src2$$XMMRegister);
5745     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5746     __ mulss($dst$$XMMRegister, $tmp$$XMMRegister);
5747     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5748     __ mulss($dst$$XMMRegister, $tmp$$XMMRegister);
5749     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5750     __ mulss($dst$$XMMRegister, $tmp$$XMMRegister);
5751   %}
5752   ins_pipe( pipe_slow );
5753 %}
5754 
5755 instruct rvmul4F_reduction_reg(regF dst, vecX src2, vecX tmp) %{
5756   predicate(UseAVX > 0);
5757   match(Set dst (MulReductionVF dst src2));
5758   effect(TEMP tmp, TEMP dst);
5759   format %{ "vmulss  $dst,$dst,$src2\n\t"
5760             "pshufd  $tmp,$src2,0x01\n\t"
5761             "vmulss  $dst,$dst,$tmp\n\t"
5762             "pshufd  $tmp,$src2,0x02\n\t"
5763             "vmulss  $dst,$dst,$tmp\n\t"
5764             "pshufd  $tmp,$src2,0x03\n\t"
5765             "vmulss  $dst,$dst,$tmp\t! mul reduction4F" %}
5766   ins_encode %{
5767     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5768     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5769     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5770     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5771     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5772     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5773     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5774   %}
5775   ins_pipe( pipe_slow );
5776 %}
5777 
5778 instruct rvmul8F_reduction_reg(regF dst, vecY src2, vecY tmp, vecY tmp2) %{
5779   predicate(UseAVX > 0);
5780   match(Set dst (MulReductionVF dst src2));
5781   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5782   format %{ "vmulss  $dst,$dst,$src2\n\t"
5783             "pshufd  $tmp,$src2,0x01\n\t"
5784             "vmulss  $dst,$dst,$tmp\n\t"
5785             "pshufd  $tmp,$src2,0x02\n\t"
5786             "vmulss  $dst,$dst,$tmp\n\t"
5787             "pshufd  $tmp,$src2,0x03\n\t"
5788             "vmulss  $dst,$dst,$tmp\n\t"
5789             "vextractf128_high  $tmp2,$src2\n\t"
5790             "vmulss  $dst,$dst,$tmp2\n\t"
5791             "pshufd  $tmp,$tmp2,0x01\n\t"
5792             "vmulss  $dst,$dst,$tmp\n\t"
5793             "pshufd  $tmp,$tmp2,0x02\n\t"
5794             "vmulss  $dst,$dst,$tmp\n\t"
5795             "pshufd  $tmp,$tmp2,0x03\n\t"
5796             "vmulss  $dst,$dst,$tmp\t! mul reduction8F" %}
5797   ins_encode %{
5798     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5799     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5800     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5801     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5802     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5803     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5804     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5805     __ vextractf128_high($tmp2$$XMMRegister, $src2$$XMMRegister);
5806     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5807     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5808     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5809     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5810     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5811     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5812     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5813   %}
5814   ins_pipe( pipe_slow );
5815 %}
5816 
5817 instruct rvmul16F_reduction_reg(regF dst, legVecZ src2, legVecZ tmp, legVecZ tmp2) %{
5818   predicate(UseAVX > 2);
5819   match(Set dst (MulReductionVF dst src2));
5820   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5821   format %{ "vmulss  $dst,$dst,$src2\n\t"
5822             "pshufd  $tmp,$src2,0x01\n\t"
5823             "vmulss  $dst,$dst,$tmp\n\t"
5824             "pshufd  $tmp,$src2,0x02\n\t"
5825             "vmulss  $dst,$dst,$tmp\n\t"
5826             "pshufd  $tmp,$src2,0x03\n\t"
5827             "vmulss  $dst,$dst,$tmp\n\t"
5828             "vextractf32x4  $tmp2,$src2,0x1\n\t"
5829             "vmulss  $dst,$dst,$tmp2\n\t"
5830             "pshufd  $tmp,$tmp2,0x01\n\t"
5831             "vmulss  $dst,$dst,$tmp\n\t"
5832             "pshufd  $tmp,$tmp2,0x02\n\t"
5833             "vmulss  $dst,$dst,$tmp\n\t"
5834             "pshufd  $tmp,$tmp2,0x03\n\t"
5835             "vmulss  $dst,$dst,$tmp\n\t"
5836             "vextractf32x4  $tmp2,$src2,0x2\n\t"
5837             "vmulss  $dst,$dst,$tmp2\n\t"
5838             "pshufd  $tmp,$tmp2,0x01\n\t"
5839             "vmulss  $dst,$dst,$tmp\n\t"
5840             "pshufd  $tmp,$tmp2,0x02\n\t"
5841             "vmulss  $dst,$dst,$tmp\n\t"
5842             "pshufd  $tmp,$tmp2,0x03\n\t"
5843             "vmulss  $dst,$dst,$tmp\n\t"
5844             "vextractf32x4  $tmp2,$src2,0x3\n\t"
5845             "vmulss  $dst,$dst,$tmp2\n\t"
5846             "pshufd  $tmp,$tmp2,0x01\n\t"
5847             "vmulss  $dst,$dst,$tmp\n\t"
5848             "pshufd  $tmp,$tmp2,0x02\n\t"
5849             "vmulss  $dst,$dst,$tmp\n\t"
5850             "pshufd  $tmp,$tmp2,0x03\n\t"
5851             "vmulss  $dst,$dst,$tmp\t! mul reduction16F" %}
5852   ins_encode %{
5853     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5854     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5855     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5856     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5857     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5858     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5859     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5860     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5861     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5862     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5863     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5864     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5865     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5866     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5867     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5868     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
5869     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5870     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5871     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5872     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5873     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5874     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5875     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5876     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
5877     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5878     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5879     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5880     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5881     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5882     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5883     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5884   %}
5885   ins_pipe( pipe_slow );
5886 %}
5887 
5888 instruct rsmul2D_reduction_reg(regD dst, vecX src2, vecX tmp) %{
5889   predicate(UseSSE >= 1 && UseAVX == 0);
5890   match(Set dst (MulReductionVD dst src2));
5891   effect(TEMP dst, TEMP tmp);
5892   format %{ "mulsd   $dst,$src2\n\t"
5893             "pshufd  $tmp,$src2,0xE\n\t"
5894             "mulsd   $dst,$tmp\t! mul reduction2D" %}
5895   ins_encode %{
5896     __ mulsd($dst$$XMMRegister, $src2$$XMMRegister);
5897     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5898     __ mulsd($dst$$XMMRegister, $tmp$$XMMRegister);
5899   %}
5900   ins_pipe( pipe_slow );
5901 %}
5902 
5903 instruct rvmul2D_reduction_reg(regD dst, vecX src2, vecX tmp) %{
5904   predicate(UseAVX > 0);
5905   match(Set dst (MulReductionVD dst src2));
5906   effect(TEMP tmp, TEMP dst);
5907   format %{ "vmulsd  $dst,$dst,$src2\n\t"
5908             "pshufd  $tmp,$src2,0xE\n\t"
5909             "vmulsd  $dst,$dst,$tmp\t! mul reduction2D" %}
5910   ins_encode %{
5911     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5912     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5913     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5914   %}
5915   ins_pipe( pipe_slow );
5916 %}
5917 
5918 instruct rvmul4D_reduction_reg(regD dst, vecY src2, vecY tmp, vecY tmp2) %{
5919   predicate(UseAVX > 0);
5920   match(Set dst (MulReductionVD dst src2));
5921   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5922   format %{ "vmulsd  $dst,$dst,$src2\n\t"
5923             "pshufd  $tmp,$src2,0xE\n\t"
5924             "vmulsd  $dst,$dst,$tmp\n\t"
5925             "vextractf128_high  $tmp2,$src2\n\t"
5926             "vmulsd  $dst,$dst,$tmp2\n\t"
5927             "pshufd  $tmp,$tmp2,0xE\n\t"
5928             "vmulsd  $dst,$dst,$tmp\t! mul reduction4D" %}
5929   ins_encode %{
5930     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5931     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5932     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5933     __ vextractf128_high($tmp2$$XMMRegister, $src2$$XMMRegister);
5934     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5935     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5936     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5937   %}
5938   ins_pipe( pipe_slow );
5939 %}
5940 
5941 instruct rvmul8D_reduction_reg(regD dst, legVecZ src2, legVecZ tmp, legVecZ tmp2) %{
5942   predicate(UseAVX > 2);
5943   match(Set dst (MulReductionVD dst src2));
5944   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5945   format %{ "vmulsd  $dst,$dst,$src2\n\t"
5946             "pshufd  $tmp,$src2,0xE\n\t"
5947             "vmulsd  $dst,$dst,$tmp\n\t"
5948             "vextractf32x4  $tmp2,$src2,0x1\n\t"
5949             "vmulsd  $dst,$dst,$tmp2\n\t"
5950             "pshufd  $tmp,$src2,0xE\n\t"
5951             "vmulsd  $dst,$dst,$tmp\n\t"
5952             "vextractf32x4  $tmp2,$src2,0x2\n\t"
5953             "vmulsd  $dst,$dst,$tmp2\n\t"
5954             "pshufd  $tmp,$tmp2,0xE\n\t"
5955             "vmulsd  $dst,$dst,$tmp\n\t"
5956             "vextractf32x4  $tmp2,$src2,0x3\n\t"
5957             "vmulsd  $dst,$dst,$tmp2\n\t"
5958             "pshufd  $tmp,$tmp2,0xE\n\t"
5959             "vmulsd  $dst,$dst,$tmp\t! mul reduction8D" %}
5960   ins_encode %{
5961     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5962     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5963     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5964     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5965     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5966     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5967     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5968     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
5969     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5970     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5971     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5972     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
5973     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5974     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5975     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5976   %}
5977   ins_pipe( pipe_slow );
5978 %}
5979 
5980 // ====================VECTOR ARITHMETIC=======================================
5981 
5982 // --------------------------------- ADD --------------------------------------
5983 
5984 // Bytes vector add
5985 instruct vadd4B(vecS dst, vecS src) %{
5986   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
5987   match(Set dst (AddVB dst src));
5988   format %{ "paddb   $dst,$src\t! add packed4B" %}
5989   ins_encode %{
5990     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
5991   %}
5992   ins_pipe( pipe_slow );
5993 %}
5994 
5995 instruct vadd4B_reg(vecS dst, vecS src1, vecS src2) %{
5996   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
5997   match(Set dst (AddVB src1 src2));
5998   format %{ "vpaddb  $dst,$src1,$src2\t! add packed4B" %}
5999   ins_encode %{
6000     int vector_len = 0;
6001     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6002   %}
6003   ins_pipe( pipe_slow );
6004 %}
6005 
6006 
6007 instruct vadd4B_mem(vecS dst, vecS src, memory mem) %{
6008   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6009   match(Set dst (AddVB src (LoadVector mem)));
6010   format %{ "vpaddb  $dst,$src,$mem\t! add packed4B" %}
6011   ins_encode %{
6012     int vector_len = 0;
6013     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6014   %}
6015   ins_pipe( pipe_slow );
6016 %}
6017 
6018 instruct vadd8B(vecD dst, vecD src) %{
6019   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
6020   match(Set dst (AddVB dst src));
6021   format %{ "paddb   $dst,$src\t! add packed8B" %}
6022   ins_encode %{
6023     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
6024   %}
6025   ins_pipe( pipe_slow );
6026 %}
6027 
6028 instruct vadd8B_reg(vecD dst, vecD src1, vecD src2) %{
6029   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6030   match(Set dst (AddVB src1 src2));
6031   format %{ "vpaddb  $dst,$src1,$src2\t! add packed8B" %}
6032   ins_encode %{
6033     int vector_len = 0;
6034     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6035   %}
6036   ins_pipe( pipe_slow );
6037 %}
6038 
6039 
6040 instruct vadd8B_mem(vecD dst, vecD src, memory mem) %{
6041   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6042   match(Set dst (AddVB src (LoadVector mem)));
6043   format %{ "vpaddb  $dst,$src,$mem\t! add packed8B" %}
6044   ins_encode %{
6045     int vector_len = 0;
6046     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6047   %}
6048   ins_pipe( pipe_slow );
6049 %}
6050 
6051 instruct vadd16B(vecX dst, vecX src) %{
6052   predicate(UseAVX == 0 && n->as_Vector()->length() == 16);
6053   match(Set dst (AddVB dst src));
6054   format %{ "paddb   $dst,$src\t! add packed16B" %}
6055   ins_encode %{
6056     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
6057   %}
6058   ins_pipe( pipe_slow );
6059 %}
6060 
6061 instruct vadd16B_reg(vecX dst, vecX src1, vecX src2) %{
6062   predicate(UseAVX > 0  && n->as_Vector()->length() == 16);
6063   match(Set dst (AddVB src1 src2));
6064   format %{ "vpaddb  $dst,$src1,$src2\t! add packed16B" %}
6065   ins_encode %{
6066     int vector_len = 0;
6067     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6068   %}
6069   ins_pipe( pipe_slow );
6070 %}
6071 
6072 instruct vadd16B_mem(vecX dst, vecX src, memory mem) %{
6073   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
6074   match(Set dst (AddVB src (LoadVector mem)));
6075   format %{ "vpaddb  $dst,$src,$mem\t! add packed16B" %}
6076   ins_encode %{
6077     int vector_len = 0;
6078     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6079   %}
6080   ins_pipe( pipe_slow );
6081 %}
6082 
6083 instruct vadd32B_reg(vecY dst, vecY src1, vecY src2) %{
6084   predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
6085   match(Set dst (AddVB src1 src2));
6086   format %{ "vpaddb  $dst,$src1,$src2\t! add packed32B" %}
6087   ins_encode %{
6088     int vector_len = 1;
6089     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6090   %}
6091   ins_pipe( pipe_slow );
6092 %}
6093 
6094 instruct vadd32B_mem(vecY dst, vecY src, memory mem) %{
6095   predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
6096   match(Set dst (AddVB src (LoadVector mem)));
6097   format %{ "vpaddb  $dst,$src,$mem\t! add packed32B" %}
6098   ins_encode %{
6099     int vector_len = 1;
6100     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6101   %}
6102   ins_pipe( pipe_slow );
6103 %}
6104 
6105 instruct vadd64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
6106   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64);
6107   match(Set dst (AddVB src1 src2));
6108   format %{ "vpaddb  $dst,$src1,$src2\t! add packed64B" %}
6109   ins_encode %{
6110     int vector_len = 2;
6111     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6112   %}
6113   ins_pipe( pipe_slow );
6114 %}
6115 
6116 instruct vadd64B_mem(vecZ dst, vecZ src, memory mem) %{
6117   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64);
6118   match(Set dst (AddVB src (LoadVector mem)));
6119   format %{ "vpaddb  $dst,$src,$mem\t! add packed64B" %}
6120   ins_encode %{
6121     int vector_len = 2;
6122     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6123   %}
6124   ins_pipe( pipe_slow );
6125 %}
6126 
6127 // Shorts/Chars vector add
6128 instruct vadd2S(vecS dst, vecS src) %{
6129   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
6130   match(Set dst (AddVS dst src));
6131   format %{ "paddw   $dst,$src\t! add packed2S" %}
6132   ins_encode %{
6133     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
6134   %}
6135   ins_pipe( pipe_slow );
6136 %}
6137 
6138 instruct vadd2S_reg(vecS dst, vecS src1, vecS src2) %{
6139   predicate(UseAVX > 0  && n->as_Vector()->length() == 2);
6140   match(Set dst (AddVS src1 src2));
6141   format %{ "vpaddw  $dst,$src1,$src2\t! add packed2S" %}
6142   ins_encode %{
6143     int vector_len = 0;
6144     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6145   %}
6146   ins_pipe( pipe_slow );
6147 %}
6148 
6149 instruct vadd2S_mem(vecS dst, vecS src, memory mem) %{
6150   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6151   match(Set dst (AddVS src (LoadVector mem)));
6152   format %{ "vpaddw  $dst,$src,$mem\t! add packed2S" %}
6153   ins_encode %{
6154     int vector_len = 0;
6155     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6156   %}
6157   ins_pipe( pipe_slow );
6158 %}
6159 
6160 instruct vadd4S(vecD dst, vecD src) %{
6161   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
6162   match(Set dst (AddVS dst src));
6163   format %{ "paddw   $dst,$src\t! add packed4S" %}
6164   ins_encode %{
6165     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
6166   %}
6167   ins_pipe( pipe_slow );
6168 %}
6169 
6170 instruct vadd4S_reg(vecD dst, vecD src1, vecD src2) %{
6171   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6172   match(Set dst (AddVS src1 src2));
6173   format %{ "vpaddw  $dst,$src1,$src2\t! add packed4S" %}
6174   ins_encode %{
6175     int vector_len = 0;
6176     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6177   %}
6178   ins_pipe( pipe_slow );
6179 %}
6180 
6181 instruct vadd4S_mem(vecD dst, vecD src, memory mem) %{
6182   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6183   match(Set dst (AddVS src (LoadVector mem)));
6184   format %{ "vpaddw  $dst,$src,$mem\t! add packed4S" %}
6185   ins_encode %{
6186     int vector_len = 0;
6187     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6188   %}
6189   ins_pipe( pipe_slow );
6190 %}
6191 
6192 instruct vadd8S(vecX dst, vecX src) %{
6193   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
6194   match(Set dst (AddVS dst src));
6195   format %{ "paddw   $dst,$src\t! add packed8S" %}
6196   ins_encode %{
6197     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
6198   %}
6199   ins_pipe( pipe_slow );
6200 %}
6201 
6202 instruct vadd8S_reg(vecX dst, vecX src1, vecX src2) %{
6203   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6204   match(Set dst (AddVS src1 src2));
6205   format %{ "vpaddw  $dst,$src1,$src2\t! add packed8S" %}
6206   ins_encode %{
6207     int vector_len = 0;
6208     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6209   %}
6210   ins_pipe( pipe_slow );
6211 %}
6212 
6213 instruct vadd8S_mem(vecX dst, vecX src, memory mem) %{
6214   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6215   match(Set dst (AddVS src (LoadVector mem)));
6216   format %{ "vpaddw  $dst,$src,$mem\t! add packed8S" %}
6217   ins_encode %{
6218     int vector_len = 0;
6219     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6220   %}
6221   ins_pipe( pipe_slow );
6222 %}
6223 
6224 instruct vadd16S_reg(vecY dst, vecY src1, vecY src2) %{
6225   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
6226   match(Set dst (AddVS src1 src2));
6227   format %{ "vpaddw  $dst,$src1,$src2\t! add packed16S" %}
6228   ins_encode %{
6229     int vector_len = 1;
6230     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6231   %}
6232   ins_pipe( pipe_slow );
6233 %}
6234 
6235 instruct vadd16S_mem(vecY dst, vecY src, memory mem) %{
6236   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
6237   match(Set dst (AddVS src (LoadVector mem)));
6238   format %{ "vpaddw  $dst,$src,$mem\t! add packed16S" %}
6239   ins_encode %{
6240     int vector_len = 1;
6241     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6242   %}
6243   ins_pipe( pipe_slow );
6244 %}
6245 
6246 instruct vadd32S_reg(vecZ dst, vecZ src1, vecZ src2) %{
6247   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
6248   match(Set dst (AddVS src1 src2));
6249   format %{ "vpaddw  $dst,$src1,$src2\t! add packed32S" %}
6250   ins_encode %{
6251     int vector_len = 2;
6252     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6253   %}
6254   ins_pipe( pipe_slow );
6255 %}
6256 
6257 instruct vadd32S_mem(vecZ dst, vecZ src, memory mem) %{
6258   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
6259   match(Set dst (AddVS src (LoadVector mem)));
6260   format %{ "vpaddw  $dst,$src,$mem\t! add packed32S" %}
6261   ins_encode %{
6262     int vector_len = 2;
6263     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6264   %}
6265   ins_pipe( pipe_slow );
6266 %}
6267 
6268 // Integers vector add
6269 instruct vadd2I(vecD dst, vecD src) %{
6270   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
6271   match(Set dst (AddVI dst src));
6272   format %{ "paddd   $dst,$src\t! add packed2I" %}
6273   ins_encode %{
6274     __ paddd($dst$$XMMRegister, $src$$XMMRegister);
6275   %}
6276   ins_pipe( pipe_slow );
6277 %}
6278 
6279 instruct vadd2I_reg(vecD dst, vecD src1, vecD src2) %{
6280   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6281   match(Set dst (AddVI src1 src2));
6282   format %{ "vpaddd  $dst,$src1,$src2\t! add packed2I" %}
6283   ins_encode %{
6284     int vector_len = 0;
6285     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6286   %}
6287   ins_pipe( pipe_slow );
6288 %}
6289 
6290 instruct vadd2I_mem(vecD dst, vecD src, memory mem) %{
6291   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6292   match(Set dst (AddVI src (LoadVector mem)));
6293   format %{ "vpaddd  $dst,$src,$mem\t! add packed2I" %}
6294   ins_encode %{
6295     int vector_len = 0;
6296     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6297   %}
6298   ins_pipe( pipe_slow );
6299 %}
6300 
6301 instruct vadd4I(vecX dst, vecX src) %{
6302   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
6303   match(Set dst (AddVI dst src));
6304   format %{ "paddd   $dst,$src\t! add packed4I" %}
6305   ins_encode %{
6306     __ paddd($dst$$XMMRegister, $src$$XMMRegister);
6307   %}
6308   ins_pipe( pipe_slow );
6309 %}
6310 
6311 instruct vadd4I_reg(vecX dst, vecX src1, vecX src2) %{
6312   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6313   match(Set dst (AddVI src1 src2));
6314   format %{ "vpaddd  $dst,$src1,$src2\t! add packed4I" %}
6315   ins_encode %{
6316     int vector_len = 0;
6317     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6318   %}
6319   ins_pipe( pipe_slow );
6320 %}
6321 
6322 instruct vadd4I_mem(vecX dst, vecX src, memory mem) %{
6323   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6324   match(Set dst (AddVI src (LoadVector mem)));
6325   format %{ "vpaddd  $dst,$src,$mem\t! add packed4I" %}
6326   ins_encode %{
6327     int vector_len = 0;
6328     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6329   %}
6330   ins_pipe( pipe_slow );
6331 %}
6332 
6333 instruct vadd8I_reg(vecY dst, vecY src1, vecY src2) %{
6334   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
6335   match(Set dst (AddVI src1 src2));
6336   format %{ "vpaddd  $dst,$src1,$src2\t! add packed8I" %}
6337   ins_encode %{
6338     int vector_len = 1;
6339     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6340   %}
6341   ins_pipe( pipe_slow );
6342 %}
6343 
6344 instruct vadd8I_mem(vecY dst, vecY src, memory mem) %{
6345   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
6346   match(Set dst (AddVI src (LoadVector mem)));
6347   format %{ "vpaddd  $dst,$src,$mem\t! add packed8I" %}
6348   ins_encode %{
6349     int vector_len = 1;
6350     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6351   %}
6352   ins_pipe( pipe_slow );
6353 %}
6354 
6355 instruct vadd16I_reg(vecZ dst, vecZ src1, vecZ src2) %{
6356   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
6357   match(Set dst (AddVI src1 src2));
6358   format %{ "vpaddd  $dst,$src1,$src2\t! add packed16I" %}
6359   ins_encode %{
6360     int vector_len = 2;
6361     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6362   %}
6363   ins_pipe( pipe_slow );
6364 %}
6365 
6366 instruct vadd16I_mem(vecZ dst, vecZ src, memory mem) %{
6367   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
6368   match(Set dst (AddVI src (LoadVector mem)));
6369   format %{ "vpaddd  $dst,$src,$mem\t! add packed16I" %}
6370   ins_encode %{
6371     int vector_len = 2;
6372     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6373   %}
6374   ins_pipe( pipe_slow );
6375 %}
6376 
6377 // Longs vector add
6378 instruct vadd2L(vecX dst, vecX src) %{
6379   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
6380   match(Set dst (AddVL dst src));
6381   format %{ "paddq   $dst,$src\t! add packed2L" %}
6382   ins_encode %{
6383     __ paddq($dst$$XMMRegister, $src$$XMMRegister);
6384   %}
6385   ins_pipe( pipe_slow );
6386 %}
6387 
6388 instruct vadd2L_reg(vecX dst, vecX src1, vecX src2) %{
6389   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6390   match(Set dst (AddVL src1 src2));
6391   format %{ "vpaddq  $dst,$src1,$src2\t! add packed2L" %}
6392   ins_encode %{
6393     int vector_len = 0;
6394     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6395   %}
6396   ins_pipe( pipe_slow );
6397 %}
6398 
6399 instruct vadd2L_mem(vecX dst, vecX src, memory mem) %{
6400   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6401   match(Set dst (AddVL src (LoadVector mem)));
6402   format %{ "vpaddq  $dst,$src,$mem\t! add packed2L" %}
6403   ins_encode %{
6404     int vector_len = 0;
6405     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6406   %}
6407   ins_pipe( pipe_slow );
6408 %}
6409 
6410 instruct vadd4L_reg(vecY dst, vecY src1, vecY src2) %{
6411   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
6412   match(Set dst (AddVL src1 src2));
6413   format %{ "vpaddq  $dst,$src1,$src2\t! add packed4L" %}
6414   ins_encode %{
6415     int vector_len = 1;
6416     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6417   %}
6418   ins_pipe( pipe_slow );
6419 %}
6420 
6421 instruct vadd4L_mem(vecY dst, vecY src, memory mem) %{
6422   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
6423   match(Set dst (AddVL src (LoadVector mem)));
6424   format %{ "vpaddq  $dst,$src,$mem\t! add packed4L" %}
6425   ins_encode %{
6426     int vector_len = 1;
6427     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6428   %}
6429   ins_pipe( pipe_slow );
6430 %}
6431 
6432 instruct vadd8L_reg(vecZ dst, vecZ src1, vecZ src2) %{
6433   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
6434   match(Set dst (AddVL src1 src2));
6435   format %{ "vpaddq  $dst,$src1,$src2\t! add packed8L" %}
6436   ins_encode %{
6437     int vector_len = 2;
6438     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6439   %}
6440   ins_pipe( pipe_slow );
6441 %}
6442 
6443 instruct vadd8L_mem(vecZ dst, vecZ src, memory mem) %{
6444   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
6445   match(Set dst (AddVL src (LoadVector mem)));
6446   format %{ "vpaddq  $dst,$src,$mem\t! add packed8L" %}
6447   ins_encode %{
6448     int vector_len = 2;
6449     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6450   %}
6451   ins_pipe( pipe_slow );
6452 %}
6453 
6454 // Floats vector add
6455 instruct vadd2F(vecD dst, vecD src) %{
6456   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
6457   match(Set dst (AddVF dst src));
6458   format %{ "addps   $dst,$src\t! add packed2F" %}
6459   ins_encode %{
6460     __ addps($dst$$XMMRegister, $src$$XMMRegister);
6461   %}
6462   ins_pipe( pipe_slow );
6463 %}
6464 
6465 instruct vadd2F_reg(vecD dst, vecD src1, vecD src2) %{
6466   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6467   match(Set dst (AddVF src1 src2));
6468   format %{ "vaddps  $dst,$src1,$src2\t! add packed2F" %}
6469   ins_encode %{
6470     int vector_len = 0;
6471     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6472   %}
6473   ins_pipe( pipe_slow );
6474 %}
6475 
6476 instruct vadd2F_mem(vecD dst, vecD src, memory mem) %{
6477   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6478   match(Set dst (AddVF src (LoadVector mem)));
6479   format %{ "vaddps  $dst,$src,$mem\t! add packed2F" %}
6480   ins_encode %{
6481     int vector_len = 0;
6482     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6483   %}
6484   ins_pipe( pipe_slow );
6485 %}
6486 
6487 instruct vadd4F(vecX dst, vecX src) %{
6488   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
6489   match(Set dst (AddVF dst src));
6490   format %{ "addps   $dst,$src\t! add packed4F" %}
6491   ins_encode %{
6492     __ addps($dst$$XMMRegister, $src$$XMMRegister);
6493   %}
6494   ins_pipe( pipe_slow );
6495 %}
6496 
6497 instruct vadd4F_reg(vecX dst, vecX src1, vecX src2) %{
6498   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6499   match(Set dst (AddVF src1 src2));
6500   format %{ "vaddps  $dst,$src1,$src2\t! add packed4F" %}
6501   ins_encode %{
6502     int vector_len = 0;
6503     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6504   %}
6505   ins_pipe( pipe_slow );
6506 %}
6507 
6508 instruct vadd4F_mem(vecX dst, vecX src, memory mem) %{
6509   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6510   match(Set dst (AddVF src (LoadVector mem)));
6511   format %{ "vaddps  $dst,$src,$mem\t! add packed4F" %}
6512   ins_encode %{
6513     int vector_len = 0;
6514     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6515   %}
6516   ins_pipe( pipe_slow );
6517 %}
6518 
6519 instruct vadd8F_reg(vecY dst, vecY src1, vecY src2) %{
6520   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6521   match(Set dst (AddVF src1 src2));
6522   format %{ "vaddps  $dst,$src1,$src2\t! add packed8F" %}
6523   ins_encode %{
6524     int vector_len = 1;
6525     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6526   %}
6527   ins_pipe( pipe_slow );
6528 %}
6529 
6530 instruct vadd8F_mem(vecY dst, vecY src, memory mem) %{
6531   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6532   match(Set dst (AddVF src (LoadVector mem)));
6533   format %{ "vaddps  $dst,$src,$mem\t! add packed8F" %}
6534   ins_encode %{
6535     int vector_len = 1;
6536     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6537   %}
6538   ins_pipe( pipe_slow );
6539 %}
6540 
6541 instruct vadd16F_reg(vecZ dst, vecZ src1, vecZ src2) %{
6542   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
6543   match(Set dst (AddVF src1 src2));
6544   format %{ "vaddps  $dst,$src1,$src2\t! add packed16F" %}
6545   ins_encode %{
6546     int vector_len = 2;
6547     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6548   %}
6549   ins_pipe( pipe_slow );
6550 %}
6551 
6552 instruct vadd16F_mem(vecZ dst, vecZ src, memory mem) %{
6553   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
6554   match(Set dst (AddVF src (LoadVector mem)));
6555   format %{ "vaddps  $dst,$src,$mem\t! add packed16F" %}
6556   ins_encode %{
6557     int vector_len = 2;
6558     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6559   %}
6560   ins_pipe( pipe_slow );
6561 %}
6562 
6563 // Doubles vector add
6564 instruct vadd2D(vecX dst, vecX src) %{
6565   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
6566   match(Set dst (AddVD dst src));
6567   format %{ "addpd   $dst,$src\t! add packed2D" %}
6568   ins_encode %{
6569     __ addpd($dst$$XMMRegister, $src$$XMMRegister);
6570   %}
6571   ins_pipe( pipe_slow );
6572 %}
6573 
6574 instruct vadd2D_reg(vecX dst, vecX src1, vecX src2) %{
6575   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6576   match(Set dst (AddVD src1 src2));
6577   format %{ "vaddpd  $dst,$src1,$src2\t! add packed2D" %}
6578   ins_encode %{
6579     int vector_len = 0;
6580     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6581   %}
6582   ins_pipe( pipe_slow );
6583 %}
6584 
6585 instruct vadd2D_mem(vecX dst, vecX src, memory mem) %{
6586   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6587   match(Set dst (AddVD src (LoadVector mem)));
6588   format %{ "vaddpd  $dst,$src,$mem\t! add packed2D" %}
6589   ins_encode %{
6590     int vector_len = 0;
6591     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6592   %}
6593   ins_pipe( pipe_slow );
6594 %}
6595 
6596 instruct vadd4D_reg(vecY dst, vecY src1, vecY src2) %{
6597   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6598   match(Set dst (AddVD src1 src2));
6599   format %{ "vaddpd  $dst,$src1,$src2\t! add packed4D" %}
6600   ins_encode %{
6601     int vector_len = 1;
6602     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6603   %}
6604   ins_pipe( pipe_slow );
6605 %}
6606 
6607 instruct vadd4D_mem(vecY dst, vecY src, memory mem) %{
6608   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6609   match(Set dst (AddVD src (LoadVector mem)));
6610   format %{ "vaddpd  $dst,$src,$mem\t! add packed4D" %}
6611   ins_encode %{
6612     int vector_len = 1;
6613     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6614   %}
6615   ins_pipe( pipe_slow );
6616 %}
6617 
6618 instruct vadd8D_reg(vecZ dst, vecZ src1, vecZ src2) %{
6619   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
6620   match(Set dst (AddVD src1 src2));
6621   format %{ "vaddpd  $dst,$src1,$src2\t! add packed8D" %}
6622   ins_encode %{
6623     int vector_len = 2;
6624     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6625   %}
6626   ins_pipe( pipe_slow );
6627 %}
6628 
6629 instruct vadd8D_mem(vecZ dst, vecZ src, memory mem) %{
6630   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
6631   match(Set dst (AddVD src (LoadVector mem)));
6632   format %{ "vaddpd  $dst,$src,$mem\t! add packed8D" %}
6633   ins_encode %{
6634     int vector_len = 2;
6635     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6636   %}
6637   ins_pipe( pipe_slow );
6638 %}
6639 
6640 // --------------------------------- SUB --------------------------------------
6641 
6642 // Bytes vector sub
6643 instruct vsub4B(vecS dst, vecS src) %{
6644   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
6645   match(Set dst (SubVB dst src));
6646   format %{ "psubb   $dst,$src\t! sub packed4B" %}
6647   ins_encode %{
6648     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
6649   %}
6650   ins_pipe( pipe_slow );
6651 %}
6652 
6653 instruct vsub4B_reg(vecS dst, vecS src1, vecS src2) %{
6654   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6655   match(Set dst (SubVB src1 src2));
6656   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed4B" %}
6657   ins_encode %{
6658     int vector_len = 0;
6659     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6660   %}
6661   ins_pipe( pipe_slow );
6662 %}
6663 
6664 instruct vsub4B_mem(vecS dst, vecS src, memory mem) %{
6665   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6666   match(Set dst (SubVB src (LoadVector mem)));
6667   format %{ "vpsubb  $dst,$src,$mem\t! sub packed4B" %}
6668   ins_encode %{
6669     int vector_len = 0;
6670     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6671   %}
6672   ins_pipe( pipe_slow );
6673 %}
6674 
6675 instruct vsub8B(vecD dst, vecD src) %{
6676   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
6677   match(Set dst (SubVB dst src));
6678   format %{ "psubb   $dst,$src\t! sub packed8B" %}
6679   ins_encode %{
6680     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
6681   %}
6682   ins_pipe( pipe_slow );
6683 %}
6684 
6685 instruct vsub8B_reg(vecD dst, vecD src1, vecD src2) %{
6686   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6687   match(Set dst (SubVB src1 src2));
6688   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed8B" %}
6689   ins_encode %{
6690     int vector_len = 0;
6691     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6692   %}
6693   ins_pipe( pipe_slow );
6694 %}
6695 
6696 instruct vsub8B_mem(vecD dst, vecD src, memory mem) %{
6697   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6698   match(Set dst (SubVB src (LoadVector mem)));
6699   format %{ "vpsubb  $dst,$src,$mem\t! sub packed8B" %}
6700   ins_encode %{
6701     int vector_len = 0;
6702     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6703   %}
6704   ins_pipe( pipe_slow );
6705 %}
6706 
6707 instruct vsub16B(vecX dst, vecX src) %{
6708   predicate(UseAVX == 0 && n->as_Vector()->length() == 16);
6709   match(Set dst (SubVB dst src));
6710   format %{ "psubb   $dst,$src\t! sub packed16B" %}
6711   ins_encode %{
6712     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
6713   %}
6714   ins_pipe( pipe_slow );
6715 %}
6716 
6717 instruct vsub16B_reg(vecX dst, vecX src1, vecX src2) %{
6718   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
6719   match(Set dst (SubVB src1 src2));
6720   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed16B" %}
6721   ins_encode %{
6722     int vector_len = 0;
6723     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6724   %}
6725   ins_pipe( pipe_slow );
6726 %}
6727 
6728 instruct vsub16B_mem(vecX dst, vecX src, memory mem) %{
6729   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
6730   match(Set dst (SubVB src (LoadVector mem)));
6731   format %{ "vpsubb  $dst,$src,$mem\t! sub packed16B" %}
6732   ins_encode %{
6733     int vector_len = 0;
6734     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6735   %}
6736   ins_pipe( pipe_slow );
6737 %}
6738 
6739 instruct vsub32B_reg(vecY dst, vecY src1, vecY src2) %{
6740   predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
6741   match(Set dst (SubVB src1 src2));
6742   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed32B" %}
6743   ins_encode %{
6744     int vector_len = 1;
6745     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6746   %}
6747   ins_pipe( pipe_slow );
6748 %}
6749 
6750 instruct vsub32B_mem(vecY dst, vecY src, memory mem) %{
6751   predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
6752   match(Set dst (SubVB src (LoadVector mem)));
6753   format %{ "vpsubb  $dst,$src,$mem\t! sub packed32B" %}
6754   ins_encode %{
6755     int vector_len = 1;
6756     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6757   %}
6758   ins_pipe( pipe_slow );
6759 %}
6760 
6761 instruct vsub64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
6762   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64);
6763   match(Set dst (SubVB src1 src2));
6764   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed64B" %}
6765   ins_encode %{
6766     int vector_len = 2;
6767     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6768   %}
6769   ins_pipe( pipe_slow );
6770 %}
6771 
6772 instruct vsub64B_mem(vecZ dst, vecZ src, memory mem) %{
6773   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64);
6774   match(Set dst (SubVB src (LoadVector mem)));
6775   format %{ "vpsubb  $dst,$src,$mem\t! sub packed64B" %}
6776   ins_encode %{
6777     int vector_len = 2;
6778     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6779   %}
6780   ins_pipe( pipe_slow );
6781 %}
6782 
6783 // Shorts/Chars vector sub
6784 instruct vsub2S(vecS dst, vecS src) %{
6785   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
6786   match(Set dst (SubVS dst src));
6787   format %{ "psubw   $dst,$src\t! sub packed2S" %}
6788   ins_encode %{
6789     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
6790   %}
6791   ins_pipe( pipe_slow );
6792 %}
6793 
6794 instruct vsub2S_reg(vecS dst, vecS src1, vecS src2) %{
6795   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6796   match(Set dst (SubVS src1 src2));
6797   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed2S" %}
6798   ins_encode %{
6799     int vector_len = 0;
6800     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6801   %}
6802   ins_pipe( pipe_slow );
6803 %}
6804 
6805 instruct vsub2S_mem(vecS dst, vecS src, memory mem) %{
6806   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6807   match(Set dst (SubVS src (LoadVector mem)));
6808   format %{ "vpsubw  $dst,$src,$mem\t! sub packed2S" %}
6809   ins_encode %{
6810     int vector_len = 0;
6811     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6812   %}
6813   ins_pipe( pipe_slow );
6814 %}
6815 
6816 instruct vsub4S(vecD dst, vecD src) %{
6817   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
6818   match(Set dst (SubVS dst src));
6819   format %{ "psubw   $dst,$src\t! sub packed4S" %}
6820   ins_encode %{
6821     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
6822   %}
6823   ins_pipe( pipe_slow );
6824 %}
6825 
6826 instruct vsub4S_reg(vecD dst, vecD src1, vecD src2) %{
6827   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6828   match(Set dst (SubVS src1 src2));
6829   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed4S" %}
6830   ins_encode %{
6831     int vector_len = 0;
6832     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6833   %}
6834   ins_pipe( pipe_slow );
6835 %}
6836 
6837 instruct vsub4S_mem(vecD dst, vecD src, memory mem) %{
6838   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6839   match(Set dst (SubVS src (LoadVector mem)));
6840   format %{ "vpsubw  $dst,$src,$mem\t! sub packed4S" %}
6841   ins_encode %{
6842     int vector_len = 0;
6843     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6844   %}
6845   ins_pipe( pipe_slow );
6846 %}
6847 
6848 instruct vsub8S(vecX dst, vecX src) %{
6849   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
6850   match(Set dst (SubVS dst src));
6851   format %{ "psubw   $dst,$src\t! sub packed8S" %}
6852   ins_encode %{
6853     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
6854   %}
6855   ins_pipe( pipe_slow );
6856 %}
6857 
6858 instruct vsub8S_reg(vecX dst, vecX src1, vecX src2) %{
6859   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6860   match(Set dst (SubVS src1 src2));
6861   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed8S" %}
6862   ins_encode %{
6863     int vector_len = 0;
6864     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6865   %}
6866   ins_pipe( pipe_slow );
6867 %}
6868 
6869 instruct vsub8S_mem(vecX dst, vecX src, memory mem) %{
6870   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6871   match(Set dst (SubVS src (LoadVector mem)));
6872   format %{ "vpsubw  $dst,$src,$mem\t! sub packed8S" %}
6873   ins_encode %{
6874     int vector_len = 0;
6875     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6876   %}
6877   ins_pipe( pipe_slow );
6878 %}
6879 
6880 instruct vsub16S_reg(vecY dst, vecY src1, vecY src2) %{
6881   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
6882   match(Set dst (SubVS src1 src2));
6883   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed16S" %}
6884   ins_encode %{
6885     int vector_len = 1;
6886     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6887   %}
6888   ins_pipe( pipe_slow );
6889 %}
6890 
6891 instruct vsub16S_mem(vecY dst, vecY src, memory mem) %{
6892   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
6893   match(Set dst (SubVS src (LoadVector mem)));
6894   format %{ "vpsubw  $dst,$src,$mem\t! sub packed16S" %}
6895   ins_encode %{
6896     int vector_len = 1;
6897     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6898   %}
6899   ins_pipe( pipe_slow );
6900 %}
6901 
6902 instruct vsub32S_reg(vecZ dst, vecZ src1, vecZ src2) %{
6903   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
6904   match(Set dst (SubVS src1 src2));
6905   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed32S" %}
6906   ins_encode %{
6907     int vector_len = 2;
6908     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6909   %}
6910   ins_pipe( pipe_slow );
6911 %}
6912 
6913 instruct vsub32S_mem(vecZ dst, vecZ src, memory mem) %{
6914   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
6915   match(Set dst (SubVS src (LoadVector mem)));
6916   format %{ "vpsubw  $dst,$src,$mem\t! sub packed32S" %}
6917   ins_encode %{
6918     int vector_len = 2;
6919     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6920   %}
6921   ins_pipe( pipe_slow );
6922 %}
6923 
6924 // Integers vector sub
6925 instruct vsub2I(vecD dst, vecD src) %{
6926   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
6927   match(Set dst (SubVI dst src));
6928   format %{ "psubd   $dst,$src\t! sub packed2I" %}
6929   ins_encode %{
6930     __ psubd($dst$$XMMRegister, $src$$XMMRegister);
6931   %}
6932   ins_pipe( pipe_slow );
6933 %}
6934 
6935 instruct vsub2I_reg(vecD dst, vecD src1, vecD src2) %{
6936   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6937   match(Set dst (SubVI src1 src2));
6938   format %{ "vpsubd  $dst,$src1,$src2\t! sub packed2I" %}
6939   ins_encode %{
6940     int vector_len = 0;
6941     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6942   %}
6943   ins_pipe( pipe_slow );
6944 %}
6945 
6946 instruct vsub2I_mem(vecD dst, vecD src, memory mem) %{
6947   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6948   match(Set dst (SubVI src (LoadVector mem)));
6949   format %{ "vpsubd  $dst,$src,$mem\t! sub packed2I" %}
6950   ins_encode %{
6951     int vector_len = 0;
6952     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6953   %}
6954   ins_pipe( pipe_slow );
6955 %}
6956 
6957 instruct vsub4I(vecX dst, vecX src) %{
6958   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
6959   match(Set dst (SubVI dst src));
6960   format %{ "psubd   $dst,$src\t! sub packed4I" %}
6961   ins_encode %{
6962     __ psubd($dst$$XMMRegister, $src$$XMMRegister);
6963   %}
6964   ins_pipe( pipe_slow );
6965 %}
6966 
6967 instruct vsub4I_reg(vecX dst, vecX src1, vecX src2) %{
6968   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6969   match(Set dst (SubVI src1 src2));
6970   format %{ "vpsubd  $dst,$src1,$src2\t! sub packed4I" %}
6971   ins_encode %{
6972     int vector_len = 0;
6973     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6974   %}
6975   ins_pipe( pipe_slow );
6976 %}
6977 
6978 instruct vsub4I_mem(vecX dst, vecX src, memory mem) %{
6979   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6980   match(Set dst (SubVI src (LoadVector mem)));
6981   format %{ "vpsubd  $dst,$src,$mem\t! sub packed4I" %}
6982   ins_encode %{
6983     int vector_len = 0;
6984     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6985   %}
6986   ins_pipe( pipe_slow );
6987 %}
6988 
6989 instruct vsub8I_reg(vecY dst, vecY src1, vecY src2) %{
6990   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
6991   match(Set dst (SubVI src1 src2));
6992   format %{ "vpsubd  $dst,$src1,$src2\t! sub packed8I" %}
6993   ins_encode %{
6994     int vector_len = 1;
6995     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6996   %}
6997   ins_pipe( pipe_slow );
6998 %}
6999 
7000 instruct vsub8I_mem(vecY dst, vecY src, memory mem) %{
7001   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
7002   match(Set dst (SubVI src (LoadVector mem)));
7003   format %{ "vpsubd  $dst,$src,$mem\t! sub packed8I" %}
7004   ins_encode %{
7005     int vector_len = 1;
7006     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7007   %}
7008   ins_pipe( pipe_slow );
7009 %}
7010 
7011 instruct vsub16I_reg(vecZ dst, vecZ src1, vecZ src2) %{
7012   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
7013   match(Set dst (SubVI src1 src2));
7014   format %{ "vpsubd  $dst,$src1,$src2\t! sub packed16I" %}
7015   ins_encode %{
7016     int vector_len = 2;
7017     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7018   %}
7019   ins_pipe( pipe_slow );
7020 %}
7021 
7022 instruct vsub16I_mem(vecZ dst, vecZ src, memory mem) %{
7023   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
7024   match(Set dst (SubVI src (LoadVector mem)));
7025   format %{ "vpsubd  $dst,$src,$mem\t! sub packed16I" %}
7026   ins_encode %{
7027     int vector_len = 2;
7028     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7029   %}
7030   ins_pipe( pipe_slow );
7031 %}
7032 
7033 // Longs vector sub
7034 instruct vsub2L(vecX dst, vecX src) %{
7035   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
7036   match(Set dst (SubVL dst src));
7037   format %{ "psubq   $dst,$src\t! sub packed2L" %}
7038   ins_encode %{
7039     __ psubq($dst$$XMMRegister, $src$$XMMRegister);
7040   %}
7041   ins_pipe( pipe_slow );
7042 %}
7043 
7044 instruct vsub2L_reg(vecX dst, vecX src1, vecX src2) %{
7045   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7046   match(Set dst (SubVL src1 src2));
7047   format %{ "vpsubq  $dst,$src1,$src2\t! sub packed2L" %}
7048   ins_encode %{
7049     int vector_len = 0;
7050     __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7051   %}
7052   ins_pipe( pipe_slow );
7053 %}
7054 
7055 instruct vsub2L_mem(vecX dst, vecX src, memory mem) %{
7056   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7057   match(Set dst (SubVL src (LoadVector mem)));
7058   format %{ "vpsubq  $dst,$src,$mem\t! sub packed2L" %}
7059   ins_encode %{
7060     int vector_len = 0;
7061     __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7062   %}
7063   ins_pipe( pipe_slow );
7064 %}
7065 
7066 instruct vsub4L_reg(vecY dst, vecY src1, vecY src2) %{
7067   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
7068   match(Set dst (SubVL src1 src2));
7069   format %{ "vpsubq  $dst,$src1,$src2\t! sub packed4L" %}
7070   ins_encode %{
7071     int vector_len = 1;
7072     __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7073   %}
7074   ins_pipe( pipe_slow );
7075 %}
7076 
7077 instruct vsub4L_mem(vecY dst, vecY src, memory mem) %{
7078   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
7079   match(Set dst (SubVL src (LoadVector mem)));
7080   format %{ "vpsubq  $dst,$src,$mem\t! sub packed4L" %}
7081   ins_encode %{
7082     int vector_len = 1;
7083     __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7084   %}
7085   ins_pipe( pipe_slow );
7086 %}
7087 
7088 instruct vsub8L_reg(vecZ dst, vecZ src1, vecZ src2) %{
7089   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
7090   match(Set dst (SubVL src1 src2));
7091   format %{ "vpsubq  $dst,$src1,$src2\t! sub packed8L" %}
7092   ins_encode %{
7093     int vector_len = 2;
7094     __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7095   %}
7096   ins_pipe( pipe_slow );
7097 %}
7098 
7099 instruct vsub8L_mem(vecZ dst, vecZ src, memory mem) %{
7100   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
7101   match(Set dst (SubVL src (LoadVector mem)));
7102   format %{ "vpsubq  $dst,$src,$mem\t! sub packed8L" %}
7103   ins_encode %{
7104     int vector_len = 2;
7105     __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7106   %}
7107   ins_pipe( pipe_slow );
7108 %}
7109 
7110 // Floats vector sub
7111 instruct vsub2F(vecD dst, vecD src) %{
7112   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
7113   match(Set dst (SubVF dst src));
7114   format %{ "subps   $dst,$src\t! sub packed2F" %}
7115   ins_encode %{
7116     __ subps($dst$$XMMRegister, $src$$XMMRegister);
7117   %}
7118   ins_pipe( pipe_slow );
7119 %}
7120 
7121 instruct vsub2F_reg(vecD dst, vecD src1, vecD src2) %{
7122   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7123   match(Set dst (SubVF src1 src2));
7124   format %{ "vsubps  $dst,$src1,$src2\t! sub packed2F" %}
7125   ins_encode %{
7126     int vector_len = 0;
7127     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7128   %}
7129   ins_pipe( pipe_slow );
7130 %}
7131 
7132 instruct vsub2F_mem(vecD dst, vecD src, memory mem) %{
7133   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7134   match(Set dst (SubVF src (LoadVector mem)));
7135   format %{ "vsubps  $dst,$src,$mem\t! sub packed2F" %}
7136   ins_encode %{
7137     int vector_len = 0;
7138     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7139   %}
7140   ins_pipe( pipe_slow );
7141 %}
7142 
7143 instruct vsub4F(vecX dst, vecX src) %{
7144   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
7145   match(Set dst (SubVF dst src));
7146   format %{ "subps   $dst,$src\t! sub packed4F" %}
7147   ins_encode %{
7148     __ subps($dst$$XMMRegister, $src$$XMMRegister);
7149   %}
7150   ins_pipe( pipe_slow );
7151 %}
7152 
7153 instruct vsub4F_reg(vecX dst, vecX src1, vecX src2) %{
7154   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7155   match(Set dst (SubVF src1 src2));
7156   format %{ "vsubps  $dst,$src1,$src2\t! sub packed4F" %}
7157   ins_encode %{
7158     int vector_len = 0;
7159     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7160   %}
7161   ins_pipe( pipe_slow );
7162 %}
7163 
7164 instruct vsub4F_mem(vecX dst, vecX src, memory mem) %{
7165   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7166   match(Set dst (SubVF src (LoadVector mem)));
7167   format %{ "vsubps  $dst,$src,$mem\t! sub packed4F" %}
7168   ins_encode %{
7169     int vector_len = 0;
7170     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7171   %}
7172   ins_pipe( pipe_slow );
7173 %}
7174 
7175 instruct vsub8F_reg(vecY dst, vecY src1, vecY src2) %{
7176   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
7177   match(Set dst (SubVF src1 src2));
7178   format %{ "vsubps  $dst,$src1,$src2\t! sub packed8F" %}
7179   ins_encode %{
7180     int vector_len = 1;
7181     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7182   %}
7183   ins_pipe( pipe_slow );
7184 %}
7185 
7186 instruct vsub8F_mem(vecY dst, vecY src, memory mem) %{
7187   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
7188   match(Set dst (SubVF src (LoadVector mem)));
7189   format %{ "vsubps  $dst,$src,$mem\t! sub packed8F" %}
7190   ins_encode %{
7191     int vector_len = 1;
7192     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7193   %}
7194   ins_pipe( pipe_slow );
7195 %}
7196 
7197 instruct vsub16F_reg(vecZ dst, vecZ src1, vecZ src2) %{
7198   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
7199   match(Set dst (SubVF src1 src2));
7200   format %{ "vsubps  $dst,$src1,$src2\t! sub packed16F" %}
7201   ins_encode %{
7202     int vector_len = 2;
7203     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7204   %}
7205   ins_pipe( pipe_slow );
7206 %}
7207 
7208 instruct vsub16F_mem(vecZ dst, vecZ src, memory mem) %{
7209   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
7210   match(Set dst (SubVF src (LoadVector mem)));
7211   format %{ "vsubps  $dst,$src,$mem\t! sub packed16F" %}
7212   ins_encode %{
7213     int vector_len = 2;
7214     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7215   %}
7216   ins_pipe( pipe_slow );
7217 %}
7218 
7219 // Doubles vector sub
7220 instruct vsub2D(vecX dst, vecX src) %{
7221   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
7222   match(Set dst (SubVD dst src));
7223   format %{ "subpd   $dst,$src\t! sub packed2D" %}
7224   ins_encode %{
7225     __ subpd($dst$$XMMRegister, $src$$XMMRegister);
7226   %}
7227   ins_pipe( pipe_slow );
7228 %}
7229 
7230 instruct vsub2D_reg(vecX dst, vecX src1, vecX src2) %{
7231   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7232   match(Set dst (SubVD src1 src2));
7233   format %{ "vsubpd  $dst,$src1,$src2\t! sub packed2D" %}
7234   ins_encode %{
7235     int vector_len = 0;
7236     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7237   %}
7238   ins_pipe( pipe_slow );
7239 %}
7240 
7241 instruct vsub2D_mem(vecX dst, vecX src, memory mem) %{
7242   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7243   match(Set dst (SubVD src (LoadVector mem)));
7244   format %{ "vsubpd  $dst,$src,$mem\t! sub packed2D" %}
7245   ins_encode %{
7246     int vector_len = 0;
7247     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7248   %}
7249   ins_pipe( pipe_slow );
7250 %}
7251 
7252 instruct vsub4D_reg(vecY dst, vecY src1, vecY src2) %{
7253   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7254   match(Set dst (SubVD src1 src2));
7255   format %{ "vsubpd  $dst,$src1,$src2\t! sub packed4D" %}
7256   ins_encode %{
7257     int vector_len = 1;
7258     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7259   %}
7260   ins_pipe( pipe_slow );
7261 %}
7262 
7263 instruct vsub4D_mem(vecY dst, vecY src, memory mem) %{
7264   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7265   match(Set dst (SubVD src (LoadVector mem)));
7266   format %{ "vsubpd  $dst,$src,$mem\t! sub packed4D" %}
7267   ins_encode %{
7268     int vector_len = 1;
7269     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7270   %}
7271   ins_pipe( pipe_slow );
7272 %}
7273 
7274 instruct vsub8D_reg(vecZ dst, vecZ src1, vecZ src2) %{
7275   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
7276   match(Set dst (SubVD src1 src2));
7277   format %{ "vsubpd  $dst,$src1,$src2\t! sub packed8D" %}
7278   ins_encode %{
7279     int vector_len = 2;
7280     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7281   %}
7282   ins_pipe( pipe_slow );
7283 %}
7284 
7285 instruct vsub8D_mem(vecZ dst, vecZ src, memory mem) %{
7286   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
7287   match(Set dst (SubVD src (LoadVector mem)));
7288   format %{ "vsubpd  $dst,$src,$mem\t! sub packed8D" %}
7289   ins_encode %{
7290     int vector_len = 2;
7291     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7292   %}
7293   ins_pipe( pipe_slow );
7294 %}
7295 
7296 // --------------------------------- MUL --------------------------------------
7297 
7298 // Shorts/Chars vector mul
7299 instruct vmul2S(vecS dst, vecS src) %{
7300   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
7301   match(Set dst (MulVS dst src));
7302   format %{ "pmullw $dst,$src\t! mul packed2S" %}
7303   ins_encode %{
7304     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
7305   %}
7306   ins_pipe( pipe_slow );
7307 %}
7308 
7309 instruct vmul2S_reg(vecS dst, vecS src1, vecS src2) %{
7310   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7311   match(Set dst (MulVS src1 src2));
7312   format %{ "vpmullw $dst,$src1,$src2\t! mul packed2S" %}
7313   ins_encode %{
7314     int vector_len = 0;
7315     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7316   %}
7317   ins_pipe( pipe_slow );
7318 %}
7319 
7320 instruct vmul2S_mem(vecS dst, vecS src, memory mem) %{
7321   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7322   match(Set dst (MulVS src (LoadVector mem)));
7323   format %{ "vpmullw $dst,$src,$mem\t! mul packed2S" %}
7324   ins_encode %{
7325     int vector_len = 0;
7326     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7327   %}
7328   ins_pipe( pipe_slow );
7329 %}
7330 
7331 instruct vmul4S(vecD dst, vecD src) %{
7332   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
7333   match(Set dst (MulVS dst src));
7334   format %{ "pmullw  $dst,$src\t! mul packed4S" %}
7335   ins_encode %{
7336     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
7337   %}
7338   ins_pipe( pipe_slow );
7339 %}
7340 
7341 instruct vmul4S_reg(vecD dst, vecD src1, vecD src2) %{
7342   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7343   match(Set dst (MulVS src1 src2));
7344   format %{ "vpmullw $dst,$src1,$src2\t! mul packed4S" %}
7345   ins_encode %{
7346     int vector_len = 0;
7347     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7348   %}
7349   ins_pipe( pipe_slow );
7350 %}
7351 
7352 instruct vmul4S_mem(vecD dst, vecD src, memory mem) %{
7353   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7354   match(Set dst (MulVS src (LoadVector mem)));
7355   format %{ "vpmullw $dst,$src,$mem\t! mul packed4S" %}
7356   ins_encode %{
7357     int vector_len = 0;
7358     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7359   %}
7360   ins_pipe( pipe_slow );
7361 %}
7362 
7363 instruct vmul8S(vecX dst, vecX src) %{
7364   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
7365   match(Set dst (MulVS dst src));
7366   format %{ "pmullw  $dst,$src\t! mul packed8S" %}
7367   ins_encode %{
7368     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
7369   %}
7370   ins_pipe( pipe_slow );
7371 %}
7372 
7373 instruct vmul8S_reg(vecX dst, vecX src1, vecX src2) %{
7374   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
7375   match(Set dst (MulVS src1 src2));
7376   format %{ "vpmullw $dst,$src1,$src2\t! mul packed8S" %}
7377   ins_encode %{
7378     int vector_len = 0;
7379     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7380   %}
7381   ins_pipe( pipe_slow );
7382 %}
7383 
7384 instruct vmul8S_mem(vecX dst, vecX src, memory mem) %{
7385   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
7386   match(Set dst (MulVS src (LoadVector mem)));
7387   format %{ "vpmullw $dst,$src,$mem\t! mul packed8S" %}
7388   ins_encode %{
7389     int vector_len = 0;
7390     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7391   %}
7392   ins_pipe( pipe_slow );
7393 %}
7394 
7395 instruct vmul16S_reg(vecY dst, vecY src1, vecY src2) %{
7396   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
7397   match(Set dst (MulVS src1 src2));
7398   format %{ "vpmullw $dst,$src1,$src2\t! mul packed16S" %}
7399   ins_encode %{
7400     int vector_len = 1;
7401     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7402   %}
7403   ins_pipe( pipe_slow );
7404 %}
7405 
7406 instruct vmul16S_mem(vecY dst, vecY src, memory mem) %{
7407   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
7408   match(Set dst (MulVS src (LoadVector mem)));
7409   format %{ "vpmullw $dst,$src,$mem\t! mul packed16S" %}
7410   ins_encode %{
7411     int vector_len = 1;
7412     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7413   %}
7414   ins_pipe( pipe_slow );
7415 %}
7416 
7417 instruct vmul32S_reg(vecZ dst, vecZ src1, vecZ src2) %{
7418   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
7419   match(Set dst (MulVS src1 src2));
7420   format %{ "vpmullw $dst,$src1,$src2\t! mul packed32S" %}
7421   ins_encode %{
7422     int vector_len = 2;
7423     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7424   %}
7425   ins_pipe( pipe_slow );
7426 %}
7427 
7428 instruct vmul32S_mem(vecZ dst, vecZ src, memory mem) %{
7429   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
7430   match(Set dst (MulVS src (LoadVector mem)));
7431   format %{ "vpmullw $dst,$src,$mem\t! mul packed32S" %}
7432   ins_encode %{
7433     int vector_len = 2;
7434     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7435   %}
7436   ins_pipe( pipe_slow );
7437 %}
7438 
7439 // Integers vector mul (sse4_1)
7440 instruct vmul2I(vecD dst, vecD src) %{
7441   predicate(UseSSE > 3 && n->as_Vector()->length() == 2);
7442   match(Set dst (MulVI dst src));
7443   format %{ "pmulld  $dst,$src\t! mul packed2I" %}
7444   ins_encode %{
7445     __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
7446   %}
7447   ins_pipe( pipe_slow );
7448 %}
7449 
7450 instruct vmul2I_reg(vecD dst, vecD src1, vecD src2) %{
7451   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7452   match(Set dst (MulVI src1 src2));
7453   format %{ "vpmulld $dst,$src1,$src2\t! mul packed2I" %}
7454   ins_encode %{
7455     int vector_len = 0;
7456     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7457   %}
7458   ins_pipe( pipe_slow );
7459 %}
7460 
7461 instruct vmul2I_mem(vecD dst, vecD src, memory mem) %{
7462   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7463   match(Set dst (MulVI src (LoadVector mem)));
7464   format %{ "vpmulld $dst,$src,$mem\t! mul packed2I" %}
7465   ins_encode %{
7466     int vector_len = 0;
7467     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7468   %}
7469   ins_pipe( pipe_slow );
7470 %}
7471 
7472 instruct vmul4I(vecX dst, vecX src) %{
7473   predicate(UseSSE > 3 && n->as_Vector()->length() == 4);
7474   match(Set dst (MulVI dst src));
7475   format %{ "pmulld  $dst,$src\t! mul packed4I" %}
7476   ins_encode %{
7477     __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
7478   %}
7479   ins_pipe( pipe_slow );
7480 %}
7481 
7482 instruct vmul4I_reg(vecX dst, vecX src1, vecX src2) %{
7483   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7484   match(Set dst (MulVI src1 src2));
7485   format %{ "vpmulld $dst,$src1,$src2\t! mul packed4I" %}
7486   ins_encode %{
7487     int vector_len = 0;
7488     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7489   %}
7490   ins_pipe( pipe_slow );
7491 %}
7492 
7493 instruct vmul4I_mem(vecX dst, vecX src, memory mem) %{
7494   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7495   match(Set dst (MulVI src (LoadVector mem)));
7496   format %{ "vpmulld $dst,$src,$mem\t! mul packed4I" %}
7497   ins_encode %{
7498     int vector_len = 0;
7499     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7500   %}
7501   ins_pipe( pipe_slow );
7502 %}
7503 
7504 instruct vmul2L_reg(vecX dst, vecX src1, vecX src2) %{
7505   predicate(UseAVX > 2 && n->as_Vector()->length() == 2 && VM_Version::supports_avx512dq());
7506   match(Set dst (MulVL src1 src2));
7507   format %{ "vpmullq $dst,$src1,$src2\t! mul packed2L" %}
7508   ins_encode %{
7509     int vector_len = 0;
7510     __ vpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7511   %}
7512   ins_pipe( pipe_slow );
7513 %}
7514 
7515 instruct vmul2L_mem(vecX dst, vecX src, memory mem) %{
7516   predicate(UseAVX > 2 && n->as_Vector()->length() == 2 && VM_Version::supports_avx512dq());
7517   match(Set dst (MulVL src (LoadVector mem)));
7518   format %{ "vpmullq $dst,$src,$mem\t! mul packed2L" %}
7519   ins_encode %{
7520     int vector_len = 0;
7521     __ vpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7522   %}
7523   ins_pipe( pipe_slow );
7524 %}
7525 
7526 instruct vmul4L_reg(vecY dst, vecY src1, vecY src2) %{
7527   predicate(UseAVX > 2 && n->as_Vector()->length() == 4 && VM_Version::supports_avx512dq());
7528   match(Set dst (MulVL src1 src2));
7529   format %{ "vpmullq $dst,$src1,$src2\t! mul packed4L" %}
7530   ins_encode %{
7531     int vector_len = 1;
7532     __ vpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7533   %}
7534   ins_pipe( pipe_slow );
7535 %}
7536 
7537 instruct vmul4L_mem(vecY dst, vecY src, memory mem) %{
7538   predicate(UseAVX > 2 && n->as_Vector()->length() == 4 && VM_Version::supports_avx512dq());
7539   match(Set dst (MulVL src (LoadVector mem)));
7540   format %{ "vpmullq $dst,$src,$mem\t! mul packed4L" %}
7541   ins_encode %{
7542     int vector_len = 1;
7543     __ vpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7544   %}
7545   ins_pipe( pipe_slow );
7546 %}
7547 
7548 instruct vmul8L_reg(vecZ dst, vecZ src1, vecZ src2) %{
7549   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && VM_Version::supports_avx512dq());
7550   match(Set dst (MulVL src1 src2));
7551   format %{ "vpmullq $dst,$src1,$src2\t! mul packed8L" %}
7552   ins_encode %{
7553     int vector_len = 2;
7554     __ vpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7555   %}
7556   ins_pipe( pipe_slow );
7557 %}
7558 
7559 instruct vmul8L_mem(vecZ dst, vecZ src, memory mem) %{
7560   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && VM_Version::supports_avx512dq());
7561   match(Set dst (MulVL src (LoadVector mem)));
7562   format %{ "vpmullq $dst,$src,$mem\t! mul packed8L" %}
7563   ins_encode %{
7564     int vector_len = 2;
7565     __ vpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7566   %}
7567   ins_pipe( pipe_slow );
7568 %}
7569 
7570 instruct vmul8I_reg(vecY dst, vecY src1, vecY src2) %{
7571   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
7572   match(Set dst (MulVI src1 src2));
7573   format %{ "vpmulld $dst,$src1,$src2\t! mul packed8I" %}
7574   ins_encode %{
7575     int vector_len = 1;
7576     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7577   %}
7578   ins_pipe( pipe_slow );
7579 %}
7580 
7581 instruct vmul8I_mem(vecY dst, vecY src, memory mem) %{
7582   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
7583   match(Set dst (MulVI src (LoadVector mem)));
7584   format %{ "vpmulld $dst,$src,$mem\t! mul packed8I" %}
7585   ins_encode %{
7586     int vector_len = 1;
7587     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7588   %}
7589   ins_pipe( pipe_slow );
7590 %}
7591 
7592 instruct vmul16I_reg(vecZ dst, vecZ src1, vecZ src2) %{
7593   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
7594   match(Set dst (MulVI src1 src2));
7595   format %{ "vpmulld $dst,$src1,$src2\t! mul packed16I" %}
7596   ins_encode %{
7597     int vector_len = 2;
7598     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7599   %}
7600   ins_pipe( pipe_slow );
7601 %}
7602 
7603 instruct vmul16I_mem(vecZ dst, vecZ src, memory mem) %{
7604   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
7605   match(Set dst (MulVI src (LoadVector mem)));
7606   format %{ "vpmulld $dst,$src,$mem\t! mul packed16I" %}
7607   ins_encode %{
7608     int vector_len = 2;
7609     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7610   %}
7611   ins_pipe( pipe_slow );
7612 %}
7613 
7614 // Floats vector mul
7615 instruct vmul2F(vecD dst, vecD src) %{
7616   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
7617   match(Set dst (MulVF dst src));
7618   format %{ "mulps   $dst,$src\t! mul packed2F" %}
7619   ins_encode %{
7620     __ mulps($dst$$XMMRegister, $src$$XMMRegister);
7621   %}
7622   ins_pipe( pipe_slow );
7623 %}
7624 
7625 instruct vmul2F_reg(vecD dst, vecD src1, vecD src2) %{
7626   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7627   match(Set dst (MulVF src1 src2));
7628   format %{ "vmulps  $dst,$src1,$src2\t! mul packed2F" %}
7629   ins_encode %{
7630     int vector_len = 0;
7631     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7632   %}
7633   ins_pipe( pipe_slow );
7634 %}
7635 
7636 instruct vmul2F_mem(vecD dst, vecD src, memory mem) %{
7637   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7638   match(Set dst (MulVF src (LoadVector mem)));
7639   format %{ "vmulps  $dst,$src,$mem\t! mul packed2F" %}
7640   ins_encode %{
7641     int vector_len = 0;
7642     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7643   %}
7644   ins_pipe( pipe_slow );
7645 %}
7646 
7647 instruct vmul4F(vecX dst, vecX src) %{
7648   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
7649   match(Set dst (MulVF dst src));
7650   format %{ "mulps   $dst,$src\t! mul packed4F" %}
7651   ins_encode %{
7652     __ mulps($dst$$XMMRegister, $src$$XMMRegister);
7653   %}
7654   ins_pipe( pipe_slow );
7655 %}
7656 
7657 instruct vmul4F_reg(vecX dst, vecX src1, vecX src2) %{
7658   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7659   match(Set dst (MulVF src1 src2));
7660   format %{ "vmulps  $dst,$src1,$src2\t! mul packed4F" %}
7661   ins_encode %{
7662     int vector_len = 0;
7663     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7664   %}
7665   ins_pipe( pipe_slow );
7666 %}
7667 
7668 instruct vmul4F_mem(vecX dst, vecX src, memory mem) %{
7669   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7670   match(Set dst (MulVF src (LoadVector mem)));
7671   format %{ "vmulps  $dst,$src,$mem\t! mul packed4F" %}
7672   ins_encode %{
7673     int vector_len = 0;
7674     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7675   %}
7676   ins_pipe( pipe_slow );
7677 %}
7678 
7679 instruct vmul8F_reg(vecY dst, vecY src1, vecY src2) %{
7680   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
7681   match(Set dst (MulVF src1 src2));
7682   format %{ "vmulps  $dst,$src1,$src2\t! mul packed8F" %}
7683   ins_encode %{
7684     int vector_len = 1;
7685     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7686   %}
7687   ins_pipe( pipe_slow );
7688 %}
7689 
7690 instruct vmul8F_mem(vecY dst, vecY src, memory mem) %{
7691   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
7692   match(Set dst (MulVF src (LoadVector mem)));
7693   format %{ "vmulps  $dst,$src,$mem\t! mul packed8F" %}
7694   ins_encode %{
7695     int vector_len = 1;
7696     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7697   %}
7698   ins_pipe( pipe_slow );
7699 %}
7700 
7701 instruct vmul16F_reg(vecZ dst, vecZ src1, vecZ src2) %{
7702   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
7703   match(Set dst (MulVF src1 src2));
7704   format %{ "vmulps  $dst,$src1,$src2\t! mul packed16F" %}
7705   ins_encode %{
7706     int vector_len = 2;
7707     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7708   %}
7709   ins_pipe( pipe_slow );
7710 %}
7711 
7712 instruct vmul16F_mem(vecZ dst, vecZ src, memory mem) %{
7713   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
7714   match(Set dst (MulVF src (LoadVector mem)));
7715   format %{ "vmulps  $dst,$src,$mem\t! mul packed16F" %}
7716   ins_encode %{
7717     int vector_len = 2;
7718     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7719   %}
7720   ins_pipe( pipe_slow );
7721 %}
7722 
7723 // Doubles vector mul
7724 instruct vmul2D(vecX dst, vecX src) %{
7725   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
7726   match(Set dst (MulVD dst src));
7727   format %{ "mulpd   $dst,$src\t! mul packed2D" %}
7728   ins_encode %{
7729     __ mulpd($dst$$XMMRegister, $src$$XMMRegister);
7730   %}
7731   ins_pipe( pipe_slow );
7732 %}
7733 
7734 instruct vmul2D_reg(vecX dst, vecX src1, vecX src2) %{
7735   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7736   match(Set dst (MulVD src1 src2));
7737   format %{ "vmulpd  $dst,$src1,$src2\t! mul packed2D" %}
7738   ins_encode %{
7739     int vector_len = 0;
7740     __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7741   %}
7742   ins_pipe( pipe_slow );
7743 %}
7744 
7745 instruct vmul2D_mem(vecX dst, vecX src, memory mem) %{
7746   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7747   match(Set dst (MulVD src (LoadVector mem)));
7748   format %{ "vmulpd  $dst,$src,$mem\t! mul packed2D" %}
7749   ins_encode %{
7750     int vector_len = 0;
7751     __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7752   %}
7753   ins_pipe( pipe_slow );
7754 %}
7755 
7756 instruct vmul4D_reg(vecY dst, vecY src1, vecY src2) %{
7757   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7758   match(Set dst (MulVD src1 src2));
7759   format %{ "vmulpd  $dst,$src1,$src2\t! mul packed4D" %}
7760   ins_encode %{
7761     int vector_len = 1;
7762     __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7763   %}
7764   ins_pipe( pipe_slow );
7765 %}
7766 
7767 instruct vmul4D_mem(vecY dst, vecY src, memory mem) %{
7768   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7769   match(Set dst (MulVD src (LoadVector mem)));
7770   format %{ "vmulpd  $dst,$src,$mem\t! mul packed4D" %}
7771   ins_encode %{
7772     int vector_len = 1;
7773     __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7774   %}
7775   ins_pipe( pipe_slow );
7776 %}
7777 
7778 instruct vmul8D_reg(vecZ dst, vecZ src1, vecZ src2) %{
7779   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
7780   match(Set dst (MulVD src1 src2));
7781   format %{ "vmulpd  $dst k0,$src1,$src2\t! mul packed8D" %}
7782   ins_encode %{
7783     int vector_len = 2;
7784     __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7785   %}
7786   ins_pipe( pipe_slow );
7787 %}
7788 
7789 instruct vmul8D_mem(vecZ dst, vecZ src, memory mem) %{
7790   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
7791   match(Set dst (MulVD src (LoadVector mem)));
7792   format %{ "vmulpd  $dst k0,$src,$mem\t! mul packed8D" %}
7793   ins_encode %{
7794     int vector_len = 2;
7795     __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7796   %}
7797   ins_pipe( pipe_slow );
7798 %}
7799 
7800 instruct vcmov8F_reg(legVecY dst, legVecY src1, legVecY src2, immI8 cop, cmpOp_vcmppd copnd) %{
7801   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
7802   match(Set dst (CMoveVF (Binary copnd cop) (Binary src1 src2)));
7803   effect(TEMP dst, USE src1, USE src2);
7804   format %{ "cmpps.$copnd  $dst, $src1, $src2  ! vcmovevf, cond=$cop\n\t"
7805             "blendvps $dst,$src1,$src2,$dst ! vcmovevf\n\t"
7806          %}
7807   ins_encode %{
7808     int vector_len = 1;
7809     int cond = (Assembler::Condition)($copnd$$cmpcode);
7810     __ cmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cond, vector_len);
7811     __ blendvps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $dst$$XMMRegister, vector_len);
7812   %}
7813   ins_pipe( pipe_slow );
7814 %}
7815 
7816 instruct vcmov4D_reg(legVecY dst, legVecY src1, legVecY src2, immI8 cop, cmpOp_vcmppd copnd) %{
7817   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7818   match(Set dst (CMoveVD (Binary copnd cop) (Binary src1 src2)));
7819   effect(TEMP dst, USE src1, USE src2);
7820   format %{ "cmppd.$copnd  $dst, $src1, $src2  ! vcmovevd, cond=$cop\n\t"
7821             "blendvpd $dst,$src1,$src2,$dst ! vcmovevd\n\t"
7822          %}
7823   ins_encode %{
7824     int vector_len = 1;
7825     int cond = (Assembler::Condition)($copnd$$cmpcode);
7826     __ cmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cond, vector_len);
7827     __ blendvpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $dst$$XMMRegister, vector_len);
7828   %}
7829   ins_pipe( pipe_slow );
7830 %}
7831 
7832 // --------------------------------- DIV --------------------------------------
7833 
7834 // Floats vector div
7835 instruct vdiv2F(vecD dst, vecD src) %{
7836   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
7837   match(Set dst (DivVF dst src));
7838   format %{ "divps   $dst,$src\t! div packed2F" %}
7839   ins_encode %{
7840     __ divps($dst$$XMMRegister, $src$$XMMRegister);
7841   %}
7842   ins_pipe( pipe_slow );
7843 %}
7844 
7845 instruct vdiv2F_reg(vecD dst, vecD src1, vecD src2) %{
7846   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7847   match(Set dst (DivVF src1 src2));
7848   format %{ "vdivps  $dst,$src1,$src2\t! div packed2F" %}
7849   ins_encode %{
7850     int vector_len = 0;
7851     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7852   %}
7853   ins_pipe( pipe_slow );
7854 %}
7855 
7856 instruct vdiv2F_mem(vecD dst, vecD src, memory mem) %{
7857   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7858   match(Set dst (DivVF src (LoadVector mem)));
7859   format %{ "vdivps  $dst,$src,$mem\t! div packed2F" %}
7860   ins_encode %{
7861     int vector_len = 0;
7862     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7863   %}
7864   ins_pipe( pipe_slow );
7865 %}
7866 
7867 instruct vdiv4F(vecX dst, vecX src) %{
7868   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
7869   match(Set dst (DivVF dst src));
7870   format %{ "divps   $dst,$src\t! div packed4F" %}
7871   ins_encode %{
7872     __ divps($dst$$XMMRegister, $src$$XMMRegister);
7873   %}
7874   ins_pipe( pipe_slow );
7875 %}
7876 
7877 instruct vdiv4F_reg(vecX dst, vecX src1, vecX src2) %{
7878   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7879   match(Set dst (DivVF src1 src2));
7880   format %{ "vdivps  $dst,$src1,$src2\t! div packed4F" %}
7881   ins_encode %{
7882     int vector_len = 0;
7883     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7884   %}
7885   ins_pipe( pipe_slow );
7886 %}
7887 
7888 instruct vdiv4F_mem(vecX dst, vecX src, memory mem) %{
7889   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7890   match(Set dst (DivVF src (LoadVector mem)));
7891   format %{ "vdivps  $dst,$src,$mem\t! div packed4F" %}
7892   ins_encode %{
7893     int vector_len = 0;
7894     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7895   %}
7896   ins_pipe( pipe_slow );
7897 %}
7898 
7899 instruct vdiv8F_reg(vecY dst, vecY src1, vecY src2) %{
7900   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
7901   match(Set dst (DivVF src1 src2));
7902   format %{ "vdivps  $dst,$src1,$src2\t! div packed8F" %}
7903   ins_encode %{
7904     int vector_len = 1;
7905     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7906   %}
7907   ins_pipe( pipe_slow );
7908 %}
7909 
7910 instruct vdiv8F_mem(vecY dst, vecY src, memory mem) %{
7911   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
7912   match(Set dst (DivVF src (LoadVector mem)));
7913   format %{ "vdivps  $dst,$src,$mem\t! div packed8F" %}
7914   ins_encode %{
7915     int vector_len = 1;
7916     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7917   %}
7918   ins_pipe( pipe_slow );
7919 %}
7920 
7921 instruct vdiv16F_reg(vecZ dst, vecZ src1, vecZ src2) %{
7922   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
7923   match(Set dst (DivVF src1 src2));
7924   format %{ "vdivps  $dst,$src1,$src2\t! div packed16F" %}
7925   ins_encode %{
7926     int vector_len = 2;
7927     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7928   %}
7929   ins_pipe( pipe_slow );
7930 %}
7931 
7932 instruct vdiv16F_mem(vecZ dst, vecZ src, memory mem) %{
7933   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
7934   match(Set dst (DivVF src (LoadVector mem)));
7935   format %{ "vdivps  $dst,$src,$mem\t! div packed16F" %}
7936   ins_encode %{
7937     int vector_len = 2;
7938     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7939   %}
7940   ins_pipe( pipe_slow );
7941 %}
7942 
7943 // Doubles vector div
7944 instruct vdiv2D(vecX dst, vecX src) %{
7945   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
7946   match(Set dst (DivVD dst src));
7947   format %{ "divpd   $dst,$src\t! div packed2D" %}
7948   ins_encode %{
7949     __ divpd($dst$$XMMRegister, $src$$XMMRegister);
7950   %}
7951   ins_pipe( pipe_slow );
7952 %}
7953 
7954 instruct vdiv2D_reg(vecX dst, vecX src1, vecX src2) %{
7955   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7956   match(Set dst (DivVD src1 src2));
7957   format %{ "vdivpd  $dst,$src1,$src2\t! div packed2D" %}
7958   ins_encode %{
7959     int vector_len = 0;
7960     __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7961   %}
7962   ins_pipe( pipe_slow );
7963 %}
7964 
7965 instruct vdiv2D_mem(vecX dst, vecX src, memory mem) %{
7966   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7967   match(Set dst (DivVD src (LoadVector mem)));
7968   format %{ "vdivpd  $dst,$src,$mem\t! div packed2D" %}
7969   ins_encode %{
7970     int vector_len = 0;
7971     __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7972   %}
7973   ins_pipe( pipe_slow );
7974 %}
7975 
7976 instruct vdiv4D_reg(vecY dst, vecY src1, vecY src2) %{
7977   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7978   match(Set dst (DivVD src1 src2));
7979   format %{ "vdivpd  $dst,$src1,$src2\t! div packed4D" %}
7980   ins_encode %{
7981     int vector_len = 1;
7982     __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7983   %}
7984   ins_pipe( pipe_slow );
7985 %}
7986 
7987 instruct vdiv4D_mem(vecY dst, vecY src, memory mem) %{
7988   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7989   match(Set dst (DivVD src (LoadVector mem)));
7990   format %{ "vdivpd  $dst,$src,$mem\t! div packed4D" %}
7991   ins_encode %{
7992     int vector_len = 1;
7993     __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7994   %}
7995   ins_pipe( pipe_slow );
7996 %}
7997 
7998 instruct vdiv8D_reg(vecZ dst, vecZ src1, vecZ src2) %{
7999   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8000   match(Set dst (DivVD src1 src2));
8001   format %{ "vdivpd  $dst,$src1,$src2\t! div packed8D" %}
8002   ins_encode %{
8003     int vector_len = 2;
8004     __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8005   %}
8006   ins_pipe( pipe_slow );
8007 %}
8008 
8009 instruct vdiv8D_mem(vecZ dst, vecZ src, memory mem) %{
8010   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8011   match(Set dst (DivVD src (LoadVector mem)));
8012   format %{ "vdivpd  $dst,$src,$mem\t! div packed8D" %}
8013   ins_encode %{
8014     int vector_len = 2;
8015     __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8016   %}
8017   ins_pipe( pipe_slow );
8018 %}
8019 
8020 // ------------------------------ Shift ---------------------------------------
8021 
8022 // Left and right shift count vectors are the same on x86
8023 // (only lowest bits of xmm reg are used for count).
8024 instruct vshiftcnt(vecS dst, rRegI cnt) %{
8025   match(Set dst (LShiftCntV cnt));
8026   match(Set dst (RShiftCntV cnt));
8027   format %{ "movd    $dst,$cnt\t! load shift count" %}
8028   ins_encode %{
8029     __ movdl($dst$$XMMRegister, $cnt$$Register);
8030   %}
8031   ins_pipe( pipe_slow );
8032 %}
8033 
8034 // --------------------------------- Sqrt --------------------------------------
8035 
8036 // Floating point vector sqrt
8037 instruct vsqrt2D_reg(vecX dst, vecX src) %{
8038   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8039   match(Set dst (SqrtVD src));
8040   format %{ "vsqrtpd  $dst,$src\t! sqrt packed2D" %}
8041   ins_encode %{
8042     int vector_len = 0;
8043     __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
8044   %}
8045   ins_pipe( pipe_slow );
8046 %}
8047 
8048 instruct vsqrt2D_mem(vecX dst, memory mem) %{
8049   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8050   match(Set dst (SqrtVD (LoadVector mem)));
8051   format %{ "vsqrtpd  $dst,$mem\t! sqrt packed2D" %}
8052   ins_encode %{
8053     int vector_len = 0;
8054     __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vector_len);
8055   %}
8056   ins_pipe( pipe_slow );
8057 %}
8058 
8059 instruct vsqrt4D_reg(vecY dst, vecY src) %{
8060   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8061   match(Set dst (SqrtVD src));
8062   format %{ "vsqrtpd  $dst,$src\t! sqrt packed4D" %}
8063   ins_encode %{
8064     int vector_len = 1;
8065     __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
8066   %}
8067   ins_pipe( pipe_slow );
8068 %}
8069 
8070 instruct vsqrt4D_mem(vecY dst, memory mem) %{
8071   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8072   match(Set dst (SqrtVD (LoadVector mem)));
8073   format %{ "vsqrtpd  $dst,$mem\t! sqrt packed4D" %}
8074   ins_encode %{
8075     int vector_len = 1;
8076     __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vector_len);
8077   %}
8078   ins_pipe( pipe_slow );
8079 %}
8080 
8081 instruct vsqrt8D_reg(vecZ dst, vecZ src) %{
8082   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8083   match(Set dst (SqrtVD src));
8084   format %{ "vsqrtpd  $dst,$src\t! sqrt packed8D" %}
8085   ins_encode %{
8086     int vector_len = 2;
8087     __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
8088   %}
8089   ins_pipe( pipe_slow );
8090 %}
8091 
8092 instruct vsqrt8D_mem(vecZ dst, memory mem) %{
8093   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8094   match(Set dst (SqrtVD (LoadVector mem)));
8095   format %{ "vsqrtpd  $dst,$mem\t! sqrt packed8D" %}
8096   ins_encode %{
8097     int vector_len = 2;
8098     __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vector_len);
8099   %}
8100   ins_pipe( pipe_slow );
8101 %}
8102 
8103 instruct vsqrt2F_reg(vecD dst, vecD src) %{
8104   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8105   match(Set dst (SqrtVF src));
8106   format %{ "vsqrtps  $dst,$src\t! sqrt packed2F" %}
8107   ins_encode %{
8108     int vector_len = 0;
8109     __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vector_len);
8110   %}
8111   ins_pipe( pipe_slow );
8112 %}
8113 
8114 instruct vsqrt2F_mem(vecD dst, memory mem) %{
8115   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8116   match(Set dst (SqrtVF (LoadVector mem)));
8117   format %{ "vsqrtps  $dst,$mem\t! sqrt packed2F" %}
8118   ins_encode %{
8119     int vector_len = 0;
8120     __ vsqrtps($dst$$XMMRegister, $mem$$Address, vector_len);
8121   %}
8122   ins_pipe( pipe_slow );
8123 %}
8124 
8125 instruct vsqrt4F_reg(vecX dst, vecX src) %{
8126   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8127   match(Set dst (SqrtVF src));
8128   format %{ "vsqrtps  $dst,$src\t! sqrt packed4F" %}
8129   ins_encode %{
8130     int vector_len = 0;
8131     __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vector_len);
8132   %}
8133   ins_pipe( pipe_slow );
8134 %}
8135 
8136 instruct vsqrt4F_mem(vecX dst, memory mem) %{
8137   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8138   match(Set dst (SqrtVF (LoadVector mem)));
8139   format %{ "vsqrtps  $dst,$mem\t! sqrt packed4F" %}
8140   ins_encode %{
8141     int vector_len = 0;
8142     __ vsqrtps($dst$$XMMRegister, $mem$$Address, vector_len);
8143   %}
8144   ins_pipe( pipe_slow );
8145 %}
8146 
8147 instruct vsqrt8F_reg(vecY dst, vecY src) %{
8148   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
8149   match(Set dst (SqrtVF src));
8150   format %{ "vsqrtps  $dst,$src\t! sqrt packed8F" %}
8151   ins_encode %{
8152     int vector_len = 1;
8153     __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vector_len);
8154   %}
8155   ins_pipe( pipe_slow );
8156 %}
8157 
8158 instruct vsqrt8F_mem(vecY dst, memory mem) %{
8159   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
8160   match(Set dst (SqrtVF (LoadVector mem)));
8161   format %{ "vsqrtps  $dst,$mem\t! sqrt packed8F" %}
8162   ins_encode %{
8163     int vector_len = 1;
8164     __ vsqrtps($dst$$XMMRegister, $mem$$Address, vector_len);
8165   %}
8166   ins_pipe( pipe_slow );
8167 %}
8168 
8169 instruct vsqrt16F_reg(vecZ dst, vecZ src) %{
8170   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
8171   match(Set dst (SqrtVF src));
8172   format %{ "vsqrtps  $dst,$src\t! sqrt packed16F" %}
8173   ins_encode %{
8174     int vector_len = 2;
8175     __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vector_len);
8176   %}
8177   ins_pipe( pipe_slow );
8178 %}
8179 
8180 instruct vsqrt16F_mem(vecZ dst, memory mem) %{
8181   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
8182   match(Set dst (SqrtVF (LoadVector mem)));
8183   format %{ "vsqrtps  $dst,$mem\t! sqrt packed16F" %}
8184   ins_encode %{
8185     int vector_len = 2;
8186     __ vsqrtps($dst$$XMMRegister, $mem$$Address, vector_len);
8187   %}
8188   ins_pipe( pipe_slow );
8189 %}
8190 
8191 // ------------------------------ LeftShift -----------------------------------
8192 
8193 // Shorts/Chars vector left shift
8194 instruct vsll2S(vecS dst, vecS shift) %{
8195   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
8196   match(Set dst (LShiftVS dst shift));
8197   format %{ "psllw   $dst,$shift\t! left shift packed2S" %}
8198   ins_encode %{
8199     __ psllw($dst$$XMMRegister, $shift$$XMMRegister);
8200   %}
8201   ins_pipe( pipe_slow );
8202 %}
8203 
8204 instruct vsll2S_imm(vecS dst, immI8 shift) %{
8205   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
8206   match(Set dst (LShiftVS dst shift));
8207   format %{ "psllw   $dst,$shift\t! left shift packed2S" %}
8208   ins_encode %{
8209     __ psllw($dst$$XMMRegister, (int)$shift$$constant);
8210   %}
8211   ins_pipe( pipe_slow );
8212 %}
8213 
8214 instruct vsll2S_reg(vecS dst, vecS src, vecS shift) %{
8215   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8216   match(Set dst (LShiftVS src shift));
8217   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
8218   ins_encode %{
8219     int vector_len = 0;
8220     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8221   %}
8222   ins_pipe( pipe_slow );
8223 %}
8224 
8225 instruct vsll2S_reg_imm(vecS dst, vecS src, immI8 shift) %{
8226   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8227   match(Set dst (LShiftVS src shift));
8228   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
8229   ins_encode %{
8230     int vector_len = 0;
8231     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8232   %}
8233   ins_pipe( pipe_slow );
8234 %}
8235 
8236 instruct vsll4S(vecD dst, vecS shift) %{
8237   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
8238   match(Set dst (LShiftVS dst shift));
8239   format %{ "psllw   $dst,$shift\t! left shift packed4S" %}
8240   ins_encode %{
8241     __ psllw($dst$$XMMRegister, $shift$$XMMRegister);
8242   %}
8243   ins_pipe( pipe_slow );
8244 %}
8245 
8246 instruct vsll4S_imm(vecD dst, immI8 shift) %{
8247   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
8248   match(Set dst (LShiftVS dst shift));
8249   format %{ "psllw   $dst,$shift\t! left shift packed4S" %}
8250   ins_encode %{
8251     __ psllw($dst$$XMMRegister, (int)$shift$$constant);
8252   %}
8253   ins_pipe( pipe_slow );
8254 %}
8255 
8256 instruct vsll4S_reg(vecD dst, vecD src, vecS shift) %{
8257   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8258   match(Set dst (LShiftVS src shift));
8259   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
8260   ins_encode %{
8261     int vector_len = 0;
8262     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8263   %}
8264   ins_pipe( pipe_slow );
8265 %}
8266 
8267 instruct vsll4S_reg_imm(vecD dst, vecD src, immI8 shift) %{
8268   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8269   match(Set dst (LShiftVS src shift));
8270   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
8271   ins_encode %{
8272     int vector_len = 0;
8273     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8274   %}
8275   ins_pipe( pipe_slow );
8276 %}
8277 
8278 instruct vsll8S(vecX dst, vecS shift) %{
8279   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
8280   match(Set dst (LShiftVS dst shift));
8281   format %{ "psllw   $dst,$shift\t! left shift packed8S" %}
8282   ins_encode %{
8283     __ psllw($dst$$XMMRegister, $shift$$XMMRegister);
8284   %}
8285   ins_pipe( pipe_slow );
8286 %}
8287 
8288 instruct vsll8S_imm(vecX dst, immI8 shift) %{
8289   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
8290   match(Set dst (LShiftVS dst shift));
8291   format %{ "psllw   $dst,$shift\t! left shift packed8S" %}
8292   ins_encode %{
8293     __ psllw($dst$$XMMRegister, (int)$shift$$constant);
8294   %}
8295   ins_pipe( pipe_slow );
8296 %}
8297 
8298 instruct vsll8S_reg(vecX dst, vecX src, vecS shift) %{
8299   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
8300   match(Set dst (LShiftVS src shift));
8301   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
8302   ins_encode %{
8303     int vector_len = 0;
8304     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8305   %}
8306   ins_pipe( pipe_slow );
8307 %}
8308 
8309 instruct vsll8S_reg_imm(vecX dst, vecX src, immI8 shift) %{
8310   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
8311   match(Set dst (LShiftVS src shift));
8312   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
8313   ins_encode %{
8314     int vector_len = 0;
8315     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8316   %}
8317   ins_pipe( pipe_slow );
8318 %}
8319 
8320 instruct vsll16S_reg(vecY dst, vecY src, vecS shift) %{
8321   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
8322   match(Set dst (LShiftVS src shift));
8323   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
8324   ins_encode %{
8325     int vector_len = 1;
8326     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8327   %}
8328   ins_pipe( pipe_slow );
8329 %}
8330 
8331 instruct vsll16S_reg_imm(vecY dst, vecY src, immI8 shift) %{
8332   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
8333   match(Set dst (LShiftVS src shift));
8334   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
8335   ins_encode %{
8336     int vector_len = 1;
8337     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8338   %}
8339   ins_pipe( pipe_slow );
8340 %}
8341 
8342 instruct vsll32S_reg(vecZ dst, vecZ src, vecS shift) %{
8343   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
8344   match(Set dst (LShiftVS src shift));
8345   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed32S" %}
8346   ins_encode %{
8347     int vector_len = 2;
8348     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8349   %}
8350   ins_pipe( pipe_slow );
8351 %}
8352 
8353 instruct vsll32S_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
8354   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
8355   match(Set dst (LShiftVS src shift));
8356   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed32S" %}
8357   ins_encode %{
8358     int vector_len = 2;
8359     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8360   %}
8361   ins_pipe( pipe_slow );
8362 %}
8363 
8364 // Integers vector left shift
8365 instruct vsll2I(vecD dst, vecS shift) %{
8366   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
8367   match(Set dst (LShiftVI dst shift));
8368   format %{ "pslld   $dst,$shift\t! left shift packed2I" %}
8369   ins_encode %{
8370     __ pslld($dst$$XMMRegister, $shift$$XMMRegister);
8371   %}
8372   ins_pipe( pipe_slow );
8373 %}
8374 
8375 instruct vsll2I_imm(vecD dst, immI8 shift) %{
8376   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
8377   match(Set dst (LShiftVI dst shift));
8378   format %{ "pslld   $dst,$shift\t! left shift packed2I" %}
8379   ins_encode %{
8380     __ pslld($dst$$XMMRegister, (int)$shift$$constant);
8381   %}
8382   ins_pipe( pipe_slow );
8383 %}
8384 
8385 instruct vsll2I_reg(vecD dst, vecD src, vecS shift) %{
8386   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8387   match(Set dst (LShiftVI src shift));
8388   format %{ "vpslld  $dst,$src,$shift\t! left shift packed2I" %}
8389   ins_encode %{
8390     int vector_len = 0;
8391     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8392   %}
8393   ins_pipe( pipe_slow );
8394 %}
8395 
8396 instruct vsll2I_reg_imm(vecD dst, vecD src, immI8 shift) %{
8397   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8398   match(Set dst (LShiftVI src shift));
8399   format %{ "vpslld  $dst,$src,$shift\t! left shift packed2I" %}
8400   ins_encode %{
8401     int vector_len = 0;
8402     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8403   %}
8404   ins_pipe( pipe_slow );
8405 %}
8406 
8407 instruct vsll4I(vecX dst, vecS shift) %{
8408   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
8409   match(Set dst (LShiftVI dst shift));
8410   format %{ "pslld   $dst,$shift\t! left shift packed4I" %}
8411   ins_encode %{
8412     __ pslld($dst$$XMMRegister, $shift$$XMMRegister);
8413   %}
8414   ins_pipe( pipe_slow );
8415 %}
8416 
8417 instruct vsll4I_imm(vecX dst, immI8 shift) %{
8418   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
8419   match(Set dst (LShiftVI dst shift));
8420   format %{ "pslld   $dst,$shift\t! left shift packed4I" %}
8421   ins_encode %{
8422     __ pslld($dst$$XMMRegister, (int)$shift$$constant);
8423   %}
8424   ins_pipe( pipe_slow );
8425 %}
8426 
8427 instruct vsll4I_reg(vecX dst, vecX src, vecS shift) %{
8428   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8429   match(Set dst (LShiftVI src shift));
8430   format %{ "vpslld  $dst,$src,$shift\t! left shift packed4I" %}
8431   ins_encode %{
8432     int vector_len = 0;
8433     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8434   %}
8435   ins_pipe( pipe_slow );
8436 %}
8437 
8438 instruct vsll4I_reg_imm(vecX dst, vecX src, immI8 shift) %{
8439   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8440   match(Set dst (LShiftVI src shift));
8441   format %{ "vpslld  $dst,$src,$shift\t! left shift packed4I" %}
8442   ins_encode %{
8443     int vector_len = 0;
8444     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8445   %}
8446   ins_pipe( pipe_slow );
8447 %}
8448 
8449 instruct vsll8I_reg(vecY dst, vecY src, vecS shift) %{
8450   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
8451   match(Set dst (LShiftVI src shift));
8452   format %{ "vpslld  $dst,$src,$shift\t! left shift packed8I" %}
8453   ins_encode %{
8454     int vector_len = 1;
8455     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8456   %}
8457   ins_pipe( pipe_slow );
8458 %}
8459 
8460 instruct vsll8I_reg_imm(vecY dst, vecY src, immI8 shift) %{
8461   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
8462   match(Set dst (LShiftVI src shift));
8463   format %{ "vpslld  $dst,$src,$shift\t! left shift packed8I" %}
8464   ins_encode %{
8465     int vector_len = 1;
8466     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8467   %}
8468   ins_pipe( pipe_slow );
8469 %}
8470 
8471 instruct vsll16I_reg(vecZ dst, vecZ src, vecS shift) %{
8472   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
8473   match(Set dst (LShiftVI src shift));
8474   format %{ "vpslld  $dst,$src,$shift\t! left shift packed16I" %}
8475   ins_encode %{
8476     int vector_len = 2;
8477     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8478   %}
8479   ins_pipe( pipe_slow );
8480 %}
8481 
8482 instruct vsll16I_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
8483   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
8484   match(Set dst (LShiftVI src shift));
8485   format %{ "vpslld  $dst,$src,$shift\t! left shift packed16I" %}
8486   ins_encode %{
8487     int vector_len = 2;
8488     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8489   %}
8490   ins_pipe( pipe_slow );
8491 %}
8492 
8493 // Longs vector left shift
8494 instruct vsll2L(vecX dst, vecS shift) %{
8495   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
8496   match(Set dst (LShiftVL dst shift));
8497   format %{ "psllq   $dst,$shift\t! left shift packed2L" %}
8498   ins_encode %{
8499     __ psllq($dst$$XMMRegister, $shift$$XMMRegister);
8500   %}
8501   ins_pipe( pipe_slow );
8502 %}
8503 
8504 instruct vsll2L_imm(vecX dst, immI8 shift) %{
8505   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
8506   match(Set dst (LShiftVL dst shift));
8507   format %{ "psllq   $dst,$shift\t! left shift packed2L" %}
8508   ins_encode %{
8509     __ psllq($dst$$XMMRegister, (int)$shift$$constant);
8510   %}
8511   ins_pipe( pipe_slow );
8512 %}
8513 
8514 instruct vsll2L_reg(vecX dst, vecX src, vecS shift) %{
8515   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8516   match(Set dst (LShiftVL src shift));
8517   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed2L" %}
8518   ins_encode %{
8519     int vector_len = 0;
8520     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8521   %}
8522   ins_pipe( pipe_slow );
8523 %}
8524 
8525 instruct vsll2L_reg_imm(vecX dst, vecX src, immI8 shift) %{
8526   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8527   match(Set dst (LShiftVL src shift));
8528   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed2L" %}
8529   ins_encode %{
8530     int vector_len = 0;
8531     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8532   %}
8533   ins_pipe( pipe_slow );
8534 %}
8535 
8536 instruct vsll4L_reg(vecY dst, vecY src, vecS shift) %{
8537   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
8538   match(Set dst (LShiftVL src shift));
8539   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed4L" %}
8540   ins_encode %{
8541     int vector_len = 1;
8542     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8543   %}
8544   ins_pipe( pipe_slow );
8545 %}
8546 
8547 instruct vsll4L_reg_imm(vecY dst, vecY src, immI8 shift) %{
8548   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
8549   match(Set dst (LShiftVL src shift));
8550   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed4L" %}
8551   ins_encode %{
8552     int vector_len = 1;
8553     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8554   %}
8555   ins_pipe( pipe_slow );
8556 %}
8557 
8558 instruct vsll8L_reg(vecZ dst, vecZ src, vecS shift) %{
8559   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8560   match(Set dst (LShiftVL src shift));
8561   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed8L" %}
8562   ins_encode %{
8563     int vector_len = 2;
8564     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8565   %}
8566   ins_pipe( pipe_slow );
8567 %}
8568 
8569 instruct vsll8L_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
8570   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8571   match(Set dst (LShiftVL src shift));
8572   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed8L" %}
8573   ins_encode %{
8574     int vector_len = 2;
8575     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8576   %}
8577   ins_pipe( pipe_slow );
8578 %}
8579 
8580 // ----------------------- LogicalRightShift -----------------------------------
8581 
8582 // Shorts vector logical right shift produces incorrect Java result
8583 // for negative data because java code convert short value into int with
8584 // sign extension before a shift. But char vectors are fine since chars are
8585 // unsigned values.
8586 
8587 instruct vsrl2S(vecS dst, vecS shift) %{
8588   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
8589   match(Set dst (URShiftVS dst shift));
8590   format %{ "psrlw   $dst,$shift\t! logical right shift packed2S" %}
8591   ins_encode %{
8592     __ psrlw($dst$$XMMRegister, $shift$$XMMRegister);
8593   %}
8594   ins_pipe( pipe_slow );
8595 %}
8596 
8597 instruct vsrl2S_imm(vecS dst, immI8 shift) %{
8598   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
8599   match(Set dst (URShiftVS dst shift));
8600   format %{ "psrlw   $dst,$shift\t! logical right shift packed2S" %}
8601   ins_encode %{
8602     __ psrlw($dst$$XMMRegister, (int)$shift$$constant);
8603   %}
8604   ins_pipe( pipe_slow );
8605 %}
8606 
8607 instruct vsrl2S_reg(vecS dst, vecS src, vecS shift) %{
8608   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8609   match(Set dst (URShiftVS src shift));
8610   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed2S" %}
8611   ins_encode %{
8612     int vector_len = 0;
8613     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8614   %}
8615   ins_pipe( pipe_slow );
8616 %}
8617 
8618 instruct vsrl2S_reg_imm(vecS dst, vecS src, immI8 shift) %{
8619   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8620   match(Set dst (URShiftVS src shift));
8621   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed2S" %}
8622   ins_encode %{
8623     int vector_len = 0;
8624     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8625   %}
8626   ins_pipe( pipe_slow );
8627 %}
8628 
8629 instruct vsrl4S(vecD dst, vecS shift) %{
8630   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
8631   match(Set dst (URShiftVS dst shift));
8632   format %{ "psrlw   $dst,$shift\t! logical right shift packed4S" %}
8633   ins_encode %{
8634     __ psrlw($dst$$XMMRegister, $shift$$XMMRegister);
8635   %}
8636   ins_pipe( pipe_slow );
8637 %}
8638 
8639 instruct vsrl4S_imm(vecD dst, immI8 shift) %{
8640   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
8641   match(Set dst (URShiftVS dst shift));
8642   format %{ "psrlw   $dst,$shift\t! logical right shift packed4S" %}
8643   ins_encode %{
8644     __ psrlw($dst$$XMMRegister, (int)$shift$$constant);
8645   %}
8646   ins_pipe( pipe_slow );
8647 %}
8648 
8649 instruct vsrl4S_reg(vecD dst, vecD src, vecS shift) %{
8650   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8651   match(Set dst (URShiftVS src shift));
8652   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed4S" %}
8653   ins_encode %{
8654     int vector_len = 0;
8655     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8656   %}
8657   ins_pipe( pipe_slow );
8658 %}
8659 
8660 instruct vsrl4S_reg_imm(vecD dst, vecD src, immI8 shift) %{
8661   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8662   match(Set dst (URShiftVS src shift));
8663   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed4S" %}
8664   ins_encode %{
8665     int vector_len = 0;
8666     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8667   %}
8668   ins_pipe( pipe_slow );
8669 %}
8670 
8671 instruct vsrl8S(vecX dst, vecS shift) %{
8672   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
8673   match(Set dst (URShiftVS dst shift));
8674   format %{ "psrlw   $dst,$shift\t! logical right shift packed8S" %}
8675   ins_encode %{
8676     __ psrlw($dst$$XMMRegister, $shift$$XMMRegister);
8677   %}
8678   ins_pipe( pipe_slow );
8679 %}
8680 
8681 instruct vsrl8S_imm(vecX dst, immI8 shift) %{
8682   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
8683   match(Set dst (URShiftVS dst shift));
8684   format %{ "psrlw   $dst,$shift\t! logical right shift packed8S" %}
8685   ins_encode %{
8686     __ psrlw($dst$$XMMRegister, (int)$shift$$constant);
8687   %}
8688   ins_pipe( pipe_slow );
8689 %}
8690 
8691 instruct vsrl8S_reg(vecX dst, vecX src, vecS shift) %{
8692   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
8693   match(Set dst (URShiftVS src shift));
8694   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed8S" %}
8695   ins_encode %{
8696     int vector_len = 0;
8697     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8698   %}
8699   ins_pipe( pipe_slow );
8700 %}
8701 
8702 instruct vsrl8S_reg_imm(vecX dst, vecX src, immI8 shift) %{
8703   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
8704   match(Set dst (URShiftVS src shift));
8705   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed8S" %}
8706   ins_encode %{
8707     int vector_len = 0;
8708     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8709   %}
8710   ins_pipe( pipe_slow );
8711 %}
8712 
8713 instruct vsrl16S_reg(vecY dst, vecY src, vecS shift) %{
8714   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
8715   match(Set dst (URShiftVS src shift));
8716   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed16S" %}
8717   ins_encode %{
8718     int vector_len = 1;
8719     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8720   %}
8721   ins_pipe( pipe_slow );
8722 %}
8723 
8724 instruct vsrl16S_reg_imm(vecY dst, vecY src, immI8 shift) %{
8725   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
8726   match(Set dst (URShiftVS src shift));
8727   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed16S" %}
8728   ins_encode %{
8729     int vector_len = 1;
8730     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8731   %}
8732   ins_pipe( pipe_slow );
8733 %}
8734 
8735 instruct vsrl32S_reg(vecZ dst, vecZ src, vecS shift) %{
8736   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
8737   match(Set dst (URShiftVS src shift));
8738   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed32S" %}
8739   ins_encode %{
8740     int vector_len = 2;
8741     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8742   %}
8743   ins_pipe( pipe_slow );
8744 %}
8745 
8746 instruct vsrl32S_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
8747   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
8748   match(Set dst (URShiftVS src shift));
8749   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed32S" %}
8750   ins_encode %{
8751     int vector_len = 2;
8752     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8753   %}
8754   ins_pipe( pipe_slow );
8755 %}
8756 
8757 // Integers vector logical right shift
8758 instruct vsrl2I(vecD dst, vecS shift) %{
8759   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
8760   match(Set dst (URShiftVI dst shift));
8761   format %{ "psrld   $dst,$shift\t! logical right shift packed2I" %}
8762   ins_encode %{
8763     __ psrld($dst$$XMMRegister, $shift$$XMMRegister);
8764   %}
8765   ins_pipe( pipe_slow );
8766 %}
8767 
8768 instruct vsrl2I_imm(vecD dst, immI8 shift) %{
8769   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
8770   match(Set dst (URShiftVI dst shift));
8771   format %{ "psrld   $dst,$shift\t! logical right shift packed2I" %}
8772   ins_encode %{
8773     __ psrld($dst$$XMMRegister, (int)$shift$$constant);
8774   %}
8775   ins_pipe( pipe_slow );
8776 %}
8777 
8778 instruct vsrl2I_reg(vecD dst, vecD src, vecS shift) %{
8779   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8780   match(Set dst (URShiftVI src shift));
8781   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed2I" %}
8782   ins_encode %{
8783     int vector_len = 0;
8784     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8785   %}
8786   ins_pipe( pipe_slow );
8787 %}
8788 
8789 instruct vsrl2I_reg_imm(vecD dst, vecD src, immI8 shift) %{
8790   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8791   match(Set dst (URShiftVI src shift));
8792   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed2I" %}
8793   ins_encode %{
8794     int vector_len = 0;
8795     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8796   %}
8797   ins_pipe( pipe_slow );
8798 %}
8799 
8800 instruct vsrl4I(vecX dst, vecS shift) %{
8801   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
8802   match(Set dst (URShiftVI dst shift));
8803   format %{ "psrld   $dst,$shift\t! logical right shift packed4I" %}
8804   ins_encode %{
8805     __ psrld($dst$$XMMRegister, $shift$$XMMRegister);
8806   %}
8807   ins_pipe( pipe_slow );
8808 %}
8809 
8810 instruct vsrl4I_imm(vecX dst, immI8 shift) %{
8811   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
8812   match(Set dst (URShiftVI dst shift));
8813   format %{ "psrld   $dst,$shift\t! logical right shift packed4I" %}
8814   ins_encode %{
8815     __ psrld($dst$$XMMRegister, (int)$shift$$constant);
8816   %}
8817   ins_pipe( pipe_slow );
8818 %}
8819 
8820 instruct vsrl4I_reg(vecX dst, vecX src, vecS shift) %{
8821   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8822   match(Set dst (URShiftVI src shift));
8823   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed4I" %}
8824   ins_encode %{
8825     int vector_len = 0;
8826     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8827   %}
8828   ins_pipe( pipe_slow );
8829 %}
8830 
8831 instruct vsrl4I_reg_imm(vecX dst, vecX src, immI8 shift) %{
8832   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8833   match(Set dst (URShiftVI src shift));
8834   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed4I" %}
8835   ins_encode %{
8836     int vector_len = 0;
8837     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8838   %}
8839   ins_pipe( pipe_slow );
8840 %}
8841 
8842 instruct vsrl8I_reg(vecY dst, vecY src, vecS shift) %{
8843   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
8844   match(Set dst (URShiftVI src shift));
8845   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed8I" %}
8846   ins_encode %{
8847     int vector_len = 1;
8848     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8849   %}
8850   ins_pipe( pipe_slow );
8851 %}
8852 
8853 instruct vsrl8I_reg_imm(vecY dst, vecY src, immI8 shift) %{
8854   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
8855   match(Set dst (URShiftVI src shift));
8856   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed8I" %}
8857   ins_encode %{
8858     int vector_len = 1;
8859     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8860   %}
8861   ins_pipe( pipe_slow );
8862 %}
8863 
8864 instruct vsrl16I_reg(vecZ dst, vecZ src, vecS shift) %{
8865   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
8866   match(Set dst (URShiftVI src shift));
8867   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed16I" %}
8868   ins_encode %{
8869     int vector_len = 2;
8870     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8871   %}
8872   ins_pipe( pipe_slow );
8873 %}
8874 
8875 instruct vsrl16I_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
8876   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
8877   match(Set dst (URShiftVI src shift));
8878   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed16I" %}
8879   ins_encode %{
8880     int vector_len = 2;
8881     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8882   %}
8883   ins_pipe( pipe_slow );
8884 %}
8885 
8886 // Longs vector logical right shift
8887 instruct vsrl2L(vecX dst, vecS shift) %{
8888   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
8889   match(Set dst (URShiftVL dst shift));
8890   format %{ "psrlq   $dst,$shift\t! logical right shift packed2L" %}
8891   ins_encode %{
8892     __ psrlq($dst$$XMMRegister, $shift$$XMMRegister);
8893   %}
8894   ins_pipe( pipe_slow );
8895 %}
8896 
8897 instruct vsrl2L_imm(vecX dst, immI8 shift) %{
8898   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
8899   match(Set dst (URShiftVL dst shift));
8900   format %{ "psrlq   $dst,$shift\t! logical right shift packed2L" %}
8901   ins_encode %{
8902     __ psrlq($dst$$XMMRegister, (int)$shift$$constant);
8903   %}
8904   ins_pipe( pipe_slow );
8905 %}
8906 
8907 instruct vsrl2L_reg(vecX dst, vecX src, vecS shift) %{
8908   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8909   match(Set dst (URShiftVL src shift));
8910   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed2L" %}
8911   ins_encode %{
8912     int vector_len = 0;
8913     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8914   %}
8915   ins_pipe( pipe_slow );
8916 %}
8917 
8918 instruct vsrl2L_reg_imm(vecX dst, vecX src, immI8 shift) %{
8919   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8920   match(Set dst (URShiftVL src shift));
8921   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed2L" %}
8922   ins_encode %{
8923     int vector_len = 0;
8924     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8925   %}
8926   ins_pipe( pipe_slow );
8927 %}
8928 
8929 instruct vsrl4L_reg(vecY dst, vecY src, vecS shift) %{
8930   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
8931   match(Set dst (URShiftVL src shift));
8932   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed4L" %}
8933   ins_encode %{
8934     int vector_len = 1;
8935     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8936   %}
8937   ins_pipe( pipe_slow );
8938 %}
8939 
8940 instruct vsrl4L_reg_imm(vecY dst, vecY src, immI8 shift) %{
8941   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
8942   match(Set dst (URShiftVL src shift));
8943   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed4L" %}
8944   ins_encode %{
8945     int vector_len = 1;
8946     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8947   %}
8948   ins_pipe( pipe_slow );
8949 %}
8950 
8951 instruct vsrl8L_reg(vecZ dst, vecZ src, vecS shift) %{
8952   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8953   match(Set dst (URShiftVL src shift));
8954   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed8L" %}
8955   ins_encode %{
8956     int vector_len = 2;
8957     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8958   %}
8959   ins_pipe( pipe_slow );
8960 %}
8961 
8962 instruct vsrl8L_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
8963   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8964   match(Set dst (URShiftVL src shift));
8965   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed8L" %}
8966   ins_encode %{
8967     int vector_len = 2;
8968     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8969   %}
8970   ins_pipe( pipe_slow );
8971 %}
8972 
8973 // ------------------- ArithmeticRightShift -----------------------------------
8974 
8975 // Shorts/Chars vector arithmetic right shift
8976 instruct vsra2S(vecS dst, vecS shift) %{
8977   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
8978   match(Set dst (RShiftVS dst shift));
8979   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed2S" %}
8980   ins_encode %{
8981     __ psraw($dst$$XMMRegister, $shift$$XMMRegister);
8982   %}
8983   ins_pipe( pipe_slow );
8984 %}
8985 
8986 instruct vsra2S_imm(vecS dst, immI8 shift) %{
8987   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
8988   match(Set dst (RShiftVS dst shift));
8989   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed2S" %}
8990   ins_encode %{
8991     __ psraw($dst$$XMMRegister, (int)$shift$$constant);
8992   %}
8993   ins_pipe( pipe_slow );
8994 %}
8995 
8996 instruct vsra2S_reg(vecS dst, vecS src, vecS shift) %{
8997   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8998   match(Set dst (RShiftVS src shift));
8999   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
9000   ins_encode %{
9001     int vector_len = 0;
9002     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9003   %}
9004   ins_pipe( pipe_slow );
9005 %}
9006 
9007 instruct vsra2S_reg_imm(vecS dst, vecS src, immI8 shift) %{
9008   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
9009   match(Set dst (RShiftVS src shift));
9010   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
9011   ins_encode %{
9012     int vector_len = 0;
9013     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9014   %}
9015   ins_pipe( pipe_slow );
9016 %}
9017 
9018 instruct vsra4S(vecD dst, vecS shift) %{
9019   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
9020   match(Set dst (RShiftVS dst shift));
9021   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed4S" %}
9022   ins_encode %{
9023     __ psraw($dst$$XMMRegister, $shift$$XMMRegister);
9024   %}
9025   ins_pipe( pipe_slow );
9026 %}
9027 
9028 instruct vsra4S_imm(vecD dst, immI8 shift) %{
9029   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
9030   match(Set dst (RShiftVS dst shift));
9031   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed4S" %}
9032   ins_encode %{
9033     __ psraw($dst$$XMMRegister, (int)$shift$$constant);
9034   %}
9035   ins_pipe( pipe_slow );
9036 %}
9037 
9038 instruct vsra4S_reg(vecD dst, vecD src, vecS shift) %{
9039   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
9040   match(Set dst (RShiftVS src shift));
9041   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
9042   ins_encode %{
9043     int vector_len = 0;
9044     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9045   %}
9046   ins_pipe( pipe_slow );
9047 %}
9048 
9049 instruct vsra4S_reg_imm(vecD dst, vecD src, immI8 shift) %{
9050   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
9051   match(Set dst (RShiftVS src shift));
9052   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
9053   ins_encode %{
9054     int vector_len = 0;
9055     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9056   %}
9057   ins_pipe( pipe_slow );
9058 %}
9059 
9060 instruct vsra8S(vecX dst, vecS shift) %{
9061   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
9062   match(Set dst (RShiftVS dst shift));
9063   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed8S" %}
9064   ins_encode %{
9065     __ psraw($dst$$XMMRegister, $shift$$XMMRegister);
9066   %}
9067   ins_pipe( pipe_slow );
9068 %}
9069 
9070 instruct vsra8S_imm(vecX dst, immI8 shift) %{
9071   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
9072   match(Set dst (RShiftVS dst shift));
9073   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed8S" %}
9074   ins_encode %{
9075     __ psraw($dst$$XMMRegister, (int)$shift$$constant);
9076   %}
9077   ins_pipe( pipe_slow );
9078 %}
9079 
9080 instruct vsra8S_reg(vecX dst, vecX src, vecS shift) %{
9081   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
9082   match(Set dst (RShiftVS src shift));
9083   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
9084   ins_encode %{
9085     int vector_len = 0;
9086     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9087   %}
9088   ins_pipe( pipe_slow );
9089 %}
9090 
9091 instruct vsra8S_reg_imm(vecX dst, vecX src, immI8 shift) %{
9092   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
9093   match(Set dst (RShiftVS src shift));
9094   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
9095   ins_encode %{
9096     int vector_len = 0;
9097     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9098   %}
9099   ins_pipe( pipe_slow );
9100 %}
9101 
9102 instruct vsra16S_reg(vecY dst, vecY src, vecS shift) %{
9103   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
9104   match(Set dst (RShiftVS src shift));
9105   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
9106   ins_encode %{
9107     int vector_len = 1;
9108     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9109   %}
9110   ins_pipe( pipe_slow );
9111 %}
9112 
9113 instruct vsra16S_reg_imm(vecY dst, vecY src, immI8 shift) %{
9114   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
9115   match(Set dst (RShiftVS src shift));
9116   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
9117   ins_encode %{
9118     int vector_len = 1;
9119     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9120   %}
9121   ins_pipe( pipe_slow );
9122 %}
9123 
9124 instruct vsra32S_reg(vecZ dst, vecZ src, vecS shift) %{
9125   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
9126   match(Set dst (RShiftVS src shift));
9127   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed32S" %}
9128   ins_encode %{
9129     int vector_len = 2;
9130     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9131   %}
9132   ins_pipe( pipe_slow );
9133 %}
9134 
9135 instruct vsra32S_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
9136   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
9137   match(Set dst (RShiftVS src shift));
9138   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed32S" %}
9139   ins_encode %{
9140     int vector_len = 2;
9141     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9142   %}
9143   ins_pipe( pipe_slow );
9144 %}
9145 
9146 // Integers vector arithmetic right shift
9147 instruct vsra2I(vecD dst, vecS shift) %{
9148   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
9149   match(Set dst (RShiftVI dst shift));
9150   format %{ "psrad   $dst,$shift\t! arithmetic right shift packed2I" %}
9151   ins_encode %{
9152     __ psrad($dst$$XMMRegister, $shift$$XMMRegister);
9153   %}
9154   ins_pipe( pipe_slow );
9155 %}
9156 
9157 instruct vsra2I_imm(vecD dst, immI8 shift) %{
9158   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
9159   match(Set dst (RShiftVI dst shift));
9160   format %{ "psrad   $dst,$shift\t! arithmetic right shift packed2I" %}
9161   ins_encode %{
9162     __ psrad($dst$$XMMRegister, (int)$shift$$constant);
9163   %}
9164   ins_pipe( pipe_slow );
9165 %}
9166 
9167 instruct vsra2I_reg(vecD dst, vecD src, vecS shift) %{
9168   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
9169   match(Set dst (RShiftVI src shift));
9170   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed2I" %}
9171   ins_encode %{
9172     int vector_len = 0;
9173     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9174   %}
9175   ins_pipe( pipe_slow );
9176 %}
9177 
9178 instruct vsra2I_reg_imm(vecD dst, vecD src, immI8 shift) %{
9179   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
9180   match(Set dst (RShiftVI src shift));
9181   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed2I" %}
9182   ins_encode %{
9183     int vector_len = 0;
9184     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9185   %}
9186   ins_pipe( pipe_slow );
9187 %}
9188 
9189 instruct vsra4I(vecX dst, vecS shift) %{
9190   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
9191   match(Set dst (RShiftVI dst shift));
9192   format %{ "psrad   $dst,$shift\t! arithmetic right shift packed4I" %}
9193   ins_encode %{
9194     __ psrad($dst$$XMMRegister, $shift$$XMMRegister);
9195   %}
9196   ins_pipe( pipe_slow );
9197 %}
9198 
9199 instruct vsra4I_imm(vecX dst, immI8 shift) %{
9200   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
9201   match(Set dst (RShiftVI dst shift));
9202   format %{ "psrad   $dst,$shift\t! arithmetic right shift packed4I" %}
9203   ins_encode %{
9204     __ psrad($dst$$XMMRegister, (int)$shift$$constant);
9205   %}
9206   ins_pipe( pipe_slow );
9207 %}
9208 
9209 instruct vsra4I_reg(vecX dst, vecX src, vecS shift) %{
9210   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
9211   match(Set dst (RShiftVI src shift));
9212   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed4I" %}
9213   ins_encode %{
9214     int vector_len = 0;
9215     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9216   %}
9217   ins_pipe( pipe_slow );
9218 %}
9219 
9220 instruct vsra4I_reg_imm(vecX dst, vecX src, immI8 shift) %{
9221   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
9222   match(Set dst (RShiftVI src shift));
9223   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed4I" %}
9224   ins_encode %{
9225     int vector_len = 0;
9226     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9227   %}
9228   ins_pipe( pipe_slow );
9229 %}
9230 
9231 instruct vsra8I_reg(vecY dst, vecY src, vecS shift) %{
9232   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
9233   match(Set dst (RShiftVI src shift));
9234   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed8I" %}
9235   ins_encode %{
9236     int vector_len = 1;
9237     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9238   %}
9239   ins_pipe( pipe_slow );
9240 %}
9241 
9242 instruct vsra8I_reg_imm(vecY dst, vecY src, immI8 shift) %{
9243   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
9244   match(Set dst (RShiftVI src shift));
9245   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed8I" %}
9246   ins_encode %{
9247     int vector_len = 1;
9248     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9249   %}
9250   ins_pipe( pipe_slow );
9251 %}
9252 
9253 instruct vsra16I_reg(vecZ dst, vecZ src, vecS shift) %{
9254   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
9255   match(Set dst (RShiftVI src shift));
9256   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed16I" %}
9257   ins_encode %{
9258     int vector_len = 2;
9259     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9260   %}
9261   ins_pipe( pipe_slow );
9262 %}
9263 
9264 instruct vsra16I_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
9265   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
9266   match(Set dst (RShiftVI src shift));
9267   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed16I" %}
9268   ins_encode %{
9269     int vector_len = 2;
9270     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9271   %}
9272   ins_pipe( pipe_slow );
9273 %}
9274 
9275 // There are no longs vector arithmetic right shift instructions.
9276 
9277 
9278 // --------------------------------- AND --------------------------------------
9279 
9280 instruct vand4B(vecS dst, vecS src) %{
9281   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 4);
9282   match(Set dst (AndV dst src));
9283   format %{ "pand    $dst,$src\t! and vectors (4 bytes)" %}
9284   ins_encode %{
9285     __ pand($dst$$XMMRegister, $src$$XMMRegister);
9286   %}
9287   ins_pipe( pipe_slow );
9288 %}
9289 
9290 instruct vand4B_reg(vecS dst, vecS src1, vecS src2) %{
9291   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
9292   match(Set dst (AndV src1 src2));
9293   format %{ "vpand   $dst,$src1,$src2\t! and vectors (4 bytes)" %}
9294   ins_encode %{
9295     int vector_len = 0;
9296     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9297   %}
9298   ins_pipe( pipe_slow );
9299 %}
9300 
9301 instruct vand4B_mem(vecS dst, vecS src, memory mem) %{
9302   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
9303   match(Set dst (AndV src (LoadVector mem)));
9304   format %{ "vpand   $dst,$src,$mem\t! and vectors (4 bytes)" %}
9305   ins_encode %{
9306     int vector_len = 0;
9307     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9308   %}
9309   ins_pipe( pipe_slow );
9310 %}
9311 
9312 instruct vand8B(vecD dst, vecD src) %{
9313   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 8);
9314   match(Set dst (AndV dst src));
9315   format %{ "pand    $dst,$src\t! and vectors (8 bytes)" %}
9316   ins_encode %{
9317     __ pand($dst$$XMMRegister, $src$$XMMRegister);
9318   %}
9319   ins_pipe( pipe_slow );
9320 %}
9321 
9322 instruct vand8B_reg(vecD dst, vecD src1, vecD src2) %{
9323   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
9324   match(Set dst (AndV src1 src2));
9325   format %{ "vpand   $dst,$src1,$src2\t! and vectors (8 bytes)" %}
9326   ins_encode %{
9327     int vector_len = 0;
9328     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9329   %}
9330   ins_pipe( pipe_slow );
9331 %}
9332 
9333 instruct vand8B_mem(vecD dst, vecD src, memory mem) %{
9334   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
9335   match(Set dst (AndV src (LoadVector mem)));
9336   format %{ "vpand   $dst,$src,$mem\t! and vectors (8 bytes)" %}
9337   ins_encode %{
9338     int vector_len = 0;
9339     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9340   %}
9341   ins_pipe( pipe_slow );
9342 %}
9343 
9344 instruct vand16B(vecX dst, vecX src) %{
9345   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 16);
9346   match(Set dst (AndV dst src));
9347   format %{ "pand    $dst,$src\t! and vectors (16 bytes)" %}
9348   ins_encode %{
9349     __ pand($dst$$XMMRegister, $src$$XMMRegister);
9350   %}
9351   ins_pipe( pipe_slow );
9352 %}
9353 
9354 instruct vand16B_reg(vecX dst, vecX src1, vecX src2) %{
9355   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
9356   match(Set dst (AndV src1 src2));
9357   format %{ "vpand   $dst,$src1,$src2\t! and vectors (16 bytes)" %}
9358   ins_encode %{
9359     int vector_len = 0;
9360     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9361   %}
9362   ins_pipe( pipe_slow );
9363 %}
9364 
9365 instruct vand16B_mem(vecX dst, vecX src, memory mem) %{
9366   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
9367   match(Set dst (AndV src (LoadVector mem)));
9368   format %{ "vpand   $dst,$src,$mem\t! and vectors (16 bytes)" %}
9369   ins_encode %{
9370     int vector_len = 0;
9371     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9372   %}
9373   ins_pipe( pipe_slow );
9374 %}
9375 
9376 instruct vand32B_reg(vecY dst, vecY src1, vecY src2) %{
9377   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
9378   match(Set dst (AndV src1 src2));
9379   format %{ "vpand   $dst,$src1,$src2\t! and vectors (32 bytes)" %}
9380   ins_encode %{
9381     int vector_len = 1;
9382     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9383   %}
9384   ins_pipe( pipe_slow );
9385 %}
9386 
9387 instruct vand32B_mem(vecY dst, vecY src, memory mem) %{
9388   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
9389   match(Set dst (AndV src (LoadVector mem)));
9390   format %{ "vpand   $dst,$src,$mem\t! and vectors (32 bytes)" %}
9391   ins_encode %{
9392     int vector_len = 1;
9393     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9394   %}
9395   ins_pipe( pipe_slow );
9396 %}
9397 
9398 instruct vand64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
9399   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
9400   match(Set dst (AndV src1 src2));
9401   format %{ "vpand   $dst,$src1,$src2\t! and vectors (64 bytes)" %}
9402   ins_encode %{
9403     int vector_len = 2;
9404     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9405   %}
9406   ins_pipe( pipe_slow );
9407 %}
9408 
9409 instruct vand64B_mem(vecZ dst, vecZ src, memory mem) %{
9410   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
9411   match(Set dst (AndV src (LoadVector mem)));
9412   format %{ "vpand   $dst,$src,$mem\t! and vectors (64 bytes)" %}
9413   ins_encode %{
9414     int vector_len = 2;
9415     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9416   %}
9417   ins_pipe( pipe_slow );
9418 %}
9419 
9420 // --------------------------------- OR ---------------------------------------
9421 
9422 instruct vor4B(vecS dst, vecS src) %{
9423   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 4);
9424   match(Set dst (OrV dst src));
9425   format %{ "por     $dst,$src\t! or vectors (4 bytes)" %}
9426   ins_encode %{
9427     __ por($dst$$XMMRegister, $src$$XMMRegister);
9428   %}
9429   ins_pipe( pipe_slow );
9430 %}
9431 
9432 instruct vor4B_reg(vecS dst, vecS src1, vecS src2) %{
9433   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
9434   match(Set dst (OrV src1 src2));
9435   format %{ "vpor    $dst,$src1,$src2\t! or vectors (4 bytes)" %}
9436   ins_encode %{
9437     int vector_len = 0;
9438     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9439   %}
9440   ins_pipe( pipe_slow );
9441 %}
9442 
9443 instruct vor4B_mem(vecS dst, vecS src, memory mem) %{
9444   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
9445   match(Set dst (OrV src (LoadVector mem)));
9446   format %{ "vpor    $dst,$src,$mem\t! or vectors (4 bytes)" %}
9447   ins_encode %{
9448     int vector_len = 0;
9449     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9450   %}
9451   ins_pipe( pipe_slow );
9452 %}
9453 
9454 instruct vor8B(vecD dst, vecD src) %{
9455   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 8);
9456   match(Set dst (OrV dst src));
9457   format %{ "por     $dst,$src\t! or vectors (8 bytes)" %}
9458   ins_encode %{
9459     __ por($dst$$XMMRegister, $src$$XMMRegister);
9460   %}
9461   ins_pipe( pipe_slow );
9462 %}
9463 
9464 instruct vor8B_reg(vecD dst, vecD src1, vecD src2) %{
9465   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
9466   match(Set dst (OrV src1 src2));
9467   format %{ "vpor    $dst,$src1,$src2\t! or vectors (8 bytes)" %}
9468   ins_encode %{
9469     int vector_len = 0;
9470     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9471   %}
9472   ins_pipe( pipe_slow );
9473 %}
9474 
9475 instruct vor8B_mem(vecD dst, vecD src, memory mem) %{
9476   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
9477   match(Set dst (OrV src (LoadVector mem)));
9478   format %{ "vpor    $dst,$src,$mem\t! or vectors (8 bytes)" %}
9479   ins_encode %{
9480     int vector_len = 0;
9481     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9482   %}
9483   ins_pipe( pipe_slow );
9484 %}
9485 
9486 instruct vor16B(vecX dst, vecX src) %{
9487   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 16);
9488   match(Set dst (OrV dst src));
9489   format %{ "por     $dst,$src\t! or vectors (16 bytes)" %}
9490   ins_encode %{
9491     __ por($dst$$XMMRegister, $src$$XMMRegister);
9492   %}
9493   ins_pipe( pipe_slow );
9494 %}
9495 
9496 instruct vor16B_reg(vecX dst, vecX src1, vecX src2) %{
9497   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
9498   match(Set dst (OrV src1 src2));
9499   format %{ "vpor    $dst,$src1,$src2\t! or vectors (16 bytes)" %}
9500   ins_encode %{
9501     int vector_len = 0;
9502     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9503   %}
9504   ins_pipe( pipe_slow );
9505 %}
9506 
9507 instruct vor16B_mem(vecX dst, vecX src, memory mem) %{
9508   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
9509   match(Set dst (OrV src (LoadVector mem)));
9510   format %{ "vpor    $dst,$src,$mem\t! or vectors (16 bytes)" %}
9511   ins_encode %{
9512     int vector_len = 0;
9513     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9514   %}
9515   ins_pipe( pipe_slow );
9516 %}
9517 
9518 instruct vor32B_reg(vecY dst, vecY src1, vecY src2) %{
9519   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
9520   match(Set dst (OrV src1 src2));
9521   format %{ "vpor    $dst,$src1,$src2\t! or vectors (32 bytes)" %}
9522   ins_encode %{
9523     int vector_len = 1;
9524     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9525   %}
9526   ins_pipe( pipe_slow );
9527 %}
9528 
9529 instruct vor32B_mem(vecY dst, vecY src, memory mem) %{
9530   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
9531   match(Set dst (OrV src (LoadVector mem)));
9532   format %{ "vpor    $dst,$src,$mem\t! or vectors (32 bytes)" %}
9533   ins_encode %{
9534     int vector_len = 1;
9535     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9536   %}
9537   ins_pipe( pipe_slow );
9538 %}
9539 
9540 instruct vor64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
9541   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
9542   match(Set dst (OrV src1 src2));
9543   format %{ "vpor    $dst,$src1,$src2\t! or vectors (64 bytes)" %}
9544   ins_encode %{
9545     int vector_len = 2;
9546     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9547   %}
9548   ins_pipe( pipe_slow );
9549 %}
9550 
9551 instruct vor64B_mem(vecZ dst, vecZ src, memory mem) %{
9552   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
9553   match(Set dst (OrV src (LoadVector mem)));
9554   format %{ "vpor    $dst,$src,$mem\t! or vectors (64 bytes)" %}
9555   ins_encode %{
9556     int vector_len = 2;
9557     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9558   %}
9559   ins_pipe( pipe_slow );
9560 %}
9561 
9562 // --------------------------------- XOR --------------------------------------
9563 
9564 instruct vxor4B(vecS dst, vecS src) %{
9565   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 4);
9566   match(Set dst (XorV dst src));
9567   format %{ "pxor    $dst,$src\t! xor vectors (4 bytes)" %}
9568   ins_encode %{
9569     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
9570   %}
9571   ins_pipe( pipe_slow );
9572 %}
9573 
9574 instruct vxor4B_reg(vecS dst, vecS src1, vecS src2) %{
9575   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
9576   match(Set dst (XorV src1 src2));
9577   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (4 bytes)" %}
9578   ins_encode %{
9579     int vector_len = 0;
9580     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9581   %}
9582   ins_pipe( pipe_slow );
9583 %}
9584 
9585 instruct vxor4B_mem(vecS dst, vecS src, memory mem) %{
9586   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
9587   match(Set dst (XorV src (LoadVector mem)));
9588   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (4 bytes)" %}
9589   ins_encode %{
9590     int vector_len = 0;
9591     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9592   %}
9593   ins_pipe( pipe_slow );
9594 %}
9595 
9596 instruct vxor8B(vecD dst, vecD src) %{
9597   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 8);
9598   match(Set dst (XorV dst src));
9599   format %{ "pxor    $dst,$src\t! xor vectors (8 bytes)" %}
9600   ins_encode %{
9601     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
9602   %}
9603   ins_pipe( pipe_slow );
9604 %}
9605 
9606 instruct vxor8B_reg(vecD dst, vecD src1, vecD src2) %{
9607   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
9608   match(Set dst (XorV src1 src2));
9609   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (8 bytes)" %}
9610   ins_encode %{
9611     int vector_len = 0;
9612     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9613   %}
9614   ins_pipe( pipe_slow );
9615 %}
9616 
9617 instruct vxor8B_mem(vecD dst, vecD src, memory mem) %{
9618   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
9619   match(Set dst (XorV src (LoadVector mem)));
9620   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (8 bytes)" %}
9621   ins_encode %{
9622     int vector_len = 0;
9623     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9624   %}
9625   ins_pipe( pipe_slow );
9626 %}
9627 
9628 instruct vxor16B(vecX dst, vecX src) %{
9629   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 16);
9630   match(Set dst (XorV dst src));
9631   format %{ "pxor    $dst,$src\t! xor vectors (16 bytes)" %}
9632   ins_encode %{
9633     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
9634   %}
9635   ins_pipe( pipe_slow );
9636 %}
9637 
9638 instruct vxor16B_reg(vecX dst, vecX src1, vecX src2) %{
9639   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
9640   match(Set dst (XorV src1 src2));
9641   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (16 bytes)" %}
9642   ins_encode %{
9643     int vector_len = 0;
9644     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9645   %}
9646   ins_pipe( pipe_slow );
9647 %}
9648 
9649 instruct vxor16B_mem(vecX dst, vecX src, memory mem) %{
9650   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
9651   match(Set dst (XorV src (LoadVector mem)));
9652   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (16 bytes)" %}
9653   ins_encode %{
9654     int vector_len = 0;
9655     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9656   %}
9657   ins_pipe( pipe_slow );
9658 %}
9659 
9660 instruct vxor32B_reg(vecY dst, vecY src1, vecY src2) %{
9661   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
9662   match(Set dst (XorV src1 src2));
9663   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (32 bytes)" %}
9664   ins_encode %{
9665     int vector_len = 1;
9666     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9667   %}
9668   ins_pipe( pipe_slow );
9669 %}
9670 
9671 instruct vxor32B_mem(vecY dst, vecY src, memory mem) %{
9672   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
9673   match(Set dst (XorV src (LoadVector mem)));
9674   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (32 bytes)" %}
9675   ins_encode %{
9676     int vector_len = 1;
9677     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9678   %}
9679   ins_pipe( pipe_slow );
9680 %}
9681 
9682 instruct vxor64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
9683   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
9684   match(Set dst (XorV src1 src2));
9685   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (64 bytes)" %}
9686   ins_encode %{
9687     int vector_len = 2;
9688     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9689   %}
9690   ins_pipe( pipe_slow );
9691 %}
9692 
9693 instruct vxor64B_mem(vecZ dst, vecZ src, memory mem) %{
9694   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
9695   match(Set dst (XorV src (LoadVector mem)));
9696   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (64 bytes)" %}
9697   ins_encode %{
9698     int vector_len = 2;
9699     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9700   %}
9701   ins_pipe( pipe_slow );
9702 %}
9703 
9704 // --------------------------------- FMA --------------------------------------
9705 
9706 // a * b + c
9707 instruct vfma2D_reg(vecX a, vecX b, vecX c) %{
9708   predicate(UseFMA && n->as_Vector()->length() == 2);
9709   match(Set c (FmaVD  c (Binary a b)));
9710   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed2D" %}
9711   ins_cost(150);
9712   ins_encode %{
9713     int vector_len = 0;
9714     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
9715   %}
9716   ins_pipe( pipe_slow );
9717 %}
9718 
9719 // a * b + c
9720 instruct vfma2D_mem(vecX a, memory b, vecX c) %{
9721   predicate(UseFMA && n->as_Vector()->length() == 2);
9722   match(Set c (FmaVD  c (Binary a (LoadVector b))));
9723   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed2D" %}
9724   ins_cost(150);
9725   ins_encode %{
9726     int vector_len = 0;
9727     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
9728   %}
9729   ins_pipe( pipe_slow );
9730 %}
9731 
9732 
9733 // a * b + c
9734 instruct vfma4D_reg(vecY a, vecY b, vecY c) %{
9735   predicate(UseFMA && n->as_Vector()->length() == 4);
9736   match(Set c (FmaVD  c (Binary a b)));
9737   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed4D" %}
9738   ins_cost(150);
9739   ins_encode %{
9740     int vector_len = 1;
9741     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
9742   %}
9743   ins_pipe( pipe_slow );
9744 %}
9745 
9746 // a * b + c
9747 instruct vfma4D_mem(vecY a, memory b, vecY c) %{
9748   predicate(UseFMA && n->as_Vector()->length() == 4);
9749   match(Set c (FmaVD  c (Binary a (LoadVector b))));
9750   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed4D" %}
9751   ins_cost(150);
9752   ins_encode %{
9753     int vector_len = 1;
9754     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
9755   %}
9756   ins_pipe( pipe_slow );
9757 %}
9758 
9759 // a * b + c
9760 instruct vfma8D_reg(vecZ a, vecZ b, vecZ c) %{
9761   predicate(UseFMA && n->as_Vector()->length() == 8);
9762   match(Set c (FmaVD  c (Binary a b)));
9763   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed8D" %}
9764   ins_cost(150);
9765   ins_encode %{
9766     int vector_len = 2;
9767     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
9768   %}
9769   ins_pipe( pipe_slow );
9770 %}
9771 
9772 // a * b + c
9773 instruct vfma8D_mem(vecZ a, memory b, vecZ c) %{
9774   predicate(UseFMA && n->as_Vector()->length() == 8);
9775   match(Set c (FmaVD  c (Binary a (LoadVector b))));
9776   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed8D" %}
9777   ins_cost(150);
9778   ins_encode %{
9779     int vector_len = 2;
9780     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
9781   %}
9782   ins_pipe( pipe_slow );
9783 %}
9784 
9785 // a * b + c
9786 instruct vfma4F_reg(vecX a, vecX b, vecX c) %{
9787   predicate(UseFMA && n->as_Vector()->length() == 4);
9788   match(Set c (FmaVF  c (Binary a b)));
9789   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed4F" %}
9790   ins_cost(150);
9791   ins_encode %{
9792     int vector_len = 0;
9793     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
9794   %}
9795   ins_pipe( pipe_slow );
9796 %}
9797 
9798 // a * b + c
9799 instruct vfma4F_mem(vecX a, memory b, vecX c) %{
9800   predicate(UseFMA && n->as_Vector()->length() == 4);
9801   match(Set c (FmaVF  c (Binary a (LoadVector b))));
9802   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed4F" %}
9803   ins_cost(150);
9804   ins_encode %{
9805     int vector_len = 0;
9806     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
9807   %}
9808   ins_pipe( pipe_slow );
9809 %}
9810 
9811 // a * b + c
9812 instruct vfma8F_reg(vecY a, vecY b, vecY c) %{
9813   predicate(UseFMA && n->as_Vector()->length() == 8);
9814   match(Set c (FmaVF  c (Binary a b)));
9815   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed8F" %}
9816   ins_cost(150);
9817   ins_encode %{
9818     int vector_len = 1;
9819     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
9820   %}
9821   ins_pipe( pipe_slow );
9822 %}
9823 
9824 // a * b + c
9825 instruct vfma8F_mem(vecY a, memory b, vecY c) %{
9826   predicate(UseFMA && n->as_Vector()->length() == 8);
9827   match(Set c (FmaVF  c (Binary a (LoadVector b))));
9828   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed8F" %}
9829   ins_cost(150);
9830   ins_encode %{
9831     int vector_len = 1;
9832     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
9833   %}
9834   ins_pipe( pipe_slow );
9835 %}
9836 
9837 // a * b + c
9838 instruct vfma16F_reg(vecZ a, vecZ b, vecZ c) %{
9839   predicate(UseFMA && n->as_Vector()->length() == 16);
9840   match(Set c (FmaVF  c (Binary a b)));
9841   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed16F" %}
9842   ins_cost(150);
9843   ins_encode %{
9844     int vector_len = 2;
9845     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
9846   %}
9847   ins_pipe( pipe_slow );
9848 %}
9849 
9850 // a * b + c
9851 instruct vfma16F_mem(vecZ a, memory b, vecZ c) %{
9852   predicate(UseFMA && n->as_Vector()->length() == 16);
9853   match(Set c (FmaVF  c (Binary a (LoadVector b))));
9854   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed16F" %}
9855   ins_cost(150);
9856   ins_encode %{
9857     int vector_len = 2;
9858     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
9859   %}
9860   ins_pipe( pipe_slow );
9861 %}
9862 
9863 // --------------------------------- Vector Multiply Add --------------------------------------
9864 
9865 instruct smuladd4S2I_reg(vecD dst, vecD src1) %{
9866   predicate(UseSSE >= 2 && UseAVX == 0 && n->as_Vector()->length() == 2);
9867   match(Set dst (MulAddVS2VI dst src1));
9868   format %{ "pmaddwd $dst,$dst,$src1\t! muladd packed4Sto2I" %}
9869   ins_encode %{
9870     __ pmaddwd($dst$$XMMRegister, $src1$$XMMRegister);
9871   %}
9872   ins_pipe( pipe_slow );
9873 %}
9874 
9875 instruct vmuladd4S2I_reg(vecD dst, vecD src1, vecD src2) %{
9876   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
9877   match(Set dst (MulAddVS2VI src1 src2));
9878   format %{ "vpmaddwd $dst,$src1,$src2\t! muladd packed4Sto2I" %}
9879   ins_encode %{
9880     int vector_len = 0;
9881     __ vpmaddwd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9882   %}
9883   ins_pipe( pipe_slow );
9884 %}
9885 
9886 instruct smuladd8S4I_reg(vecX dst, vecX src1) %{
9887   predicate(UseSSE >= 2 && UseAVX == 0 && n->as_Vector()->length() == 4);
9888   match(Set dst (MulAddVS2VI dst src1));
9889   format %{ "pmaddwd $dst,$dst,$src1\t! muladd packed8Sto4I" %}
9890   ins_encode %{
9891     __ pmaddwd($dst$$XMMRegister, $src1$$XMMRegister);
9892   %}
9893   ins_pipe( pipe_slow );
9894 %}
9895 
9896 instruct vmuladd8S4I_reg(vecX dst, vecX src1, vecX src2) %{
9897   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
9898   match(Set dst (MulAddVS2VI src1 src2));
9899   format %{ "vpmaddwd $dst,$src1,$src2\t! muladd packed8Sto4I" %}
9900   ins_encode %{
9901     int vector_len = 0;
9902     __ vpmaddwd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9903   %}
9904   ins_pipe( pipe_slow );
9905 %}
9906 
9907 instruct vmuladd16S8I_reg(vecY dst, vecY src1, vecY src2) %{
9908   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
9909   match(Set dst (MulAddVS2VI src1 src2));
9910   format %{ "vpmaddwd $dst,$src1,$src2\t! muladd packed16Sto8I" %}
9911   ins_encode %{
9912     int vector_len = 1;
9913     __ vpmaddwd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9914   %}
9915   ins_pipe( pipe_slow );
9916 %}
9917 
9918 instruct vmuladd32S16I_reg(vecZ dst, vecZ src1, vecZ src2) %{
9919   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
9920   match(Set dst (MulAddVS2VI src1 src2));
9921   format %{ "vpmaddwd $dst,$src1,$src2\t! muladd packed32Sto16I" %}
9922   ins_encode %{
9923     int vector_len = 2;
9924     __ vpmaddwd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9925   %}
9926   ins_pipe( pipe_slow );
9927 %}
9928 
9929 // --------------------------------- Vector Multiply Add Add ----------------------------------
9930 
9931 instruct vmuladdadd4S2I_reg(vecD dst, vecD src1, vecD src2) %{
9932   predicate(VM_Version::supports_vnni() && UseAVX > 2 && n->as_Vector()->length() == 2);
9933   match(Set dst (AddVI (MulAddVS2VI src1 src2) dst));
9934   format %{ "evpdpwssd $dst,$src1,$src2\t! muladdadd packed4Sto2I" %}
9935   ins_encode %{
9936     int vector_len = 0;
9937     __ evpdpwssd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9938   %}
9939   ins_pipe( pipe_slow );
9940   ins_cost(10);
9941 %}
9942 
9943 instruct vmuladdadd8S4I_reg(vecX dst, vecX src1, vecX src2) %{
9944   predicate(VM_Version::supports_vnni() && UseAVX > 2 && n->as_Vector()->length() == 4);
9945   match(Set dst (AddVI (MulAddVS2VI src1 src2) dst));
9946   format %{ "evpdpwssd $dst,$src1,$src2\t! muladdadd packed8Sto4I" %}
9947   ins_encode %{
9948     int vector_len = 0;
9949     __ evpdpwssd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9950   %}
9951   ins_pipe( pipe_slow );
9952   ins_cost(10);
9953 %}
9954 
9955 instruct vmuladdadd16S8I_reg(vecY dst, vecY src1, vecY src2) %{
9956   predicate(VM_Version::supports_vnni() && UseAVX > 2 && n->as_Vector()->length() == 8);
9957   match(Set dst (AddVI (MulAddVS2VI src1 src2) dst));
9958   format %{ "evpdpwssd $dst,$src1,$src2\t! muladdadd packed16Sto8I" %}
9959   ins_encode %{
9960     int vector_len = 1;
9961     __ evpdpwssd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9962   %}
9963   ins_pipe( pipe_slow );
9964   ins_cost(10);
9965 %}
9966 
9967 instruct vmuladdadd32S16I_reg(vecZ dst, vecZ src1, vecZ src2) %{
9968   predicate(VM_Version::supports_vnni() && UseAVX > 2 && n->as_Vector()->length() == 16);
9969   match(Set dst (AddVI (MulAddVS2VI src1 src2) dst));
9970   format %{ "evpdpwssd $dst,$src1,$src2\t! muladdadd packed32Sto16I" %}
9971   ins_encode %{
9972     int vector_len = 2;
9973     __ evpdpwssd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9974   %}
9975   ins_pipe( pipe_slow );
9976   ins_cost(10);
9977 %}
9978 
9979 // --------------------------------- PopCount --------------------------------------
9980 
9981 instruct vpopcount2I(vecD dst, vecD src) %{
9982   predicate(VM_Version::supports_vpopcntdq() && UsePopCountInstruction && n->as_Vector()->length() == 2);
9983   match(Set dst (PopCountVI src));
9984   format %{ "vpopcntd  $dst,$src\t! vector popcount packed2I" %}
9985   ins_encode %{
9986     int vector_len = 0;
9987     __ vpopcntd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
9988   %}
9989   ins_pipe( pipe_slow );
9990 %}
9991 
9992 instruct vpopcount4I(vecX dst, vecX src) %{
9993   predicate(VM_Version::supports_vpopcntdq() && UsePopCountInstruction && n->as_Vector()->length() == 4);
9994   match(Set dst (PopCountVI src));
9995   format %{ "vpopcntd  $dst,$src\t! vector popcount packed4I" %}
9996   ins_encode %{
9997     int vector_len = 0;
9998     __ vpopcntd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
9999   %}
10000   ins_pipe( pipe_slow );
10001 %}
10002 
10003 instruct vpopcount8I(vecY dst, vecY src) %{
10004   predicate(VM_Version::supports_vpopcntdq() && UsePopCountInstruction && n->as_Vector()->length() == 8);
10005   match(Set dst (PopCountVI src));
10006   format %{ "vpopcntd  $dst,$src\t! vector popcount packed8I" %}
10007   ins_encode %{
10008     int vector_len = 1;
10009     __ vpopcntd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
10010   %}
10011   ins_pipe( pipe_slow );
10012 %}
10013 
10014 instruct vpopcount16I(vecZ dst, vecZ src) %{
10015   predicate(VM_Version::supports_vpopcntdq() && UsePopCountInstruction && n->as_Vector()->length() == 16);
10016   match(Set dst (PopCountVI src));
10017   format %{ "vpopcntd  $dst,$src\t! vector popcount packed16I" %}
10018   ins_encode %{
10019     int vector_len = 2;
10020     __ vpopcntd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
10021   %}
10022   ins_pipe( pipe_slow );
10023 %}