1 //
   2 // Copyright (c) 2011, 2019, Oracle and/or its affiliates. All rights reserved.
   3 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4 //
   5 // This code is free software; you can redistribute it and/or modify it
   6 // under the terms of the GNU General Public License version 2 only, as
   7 // published by the Free Software Foundation.
   8 //
   9 // This code is distributed in the hope that it will be useful, but WITHOUT
  10 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11 // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12 // version 2 for more details (a copy is included in the LICENSE file that
  13 // accompanied this code).
  14 //
  15 // You should have received a copy of the GNU General Public License version
  16 // 2 along with this work; if not, write to the Free Software Foundation,
  17 // Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18 //
  19 // Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20 // or visit www.oracle.com if you need additional information or have any
  21 // questions.
  22 //
  23 //
  24 
  25 // X86 Common Architecture Description File
  26 
  27 //----------REGISTER DEFINITION BLOCK------------------------------------------
  28 // This information is used by the matcher and the register allocator to
  29 // describe individual registers and classes of registers within the target
  30 // archtecture.
  31 
  32 register %{
  33 //----------Architecture Description Register Definitions----------------------
  34 // General Registers
  35 // "reg_def"  name ( register save type, C convention save type,
  36 //                   ideal register type, encoding );
  37 // Register Save Types:
  38 //
  39 // NS  = No-Save:       The register allocator assumes that these registers
  40 //                      can be used without saving upon entry to the method, &
  41 //                      that they do not need to be saved at call sites.
  42 //
  43 // SOC = Save-On-Call:  The register allocator assumes that these registers
  44 //                      can be used without saving upon entry to the method,
  45 //                      but that they must be saved at call sites.
  46 //
  47 // SOE = Save-On-Entry: The register allocator assumes that these registers
  48 //                      must be saved before using them upon entry to the
  49 //                      method, but they do not need to be saved at call
  50 //                      sites.
  51 //
  52 // AS  = Always-Save:   The register allocator assumes that these registers
  53 //                      must be saved before using them upon entry to the
  54 //                      method, & that they must be saved at call sites.
  55 //
  56 // Ideal Register Type is used to determine how to save & restore a
  57 // register.  Op_RegI will get spilled with LoadI/StoreI, Op_RegP will get
  58 // spilled with LoadP/StoreP.  If the register supports both, use Op_RegI.
  59 //
  60 // The encoding number is the actual bit-pattern placed into the opcodes.
  61 
  62 // XMM registers.  512-bit registers or 8 words each, labeled (a)-p.
  63 // Word a in each register holds a Float, words ab hold a Double.
  64 // The whole registers are used in SSE4.2 version intrinsics,
  65 // array copy stubs and superword operations (see UseSSE42Intrinsics,
  66 // UseXMMForArrayCopy and UseSuperword flags).
  67 // For pre EVEX enabled architectures:
  68 //      XMM8-XMM15 must be encoded with REX (VEX for UseAVX)
  69 // For EVEX enabled architectures:
  70 //      XMM8-XMM31 must be encoded with REX (EVEX for UseAVX).
  71 //
  72 // Linux ABI:   No register preserved across function calls
  73 //              XMM0-XMM7 might hold parameters
  74 // Windows ABI: XMM6-XMM31 preserved across function calls
  75 //              XMM0-XMM3 might hold parameters
  76 
  77 reg_def XMM0 ( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg());
  78 reg_def XMM0b( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(1));
  79 reg_def XMM0c( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(2));
  80 reg_def XMM0d( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(3));
  81 reg_def XMM0e( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(4));
  82 reg_def XMM0f( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(5));
  83 reg_def XMM0g( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(6));
  84 reg_def XMM0h( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(7));
  85 reg_def XMM0i( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(8));
  86 reg_def XMM0j( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(9));
  87 reg_def XMM0k( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(10));
  88 reg_def XMM0l( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(11));
  89 reg_def XMM0m( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(12));
  90 reg_def XMM0n( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(13));
  91 reg_def XMM0o( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(14));
  92 reg_def XMM0p( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(15));
  93 
  94 reg_def XMM1 ( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg());
  95 reg_def XMM1b( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(1));
  96 reg_def XMM1c( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(2));
  97 reg_def XMM1d( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(3));
  98 reg_def XMM1e( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(4));
  99 reg_def XMM1f( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(5));
 100 reg_def XMM1g( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(6));
 101 reg_def XMM1h( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(7));
 102 reg_def XMM1i( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(8));
 103 reg_def XMM1j( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(9));
 104 reg_def XMM1k( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(10));
 105 reg_def XMM1l( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(11));
 106 reg_def XMM1m( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(12));
 107 reg_def XMM1n( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(13));
 108 reg_def XMM1o( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(14));
 109 reg_def XMM1p( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(15));
 110 
 111 reg_def XMM2 ( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg());
 112 reg_def XMM2b( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(1));
 113 reg_def XMM2c( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(2));
 114 reg_def XMM2d( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(3));
 115 reg_def XMM2e( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(4));
 116 reg_def XMM2f( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(5));
 117 reg_def XMM2g( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(6));
 118 reg_def XMM2h( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(7));
 119 reg_def XMM2i( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(8));
 120 reg_def XMM2j( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(9));
 121 reg_def XMM2k( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(10));
 122 reg_def XMM2l( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(11));
 123 reg_def XMM2m( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(12));
 124 reg_def XMM2n( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(13));
 125 reg_def XMM2o( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(14));
 126 reg_def XMM2p( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(15));
 127 
 128 reg_def XMM3 ( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg());
 129 reg_def XMM3b( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(1));
 130 reg_def XMM3c( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(2));
 131 reg_def XMM3d( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(3));
 132 reg_def XMM3e( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(4));
 133 reg_def XMM3f( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(5));
 134 reg_def XMM3g( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(6));
 135 reg_def XMM3h( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(7));
 136 reg_def XMM3i( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(8));
 137 reg_def XMM3j( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(9));
 138 reg_def XMM3k( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(10));
 139 reg_def XMM3l( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(11));
 140 reg_def XMM3m( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(12));
 141 reg_def XMM3n( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(13));
 142 reg_def XMM3o( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(14));
 143 reg_def XMM3p( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(15));
 144 
 145 reg_def XMM4 ( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg());
 146 reg_def XMM4b( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(1));
 147 reg_def XMM4c( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(2));
 148 reg_def XMM4d( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(3));
 149 reg_def XMM4e( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(4));
 150 reg_def XMM4f( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(5));
 151 reg_def XMM4g( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(6));
 152 reg_def XMM4h( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(7));
 153 reg_def XMM4i( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(8));
 154 reg_def XMM4j( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(9));
 155 reg_def XMM4k( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(10));
 156 reg_def XMM4l( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(11));
 157 reg_def XMM4m( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(12));
 158 reg_def XMM4n( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(13));
 159 reg_def XMM4o( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(14));
 160 reg_def XMM4p( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(15));
 161 
 162 reg_def XMM5 ( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg());
 163 reg_def XMM5b( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(1));
 164 reg_def XMM5c( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(2));
 165 reg_def XMM5d( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(3));
 166 reg_def XMM5e( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(4));
 167 reg_def XMM5f( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(5));
 168 reg_def XMM5g( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(6));
 169 reg_def XMM5h( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(7));
 170 reg_def XMM5i( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(8));
 171 reg_def XMM5j( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(9));
 172 reg_def XMM5k( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(10));
 173 reg_def XMM5l( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(11));
 174 reg_def XMM5m( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(12));
 175 reg_def XMM5n( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(13));
 176 reg_def XMM5o( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(14));
 177 reg_def XMM5p( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(15));
 178 
 179 reg_def XMM6 ( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg());
 180 reg_def XMM6b( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(1));
 181 reg_def XMM6c( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(2));
 182 reg_def XMM6d( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(3));
 183 reg_def XMM6e( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(4));
 184 reg_def XMM6f( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(5));
 185 reg_def XMM6g( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(6));
 186 reg_def XMM6h( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(7));
 187 reg_def XMM6i( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(8));
 188 reg_def XMM6j( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(9));
 189 reg_def XMM6k( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(10));
 190 reg_def XMM6l( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(11));
 191 reg_def XMM6m( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(12));
 192 reg_def XMM6n( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(13));
 193 reg_def XMM6o( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(14));
 194 reg_def XMM6p( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(15));
 195 
 196 reg_def XMM7 ( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg());
 197 reg_def XMM7b( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(1));
 198 reg_def XMM7c( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(2));
 199 reg_def XMM7d( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(3));
 200 reg_def XMM7e( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(4));
 201 reg_def XMM7f( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(5));
 202 reg_def XMM7g( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(6));
 203 reg_def XMM7h( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(7));
 204 reg_def XMM7i( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(8));
 205 reg_def XMM7j( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(9));
 206 reg_def XMM7k( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(10));
 207 reg_def XMM7l( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(11));
 208 reg_def XMM7m( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(12));
 209 reg_def XMM7n( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(13));
 210 reg_def XMM7o( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(14));
 211 reg_def XMM7p( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(15));
 212 
 213 #ifdef _LP64
 214 
 215 reg_def XMM8 ( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg());
 216 reg_def XMM8b( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(1));
 217 reg_def XMM8c( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(2));
 218 reg_def XMM8d( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(3));
 219 reg_def XMM8e( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(4));
 220 reg_def XMM8f( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(5));
 221 reg_def XMM8g( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(6));
 222 reg_def XMM8h( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(7));
 223 reg_def XMM8i( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(8));
 224 reg_def XMM8j( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(9));
 225 reg_def XMM8k( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(10));
 226 reg_def XMM8l( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(11));
 227 reg_def XMM8m( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(12));
 228 reg_def XMM8n( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(13));
 229 reg_def XMM8o( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(14));
 230 reg_def XMM8p( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(15));
 231 
 232 reg_def XMM9 ( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg());
 233 reg_def XMM9b( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(1));
 234 reg_def XMM9c( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(2));
 235 reg_def XMM9d( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(3));
 236 reg_def XMM9e( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(4));
 237 reg_def XMM9f( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(5));
 238 reg_def XMM9g( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(6));
 239 reg_def XMM9h( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(7));
 240 reg_def XMM9i( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(8));
 241 reg_def XMM9j( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(9));
 242 reg_def XMM9k( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(10));
 243 reg_def XMM9l( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(11));
 244 reg_def XMM9m( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(12));
 245 reg_def XMM9n( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(13));
 246 reg_def XMM9o( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(14));
 247 reg_def XMM9p( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(15));
 248 
 249 reg_def XMM10 ( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg());
 250 reg_def XMM10b( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(1));
 251 reg_def XMM10c( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(2));
 252 reg_def XMM10d( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(3));
 253 reg_def XMM10e( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(4));
 254 reg_def XMM10f( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(5));
 255 reg_def XMM10g( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(6));
 256 reg_def XMM10h( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(7));
 257 reg_def XMM10i( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(8));
 258 reg_def XMM10j( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(9));
 259 reg_def XMM10k( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(10));
 260 reg_def XMM10l( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(11));
 261 reg_def XMM10m( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(12));
 262 reg_def XMM10n( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(13));
 263 reg_def XMM10o( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(14));
 264 reg_def XMM10p( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(15));
 265 
 266 reg_def XMM11 ( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg());
 267 reg_def XMM11b( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(1));
 268 reg_def XMM11c( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(2));
 269 reg_def XMM11d( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(3));
 270 reg_def XMM11e( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(4));
 271 reg_def XMM11f( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(5));
 272 reg_def XMM11g( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(6));
 273 reg_def XMM11h( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(7));
 274 reg_def XMM11i( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(8));
 275 reg_def XMM11j( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(9));
 276 reg_def XMM11k( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(10));
 277 reg_def XMM11l( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(11));
 278 reg_def XMM11m( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(12));
 279 reg_def XMM11n( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(13));
 280 reg_def XMM11o( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(14));
 281 reg_def XMM11p( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(15));
 282 
 283 reg_def XMM12 ( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg());
 284 reg_def XMM12b( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(1));
 285 reg_def XMM12c( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(2));
 286 reg_def XMM12d( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(3));
 287 reg_def XMM12e( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(4));
 288 reg_def XMM12f( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(5));
 289 reg_def XMM12g( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(6));
 290 reg_def XMM12h( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(7));
 291 reg_def XMM12i( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(8));
 292 reg_def XMM12j( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(9));
 293 reg_def XMM12k( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(10));
 294 reg_def XMM12l( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(11));
 295 reg_def XMM12m( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(12));
 296 reg_def XMM12n( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(13));
 297 reg_def XMM12o( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(14));
 298 reg_def XMM12p( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(15));
 299 
 300 reg_def XMM13 ( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg());
 301 reg_def XMM13b( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(1));
 302 reg_def XMM13c( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(2));
 303 reg_def XMM13d( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(3));
 304 reg_def XMM13e( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(4));
 305 reg_def XMM13f( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(5));
 306 reg_def XMM13g( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(6));
 307 reg_def XMM13h( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(7));
 308 reg_def XMM13i( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(8));
 309 reg_def XMM13j( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(9));
 310 reg_def XMM13k( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(10));
 311 reg_def XMM13l( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(11));
 312 reg_def XMM13m( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(12));
 313 reg_def XMM13n( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(13));
 314 reg_def XMM13o( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(14));
 315 reg_def XMM13p( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(15));
 316 
 317 reg_def XMM14 ( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg());
 318 reg_def XMM14b( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(1));
 319 reg_def XMM14c( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(2));
 320 reg_def XMM14d( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(3));
 321 reg_def XMM14e( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(4));
 322 reg_def XMM14f( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(5));
 323 reg_def XMM14g( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(6));
 324 reg_def XMM14h( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(7));
 325 reg_def XMM14i( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(8));
 326 reg_def XMM14j( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(9));
 327 reg_def XMM14k( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(10));
 328 reg_def XMM14l( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(11));
 329 reg_def XMM14m( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(12));
 330 reg_def XMM14n( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(13));
 331 reg_def XMM14o( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(14));
 332 reg_def XMM14p( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(15));
 333 
 334 reg_def XMM15 ( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg());
 335 reg_def XMM15b( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(1));
 336 reg_def XMM15c( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(2));
 337 reg_def XMM15d( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(3));
 338 reg_def XMM15e( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(4));
 339 reg_def XMM15f( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(5));
 340 reg_def XMM15g( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(6));
 341 reg_def XMM15h( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(7));
 342 reg_def XMM15i( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(8));
 343 reg_def XMM15j( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(9));
 344 reg_def XMM15k( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(10));
 345 reg_def XMM15l( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(11));
 346 reg_def XMM15m( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(12));
 347 reg_def XMM15n( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(13));
 348 reg_def XMM15o( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(14));
 349 reg_def XMM15p( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(15));
 350 
 351 reg_def XMM16 ( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg());
 352 reg_def XMM16b( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(1));
 353 reg_def XMM16c( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(2));
 354 reg_def XMM16d( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(3));
 355 reg_def XMM16e( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(4));
 356 reg_def XMM16f( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(5));
 357 reg_def XMM16g( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(6));
 358 reg_def XMM16h( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(7));
 359 reg_def XMM16i( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(8));
 360 reg_def XMM16j( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(9));
 361 reg_def XMM16k( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(10));
 362 reg_def XMM16l( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(11));
 363 reg_def XMM16m( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(12));
 364 reg_def XMM16n( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(13));
 365 reg_def XMM16o( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(14));
 366 reg_def XMM16p( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(15));
 367 
 368 reg_def XMM17 ( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg());
 369 reg_def XMM17b( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(1));
 370 reg_def XMM17c( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(2));
 371 reg_def XMM17d( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(3));
 372 reg_def XMM17e( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(4));
 373 reg_def XMM17f( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(5));
 374 reg_def XMM17g( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(6));
 375 reg_def XMM17h( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(7));
 376 reg_def XMM17i( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(8));
 377 reg_def XMM17j( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(9));
 378 reg_def XMM17k( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(10));
 379 reg_def XMM17l( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(11));
 380 reg_def XMM17m( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(12));
 381 reg_def XMM17n( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(13));
 382 reg_def XMM17o( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(14));
 383 reg_def XMM17p( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(15));
 384 
 385 reg_def XMM18 ( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg());
 386 reg_def XMM18b( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(1));
 387 reg_def XMM18c( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(2));
 388 reg_def XMM18d( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(3));
 389 reg_def XMM18e( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(4));
 390 reg_def XMM18f( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(5));
 391 reg_def XMM18g( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(6));
 392 reg_def XMM18h( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(7));
 393 reg_def XMM18i( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(8));
 394 reg_def XMM18j( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(9));
 395 reg_def XMM18k( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(10));
 396 reg_def XMM18l( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(11));
 397 reg_def XMM18m( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(12));
 398 reg_def XMM18n( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(13));
 399 reg_def XMM18o( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(14));
 400 reg_def XMM18p( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(15));
 401 
 402 reg_def XMM19 ( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg());
 403 reg_def XMM19b( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(1));
 404 reg_def XMM19c( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(2));
 405 reg_def XMM19d( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(3));
 406 reg_def XMM19e( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(4));
 407 reg_def XMM19f( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(5));
 408 reg_def XMM19g( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(6));
 409 reg_def XMM19h( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(7));
 410 reg_def XMM19i( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(8));
 411 reg_def XMM19j( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(9));
 412 reg_def XMM19k( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(10));
 413 reg_def XMM19l( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(11));
 414 reg_def XMM19m( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(12));
 415 reg_def XMM19n( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(13));
 416 reg_def XMM19o( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(14));
 417 reg_def XMM19p( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(15));
 418 
 419 reg_def XMM20 ( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg());
 420 reg_def XMM20b( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(1));
 421 reg_def XMM20c( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(2));
 422 reg_def XMM20d( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(3));
 423 reg_def XMM20e( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(4));
 424 reg_def XMM20f( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(5));
 425 reg_def XMM20g( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(6));
 426 reg_def XMM20h( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(7));
 427 reg_def XMM20i( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(8));
 428 reg_def XMM20j( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(9));
 429 reg_def XMM20k( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(10));
 430 reg_def XMM20l( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(11));
 431 reg_def XMM20m( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(12));
 432 reg_def XMM20n( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(13));
 433 reg_def XMM20o( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(14));
 434 reg_def XMM20p( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(15));
 435 
 436 reg_def XMM21 ( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg());
 437 reg_def XMM21b( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(1));
 438 reg_def XMM21c( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(2));
 439 reg_def XMM21d( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(3));
 440 reg_def XMM21e( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(4));
 441 reg_def XMM21f( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(5));
 442 reg_def XMM21g( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(6));
 443 reg_def XMM21h( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(7));
 444 reg_def XMM21i( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(8));
 445 reg_def XMM21j( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(9));
 446 reg_def XMM21k( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(10));
 447 reg_def XMM21l( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(11));
 448 reg_def XMM21m( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(12));
 449 reg_def XMM21n( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(13));
 450 reg_def XMM21o( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(14));
 451 reg_def XMM21p( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(15));
 452 
 453 reg_def XMM22 ( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg());
 454 reg_def XMM22b( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(1));
 455 reg_def XMM22c( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(2));
 456 reg_def XMM22d( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(3));
 457 reg_def XMM22e( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(4));
 458 reg_def XMM22f( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(5));
 459 reg_def XMM22g( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(6));
 460 reg_def XMM22h( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(7));
 461 reg_def XMM22i( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(8));
 462 reg_def XMM22j( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(9));
 463 reg_def XMM22k( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(10));
 464 reg_def XMM22l( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(11));
 465 reg_def XMM22m( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(12));
 466 reg_def XMM22n( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(13));
 467 reg_def XMM22o( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(14));
 468 reg_def XMM22p( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(15));
 469 
 470 reg_def XMM23 ( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg());
 471 reg_def XMM23b( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(1));
 472 reg_def XMM23c( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(2));
 473 reg_def XMM23d( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(3));
 474 reg_def XMM23e( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(4));
 475 reg_def XMM23f( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(5));
 476 reg_def XMM23g( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(6));
 477 reg_def XMM23h( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(7));
 478 reg_def XMM23i( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(8));
 479 reg_def XMM23j( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(9));
 480 reg_def XMM23k( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(10));
 481 reg_def XMM23l( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(11));
 482 reg_def XMM23m( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(12));
 483 reg_def XMM23n( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(13));
 484 reg_def XMM23o( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(14));
 485 reg_def XMM23p( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(15));
 486 
 487 reg_def XMM24 ( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg());
 488 reg_def XMM24b( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(1));
 489 reg_def XMM24c( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(2));
 490 reg_def XMM24d( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(3));
 491 reg_def XMM24e( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(4));
 492 reg_def XMM24f( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(5));
 493 reg_def XMM24g( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(6));
 494 reg_def XMM24h( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(7));
 495 reg_def XMM24i( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(8));
 496 reg_def XMM24j( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(9));
 497 reg_def XMM24k( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(10));
 498 reg_def XMM24l( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(11));
 499 reg_def XMM24m( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(12));
 500 reg_def XMM24n( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(13));
 501 reg_def XMM24o( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(14));
 502 reg_def XMM24p( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(15));
 503 
 504 reg_def XMM25 ( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg());
 505 reg_def XMM25b( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(1));
 506 reg_def XMM25c( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(2));
 507 reg_def XMM25d( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(3));
 508 reg_def XMM25e( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(4));
 509 reg_def XMM25f( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(5));
 510 reg_def XMM25g( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(6));
 511 reg_def XMM25h( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(7));
 512 reg_def XMM25i( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(8));
 513 reg_def XMM25j( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(9));
 514 reg_def XMM25k( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(10));
 515 reg_def XMM25l( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(11));
 516 reg_def XMM25m( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(12));
 517 reg_def XMM25n( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(13));
 518 reg_def XMM25o( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(14));
 519 reg_def XMM25p( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(15));
 520 
 521 reg_def XMM26 ( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg());
 522 reg_def XMM26b( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(1));
 523 reg_def XMM26c( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(2));
 524 reg_def XMM26d( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(3));
 525 reg_def XMM26e( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(4));
 526 reg_def XMM26f( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(5));
 527 reg_def XMM26g( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(6));
 528 reg_def XMM26h( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(7));
 529 reg_def XMM26i( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(8));
 530 reg_def XMM26j( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(9));
 531 reg_def XMM26k( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(10));
 532 reg_def XMM26l( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(11));
 533 reg_def XMM26m( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(12));
 534 reg_def XMM26n( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(13));
 535 reg_def XMM26o( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(14));
 536 reg_def XMM26p( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(15));
 537 
 538 reg_def XMM27 ( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg());
 539 reg_def XMM27b( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(1));
 540 reg_def XMM27c( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(2));
 541 reg_def XMM27d( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(3));
 542 reg_def XMM27e( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(4));
 543 reg_def XMM27f( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(5));
 544 reg_def XMM27g( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(6));
 545 reg_def XMM27h( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(7));
 546 reg_def XMM27i( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(8));
 547 reg_def XMM27j( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(9));
 548 reg_def XMM27k( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(10));
 549 reg_def XMM27l( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(11));
 550 reg_def XMM27m( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(12));
 551 reg_def XMM27n( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(13));
 552 reg_def XMM27o( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(14));
 553 reg_def XMM27p( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(15));
 554 
 555 reg_def XMM28 ( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg());
 556 reg_def XMM28b( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(1));
 557 reg_def XMM28c( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(2));
 558 reg_def XMM28d( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(3));
 559 reg_def XMM28e( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(4));
 560 reg_def XMM28f( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(5));
 561 reg_def XMM28g( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(6));
 562 reg_def XMM28h( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(7));
 563 reg_def XMM28i( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(8));
 564 reg_def XMM28j( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(9));
 565 reg_def XMM28k( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(10));
 566 reg_def XMM28l( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(11));
 567 reg_def XMM28m( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(12));
 568 reg_def XMM28n( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(13));
 569 reg_def XMM28o( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(14));
 570 reg_def XMM28p( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(15));
 571 
 572 reg_def XMM29 ( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg());
 573 reg_def XMM29b( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(1));
 574 reg_def XMM29c( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(2));
 575 reg_def XMM29d( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(3));
 576 reg_def XMM29e( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(4));
 577 reg_def XMM29f( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(5));
 578 reg_def XMM29g( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(6));
 579 reg_def XMM29h( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(7));
 580 reg_def XMM29i( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(8));
 581 reg_def XMM29j( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(9));
 582 reg_def XMM29k( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(10));
 583 reg_def XMM29l( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(11));
 584 reg_def XMM29m( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(12));
 585 reg_def XMM29n( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(13));
 586 reg_def XMM29o( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(14));
 587 reg_def XMM29p( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(15));
 588 
 589 reg_def XMM30 ( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg());
 590 reg_def XMM30b( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(1));
 591 reg_def XMM30c( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(2));
 592 reg_def XMM30d( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(3));
 593 reg_def XMM30e( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(4));
 594 reg_def XMM30f( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(5));
 595 reg_def XMM30g( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(6));
 596 reg_def XMM30h( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(7));
 597 reg_def XMM30i( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(8));
 598 reg_def XMM30j( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(9));
 599 reg_def XMM30k( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(10));
 600 reg_def XMM30l( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(11));
 601 reg_def XMM30m( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(12));
 602 reg_def XMM30n( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(13));
 603 reg_def XMM30o( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(14));
 604 reg_def XMM30p( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(15));
 605 
 606 reg_def XMM31 ( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg());
 607 reg_def XMM31b( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(1));
 608 reg_def XMM31c( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(2));
 609 reg_def XMM31d( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(3));
 610 reg_def XMM31e( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(4));
 611 reg_def XMM31f( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(5));
 612 reg_def XMM31g( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(6));
 613 reg_def XMM31h( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(7));
 614 reg_def XMM31i( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(8));
 615 reg_def XMM31j( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(9));
 616 reg_def XMM31k( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(10));
 617 reg_def XMM31l( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(11));
 618 reg_def XMM31m( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(12));
 619 reg_def XMM31n( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(13));
 620 reg_def XMM31o( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(14));
 621 reg_def XMM31p( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(15));
 622 
 623 #endif // _LP64
 624 
 625 #ifdef _LP64
 626 reg_def RFLAGS(SOC, SOC, 0, 16, VMRegImpl::Bad());
 627 #else
 628 reg_def RFLAGS(SOC, SOC, 0, 8, VMRegImpl::Bad());
 629 #endif // _LP64
 630 
 631 alloc_class chunk1(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
 632                    XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
 633                    XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
 634                    XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
 635                    XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
 636                    XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
 637                    XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
 638                    XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
 639 #ifdef _LP64
 640                   ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
 641                    XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
 642                    XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
 643                    XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
 644                    XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
 645                    XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
 646                    XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
 647                    XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
 648                   ,XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
 649                    XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
 650                    XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
 651                    XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
 652                    XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
 653                    XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
 654                    XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
 655                    XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
 656                    XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
 657                    XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
 658                    XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
 659                    XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
 660                    XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
 661                    XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
 662                    XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
 663                    XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p
 664 #endif
 665                       );
 666 
 667 // flags allocation class should be last.
 668 alloc_class chunk2(RFLAGS);
 669 
 670 // Singleton class for condition codes
 671 reg_class int_flags(RFLAGS);
 672 
 673 // Class for pre evex float registers
 674 reg_class float_reg_legacy(XMM0,
 675                     XMM1,
 676                     XMM2,
 677                     XMM3,
 678                     XMM4,
 679                     XMM5,
 680                     XMM6,
 681                     XMM7
 682 #ifdef _LP64
 683                    ,XMM8,
 684                     XMM9,
 685                     XMM10,
 686                     XMM11,
 687                     XMM12,
 688                     XMM13,
 689                     XMM14,
 690                     XMM15
 691 #endif
 692                     );
 693 
 694 // Class for evex float registers
 695 reg_class float_reg_evex(XMM0,
 696                     XMM1,
 697                     XMM2,
 698                     XMM3,
 699                     XMM4,
 700                     XMM5,
 701                     XMM6,
 702                     XMM7
 703 #ifdef _LP64
 704                    ,XMM8,
 705                     XMM9,
 706                     XMM10,
 707                     XMM11,
 708                     XMM12,
 709                     XMM13,
 710                     XMM14,
 711                     XMM15,
 712                     XMM16,
 713                     XMM17,
 714                     XMM18,
 715                     XMM19,
 716                     XMM20,
 717                     XMM21,
 718                     XMM22,
 719                     XMM23,
 720                     XMM24,
 721                     XMM25,
 722                     XMM26,
 723                     XMM27,
 724                     XMM28,
 725                     XMM29,
 726                     XMM30,
 727                     XMM31
 728 #endif
 729                     );
 730 
 731 reg_class_dynamic float_reg(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() %} );
 732 reg_class_dynamic float_reg_vl(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
 733 
 734 // Class for pre evex double registers
 735 reg_class double_reg_legacy(XMM0,  XMM0b,
 736                      XMM1,  XMM1b,
 737                      XMM2,  XMM2b,
 738                      XMM3,  XMM3b,
 739                      XMM4,  XMM4b,
 740                      XMM5,  XMM5b,
 741                      XMM6,  XMM6b,
 742                      XMM7,  XMM7b
 743 #ifdef _LP64
 744                     ,XMM8,  XMM8b,
 745                      XMM9,  XMM9b,
 746                      XMM10, XMM10b,
 747                      XMM11, XMM11b,
 748                      XMM12, XMM12b,
 749                      XMM13, XMM13b,
 750                      XMM14, XMM14b,
 751                      XMM15, XMM15b
 752 #endif
 753                      );
 754 
 755 // Class for evex double registers
 756 reg_class double_reg_evex(XMM0,  XMM0b,
 757                      XMM1,  XMM1b,
 758                      XMM2,  XMM2b,
 759                      XMM3,  XMM3b,
 760                      XMM4,  XMM4b,
 761                      XMM5,  XMM5b,
 762                      XMM6,  XMM6b,
 763                      XMM7,  XMM7b
 764 #ifdef _LP64
 765                     ,XMM8,  XMM8b,
 766                      XMM9,  XMM9b,
 767                      XMM10, XMM10b,
 768                      XMM11, XMM11b,
 769                      XMM12, XMM12b,
 770                      XMM13, XMM13b,
 771                      XMM14, XMM14b,
 772                      XMM15, XMM15b,
 773                      XMM16, XMM16b,
 774                      XMM17, XMM17b,
 775                      XMM18, XMM18b,
 776                      XMM19, XMM19b,
 777                      XMM20, XMM20b,
 778                      XMM21, XMM21b,
 779                      XMM22, XMM22b,
 780                      XMM23, XMM23b,
 781                      XMM24, XMM24b,
 782                      XMM25, XMM25b,
 783                      XMM26, XMM26b,
 784                      XMM27, XMM27b,
 785                      XMM28, XMM28b,
 786                      XMM29, XMM29b,
 787                      XMM30, XMM30b,
 788                      XMM31, XMM31b
 789 #endif
 790                      );
 791 
 792 reg_class_dynamic double_reg(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() %} );
 793 reg_class_dynamic double_reg_vl(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
 794 
 795 // Class for pre evex 32bit vector registers
 796 reg_class vectors_reg_legacy(XMM0,
 797                       XMM1,
 798                       XMM2,
 799                       XMM3,
 800                       XMM4,
 801                       XMM5,
 802                       XMM6,
 803                       XMM7
 804 #ifdef _LP64
 805                      ,XMM8,
 806                       XMM9,
 807                       XMM10,
 808                       XMM11,
 809                       XMM12,
 810                       XMM13,
 811                       XMM14,
 812                       XMM15
 813 #endif
 814                       );
 815 
 816 // Class for evex 32bit vector registers
 817 reg_class vectors_reg_evex(XMM0,
 818                       XMM1,
 819                       XMM2,
 820                       XMM3,
 821                       XMM4,
 822                       XMM5,
 823                       XMM6,
 824                       XMM7
 825 #ifdef _LP64
 826                      ,XMM8,
 827                       XMM9,
 828                       XMM10,
 829                       XMM11,
 830                       XMM12,
 831                       XMM13,
 832                       XMM14,
 833                       XMM15,
 834                       XMM16,
 835                       XMM17,
 836                       XMM18,
 837                       XMM19,
 838                       XMM20,
 839                       XMM21,
 840                       XMM22,
 841                       XMM23,
 842                       XMM24,
 843                       XMM25,
 844                       XMM26,
 845                       XMM27,
 846                       XMM28,
 847                       XMM29,
 848                       XMM30,
 849                       XMM31
 850 #endif
 851                       );
 852 
 853 reg_class_dynamic vectors_reg(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_evex() %} );
 854 reg_class_dynamic vectors_reg_vlbwdq(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
 855 
 856 // Class for all 64bit vector registers
 857 reg_class vectord_reg_legacy(XMM0,  XMM0b,
 858                       XMM1,  XMM1b,
 859                       XMM2,  XMM2b,
 860                       XMM3,  XMM3b,
 861                       XMM4,  XMM4b,
 862                       XMM5,  XMM5b,
 863                       XMM6,  XMM6b,
 864                       XMM7,  XMM7b
 865 #ifdef _LP64
 866                      ,XMM8,  XMM8b,
 867                       XMM9,  XMM9b,
 868                       XMM10, XMM10b,
 869                       XMM11, XMM11b,
 870                       XMM12, XMM12b,
 871                       XMM13, XMM13b,
 872                       XMM14, XMM14b,
 873                       XMM15, XMM15b
 874 #endif
 875                       );
 876 
 877 // Class for all 64bit vector registers
 878 reg_class vectord_reg_evex(XMM0,  XMM0b,
 879                       XMM1,  XMM1b,
 880                       XMM2,  XMM2b,
 881                       XMM3,  XMM3b,
 882                       XMM4,  XMM4b,
 883                       XMM5,  XMM5b,
 884                       XMM6,  XMM6b,
 885                       XMM7,  XMM7b
 886 #ifdef _LP64
 887                      ,XMM8,  XMM8b,
 888                       XMM9,  XMM9b,
 889                       XMM10, XMM10b,
 890                       XMM11, XMM11b,
 891                       XMM12, XMM12b,
 892                       XMM13, XMM13b,
 893                       XMM14, XMM14b,
 894                       XMM15, XMM15b,
 895                       XMM16, XMM16b,
 896                       XMM17, XMM17b,
 897                       XMM18, XMM18b,
 898                       XMM19, XMM19b,
 899                       XMM20, XMM20b,
 900                       XMM21, XMM21b,
 901                       XMM22, XMM22b,
 902                       XMM23, XMM23b,
 903                       XMM24, XMM24b,
 904                       XMM25, XMM25b,
 905                       XMM26, XMM26b,
 906                       XMM27, XMM27b,
 907                       XMM28, XMM28b,
 908                       XMM29, XMM29b,
 909                       XMM30, XMM30b,
 910                       XMM31, XMM31b
 911 #endif
 912                       );
 913 
 914 reg_class_dynamic vectord_reg(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_evex() %} );
 915 reg_class_dynamic vectord_reg_vlbwdq(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
 916 
 917 // Class for all 128bit vector registers
 918 reg_class vectorx_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,
 919                       XMM1,  XMM1b,  XMM1c,  XMM1d,
 920                       XMM2,  XMM2b,  XMM2c,  XMM2d,
 921                       XMM3,  XMM3b,  XMM3c,  XMM3d,
 922                       XMM4,  XMM4b,  XMM4c,  XMM4d,
 923                       XMM5,  XMM5b,  XMM5c,  XMM5d,
 924                       XMM6,  XMM6b,  XMM6c,  XMM6d,
 925                       XMM7,  XMM7b,  XMM7c,  XMM7d
 926 #ifdef _LP64
 927                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,
 928                       XMM9,  XMM9b,  XMM9c,  XMM9d,
 929                       XMM10, XMM10b, XMM10c, XMM10d,
 930                       XMM11, XMM11b, XMM11c, XMM11d,
 931                       XMM12, XMM12b, XMM12c, XMM12d,
 932                       XMM13, XMM13b, XMM13c, XMM13d,
 933                       XMM14, XMM14b, XMM14c, XMM14d,
 934                       XMM15, XMM15b, XMM15c, XMM15d
 935 #endif
 936                       );
 937 
 938 // Class for all 128bit vector registers
 939 reg_class vectorx_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,
 940                       XMM1,  XMM1b,  XMM1c,  XMM1d,
 941                       XMM2,  XMM2b,  XMM2c,  XMM2d,
 942                       XMM3,  XMM3b,  XMM3c,  XMM3d,
 943                       XMM4,  XMM4b,  XMM4c,  XMM4d,
 944                       XMM5,  XMM5b,  XMM5c,  XMM5d,
 945                       XMM6,  XMM6b,  XMM6c,  XMM6d,
 946                       XMM7,  XMM7b,  XMM7c,  XMM7d
 947 #ifdef _LP64
 948                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,
 949                       XMM9,  XMM9b,  XMM9c,  XMM9d,
 950                       XMM10, XMM10b, XMM10c, XMM10d,
 951                       XMM11, XMM11b, XMM11c, XMM11d,
 952                       XMM12, XMM12b, XMM12c, XMM12d,
 953                       XMM13, XMM13b, XMM13c, XMM13d,
 954                       XMM14, XMM14b, XMM14c, XMM14d,
 955                       XMM15, XMM15b, XMM15c, XMM15d,
 956                       XMM16, XMM16b, XMM16c, XMM16d,
 957                       XMM17, XMM17b, XMM17c, XMM17d,
 958                       XMM18, XMM18b, XMM18c, XMM18d,
 959                       XMM19, XMM19b, XMM19c, XMM19d,
 960                       XMM20, XMM20b, XMM20c, XMM20d,
 961                       XMM21, XMM21b, XMM21c, XMM21d,
 962                       XMM22, XMM22b, XMM22c, XMM22d,
 963                       XMM23, XMM23b, XMM23c, XMM23d,
 964                       XMM24, XMM24b, XMM24c, XMM24d,
 965                       XMM25, XMM25b, XMM25c, XMM25d,
 966                       XMM26, XMM26b, XMM26c, XMM26d,
 967                       XMM27, XMM27b, XMM27c, XMM27d,
 968                       XMM28, XMM28b, XMM28c, XMM28d,
 969                       XMM29, XMM29b, XMM29c, XMM29d,
 970                       XMM30, XMM30b, XMM30c, XMM30d,
 971                       XMM31, XMM31b, XMM31c, XMM31d
 972 #endif
 973                       );
 974 
 975 reg_class_dynamic vectorx_reg(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_evex() %} );
 976 reg_class_dynamic vectorx_reg_vlbwdq(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
 977 
 978 // Class for all 256bit vector registers
 979 reg_class vectory_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
 980                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
 981                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
 982                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
 983                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
 984                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
 985                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
 986                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
 987 #ifdef _LP64
 988                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
 989                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
 990                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
 991                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
 992                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
 993                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
 994                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
 995                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h
 996 #endif
 997                       );
 998 
 999 // Class for all 256bit vector registers
1000 reg_class vectory_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
1001                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
1002                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
1003                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
1004                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
1005                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
1006                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
1007                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
1008 #ifdef _LP64
1009                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
1010                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
1011                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
1012                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
1013                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
1014                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
1015                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
1016                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h,
1017                       XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h,
1018                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h,
1019                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h,
1020                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h,
1021                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h,
1022                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h,
1023                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h,
1024                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h,
1025                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h,
1026                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h,
1027                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h,
1028                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h,
1029                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h,
1030                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h,
1031                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h,
1032                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h
1033 #endif
1034                       );
1035 
1036 reg_class_dynamic vectory_reg(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_evex() %} );
1037 reg_class_dynamic vectory_reg_vlbwdq(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
1038 
1039 // Class for all 512bit vector registers
1040 reg_class vectorz_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
1041                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
1042                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
1043                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
1044                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
1045                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
1046                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
1047                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
1048 #ifdef _LP64
1049                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
1050                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
1051                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
1052                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
1053                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
1054                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
1055                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
1056                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
1057                      ,XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
1058                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
1059                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
1060                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
1061                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
1062                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
1063                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
1064                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
1065                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
1066                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
1067                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
1068                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
1069                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
1070                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
1071                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
1072                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p
1073 #endif
1074                       );
1075 
1076 // Class for restricted 512bit vector registers
1077 reg_class vectorz_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
1078                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
1079                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
1080                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
1081                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
1082                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
1083                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
1084                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
1085 #ifdef _LP64
1086                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
1087                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
1088                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
1089                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
1090                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
1091                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
1092                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
1093                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
1094 #endif
1095                       );
1096 
1097 reg_class_dynamic vectorz_reg(vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() %} );
1098 reg_class_dynamic vectorz_reg_vl(vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
1099 
1100 reg_class xmm0_reg(XMM0, XMM0b, XMM0c, XMM0d);
1101 reg_class ymm0_reg(XMM0, XMM0b, XMM0c, XMM0d, XMM0e, XMM0f, XMM0g, XMM0h);
1102 reg_class zmm0_reg(XMM0, XMM0b, XMM0c, XMM0d, XMM0e, XMM0f, XMM0g, XMM0h, XMM0i, XMM0j, XMM0k, XMM0l, XMM0m, XMM0n, XMM0o, XMM0p);
1103 
1104 reg_class xmm1_reg(XMM1, XMM1b, XMM1c, XMM1d);
1105 reg_class ymm1_reg(XMM1, XMM1b, XMM1c, XMM1d, XMM1e, XMM1f, XMM1g, XMM1h);
1106 reg_class zmm1_reg(XMM1, XMM1b, XMM1c, XMM1d, XMM1e, XMM1f, XMM1g, XMM1h, XMM1i, XMM1j, XMM1k, XMM1l, XMM1m, XMM1n, XMM1o, XMM1p);
1107 
1108 reg_class xmm2_reg(XMM2, XMM2b, XMM2c, XMM2d);
1109 reg_class ymm2_reg(XMM2, XMM2b, XMM2c, XMM2d, XMM2e, XMM2f, XMM2g, XMM2h);
1110 reg_class zmm2_reg(XMM2, XMM2b, XMM2c, XMM2d, XMM2e, XMM2f, XMM2g, XMM2h, XMM2i, XMM2j, XMM2k, XMM2l, XMM2m, XMM2n, XMM2o, XMM2p);
1111 
1112 reg_class xmm3_reg(XMM3, XMM3b, XMM3c, XMM3d);
1113 reg_class ymm3_reg(XMM3, XMM3b, XMM3c, XMM3d, XMM3e, XMM3f, XMM3g, XMM3h);
1114 reg_class zmm3_reg(XMM3, XMM3b, XMM3c, XMM3d, XMM3e, XMM3f, XMM3g, XMM3h, XMM3i, XMM3j, XMM3k, XMM3l, XMM3m, XMM3n, XMM3o, XMM3p);
1115 
1116 reg_class xmm4_reg(XMM4, XMM4b, XMM4c, XMM4d);
1117 reg_class ymm4_reg(XMM4, XMM4b, XMM4c, XMM4d, XMM4e, XMM4f, XMM4g, XMM4h);
1118 reg_class zmm4_reg(XMM4, XMM4b, XMM4c, XMM4d, XMM4e, XMM4f, XMM4g, XMM4h, XMM4i, XMM4j, XMM4k, XMM4l, XMM4m, XMM4n, XMM4o, XMM4p);
1119 
1120 reg_class xmm5_reg(XMM5, XMM5b, XMM5c, XMM5d);
1121 reg_class ymm5_reg(XMM5, XMM5b, XMM5c, XMM5d, XMM5e, XMM5f, XMM5g, XMM5h);
1122 reg_class zmm5_reg(XMM5, XMM5b, XMM5c, XMM5d, XMM5e, XMM5f, XMM5g, XMM5h, XMM5i, XMM5j, XMM5k, XMM5l, XMM5m, XMM5n, XMM5o, XMM5p);
1123 
1124 reg_class xmm6_reg(XMM6, XMM6b, XMM6c, XMM6d);
1125 reg_class ymm6_reg(XMM6, XMM6b, XMM6c, XMM6d, XMM6e, XMM6f, XMM6g, XMM6h);
1126 reg_class zmm6_reg(XMM6, XMM6b, XMM6c, XMM6d, XMM6e, XMM6f, XMM6g, XMM6h, XMM6i, XMM6j, XMM6k, XMM6l, XMM6m, XMM6n, XMM6o, XMM6p);
1127 
1128 reg_class xmm7_reg(XMM7, XMM7b, XMM7c, XMM7d);
1129 reg_class ymm7_reg(XMM7, XMM7b, XMM7c, XMM7d, XMM7e, XMM7f, XMM7g, XMM7h);
1130 reg_class zmm7_reg(XMM7, XMM7b, XMM7c, XMM7d, XMM7e, XMM7f, XMM7g, XMM7h, XMM7i, XMM7j, XMM7k, XMM7l, XMM7m, XMM7n, XMM7o, XMM7p);
1131 
1132 #ifdef _LP64
1133 
1134 reg_class xmm8_reg(XMM8, XMM8b, XMM8c, XMM8d);
1135 reg_class ymm8_reg(XMM8, XMM8b, XMM8c, XMM8d, XMM8e, XMM8f, XMM8g, XMM8h);
1136 reg_class zmm8_reg(XMM8, XMM8b, XMM8c, XMM8d, XMM8e, XMM8f, XMM8g, XMM8h, XMM8i, XMM8j, XMM8k, XMM8l, XMM8m, XMM8n, XMM8o, XMM8p);
1137 
1138 reg_class xmm9_reg(XMM9, XMM9b, XMM9c, XMM9d);
1139 reg_class ymm9_reg(XMM9, XMM9b, XMM9c, XMM9d, XMM9e, XMM9f, XMM9g, XMM9h);
1140 reg_class zmm9_reg(XMM9, XMM9b, XMM9c, XMM9d, XMM9e, XMM9f, XMM9g, XMM9h, XMM9i, XMM9j, XMM9k, XMM9l, XMM9m, XMM9n, XMM9o, XMM9p);
1141 
1142 reg_class xmm10_reg(XMM10, XMM10b, XMM10c, XMM10d);
1143 reg_class ymm10_reg(XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h);
1144 reg_class zmm10_reg(XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p);
1145 
1146 reg_class xmm11_reg(XMM11, XMM11b, XMM11c, XMM11d);
1147 reg_class ymm11_reg(XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h);
1148 reg_class zmm11_reg(XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p);
1149 
1150 reg_class xmm12_reg(XMM12, XMM12b, XMM12c, XMM12d);
1151 reg_class ymm12_reg(XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h);
1152 reg_class zmm12_reg(XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p);
1153 
1154 reg_class xmm13_reg(XMM13, XMM13b, XMM13c, XMM13d);
1155 reg_class ymm13_reg(XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h);
1156 reg_class zmm13_reg(XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p);
1157 
1158 reg_class xmm14_reg(XMM14, XMM14b, XMM14c, XMM14d);
1159 reg_class ymm14_reg(XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h);
1160 reg_class zmm14_reg(XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p);
1161 
1162 reg_class xmm15_reg(XMM15, XMM15b, XMM15c, XMM15d);
1163 reg_class ymm15_reg(XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h);
1164 reg_class zmm15_reg(XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p);
1165 
1166 reg_class xmm16_reg(XMM16, XMM16b, XMM16c, XMM16d);
1167 reg_class ymm16_reg(XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h);
1168 reg_class zmm16_reg(XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p);
1169 
1170 reg_class xmm17_reg(XMM17, XMM17b, XMM17c, XMM17d);
1171 reg_class ymm17_reg(XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h);
1172 reg_class zmm17_reg(XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p);
1173 
1174 reg_class xmm18_reg(XMM18, XMM18b, XMM18c, XMM18d);
1175 reg_class ymm18_reg(XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h);
1176 reg_class zmm18_reg(XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p);
1177 
1178 reg_class xmm19_reg(XMM19, XMM19b, XMM19c, XMM19d);
1179 reg_class ymm19_reg(XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h);
1180 reg_class zmm19_reg(XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p);
1181 
1182 reg_class xmm20_reg(XMM20, XMM20b, XMM20c, XMM20d);
1183 reg_class ymm20_reg(XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h);
1184 reg_class zmm20_reg(XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p);
1185 
1186 reg_class xmm21_reg(XMM21, XMM21b, XMM21c, XMM21d);
1187 reg_class ymm21_reg(XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h);
1188 reg_class zmm21_reg(XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p);
1189 
1190 reg_class xmm22_reg(XMM22, XMM22b, XMM22c, XMM22d);
1191 reg_class ymm22_reg(XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h);
1192 reg_class zmm22_reg(XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p);
1193 
1194 reg_class xmm23_reg(XMM23, XMM23b, XMM23c, XMM23d);
1195 reg_class ymm23_reg(XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h);
1196 reg_class zmm23_reg(XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p);
1197 
1198 reg_class xmm24_reg(XMM24, XMM24b, XMM24c, XMM24d);
1199 reg_class ymm24_reg(XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h);
1200 reg_class zmm24_reg(XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p);
1201 
1202 reg_class xmm25_reg(XMM25, XMM25b, XMM25c, XMM25d);
1203 reg_class ymm25_reg(XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h);
1204 reg_class zmm25_reg(XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p);
1205 
1206 reg_class xmm26_reg(XMM26, XMM26b, XMM26c, XMM26d);
1207 reg_class ymm26_reg(XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h);
1208 reg_class zmm26_reg(XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p);
1209 
1210 reg_class xmm27_reg(XMM27, XMM27b, XMM27c, XMM27d);
1211 reg_class ymm27_reg(XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h);
1212 reg_class zmm27_reg(XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p);
1213 
1214 reg_class xmm28_reg(XMM28, XMM28b, XMM28c, XMM28d);
1215 reg_class ymm28_reg(XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h);
1216 reg_class zmm28_reg(XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p);
1217 
1218 reg_class xmm29_reg(XMM29, XMM29b, XMM29c, XMM29d);
1219 reg_class ymm29_reg(XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h);
1220 reg_class zmm29_reg(XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p);
1221 
1222 reg_class xmm30_reg(XMM30, XMM30b, XMM30c, XMM30d);
1223 reg_class ymm30_reg(XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h);
1224 reg_class zmm30_reg(XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p);
1225 
1226 reg_class xmm31_reg(XMM31, XMM31b, XMM31c, XMM31d);
1227 reg_class ymm31_reg(XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h);
1228 reg_class zmm31_reg(XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p);
1229 
1230 #endif
1231 
1232 %}
1233 
1234 
1235 //----------SOURCE BLOCK-------------------------------------------------------
1236 // This is a block of C++ code which provides values, functions, and
1237 // definitions necessary in the rest of the architecture description
1238 
1239 source_hpp %{
1240 // Header information of the source block.
1241 // Method declarations/definitions which are used outside
1242 // the ad-scope can conveniently be defined here.
1243 //
1244 // To keep related declarations/definitions/uses close together,
1245 // we switch between source %{ }% and source_hpp %{ }% freely as needed.
1246 
1247 class NativeJump;
1248 
1249 class CallStubImpl {
1250 
1251   //--------------------------------------------------------------
1252   //---<  Used for optimization in Compile::shorten_branches  >---
1253   //--------------------------------------------------------------
1254 
1255  public:
1256   // Size of call trampoline stub.
1257   static uint size_call_trampoline() {
1258     return 0; // no call trampolines on this platform
1259   }
1260 
1261   // number of relocations needed by a call trampoline stub
1262   static uint reloc_call_trampoline() {
1263     return 0; // no call trampolines on this platform
1264   }
1265 };
1266 
1267 class HandlerImpl {
1268 
1269  public:
1270 
1271   static int emit_exception_handler(CodeBuffer &cbuf);
1272   static int emit_deopt_handler(CodeBuffer& cbuf);
1273 
1274   static uint size_exception_handler() {
1275     // NativeCall instruction size is the same as NativeJump.
1276     // exception handler starts out as jump and can be patched to
1277     // a call be deoptimization.  (4932387)
1278     // Note that this value is also credited (in output.cpp) to
1279     // the size of the code section.
1280     return NativeJump::instruction_size;
1281   }
1282 
1283 #ifdef _LP64
1284   static uint size_deopt_handler() {
1285     // three 5 byte instructions plus one move for unreachable address.
1286     return 15+3;
1287   }
1288 #else
1289   static uint size_deopt_handler() {
1290     // NativeCall instruction size is the same as NativeJump.
1291     // exception handler starts out as jump and can be patched to
1292     // a call be deoptimization.  (4932387)
1293     // Note that this value is also credited (in output.cpp) to
1294     // the size of the code section.
1295     return 5 + NativeJump::instruction_size; // pushl(); jmp;
1296   }
1297 #endif
1298 };
1299 
1300 %} // end source_hpp
1301 
1302 source %{
1303 
1304 #include "opto/addnode.hpp"
1305 
1306 // Emit exception handler code.
1307 // Stuff framesize into a register and call a VM stub routine.
1308 int HandlerImpl::emit_exception_handler(CodeBuffer& cbuf) {
1309 
1310   // Note that the code buffer's insts_mark is always relative to insts.
1311   // That's why we must use the macroassembler to generate a handler.
1312   MacroAssembler _masm(&cbuf);
1313   address base = __ start_a_stub(size_exception_handler());
1314   if (base == NULL) {
1315     ciEnv::current()->record_failure("CodeCache is full");
1316     return 0;  // CodeBuffer::expand failed
1317   }
1318   int offset = __ offset();
1319   __ jump(RuntimeAddress(OptoRuntime::exception_blob()->entry_point()));
1320   assert(__ offset() - offset <= (int) size_exception_handler(), "overflow");
1321   __ end_a_stub();
1322   return offset;
1323 }
1324 
1325 // Emit deopt handler code.
1326 int HandlerImpl::emit_deopt_handler(CodeBuffer& cbuf) {
1327 
1328   // Note that the code buffer's insts_mark is always relative to insts.
1329   // That's why we must use the macroassembler to generate a handler.
1330   MacroAssembler _masm(&cbuf);
1331   address base = __ start_a_stub(size_deopt_handler());
1332   if (base == NULL) {
1333     ciEnv::current()->record_failure("CodeCache is full");
1334     return 0;  // CodeBuffer::expand failed
1335   }
1336   int offset = __ offset();
1337 
1338 #ifdef _LP64
1339   address the_pc = (address) __ pc();
1340   Label next;
1341   // push a "the_pc" on the stack without destroying any registers
1342   // as they all may be live.
1343 
1344   // push address of "next"
1345   __ call(next, relocInfo::none); // reloc none is fine since it is a disp32
1346   __ bind(next);
1347   // adjust it so it matches "the_pc"
1348   __ subptr(Address(rsp, 0), __ offset() - offset);
1349 #else
1350   InternalAddress here(__ pc());
1351   __ pushptr(here.addr());
1352 #endif
1353 
1354   __ jump(RuntimeAddress(SharedRuntime::deopt_blob()->unpack()));
1355   assert(__ offset() - offset <= (int) size_deopt_handler(), "overflow %d", (__ offset() - offset));
1356   __ end_a_stub();
1357   return offset;
1358 }
1359 
1360 
1361 //=============================================================================
1362 
1363   // Float masks come from different places depending on platform.
1364 #ifdef _LP64
1365   static address float_signmask()  { return StubRoutines::x86::float_sign_mask(); }
1366   static address float_signflip()  { return StubRoutines::x86::float_sign_flip(); }
1367   static address double_signmask() { return StubRoutines::x86::double_sign_mask(); }
1368   static address double_signflip() { return StubRoutines::x86::double_sign_flip(); }
1369 #else
1370   static address float_signmask()  { return (address)float_signmask_pool; }
1371   static address float_signflip()  { return (address)float_signflip_pool; }
1372   static address double_signmask() { return (address)double_signmask_pool; }
1373   static address double_signflip() { return (address)double_signflip_pool; }
1374 #endif
1375   static address vector_short_to_byte_mask() { return StubRoutines::x86::vector_short_to_byte_mask(); }
1376   static address vector_float_signmask() { return StubRoutines::x86::vector_float_sign_mask(); }
1377   static address vector_float_signflip() { return StubRoutines::x86::vector_float_sign_flip(); }
1378   static address vector_double_signmask() { return StubRoutines::x86::vector_double_sign_mask(); }
1379   static address vector_double_signflip() { return StubRoutines::x86::vector_double_sign_flip(); }
1380   static address vector_byte_perm_mask() { return StubRoutines::x86::vector_byte_perm_mask(); }
1381   static address vector_long_sign_mask() { return StubRoutines::x86::vector_long_sign_mask(); }
1382 
1383 //=============================================================================
1384 
1385 
1386 typedef void (MacroAssembler::*XX_Inst)(XMMRegister, XMMRegister);
1387 typedef void (MacroAssembler::*XAR_Inst)(XMMRegister, AddressLiteral, Register);
1388 typedef void (MacroAssembler::*XXI_Inst)(XMMRegister, XMMRegister, int);
1389 typedef void (MacroAssembler::*XXAIR_Inst)(XMMRegister, XMMRegister, AddressLiteral, int, Register);
1390 typedef void (MacroAssembler::*XXXI_Inst)(XMMRegister, XMMRegister, XMMRegister, int);
1391 
1392 XX_Inst get_xx_inst(int opcode)  {
1393   XX_Inst inst;
1394   switch(opcode) {
1395     case Op_RShiftVB:
1396     case Op_RShiftVS:
1397       return &MacroAssembler::psraw;
1398     case Op_LShiftVB:
1399     case Op_LShiftVS:
1400       return &MacroAssembler::psllw;
1401     case Op_URShiftVB:
1402     case Op_URShiftVS:
1403       return &MacroAssembler::psrlw;
1404     case Op_RShiftVI:
1405       return &MacroAssembler::psrad;
1406     case Op_LShiftVI:
1407       return &MacroAssembler::pslld;
1408     case Op_URShiftVI:
1409       return &MacroAssembler::psrld;
1410     case Op_LShiftVL:
1411       return &MacroAssembler::psllq;
1412     case Op_RShiftVL:
1413     case Op_URShiftVL:
1414       return &MacroAssembler::psrlq;
1415     default:
1416       return NULL;
1417   }
1418 }
1419 
1420 XAR_Inst get_xar_inst(int opcode)  {
1421   XAR_Inst inst;
1422   switch(opcode) {
1423     case Op_AbsVF:
1424       return &MacroAssembler::andps;
1425     case Op_AbsVD:
1426       return &MacroAssembler::andpd;
1427     case Op_NegVF:
1428       return &MacroAssembler::xorps;
1429     case Op_NegVD:
1430       return &MacroAssembler::xorpd;
1431     default:
1432       return NULL;
1433   }
1434 }
1435 
1436 XXAIR_Inst get_xxair_inst(int opcode)  {
1437   XXAIR_Inst inst;
1438   switch(opcode) {
1439     case Op_AbsVF:
1440       return &MacroAssembler::vandps;
1441     case Op_AbsVD:
1442       return &MacroAssembler::vandpd;
1443     case Op_NegVF:
1444       return &MacroAssembler::vxorps;
1445     case Op_NegVD:
1446       return &MacroAssembler::vxorpd;
1447     default:
1448       return NULL;
1449   }
1450 }
1451 
1452 XXXI_Inst get_xxxi_inst(int opcode)  {
1453   XXXI_Inst inst;
1454   switch(opcode) {
1455     case Op_RShiftVB:
1456     case Op_RShiftVS:
1457       return &MacroAssembler::vpsraw;
1458     case Op_LShiftVB:
1459     case Op_LShiftVS:
1460       return &MacroAssembler::vpsllw;
1461     case Op_URShiftVB:
1462     case Op_URShiftVS:
1463       return &MacroAssembler::vpsrlw;
1464     case Op_RShiftVI:
1465       return &MacroAssembler::vpsrad;
1466     case Op_LShiftVI:
1467       return &MacroAssembler::vpslld;
1468     case Op_URShiftVI:
1469       return &MacroAssembler::vpsrld;
1470     case Op_RShiftVL:
1471       return &MacroAssembler::evpsraq;
1472     case Op_LShiftVL:
1473       return &MacroAssembler::vpsllq;
1474     case Op_URShiftVL:
1475       return &MacroAssembler::vpsrlq;
1476     default:
1477       return NULL;
1478   }
1479 }
1480 
1481 XX_Inst get_extend_inst(bool sign)  {
1482   XX_Inst inst;
1483   if (sign)
1484     inst = &MacroAssembler::pmovsxbw;
1485   else 
1486     inst = &MacroAssembler::pmovzxbw;
1487   return inst;
1488 }
1489 
1490 XXI_Inst get_avx_extend_inst(bool sign)  {
1491   XXI_Inst inst;
1492   if (sign)
1493     inst = &MacroAssembler::vpmovsxbw;
1494   else 
1495     inst = &MacroAssembler::vpmovzxbw;
1496   return inst;
1497 }
1498 
1499 AddressLiteral get_mask(int opcode)  {
1500   switch(opcode) {
1501     case Op_AbsVF:
1502       return ExternalAddress(vector_float_signmask());
1503     case Op_AbsVD:
1504       return ExternalAddress(vector_double_signmask());
1505     case Op_NegVF:
1506       return ExternalAddress(vector_float_signflip());
1507     case Op_NegVD:
1508       return ExternalAddress(vector_double_signflip());
1509     default:
1510       return ExternalAddress(vector_double_signflip());
1511   }
1512 }
1513 // need a scratch register to load mask TBD
1514 void emit_vshift4Bor8B_code(MacroAssembler& _masm, int opcode, XMMRegister dst,
1515                      XMMRegister src, XMMRegister shift, 
1516                      XMMRegister tmp, Register scratch) {
1517   XX_Inst extendinst = get_extend_inst(opcode == Op_URShiftVB ? false : true);
1518   XX_Inst shiftinst = get_xx_inst(opcode);
1519 
1520   (_masm.*extendinst)(tmp, src);
1521   (_masm.*shiftinst)(tmp, shift);
1522   __ movdqu(dst, ExternalAddress(vector_short_to_byte_mask()), scratch); 
1523   __ pand(dst, tmp);
1524   __ packuswb(dst, dst);
1525 }
1526 
1527 // need a scratch register to load mask TBD
1528 void emit_vshift16B_code(MacroAssembler& _masm, int opcode, XMMRegister dst,
1529                         XMMRegister src, XMMRegister shift, 
1530                         XMMRegister tmp1, XMMRegister tmp2, Register scratch) {
1531   XX_Inst extendinst = get_extend_inst(opcode == Op_URShiftVB ? false : true);
1532   XX_Inst shiftinst = get_xx_inst(opcode);
1533 
1534   (_masm.*extendinst)(tmp1, src);
1535   (_masm.*shiftinst)(tmp1, shift);
1536   __ pshufd(tmp2, src, 0xE);
1537   (_masm.*extendinst)(tmp2, tmp2);
1538   (_masm.*shiftinst)(tmp2, shift);
1539   __ movdqu(dst, ExternalAddress(vector_short_to_byte_mask()), scratch);
1540   __ pand(tmp2, dst);
1541   __ pand(dst, tmp1);
1542   __ packuswb(dst, tmp2);
1543 }
1544 
1545 
1546 void emit_vshift16B_avx_code(MacroAssembler& _masm, int opcode, XMMRegister dst,
1547                             XMMRegister src, XMMRegister shift, 
1548                             XMMRegister tmp, Register scratch) {
1549   XXI_Inst extendinst = get_avx_extend_inst(opcode == Op_URShiftVB ? false : true);
1550   XXXI_Inst shiftinst = get_xxxi_inst(opcode);
1551 
1552   int vector_len = 1;
1553   (_masm.*extendinst)(tmp, src, vector_len);
1554   (_masm.*shiftinst)(tmp, tmp, shift, vector_len);
1555   __ vpand(tmp, tmp, ExternalAddress(vector_short_to_byte_mask()), vector_len, scratch);
1556   __ vextracti128_high(dst, tmp);
1557   __ vpackuswb(dst, tmp, dst, 0);
1558 }
1559 
1560 void emit_vshift32B_avx_code(MacroAssembler& _masm, int opcode, XMMRegister dst,
1561                             XMMRegister src, XMMRegister shift, 
1562                             XMMRegister tmp, Register scratch) {
1563   XXI_Inst extendinst = get_avx_extend_inst(opcode == Op_URShiftVB ? false : true);
1564   XXXI_Inst shiftinst = get_xxxi_inst(opcode);
1565 
1566   int vector_len = 1;
1567   __ vextracti128_high(tmp, src);
1568   (_masm.*extendinst)(tmp, tmp, vector_len);
1569   (_masm.*extendinst)(dst, src, vector_len);
1570   (_masm.*shiftinst)(tmp, tmp, shift, vector_len);
1571   (_masm.*shiftinst)(dst, dst, shift, vector_len);
1572   __ vpand(tmp, tmp, ExternalAddress(vector_short_to_byte_mask()), vector_len, scratch);
1573   __ vpand(dst, dst, ExternalAddress(vector_short_to_byte_mask()), vector_len, scratch);
1574   __ vpackuswb(dst, dst, tmp, vector_len);
1575   __ vpermq(dst, dst, 0xD8, vector_len);
1576 }
1577 
1578 void emit_vshift64B_avx_code(MacroAssembler& _masm, int opcode, XMMRegister dst,
1579                             XMMRegister src, XMMRegister shift, 
1580                             XMMRegister tmp1, XMMRegister tmp2, Register scratch) {
1581   XXI_Inst extendinst = get_avx_extend_inst(opcode == Op_URShiftVB ? false : true);
1582   XXXI_Inst shiftinst = get_xxxi_inst(opcode);
1583 
1584   int vector_len = 2;
1585   __ vextracti64x4(tmp1, src, 1);
1586   (_masm.*extendinst)(tmp1, tmp1, vector_len);
1587   (_masm.*extendinst)(tmp2, src, vector_len);
1588   (_masm.*shiftinst)(tmp1, tmp1, shift, vector_len);
1589   (_masm.*shiftinst)(tmp2, tmp2, shift, vector_len);
1590   __ vmovdqu(dst, ExternalAddress(vector_short_to_byte_mask()), scratch);
1591   __ vpbroadcastd(dst, dst, vector_len);
1592   __ vpand(tmp1, tmp1, dst, vector_len);
1593   __ vpand(tmp2, tmp2, dst, vector_len);
1594   __ vpackuswb(dst, tmp1, tmp2, vector_len);
1595   __ evmovdquq(tmp2, ExternalAddress(vector_byte_perm_mask()), vector_len, scratch);
1596   __ vpermq(dst, tmp2, dst, vector_len);
1597 }
1598 
1599 //=============================================================================
1600 const bool Matcher::match_rule_supported(int opcode) {
1601   if (!has_match_rule(opcode))
1602     return false;
1603 
1604   bool ret_value = true;
1605   switch (opcode) {
1606     case Op_AbsVL:
1607       if (UseAVX < 3)
1608         ret_value = false;
1609     case Op_PopCountI:
1610     case Op_PopCountL:
1611       if (!UsePopCountInstruction)
1612         ret_value = false;
1613       break;
1614     case Op_PopCountVI:
1615       if (!UsePopCountInstruction || !VM_Version::supports_vpopcntdq())
1616         ret_value = false;
1617       break;
1618     case Op_MulVI:
1619       if ((UseSSE < 4) && (UseAVX < 1)) // only with SSE4_1 or AVX
1620         ret_value = false;
1621       break;
1622     case Op_MulVL:
1623     case Op_MulReductionVL:
1624       if (VM_Version::supports_avx512dq() == false)
1625         ret_value = false;
1626       break;
1627     case Op_AddReductionVL:
1628       if (UseAVX < 3) // only EVEX : vector connectivity becomes an issue here
1629         ret_value = false;
1630       break;
1631     case Op_AbsVB:
1632     case Op_AbsVS:
1633     case Op_AbsVI:
1634     case Op_AddReductionVI:
1635       if (UseSSE < 3) // requires at least SSE3
1636         ret_value = false;
1637       break;
1638     case Op_MulReductionVI:
1639       if (UseSSE < 4) // requires at least SSE4
1640         ret_value = false;
1641       break;
1642     case Op_AddReductionVF:
1643     case Op_AddReductionVD:
1644     case Op_MulReductionVF:
1645     case Op_MulReductionVD:
1646       if (UseSSE < 1) // requires at least SSE
1647         ret_value = false;
1648       break;
1649     case Op_SqrtVD:
1650     case Op_SqrtVF:
1651       if (UseAVX < 1) // enabled for AVX only
1652         ret_value = false;
1653       break;
1654     case Op_CompareAndSwapL:
1655 #ifdef _LP64
1656     case Op_CompareAndSwapP:
1657 #endif
1658       if (!VM_Version::supports_cx8())
1659         ret_value = false;
1660       break;
1661     case Op_CMoveVF:
1662     case Op_CMoveVD:
1663       if (UseAVX < 1 || UseAVX > 2)
1664         ret_value = false;
1665       break;
1666     case Op_StrIndexOf:
1667       if (!UseSSE42Intrinsics)
1668         ret_value = false;
1669       break;
1670     case Op_StrIndexOfChar:
1671       if (!UseSSE42Intrinsics)
1672         ret_value = false;
1673       break;
1674     case Op_OnSpinWait:
1675       if (VM_Version::supports_on_spin_wait() == false)
1676         ret_value = false;
1677       break;
1678     case Op_MulAddVS2VI:
1679     case Op_RShiftVL:
1680     case Op_AbsVD:
1681     case Op_NegVD:
1682       if (UseSSE < 2)
1683         ret_value = false;
1684       break;
1685     case Op_MulVB:
1686     case Op_LShiftVB:
1687     case Op_RShiftVB:
1688     case Op_URShiftVB:
1689       if (UseSSE < 4)
1690         ret_value = false;
1691       break;
1692 #ifdef _LP64
1693     case Op_MaxD:
1694     case Op_MaxF:
1695     case Op_MinD:
1696     case Op_MinF:
1697       if (UseAVX < 1) // enabled for AVX only
1698         ret_value = false;
1699       break;
1700 #endif
1701   }
1702 
1703   return ret_value;  // Per default match rules are supported.
1704 }
1705 
1706 const bool Matcher::match_rule_supported_vector(int opcode, int vlen) {
1707   // identify extra cases that we might want to provide match rules for
1708   // e.g. Op_ vector nodes and other intrinsics while guarding with vlen
1709   bool ret_value = match_rule_supported(opcode);
1710   if (ret_value) {
1711     switch (opcode) {
1712       case Op_AbsVB:
1713       case Op_AddVB:
1714       case Op_SubVB:
1715         if ((vlen == 64) && (VM_Version::supports_avx512bw() == false))
1716           ret_value = false;
1717         break;
1718       case Op_AbsVS:
1719       case Op_AddVS:
1720       case Op_SubVS:
1721       case Op_MulVS:
1722       case Op_LShiftVS:
1723       case Op_RShiftVS:
1724       case Op_URShiftVS:
1725         if ((vlen == 32) && (VM_Version::supports_avx512bw() == false))
1726           ret_value = false;
1727         break;
1728       case Op_MulVB:
1729       case Op_LShiftVB:
1730       case Op_RShiftVB:
1731       case Op_URShiftVB:
1732         if ((vlen == 32 && UseAVX < 2) || 
1733             ((vlen == 64) && (VM_Version::supports_avx512bw() == false)))
1734           ret_value = false;
1735         break;
1736       case Op_NegVF:
1737         if ((vlen == 16) && (VM_Version::supports_avx512dq() == false))
1738           ret_value = false;
1739         break;
1740       case Op_CMoveVF:
1741         if (vlen != 8)
1742           ret_value  = false;
1743         break;
1744       case Op_NegVD:
1745         if ((vlen == 8) && (VM_Version::supports_avx512dq() == false))
1746           ret_value = false;
1747         break;
1748       case Op_CMoveVD:
1749         if (vlen != 4)
1750           ret_value  = false;
1751         break;
1752     }
1753   }
1754 
1755   return ret_value;  // Per default match rules are supported.
1756 }
1757 
1758 const bool Matcher::has_predicated_vectors(void) {
1759   bool ret_value = false;
1760   if (UseAVX > 2) {
1761     ret_value = VM_Version::supports_avx512vl();
1762   }
1763 
1764   return ret_value;
1765 }
1766 
1767 const int Matcher::float_pressure(int default_pressure_threshold) {
1768   int float_pressure_threshold = default_pressure_threshold;
1769 #ifdef _LP64
1770   if (UseAVX > 2) {
1771     // Increase pressure threshold on machines with AVX3 which have
1772     // 2x more XMM registers.
1773     float_pressure_threshold = default_pressure_threshold * 2;
1774   }
1775 #endif
1776   return float_pressure_threshold;
1777 }
1778 
1779 // Max vector size in bytes. 0 if not supported.
1780 const int Matcher::vector_width_in_bytes(BasicType bt) {
1781   assert(is_java_primitive(bt), "only primitive type vectors");
1782   if (UseSSE < 2) return 0;
1783   // SSE2 supports 128bit vectors for all types.
1784   // AVX2 supports 256bit vectors for all types.
1785   // AVX2/EVEX supports 512bit vectors for all types.
1786   int size = (UseAVX > 1) ? (1 << UseAVX) * 8 : 16;
1787   // AVX1 supports 256bit vectors only for FLOAT and DOUBLE.
1788   if (UseAVX > 0 && (bt == T_FLOAT || bt == T_DOUBLE))
1789     size = (UseAVX > 2) ? 64 : 32;
1790   if (UseAVX > 2 && (bt == T_BYTE || bt == T_SHORT || bt == T_CHAR))
1791     size = (VM_Version::supports_avx512bw()) ? 64 : 32;
1792   // Use flag to limit vector size.
1793   size = MIN2(size,(int)MaxVectorSize);
1794   // Minimum 2 values in vector (or 4 for bytes).
1795   switch (bt) {
1796   case T_DOUBLE:
1797   case T_LONG:
1798     if (size < 16) return 0;
1799     break;
1800   case T_FLOAT:
1801   case T_INT:
1802     if (size < 8) return 0;
1803     break;
1804   case T_BOOLEAN:
1805     if (size < 4) return 0;
1806     break;
1807   case T_CHAR:
1808     if (size < 4) return 0;
1809     break;
1810   case T_BYTE:
1811     if (size < 4) return 0;
1812     break;
1813   case T_SHORT:
1814     if (size < 4) return 0;
1815     break;
1816   default:
1817     ShouldNotReachHere();
1818   }
1819   return size;
1820 }
1821 
1822 // Limits on vector size (number of elements) loaded into vector.
1823 const int Matcher::max_vector_size(const BasicType bt) {
1824   return vector_width_in_bytes(bt)/type2aelembytes(bt);
1825 }
1826 const int Matcher::min_vector_size(const BasicType bt) {
1827   int max_size = max_vector_size(bt);
1828   // Min size which can be loaded into vector is 4 bytes.
1829   int size = (type2aelembytes(bt) == 1) ? 4 : 2;
1830   return MIN2(size,max_size);
1831 }
1832 
1833 // Vector ideal reg corresponding to specified size in bytes
1834 const uint Matcher::vector_ideal_reg(int size) {
1835   assert(MaxVectorSize >= size, "");
1836   switch(size) {
1837     case  4: return Op_VecS;
1838     case  8: return Op_VecD;
1839     case 16: return Op_VecX;
1840     case 32: return Op_VecY;
1841     case 64: return Op_VecZ;
1842   }
1843   ShouldNotReachHere();
1844   return 0;
1845 }
1846 
1847 // Only lowest bits of xmm reg are used for vector shift count.
1848 const uint Matcher::vector_shift_count_ideal_reg(int size) {
1849   return Op_VecS;
1850 }
1851 
1852 // x86 supports misaligned vectors store/load.
1853 const bool Matcher::misaligned_vectors_ok() {
1854   return true;
1855 }
1856 
1857 // x86 AES instructions are compatible with SunJCE expanded
1858 // keys, hence we do not need to pass the original key to stubs
1859 const bool Matcher::pass_original_key_for_aes() {
1860   return false;
1861 }
1862 
1863 
1864 const bool Matcher::convi2l_type_required = true;
1865 
1866 // Check for shift by small constant as well
1867 static bool clone_shift(Node* shift, Matcher* matcher, Matcher::MStack& mstack, VectorSet& address_visited) {
1868   if (shift->Opcode() == Op_LShiftX && shift->in(2)->is_Con() &&
1869       shift->in(2)->get_int() <= 3 &&
1870       // Are there other uses besides address expressions?
1871       !matcher->is_visited(shift)) {
1872     address_visited.set(shift->_idx); // Flag as address_visited
1873     mstack.push(shift->in(2), Matcher::Visit);
1874     Node *conv = shift->in(1);
1875 #ifdef _LP64
1876     // Allow Matcher to match the rule which bypass
1877     // ConvI2L operation for an array index on LP64
1878     // if the index value is positive.
1879     if (conv->Opcode() == Op_ConvI2L &&
1880         conv->as_Type()->type()->is_long()->_lo >= 0 &&
1881         // Are there other uses besides address expressions?
1882         !matcher->is_visited(conv)) {
1883       address_visited.set(conv->_idx); // Flag as address_visited
1884       mstack.push(conv->in(1), Matcher::Pre_Visit);
1885     } else
1886 #endif
1887       mstack.push(conv, Matcher::Pre_Visit);
1888     return true;
1889   }
1890   return false;
1891 }
1892 
1893 // Should the Matcher clone shifts on addressing modes, expecting them
1894 // to be subsumed into complex addressing expressions or compute them
1895 // into registers?
1896 bool Matcher::clone_address_expressions(AddPNode* m, Matcher::MStack& mstack, VectorSet& address_visited) {
1897   Node *off = m->in(AddPNode::Offset);
1898   if (off->is_Con()) {
1899     address_visited.test_set(m->_idx); // Flag as address_visited
1900     Node *adr = m->in(AddPNode::Address);
1901 
1902     // Intel can handle 2 adds in addressing mode
1903     // AtomicAdd is not an addressing expression.
1904     // Cheap to find it by looking for screwy base.
1905     if (adr->is_AddP() &&
1906         !adr->in(AddPNode::Base)->is_top() &&
1907         LP64_ONLY( off->get_long() == (int) (off->get_long()) && ) // immL32
1908         // Are there other uses besides address expressions?
1909         !is_visited(adr)) {
1910       address_visited.set(adr->_idx); // Flag as address_visited
1911       Node *shift = adr->in(AddPNode::Offset);
1912       if (!clone_shift(shift, this, mstack, address_visited)) {
1913         mstack.push(shift, Pre_Visit);
1914       }
1915       mstack.push(adr->in(AddPNode::Address), Pre_Visit);
1916       mstack.push(adr->in(AddPNode::Base), Pre_Visit);
1917     } else {
1918       mstack.push(adr, Pre_Visit);
1919     }
1920 
1921     // Clone X+offset as it also folds into most addressing expressions
1922     mstack.push(off, Visit);
1923     mstack.push(m->in(AddPNode::Base), Pre_Visit);
1924     return true;
1925   } else if (clone_shift(off, this, mstack, address_visited)) {
1926     address_visited.test_set(m->_idx); // Flag as address_visited
1927     mstack.push(m->in(AddPNode::Address), Pre_Visit);
1928     mstack.push(m->in(AddPNode::Base), Pre_Visit);
1929     return true;
1930   }
1931   return false;
1932 }
1933 
1934 void Compile::reshape_address(AddPNode* addp) {
1935 }
1936 
1937 // Helper methods for MachSpillCopyNode::implementation().
1938 static int vec_mov_helper(CodeBuffer *cbuf, bool do_size, int src_lo, int dst_lo,
1939                           int src_hi, int dst_hi, uint ireg, outputStream* st) {
1940   // In 64-bit VM size calculation is very complex. Emitting instructions
1941   // into scratch buffer is used to get size in 64-bit VM.
1942   LP64_ONLY( assert(!do_size, "this method calculates size only for 32-bit VM"); )
1943   assert(ireg == Op_VecS || // 32bit vector
1944          (src_lo & 1) == 0 && (src_lo + 1) == src_hi &&
1945          (dst_lo & 1) == 0 && (dst_lo + 1) == dst_hi,
1946          "no non-adjacent vector moves" );
1947   if (cbuf) {
1948     MacroAssembler _masm(cbuf);
1949     int offset = __ offset();
1950     switch (ireg) {
1951     case Op_VecS: // copy whole register
1952     case Op_VecD:
1953     case Op_VecX:
1954 #ifndef _LP64
1955       __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
1956 #else
1957       if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
1958         __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
1959       } else {
1960         __ vextractf32x4(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 0x0);
1961      }
1962 #endif
1963       break;
1964     case Op_VecY:
1965 #ifndef _LP64
1966       __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
1967 #else
1968       if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
1969         __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
1970       } else {
1971         __ vextractf64x4(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 0x0);
1972      }
1973 #endif
1974       break;
1975     case Op_VecZ:
1976       __ evmovdquq(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 2);
1977       break;
1978     default:
1979       ShouldNotReachHere();
1980     }
1981     int size = __ offset() - offset;
1982 #ifdef ASSERT
1983     // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
1984     assert(!do_size || size == 4, "incorrect size calculattion");
1985 #endif
1986     return size;
1987 #ifndef PRODUCT
1988   } else if (!do_size) {
1989     switch (ireg) {
1990     case Op_VecS:
1991     case Op_VecD:
1992     case Op_VecX:
1993       st->print("movdqu  %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
1994       break;
1995     case Op_VecY:
1996     case Op_VecZ:
1997       st->print("vmovdqu %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
1998       break;
1999     default:
2000       ShouldNotReachHere();
2001     }
2002 #endif
2003   }
2004   // VEX_2bytes prefix is used if UseAVX > 0, and it takes the same 2 bytes as SIMD prefix.
2005   return (UseAVX > 2) ? 6 : 4;
2006 }
2007 
2008 static int vec_spill_helper(CodeBuffer *cbuf, bool do_size, bool is_load,
2009                             int stack_offset, int reg, uint ireg, outputStream* st) {
2010   // In 64-bit VM size calculation is very complex. Emitting instructions
2011   // into scratch buffer is used to get size in 64-bit VM.
2012   LP64_ONLY( assert(!do_size, "this method calculates size only for 32-bit VM"); )
2013   if (cbuf) {
2014     MacroAssembler _masm(cbuf);
2015     int offset = __ offset();
2016     if (is_load) {
2017       switch (ireg) {
2018       case Op_VecS:
2019         __ movdl(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
2020         break;
2021       case Op_VecD:
2022         __ movq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
2023         break;
2024       case Op_VecX:
2025 #ifndef _LP64
2026         __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
2027 #else
2028         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
2029           __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
2030         } else {
2031           __ vpxor(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 2);
2032           __ vinsertf32x4(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset),0x0);
2033         }
2034 #endif
2035         break;
2036       case Op_VecY:
2037 #ifndef _LP64
2038         __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
2039 #else
2040         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
2041           __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
2042         } else {
2043           __ vpxor(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 2);
2044           __ vinsertf64x4(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset),0x0);
2045         }
2046 #endif
2047         break;
2048       case Op_VecZ:
2049         __ evmovdquq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset), 2);
2050         break;
2051       default:
2052         ShouldNotReachHere();
2053       }
2054     } else { // store
2055       switch (ireg) {
2056       case Op_VecS:
2057         __ movdl(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
2058         break;
2059       case Op_VecD:
2060         __ movq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
2061         break;
2062       case Op_VecX:
2063 #ifndef _LP64
2064         __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
2065 #else
2066         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
2067           __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
2068         }
2069         else {
2070           __ vextractf32x4(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 0x0);
2071         }
2072 #endif
2073         break;
2074       case Op_VecY:
2075 #ifndef _LP64
2076         __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
2077 #else
2078         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
2079           __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
2080         }
2081         else {
2082           __ vextractf64x4(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 0x0);
2083         }
2084 #endif
2085         break;
2086       case Op_VecZ:
2087         __ evmovdquq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 2);
2088         break;
2089       default:
2090         ShouldNotReachHere();
2091       }
2092     }
2093     int size = __ offset() - offset;
2094 #ifdef ASSERT
2095     int offset_size = (stack_offset == 0) ? 0 : ((stack_offset < 0x80) ? 1 : (UseAVX > 2) ? 6 : 4);
2096     // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
2097     assert(!do_size || size == (5+offset_size), "incorrect size calculattion");
2098 #endif
2099     return size;
2100 #ifndef PRODUCT
2101   } else if (!do_size) {
2102     if (is_load) {
2103       switch (ireg) {
2104       case Op_VecS:
2105         st->print("movd    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
2106         break;
2107       case Op_VecD:
2108         st->print("movq    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
2109         break;
2110        case Op_VecX:
2111         st->print("movdqu  %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
2112         break;
2113       case Op_VecY:
2114       case Op_VecZ:
2115         st->print("vmovdqu %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
2116         break;
2117       default:
2118         ShouldNotReachHere();
2119       }
2120     } else { // store
2121       switch (ireg) {
2122       case Op_VecS:
2123         st->print("movd    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
2124         break;
2125       case Op_VecD:
2126         st->print("movq    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
2127         break;
2128        case Op_VecX:
2129         st->print("movdqu  [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
2130         break;
2131       case Op_VecY:
2132       case Op_VecZ:
2133         st->print("vmovdqu [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
2134         break;
2135       default:
2136         ShouldNotReachHere();
2137       }
2138     }
2139 #endif
2140   }
2141   bool is_single_byte = false;
2142   int vec_len = 0;
2143   if ((UseAVX > 2) && (stack_offset != 0)) {
2144     int tuple_type = Assembler::EVEX_FVM;
2145     int input_size = Assembler::EVEX_32bit;
2146     switch (ireg) {
2147     case Op_VecS:
2148       tuple_type = Assembler::EVEX_T1S;
2149       break;
2150     case Op_VecD:
2151       tuple_type = Assembler::EVEX_T1S;
2152       input_size = Assembler::EVEX_64bit;
2153       break;
2154     case Op_VecX:
2155       break;
2156     case Op_VecY:
2157       vec_len = 1;
2158       break;
2159     case Op_VecZ:
2160       vec_len = 2;
2161       break;
2162     }
2163     is_single_byte = Assembler::query_compressed_disp_byte(stack_offset, true, vec_len, tuple_type, input_size, 0);
2164   }
2165   int offset_size = 0;
2166   int size = 5;
2167   if (UseAVX > 2 ) {
2168     if (VM_Version::supports_avx512novl() && (vec_len == 2)) {
2169       offset_size = (stack_offset == 0) ? 0 : ((is_single_byte) ? 1 : 4);
2170       size += 2; // Need an additional two bytes for EVEX encoding
2171     } else if (VM_Version::supports_avx512novl() && (vec_len < 2)) {
2172       offset_size = (stack_offset == 0) ? 0 : ((stack_offset <= 127) ? 1 : 4);
2173     } else {
2174       offset_size = (stack_offset == 0) ? 0 : ((is_single_byte) ? 1 : 4);
2175       size += 2; // Need an additional two bytes for EVEX encodding
2176     }
2177   } else {
2178     offset_size = (stack_offset == 0) ? 0 : ((stack_offset <= 127) ? 1 : 4);
2179   }
2180   // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
2181   return size+offset_size;
2182 }
2183 
2184 static inline jint replicate4_imm(int con, int width) {
2185   // Load a constant of "width" (in bytes) and replicate it to fill 32bit.
2186   assert(width == 1 || width == 2, "only byte or short types here");
2187   int bit_width = width * 8;
2188   jint val = con;
2189   val &= (1 << bit_width) - 1;  // mask off sign bits
2190   while(bit_width < 32) {
2191     val |= (val << bit_width);
2192     bit_width <<= 1;
2193   }
2194   return val;
2195 }
2196 
2197 static inline jlong replicate8_imm(int con, int width) {
2198   // Load a constant of "width" (in bytes) and replicate it to fill 64bit.
2199   assert(width == 1 || width == 2 || width == 4, "only byte, short or int types here");
2200   int bit_width = width * 8;
2201   jlong val = con;
2202   val &= (((jlong) 1) << bit_width) - 1;  // mask off sign bits
2203   while(bit_width < 64) {
2204     val |= (val << bit_width);
2205     bit_width <<= 1;
2206   }
2207   return val;
2208 }
2209 
2210 #ifndef PRODUCT
2211   void MachNopNode::format(PhaseRegAlloc*, outputStream* st) const {
2212     st->print("nop \t# %d bytes pad for loops and calls", _count);
2213   }
2214 #endif
2215 
2216   void MachNopNode::emit(CodeBuffer &cbuf, PhaseRegAlloc*) const {
2217     MacroAssembler _masm(&cbuf);
2218     __ nop(_count);
2219   }
2220 
2221   uint MachNopNode::size(PhaseRegAlloc*) const {
2222     return _count;
2223   }
2224 
2225 #ifndef PRODUCT
2226   void MachBreakpointNode::format(PhaseRegAlloc*, outputStream* st) const {
2227     st->print("# breakpoint");
2228   }
2229 #endif
2230 
2231   void MachBreakpointNode::emit(CodeBuffer &cbuf, PhaseRegAlloc* ra_) const {
2232     MacroAssembler _masm(&cbuf);
2233     __ int3();
2234   }
2235 
2236   uint MachBreakpointNode::size(PhaseRegAlloc* ra_) const {
2237     return MachNode::size(ra_);
2238   }
2239 
2240 %}
2241 
2242 encode %{
2243 
2244   enc_class call_epilog %{
2245     if (VerifyStackAtCalls) {
2246       // Check that stack depth is unchanged: find majik cookie on stack
2247       int framesize = ra_->reg2offset_unchecked(OptoReg::add(ra_->_matcher._old_SP, -3*VMRegImpl::slots_per_word));
2248       MacroAssembler _masm(&cbuf);
2249       Label L;
2250       __ cmpptr(Address(rsp, framesize), (int32_t)0xbadb100d);
2251       __ jccb(Assembler::equal, L);
2252       // Die if stack mismatch
2253       __ int3();
2254       __ bind(L);
2255     }
2256   %}
2257 
2258 %}
2259 
2260 
2261 //----------OPERANDS-----------------------------------------------------------
2262 // Operand definitions must precede instruction definitions for correct parsing
2263 // in the ADLC because operands constitute user defined types which are used in
2264 // instruction definitions.
2265 
2266 operand vecZ() %{
2267   constraint(ALLOC_IN_RC(vectorz_reg));
2268   match(VecZ);
2269 
2270   format %{ %}
2271   interface(REG_INTER);
2272 %}
2273 
2274 operand legVecZ() %{
2275   constraint(ALLOC_IN_RC(vectorz_reg_vl));
2276   match(VecZ);
2277 
2278   format %{ %}
2279   interface(REG_INTER);
2280 %}
2281 
2282 // Comparison Code for FP conditional move
2283 operand cmpOp_vcmppd() %{
2284   match(Bool);
2285 
2286   predicate(n->as_Bool()->_test._test != BoolTest::overflow &&
2287             n->as_Bool()->_test._test != BoolTest::no_overflow);
2288   format %{ "" %}
2289   interface(COND_INTER) %{
2290     equal        (0x0, "eq");
2291     less         (0x1, "lt");
2292     less_equal   (0x2, "le");
2293     not_equal    (0xC, "ne");
2294     greater_equal(0xD, "ge");
2295     greater      (0xE, "gt");
2296     //TODO cannot compile (adlc breaks) without two next lines with error:
2297     // x86_64.ad(13987) Syntax Error: :In operand cmpOp_vcmppd: Do not support this encode constant: ' %{
2298     // equal' for overflow.
2299     overflow     (0x20, "o");  // not really supported by the instruction
2300     no_overflow  (0x21, "no"); // not really supported by the instruction
2301   %}
2302 %}
2303 
2304 
2305 // INSTRUCTIONS -- Platform independent definitions (same for 32- and 64-bit)
2306 
2307 // ============================================================================
2308 
2309 instruct ShouldNotReachHere() %{
2310   match(Halt);
2311   format %{ "ud2\t# ShouldNotReachHere" %}
2312   ins_encode %{
2313     __ ud2();
2314   %}
2315   ins_pipe(pipe_slow);
2316 %}
2317 
2318 // =================================EVEX special===============================
2319 
2320 instruct setMask(rRegI dst, rRegI src) %{
2321   predicate(Matcher::has_predicated_vectors());
2322   match(Set dst (SetVectMaskI  src));
2323   effect(TEMP dst);
2324   format %{ "setvectmask   $dst, $src" %}
2325   ins_encode %{
2326     __ setvectmask($dst$$Register, $src$$Register);
2327   %}
2328   ins_pipe(pipe_slow);
2329 %}
2330 
2331 // ============================================================================
2332 
2333 instruct addF_reg(regF dst, regF src) %{
2334   predicate((UseSSE>=1) && (UseAVX == 0));
2335   match(Set dst (AddF dst src));
2336 
2337   format %{ "addss   $dst, $src" %}
2338   ins_cost(150);
2339   ins_encode %{
2340     __ addss($dst$$XMMRegister, $src$$XMMRegister);
2341   %}
2342   ins_pipe(pipe_slow);
2343 %}
2344 
2345 instruct addF_mem(regF dst, memory src) %{
2346   predicate((UseSSE>=1) && (UseAVX == 0));
2347   match(Set dst (AddF dst (LoadF src)));
2348 
2349   format %{ "addss   $dst, $src" %}
2350   ins_cost(150);
2351   ins_encode %{
2352     __ addss($dst$$XMMRegister, $src$$Address);
2353   %}
2354   ins_pipe(pipe_slow);
2355 %}
2356 
2357 instruct addF_imm(regF dst, immF con) %{
2358   predicate((UseSSE>=1) && (UseAVX == 0));
2359   match(Set dst (AddF dst con));
2360   format %{ "addss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2361   ins_cost(150);
2362   ins_encode %{
2363     __ addss($dst$$XMMRegister, $constantaddress($con));
2364   %}
2365   ins_pipe(pipe_slow);
2366 %}
2367 
2368 instruct addF_reg_reg(regF dst, regF src1, regF src2) %{
2369   predicate(UseAVX > 0);
2370   match(Set dst (AddF src1 src2));
2371 
2372   format %{ "vaddss  $dst, $src1, $src2" %}
2373   ins_cost(150);
2374   ins_encode %{
2375     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2376   %}
2377   ins_pipe(pipe_slow);
2378 %}
2379 
2380 instruct addF_reg_mem(regF dst, regF src1, memory src2) %{
2381   predicate(UseAVX > 0);
2382   match(Set dst (AddF src1 (LoadF src2)));
2383 
2384   format %{ "vaddss  $dst, $src1, $src2" %}
2385   ins_cost(150);
2386   ins_encode %{
2387     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2388   %}
2389   ins_pipe(pipe_slow);
2390 %}
2391 
2392 instruct addF_reg_imm(regF dst, regF src, immF con) %{
2393   predicate(UseAVX > 0);
2394   match(Set dst (AddF src con));
2395 
2396   format %{ "vaddss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
2397   ins_cost(150);
2398   ins_encode %{
2399     __ vaddss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2400   %}
2401   ins_pipe(pipe_slow);
2402 %}
2403 
2404 instruct addD_reg(regD dst, regD src) %{
2405   predicate((UseSSE>=2) && (UseAVX == 0));
2406   match(Set dst (AddD dst src));
2407 
2408   format %{ "addsd   $dst, $src" %}
2409   ins_cost(150);
2410   ins_encode %{
2411     __ addsd($dst$$XMMRegister, $src$$XMMRegister);
2412   %}
2413   ins_pipe(pipe_slow);
2414 %}
2415 
2416 instruct addD_mem(regD dst, memory src) %{
2417   predicate((UseSSE>=2) && (UseAVX == 0));
2418   match(Set dst (AddD dst (LoadD src)));
2419 
2420   format %{ "addsd   $dst, $src" %}
2421   ins_cost(150);
2422   ins_encode %{
2423     __ addsd($dst$$XMMRegister, $src$$Address);
2424   %}
2425   ins_pipe(pipe_slow);
2426 %}
2427 
2428 instruct addD_imm(regD dst, immD con) %{
2429   predicate((UseSSE>=2) && (UseAVX == 0));
2430   match(Set dst (AddD dst con));
2431   format %{ "addsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2432   ins_cost(150);
2433   ins_encode %{
2434     __ addsd($dst$$XMMRegister, $constantaddress($con));
2435   %}
2436   ins_pipe(pipe_slow);
2437 %}
2438 
2439 instruct addD_reg_reg(regD dst, regD src1, regD src2) %{
2440   predicate(UseAVX > 0);
2441   match(Set dst (AddD src1 src2));
2442 
2443   format %{ "vaddsd  $dst, $src1, $src2" %}
2444   ins_cost(150);
2445   ins_encode %{
2446     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2447   %}
2448   ins_pipe(pipe_slow);
2449 %}
2450 
2451 instruct addD_reg_mem(regD dst, regD src1, memory src2) %{
2452   predicate(UseAVX > 0);
2453   match(Set dst (AddD src1 (LoadD src2)));
2454 
2455   format %{ "vaddsd  $dst, $src1, $src2" %}
2456   ins_cost(150);
2457   ins_encode %{
2458     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2459   %}
2460   ins_pipe(pipe_slow);
2461 %}
2462 
2463 instruct addD_reg_imm(regD dst, regD src, immD con) %{
2464   predicate(UseAVX > 0);
2465   match(Set dst (AddD src con));
2466 
2467   format %{ "vaddsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
2468   ins_cost(150);
2469   ins_encode %{
2470     __ vaddsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2471   %}
2472   ins_pipe(pipe_slow);
2473 %}
2474 
2475 instruct subF_reg(regF dst, regF src) %{
2476   predicate((UseSSE>=1) && (UseAVX == 0));
2477   match(Set dst (SubF dst src));
2478 
2479   format %{ "subss   $dst, $src" %}
2480   ins_cost(150);
2481   ins_encode %{
2482     __ subss($dst$$XMMRegister, $src$$XMMRegister);
2483   %}
2484   ins_pipe(pipe_slow);
2485 %}
2486 
2487 instruct subF_mem(regF dst, memory src) %{
2488   predicate((UseSSE>=1) && (UseAVX == 0));
2489   match(Set dst (SubF dst (LoadF src)));
2490 
2491   format %{ "subss   $dst, $src" %}
2492   ins_cost(150);
2493   ins_encode %{
2494     __ subss($dst$$XMMRegister, $src$$Address);
2495   %}
2496   ins_pipe(pipe_slow);
2497 %}
2498 
2499 instruct subF_imm(regF dst, immF con) %{
2500   predicate((UseSSE>=1) && (UseAVX == 0));
2501   match(Set dst (SubF dst con));
2502   format %{ "subss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2503   ins_cost(150);
2504   ins_encode %{
2505     __ subss($dst$$XMMRegister, $constantaddress($con));
2506   %}
2507   ins_pipe(pipe_slow);
2508 %}
2509 
2510 instruct subF_reg_reg(regF dst, regF src1, regF src2) %{
2511   predicate(UseAVX > 0);
2512   match(Set dst (SubF src1 src2));
2513 
2514   format %{ "vsubss  $dst, $src1, $src2" %}
2515   ins_cost(150);
2516   ins_encode %{
2517     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2518   %}
2519   ins_pipe(pipe_slow);
2520 %}
2521 
2522 instruct subF_reg_mem(regF dst, regF src1, memory src2) %{
2523   predicate(UseAVX > 0);
2524   match(Set dst (SubF src1 (LoadF src2)));
2525 
2526   format %{ "vsubss  $dst, $src1, $src2" %}
2527   ins_cost(150);
2528   ins_encode %{
2529     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2530   %}
2531   ins_pipe(pipe_slow);
2532 %}
2533 
2534 instruct subF_reg_imm(regF dst, regF src, immF con) %{
2535   predicate(UseAVX > 0);
2536   match(Set dst (SubF src con));
2537 
2538   format %{ "vsubss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
2539   ins_cost(150);
2540   ins_encode %{
2541     __ vsubss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2542   %}
2543   ins_pipe(pipe_slow);
2544 %}
2545 
2546 instruct subD_reg(regD dst, regD src) %{
2547   predicate((UseSSE>=2) && (UseAVX == 0));
2548   match(Set dst (SubD dst src));
2549 
2550   format %{ "subsd   $dst, $src" %}
2551   ins_cost(150);
2552   ins_encode %{
2553     __ subsd($dst$$XMMRegister, $src$$XMMRegister);
2554   %}
2555   ins_pipe(pipe_slow);
2556 %}
2557 
2558 instruct subD_mem(regD dst, memory src) %{
2559   predicate((UseSSE>=2) && (UseAVX == 0));
2560   match(Set dst (SubD dst (LoadD src)));
2561 
2562   format %{ "subsd   $dst, $src" %}
2563   ins_cost(150);
2564   ins_encode %{
2565     __ subsd($dst$$XMMRegister, $src$$Address);
2566   %}
2567   ins_pipe(pipe_slow);
2568 %}
2569 
2570 instruct subD_imm(regD dst, immD con) %{
2571   predicate((UseSSE>=2) && (UseAVX == 0));
2572   match(Set dst (SubD dst con));
2573   format %{ "subsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2574   ins_cost(150);
2575   ins_encode %{
2576     __ subsd($dst$$XMMRegister, $constantaddress($con));
2577   %}
2578   ins_pipe(pipe_slow);
2579 %}
2580 
2581 instruct subD_reg_reg(regD dst, regD src1, regD src2) %{
2582   predicate(UseAVX > 0);
2583   match(Set dst (SubD src1 src2));
2584 
2585   format %{ "vsubsd  $dst, $src1, $src2" %}
2586   ins_cost(150);
2587   ins_encode %{
2588     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2589   %}
2590   ins_pipe(pipe_slow);
2591 %}
2592 
2593 instruct subD_reg_mem(regD dst, regD src1, memory src2) %{
2594   predicate(UseAVX > 0);
2595   match(Set dst (SubD src1 (LoadD src2)));
2596 
2597   format %{ "vsubsd  $dst, $src1, $src2" %}
2598   ins_cost(150);
2599   ins_encode %{
2600     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2601   %}
2602   ins_pipe(pipe_slow);
2603 %}
2604 
2605 instruct subD_reg_imm(regD dst, regD src, immD con) %{
2606   predicate(UseAVX > 0);
2607   match(Set dst (SubD src con));
2608 
2609   format %{ "vsubsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
2610   ins_cost(150);
2611   ins_encode %{
2612     __ vsubsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2613   %}
2614   ins_pipe(pipe_slow);
2615 %}
2616 
2617 instruct mulF_reg(regF dst, regF src) %{
2618   predicate((UseSSE>=1) && (UseAVX == 0));
2619   match(Set dst (MulF dst src));
2620 
2621   format %{ "mulss   $dst, $src" %}
2622   ins_cost(150);
2623   ins_encode %{
2624     __ mulss($dst$$XMMRegister, $src$$XMMRegister);
2625   %}
2626   ins_pipe(pipe_slow);
2627 %}
2628 
2629 instruct mulF_mem(regF dst, memory src) %{
2630   predicate((UseSSE>=1) && (UseAVX == 0));
2631   match(Set dst (MulF dst (LoadF src)));
2632 
2633   format %{ "mulss   $dst, $src" %}
2634   ins_cost(150);
2635   ins_encode %{
2636     __ mulss($dst$$XMMRegister, $src$$Address);
2637   %}
2638   ins_pipe(pipe_slow);
2639 %}
2640 
2641 instruct mulF_imm(regF dst, immF con) %{
2642   predicate((UseSSE>=1) && (UseAVX == 0));
2643   match(Set dst (MulF dst con));
2644   format %{ "mulss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2645   ins_cost(150);
2646   ins_encode %{
2647     __ mulss($dst$$XMMRegister, $constantaddress($con));
2648   %}
2649   ins_pipe(pipe_slow);
2650 %}
2651 
2652 instruct mulF_reg_reg(regF dst, regF src1, regF src2) %{
2653   predicate(UseAVX > 0);
2654   match(Set dst (MulF src1 src2));
2655 
2656   format %{ "vmulss  $dst, $src1, $src2" %}
2657   ins_cost(150);
2658   ins_encode %{
2659     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2660   %}
2661   ins_pipe(pipe_slow);
2662 %}
2663 
2664 instruct mulF_reg_mem(regF dst, regF src1, memory src2) %{
2665   predicate(UseAVX > 0);
2666   match(Set dst (MulF src1 (LoadF src2)));
2667 
2668   format %{ "vmulss  $dst, $src1, $src2" %}
2669   ins_cost(150);
2670   ins_encode %{
2671     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2672   %}
2673   ins_pipe(pipe_slow);
2674 %}
2675 
2676 instruct mulF_reg_imm(regF dst, regF src, immF con) %{
2677   predicate(UseAVX > 0);
2678   match(Set dst (MulF src con));
2679 
2680   format %{ "vmulss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
2681   ins_cost(150);
2682   ins_encode %{
2683     __ vmulss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2684   %}
2685   ins_pipe(pipe_slow);
2686 %}
2687 
2688 instruct mulD_reg(regD dst, regD src) %{
2689   predicate((UseSSE>=2) && (UseAVX == 0));
2690   match(Set dst (MulD dst src));
2691 
2692   format %{ "mulsd   $dst, $src" %}
2693   ins_cost(150);
2694   ins_encode %{
2695     __ mulsd($dst$$XMMRegister, $src$$XMMRegister);
2696   %}
2697   ins_pipe(pipe_slow);
2698 %}
2699 
2700 instruct mulD_mem(regD dst, memory src) %{
2701   predicate((UseSSE>=2) && (UseAVX == 0));
2702   match(Set dst (MulD dst (LoadD src)));
2703 
2704   format %{ "mulsd   $dst, $src" %}
2705   ins_cost(150);
2706   ins_encode %{
2707     __ mulsd($dst$$XMMRegister, $src$$Address);
2708   %}
2709   ins_pipe(pipe_slow);
2710 %}
2711 
2712 instruct mulD_imm(regD dst, immD con) %{
2713   predicate((UseSSE>=2) && (UseAVX == 0));
2714   match(Set dst (MulD dst con));
2715   format %{ "mulsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2716   ins_cost(150);
2717   ins_encode %{
2718     __ mulsd($dst$$XMMRegister, $constantaddress($con));
2719   %}
2720   ins_pipe(pipe_slow);
2721 %}
2722 
2723 instruct mulD_reg_reg(regD dst, regD src1, regD src2) %{
2724   predicate(UseAVX > 0);
2725   match(Set dst (MulD src1 src2));
2726 
2727   format %{ "vmulsd  $dst, $src1, $src2" %}
2728   ins_cost(150);
2729   ins_encode %{
2730     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2731   %}
2732   ins_pipe(pipe_slow);
2733 %}
2734 
2735 instruct mulD_reg_mem(regD dst, regD src1, memory src2) %{
2736   predicate(UseAVX > 0);
2737   match(Set dst (MulD src1 (LoadD src2)));
2738 
2739   format %{ "vmulsd  $dst, $src1, $src2" %}
2740   ins_cost(150);
2741   ins_encode %{
2742     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2743   %}
2744   ins_pipe(pipe_slow);
2745 %}
2746 
2747 instruct mulD_reg_imm(regD dst, regD src, immD con) %{
2748   predicate(UseAVX > 0);
2749   match(Set dst (MulD src con));
2750 
2751   format %{ "vmulsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
2752   ins_cost(150);
2753   ins_encode %{
2754     __ vmulsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2755   %}
2756   ins_pipe(pipe_slow);
2757 %}
2758 
2759 instruct divF_reg(regF dst, regF src) %{
2760   predicate((UseSSE>=1) && (UseAVX == 0));
2761   match(Set dst (DivF dst src));
2762 
2763   format %{ "divss   $dst, $src" %}
2764   ins_cost(150);
2765   ins_encode %{
2766     __ divss($dst$$XMMRegister, $src$$XMMRegister);
2767   %}
2768   ins_pipe(pipe_slow);
2769 %}
2770 
2771 instruct divF_mem(regF dst, memory src) %{
2772   predicate((UseSSE>=1) && (UseAVX == 0));
2773   match(Set dst (DivF dst (LoadF src)));
2774 
2775   format %{ "divss   $dst, $src" %}
2776   ins_cost(150);
2777   ins_encode %{
2778     __ divss($dst$$XMMRegister, $src$$Address);
2779   %}
2780   ins_pipe(pipe_slow);
2781 %}
2782 
2783 instruct divF_imm(regF dst, immF con) %{
2784   predicate((UseSSE>=1) && (UseAVX == 0));
2785   match(Set dst (DivF dst con));
2786   format %{ "divss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2787   ins_cost(150);
2788   ins_encode %{
2789     __ divss($dst$$XMMRegister, $constantaddress($con));
2790   %}
2791   ins_pipe(pipe_slow);
2792 %}
2793 
2794 instruct divF_reg_reg(regF dst, regF src1, regF src2) %{
2795   predicate(UseAVX > 0);
2796   match(Set dst (DivF src1 src2));
2797 
2798   format %{ "vdivss  $dst, $src1, $src2" %}
2799   ins_cost(150);
2800   ins_encode %{
2801     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2802   %}
2803   ins_pipe(pipe_slow);
2804 %}
2805 
2806 instruct divF_reg_mem(regF dst, regF src1, memory src2) %{
2807   predicate(UseAVX > 0);
2808   match(Set dst (DivF src1 (LoadF src2)));
2809 
2810   format %{ "vdivss  $dst, $src1, $src2" %}
2811   ins_cost(150);
2812   ins_encode %{
2813     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2814   %}
2815   ins_pipe(pipe_slow);
2816 %}
2817 
2818 instruct divF_reg_imm(regF dst, regF src, immF con) %{
2819   predicate(UseAVX > 0);
2820   match(Set dst (DivF src con));
2821 
2822   format %{ "vdivss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
2823   ins_cost(150);
2824   ins_encode %{
2825     __ vdivss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2826   %}
2827   ins_pipe(pipe_slow);
2828 %}
2829 
2830 instruct divD_reg(regD dst, regD src) %{
2831   predicate((UseSSE>=2) && (UseAVX == 0));
2832   match(Set dst (DivD dst src));
2833 
2834   format %{ "divsd   $dst, $src" %}
2835   ins_cost(150);
2836   ins_encode %{
2837     __ divsd($dst$$XMMRegister, $src$$XMMRegister);
2838   %}
2839   ins_pipe(pipe_slow);
2840 %}
2841 
2842 instruct divD_mem(regD dst, memory src) %{
2843   predicate((UseSSE>=2) && (UseAVX == 0));
2844   match(Set dst (DivD dst (LoadD src)));
2845 
2846   format %{ "divsd   $dst, $src" %}
2847   ins_cost(150);
2848   ins_encode %{
2849     __ divsd($dst$$XMMRegister, $src$$Address);
2850   %}
2851   ins_pipe(pipe_slow);
2852 %}
2853 
2854 instruct divD_imm(regD dst, immD con) %{
2855   predicate((UseSSE>=2) && (UseAVX == 0));
2856   match(Set dst (DivD dst con));
2857   format %{ "divsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2858   ins_cost(150);
2859   ins_encode %{
2860     __ divsd($dst$$XMMRegister, $constantaddress($con));
2861   %}
2862   ins_pipe(pipe_slow);
2863 %}
2864 
2865 instruct divD_reg_reg(regD dst, regD src1, regD src2) %{
2866   predicate(UseAVX > 0);
2867   match(Set dst (DivD src1 src2));
2868 
2869   format %{ "vdivsd  $dst, $src1, $src2" %}
2870   ins_cost(150);
2871   ins_encode %{
2872     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2873   %}
2874   ins_pipe(pipe_slow);
2875 %}
2876 
2877 instruct divD_reg_mem(regD dst, regD src1, memory src2) %{
2878   predicate(UseAVX > 0);
2879   match(Set dst (DivD src1 (LoadD src2)));
2880 
2881   format %{ "vdivsd  $dst, $src1, $src2" %}
2882   ins_cost(150);
2883   ins_encode %{
2884     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2885   %}
2886   ins_pipe(pipe_slow);
2887 %}
2888 
2889 instruct divD_reg_imm(regD dst, regD src, immD con) %{
2890   predicate(UseAVX > 0);
2891   match(Set dst (DivD src con));
2892 
2893   format %{ "vdivsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
2894   ins_cost(150);
2895   ins_encode %{
2896     __ vdivsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2897   %}
2898   ins_pipe(pipe_slow);
2899 %}
2900 
2901 instruct absF_reg(regF dst) %{
2902   predicate((UseSSE>=1) && (UseAVX == 0));
2903   match(Set dst (AbsF dst));
2904   ins_cost(150);
2905   format %{ "andps   $dst, [0x7fffffff]\t# abs float by sign masking" %}
2906   ins_encode %{
2907     __ andps($dst$$XMMRegister, ExternalAddress(float_signmask()));
2908   %}
2909   ins_pipe(pipe_slow);
2910 %}
2911 
2912 instruct absF_reg_reg(vlRegF dst, vlRegF src) %{
2913   predicate(UseAVX > 0);
2914   match(Set dst (AbsF src));
2915   ins_cost(150);
2916   format %{ "vandps  $dst, $src, [0x7fffffff]\t# abs float by sign masking" %}
2917   ins_encode %{
2918     int vector_len = 0;
2919     __ vandps($dst$$XMMRegister, $src$$XMMRegister,
2920               ExternalAddress(float_signmask()), vector_len);
2921   %}
2922   ins_pipe(pipe_slow);
2923 %}
2924 
2925 instruct absD_reg(regD dst) %{
2926   predicate((UseSSE>=2) && (UseAVX == 0));
2927   match(Set dst (AbsD dst));
2928   ins_cost(150);
2929   format %{ "andpd   $dst, [0x7fffffffffffffff]\t"
2930             "# abs double by sign masking" %}
2931   ins_encode %{
2932     __ andpd($dst$$XMMRegister, ExternalAddress(double_signmask()));
2933   %}
2934   ins_pipe(pipe_slow);
2935 %}
2936 
2937 instruct absD_reg_reg(vlRegD dst, vlRegD src) %{
2938   predicate(UseAVX > 0);
2939   match(Set dst (AbsD src));
2940   ins_cost(150);
2941   format %{ "vandpd  $dst, $src, [0x7fffffffffffffff]\t"
2942             "# abs double by sign masking" %}
2943   ins_encode %{
2944     int vector_len = 0;
2945     __ vandpd($dst$$XMMRegister, $src$$XMMRegister,
2946               ExternalAddress(double_signmask()), vector_len);
2947   %}
2948   ins_pipe(pipe_slow);
2949 %}
2950 
2951 instruct negF_reg(regF dst) %{
2952   predicate((UseSSE>=1) && (UseAVX == 0));
2953   match(Set dst (NegF dst));
2954   ins_cost(150);
2955   format %{ "xorps   $dst, [0x80000000]\t# neg float by sign flipping" %}
2956   ins_encode %{
2957     __ xorps($dst$$XMMRegister, ExternalAddress(float_signflip()));
2958   %}
2959   ins_pipe(pipe_slow);
2960 %}
2961 
2962 instruct negF_reg_reg(vlRegF dst, vlRegF src) %{
2963   predicate(UseAVX > 0);
2964   match(Set dst (NegF src));
2965   ins_cost(150);
2966   format %{ "vnegatess  $dst, $src, [0x80000000]\t# neg float by sign flipping" %}
2967   ins_encode %{
2968     __ vnegatess($dst$$XMMRegister, $src$$XMMRegister,
2969                  ExternalAddress(float_signflip()));
2970   %}
2971   ins_pipe(pipe_slow);
2972 %}
2973 
2974 instruct negD_reg(regD dst) %{
2975   predicate((UseSSE>=2) && (UseAVX == 0));
2976   match(Set dst (NegD dst));
2977   ins_cost(150);
2978   format %{ "xorpd   $dst, [0x8000000000000000]\t"
2979             "# neg double by sign flipping" %}
2980   ins_encode %{
2981     __ xorpd($dst$$XMMRegister, ExternalAddress(double_signflip()));
2982   %}
2983   ins_pipe(pipe_slow);
2984 %}
2985 
2986 instruct negD_reg_reg(vlRegD dst, vlRegD src) %{
2987   predicate(UseAVX > 0);
2988   match(Set dst (NegD src));
2989   ins_cost(150);
2990   format %{ "vnegatesd  $dst, $src, [0x8000000000000000]\t"
2991             "# neg double by sign flipping" %}
2992   ins_encode %{
2993     __ vnegatesd($dst$$XMMRegister, $src$$XMMRegister,
2994                  ExternalAddress(double_signflip()));
2995   %}
2996   ins_pipe(pipe_slow);
2997 %}
2998 
2999 instruct sqrtF_reg(regF dst, regF src) %{
3000   predicate(UseSSE>=1);
3001   match(Set dst (SqrtF src));
3002 
3003   format %{ "sqrtss  $dst, $src" %}
3004   ins_cost(150);
3005   ins_encode %{
3006     __ sqrtss($dst$$XMMRegister, $src$$XMMRegister);
3007   %}
3008   ins_pipe(pipe_slow);
3009 %}
3010 
3011 instruct sqrtF_mem(regF dst, memory src) %{
3012   predicate(UseSSE>=1);
3013   match(Set dst (SqrtF (LoadF src)));
3014 
3015   format %{ "sqrtss  $dst, $src" %}
3016   ins_cost(150);
3017   ins_encode %{
3018     __ sqrtss($dst$$XMMRegister, $src$$Address);
3019   %}
3020   ins_pipe(pipe_slow);
3021 %}
3022 
3023 instruct sqrtF_imm(regF dst, immF con) %{
3024   predicate(UseSSE>=1);
3025   match(Set dst (SqrtF con));
3026 
3027   format %{ "sqrtss  $dst, [$constantaddress]\t# load from constant table: float=$con" %}
3028   ins_cost(150);
3029   ins_encode %{
3030     __ sqrtss($dst$$XMMRegister, $constantaddress($con));
3031   %}
3032   ins_pipe(pipe_slow);
3033 %}
3034 
3035 instruct sqrtD_reg(regD dst, regD src) %{
3036   predicate(UseSSE>=2);
3037   match(Set dst (SqrtD src));
3038 
3039   format %{ "sqrtsd  $dst, $src" %}
3040   ins_cost(150);
3041   ins_encode %{
3042     __ sqrtsd($dst$$XMMRegister, $src$$XMMRegister);
3043   %}
3044   ins_pipe(pipe_slow);
3045 %}
3046 
3047 instruct sqrtD_mem(regD dst, memory src) %{
3048   predicate(UseSSE>=2);
3049   match(Set dst (SqrtD (LoadD src)));
3050 
3051   format %{ "sqrtsd  $dst, $src" %}
3052   ins_cost(150);
3053   ins_encode %{
3054     __ sqrtsd($dst$$XMMRegister, $src$$Address);
3055   %}
3056   ins_pipe(pipe_slow);
3057 %}
3058 
3059 instruct sqrtD_imm(regD dst, immD con) %{
3060   predicate(UseSSE>=2);
3061   match(Set dst (SqrtD con));
3062   format %{ "sqrtsd  $dst, [$constantaddress]\t# load from constant table: double=$con" %}
3063   ins_cost(150);
3064   ins_encode %{
3065     __ sqrtsd($dst$$XMMRegister, $constantaddress($con));
3066   %}
3067   ins_pipe(pipe_slow);
3068 %}
3069 
3070 instruct onspinwait() %{
3071   match(OnSpinWait);
3072   ins_cost(200);
3073 
3074   format %{
3075     $$template
3076     $$emit$$"pause\t! membar_onspinwait"
3077   %}
3078   ins_encode %{
3079     __ pause();
3080   %}
3081   ins_pipe(pipe_slow);
3082 %}
3083 
3084 // a * b + c
3085 instruct fmaD_reg(regD a, regD b, regD c) %{
3086   predicate(UseFMA);
3087   match(Set c (FmaD  c (Binary a b)));
3088   format %{ "fmasd $a,$b,$c\t# $c = $a * $b + $c" %}
3089   ins_cost(150);
3090   ins_encode %{
3091     __ fmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
3092   %}
3093   ins_pipe( pipe_slow );
3094 %}
3095 
3096 // a * b + c
3097 instruct fmaF_reg(regF a, regF b, regF c) %{
3098   predicate(UseFMA);
3099   match(Set c (FmaF  c (Binary a b)));
3100   format %{ "fmass $a,$b,$c\t# $c = $a * $b + $c" %}
3101   ins_cost(150);
3102   ins_encode %{
3103     __ fmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
3104   %}
3105   ins_pipe( pipe_slow );
3106 %}
3107 
3108 // ====================VECTOR INSTRUCTIONS=====================================
3109 
3110 
3111 // Load vectors (4 bytes long)
3112 instruct loadV4(vecS dst, memory mem) %{
3113   predicate(n->as_LoadVector()->memory_size() == 4);
3114   match(Set dst (LoadVector mem));
3115   ins_cost(125);
3116   format %{ "movd    $dst,$mem\t! load vector (4 bytes)" %}
3117   ins_encode %{
3118     __ movdl($dst$$XMMRegister, $mem$$Address);
3119   %}
3120   ins_pipe( pipe_slow );
3121 %}
3122 
3123 // Load vectors (4 bytes long)
3124 instruct MoveVecS2Leg(legVecS dst, vecS src) %{
3125   match(Set dst src);
3126   format %{ "movss $dst,$src\t! load vector (4 bytes)" %}
3127   ins_encode %{
3128     __ movflt($dst$$XMMRegister, $src$$XMMRegister);
3129   %}
3130   ins_pipe( fpu_reg_reg );
3131 %}
3132 
3133 // Load vectors (4 bytes long)
3134 instruct MoveLeg2VecS(vecS dst, legVecS src) %{
3135   match(Set dst src);
3136   format %{ "movss $dst,$src\t! load vector (4 bytes)" %}
3137   ins_encode %{
3138     __ movflt($dst$$XMMRegister, $src$$XMMRegister);
3139   %}
3140   ins_pipe( fpu_reg_reg );
3141 %}
3142 
3143 // Load vectors (8 bytes long)
3144 instruct loadV8(vecD dst, memory mem) %{
3145   predicate(n->as_LoadVector()->memory_size() == 8);
3146   match(Set dst (LoadVector mem));
3147   ins_cost(125);
3148   format %{ "movq    $dst,$mem\t! load vector (8 bytes)" %}
3149   ins_encode %{
3150     __ movq($dst$$XMMRegister, $mem$$Address);
3151   %}
3152   ins_pipe( pipe_slow );
3153 %}
3154 
3155 // Load vectors (8 bytes long)
3156 instruct MoveVecD2Leg(legVecD dst, vecD src) %{
3157   match(Set dst src);
3158   format %{ "movsd $dst,$src\t! load vector (8 bytes)" %}
3159   ins_encode %{
3160     __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
3161   %}
3162   ins_pipe( fpu_reg_reg );
3163 %}
3164 
3165 // Load vectors (8 bytes long)
3166 instruct MoveLeg2VecD(vecD dst, legVecD src) %{
3167   match(Set dst src);
3168   format %{ "movsd $dst,$src\t! load vector (8 bytes)" %}
3169   ins_encode %{
3170     __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
3171   %}
3172   ins_pipe( fpu_reg_reg );
3173 %}
3174 
3175 // Load vectors (16 bytes long)
3176 instruct loadV16(vecX dst, memory mem) %{
3177   predicate(n->as_LoadVector()->memory_size() == 16);
3178   match(Set dst (LoadVector mem));
3179   ins_cost(125);
3180   format %{ "movdqu  $dst,$mem\t! load vector (16 bytes)" %}
3181   ins_encode %{
3182     __ movdqu($dst$$XMMRegister, $mem$$Address);
3183   %}
3184   ins_pipe( pipe_slow );
3185 %}
3186 
3187 // Load vectors (16 bytes long)
3188 instruct MoveVecX2Leg(legVecX dst, vecX src) %{
3189   match(Set dst src);
3190   format %{ "movdqu $dst,$src\t! load vector (16 bytes)" %}
3191   ins_encode %{
3192     if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
3193       int vector_len = 2;
3194       __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
3195     } else {
3196       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
3197     }
3198   %}
3199   ins_pipe( fpu_reg_reg );
3200 %}
3201 
3202 // Load vectors (16 bytes long)
3203 instruct MoveLeg2VecX(vecX dst, legVecX src) %{
3204   match(Set dst src);
3205   format %{ "movdqu $dst,$src\t! load vector (16 bytes)" %}
3206   ins_encode %{
3207     if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
3208       int vector_len = 2;
3209       __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
3210     } else {
3211       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
3212     }
3213   %}
3214   ins_pipe( fpu_reg_reg );
3215 %}
3216 
3217 // Load vectors (32 bytes long)
3218 instruct loadV32(vecY dst, memory mem) %{
3219   predicate(n->as_LoadVector()->memory_size() == 32);
3220   match(Set dst (LoadVector mem));
3221   ins_cost(125);
3222   format %{ "vmovdqu $dst,$mem\t! load vector (32 bytes)" %}
3223   ins_encode %{
3224     __ vmovdqu($dst$$XMMRegister, $mem$$Address);
3225   %}
3226   ins_pipe( pipe_slow );
3227 %}
3228 
3229 // Load vectors (32 bytes long)
3230 instruct MoveVecY2Leg(legVecY dst, vecY src) %{
3231   match(Set dst src);
3232   format %{ "vmovdqu $dst,$src\t! load vector (32 bytes)" %}
3233   ins_encode %{
3234     if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
3235       int vector_len = 2;
3236       __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
3237     } else {
3238       __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister);
3239     }
3240   %}
3241   ins_pipe( fpu_reg_reg );
3242 %}
3243 
3244 // Load vectors (32 bytes long)
3245 instruct MoveLeg2VecY(vecY dst, legVecY src) %{
3246   match(Set dst src);
3247   format %{ "vmovdqu $dst,$src\t! load vector (32 bytes)" %}
3248   ins_encode %{
3249     if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
3250       int vector_len = 2;
3251       __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
3252     } else {
3253       __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister);
3254     }
3255   %}
3256   ins_pipe( fpu_reg_reg );
3257 %}
3258 
3259 // Load vectors (64 bytes long)
3260 instruct loadV64_dword(vecZ dst, memory mem) %{
3261   predicate(n->as_LoadVector()->memory_size() == 64 && n->as_LoadVector()->element_size() <= 4);
3262   match(Set dst (LoadVector mem));
3263   ins_cost(125);
3264   format %{ "vmovdqul $dst k0,$mem\t! load vector (64 bytes)" %}
3265   ins_encode %{
3266     int vector_len = 2;
3267     __ evmovdqul($dst$$XMMRegister, $mem$$Address, vector_len);
3268   %}
3269   ins_pipe( pipe_slow );
3270 %}
3271 
3272 // Load vectors (64 bytes long)
3273 instruct loadV64_qword(vecZ dst, memory mem) %{
3274   predicate(n->as_LoadVector()->memory_size() == 64 && n->as_LoadVector()->element_size() > 4);
3275   match(Set dst (LoadVector mem));
3276   ins_cost(125);
3277   format %{ "vmovdquq $dst k0,$mem\t! load vector (64 bytes)" %}
3278   ins_encode %{
3279     int vector_len = 2;
3280     __ evmovdquq($dst$$XMMRegister, $mem$$Address, vector_len);
3281   %}
3282   ins_pipe( pipe_slow );
3283 %}
3284 
3285 instruct MoveVecZ2Leg(legVecZ dst, vecZ  src) %{
3286   match(Set dst src);
3287   format %{ "vmovdquq $dst k0,$src\t! Move vector (64 bytes)" %}
3288   ins_encode %{
3289     int vector_len = 2;
3290     __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
3291   %}
3292   ins_pipe( fpu_reg_reg );
3293 %}
3294 
3295 instruct MoveLeg2VecZ(vecZ dst, legVecZ  src) %{
3296   match(Set dst src);
3297   format %{ "vmovdquq $dst k0,$src\t! Move vector (64 bytes)" %}
3298   ins_encode %{
3299     int vector_len = 2;
3300     __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
3301   %}
3302   ins_pipe( fpu_reg_reg );
3303 %}
3304 
3305 // Store vectors
3306 instruct storeV4(memory mem, vecS src) %{
3307   predicate(n->as_StoreVector()->memory_size() == 4);
3308   match(Set mem (StoreVector mem src));
3309   ins_cost(145);
3310   format %{ "movd    $mem,$src\t! store vector (4 bytes)" %}
3311   ins_encode %{
3312     __ movdl($mem$$Address, $src$$XMMRegister);
3313   %}
3314   ins_pipe( pipe_slow );
3315 %}
3316 
3317 instruct storeV8(memory mem, vecD src) %{
3318   predicate(n->as_StoreVector()->memory_size() == 8);
3319   match(Set mem (StoreVector mem src));
3320   ins_cost(145);
3321   format %{ "movq    $mem,$src\t! store vector (8 bytes)" %}
3322   ins_encode %{
3323     __ movq($mem$$Address, $src$$XMMRegister);
3324   %}
3325   ins_pipe( pipe_slow );
3326 %}
3327 
3328 instruct storeV16(memory mem, vecX src) %{
3329   predicate(n->as_StoreVector()->memory_size() == 16);
3330   match(Set mem (StoreVector mem src));
3331   ins_cost(145);
3332   format %{ "movdqu  $mem,$src\t! store vector (16 bytes)" %}
3333   ins_encode %{
3334     __ movdqu($mem$$Address, $src$$XMMRegister);
3335   %}
3336   ins_pipe( pipe_slow );
3337 %}
3338 
3339 instruct storeV32(memory mem, vecY src) %{
3340   predicate(n->as_StoreVector()->memory_size() == 32);
3341   match(Set mem (StoreVector mem src));
3342   ins_cost(145);
3343   format %{ "vmovdqu $mem,$src\t! store vector (32 bytes)" %}
3344   ins_encode %{
3345     __ vmovdqu($mem$$Address, $src$$XMMRegister);
3346   %}
3347   ins_pipe( pipe_slow );
3348 %}
3349 
3350 instruct storeV64_dword(memory mem, vecZ src) %{
3351   predicate(n->as_StoreVector()->memory_size() == 64 && n->as_StoreVector()->element_size() <= 4);
3352   match(Set mem (StoreVector mem src));
3353   ins_cost(145);
3354   format %{ "vmovdqul $mem k0,$src\t! store vector (64 bytes)" %}
3355   ins_encode %{
3356     int vector_len = 2;
3357     __ evmovdqul($mem$$Address, $src$$XMMRegister, vector_len);
3358   %}
3359   ins_pipe( pipe_slow );
3360 %}
3361 
3362 instruct storeV64_qword(memory mem, vecZ src) %{
3363   predicate(n->as_StoreVector()->memory_size() == 64 && n->as_StoreVector()->element_size() > 4);
3364   match(Set mem (StoreVector mem src));
3365   ins_cost(145);
3366   format %{ "vmovdquq $mem k0,$src\t! store vector (64 bytes)" %}
3367   ins_encode %{
3368     int vector_len = 2;
3369     __ evmovdquq($mem$$Address, $src$$XMMRegister, vector_len);
3370   %}
3371   ins_pipe( pipe_slow );
3372 %}
3373 
3374 // ====================LEGACY REPLICATE=======================================
3375 
3376 instruct Repl4B_mem(vecS dst, memory mem) %{
3377   predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
3378   match(Set dst (ReplicateB (LoadB mem)));
3379   format %{ "punpcklbw $dst,$mem\n\t"
3380             "pshuflw $dst,$dst,0x00\t! replicate4B" %}
3381   ins_encode %{
3382     __ punpcklbw($dst$$XMMRegister, $mem$$Address);
3383     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3384   %}
3385   ins_pipe( pipe_slow );
3386 %}
3387 
3388 instruct Repl8B_mem(vecD dst, memory mem) %{
3389   predicate(n->as_Vector()->length() == 8 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
3390   match(Set dst (ReplicateB (LoadB mem)));
3391   format %{ "punpcklbw $dst,$mem\n\t"
3392             "pshuflw $dst,$dst,0x00\t! replicate8B" %}
3393   ins_encode %{
3394     __ punpcklbw($dst$$XMMRegister, $mem$$Address);
3395     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3396   %}
3397   ins_pipe( pipe_slow );
3398 %}
3399 
3400 instruct Repl16B(vecX dst, rRegI src) %{
3401   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
3402   match(Set dst (ReplicateB src));
3403   format %{ "movd    $dst,$src\n\t"
3404             "punpcklbw $dst,$dst\n\t"
3405             "pshuflw $dst,$dst,0x00\n\t"
3406             "punpcklqdq $dst,$dst\t! replicate16B" %}
3407   ins_encode %{
3408     __ movdl($dst$$XMMRegister, $src$$Register);
3409     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
3410     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3411     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3412   %}
3413   ins_pipe( pipe_slow );
3414 %}
3415 
3416 instruct Repl16B_mem(vecX dst, memory mem) %{
3417   predicate(n->as_Vector()->length() == 16 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
3418   match(Set dst (ReplicateB (LoadB mem)));
3419   format %{ "punpcklbw $dst,$mem\n\t"
3420             "pshuflw $dst,$dst,0x00\n\t"
3421             "punpcklqdq $dst,$dst\t! replicate16B" %}
3422   ins_encode %{
3423     __ punpcklbw($dst$$XMMRegister, $mem$$Address);
3424     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3425     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3426   %}
3427   ins_pipe( pipe_slow );
3428 %}
3429 
3430 instruct Repl32B(vecY dst, rRegI src) %{
3431   predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
3432   match(Set dst (ReplicateB src));
3433   format %{ "movd    $dst,$src\n\t"
3434             "punpcklbw $dst,$dst\n\t"
3435             "pshuflw $dst,$dst,0x00\n\t"
3436             "punpcklqdq $dst,$dst\n\t"
3437             "vinserti128_high $dst,$dst\t! replicate32B" %}
3438   ins_encode %{
3439     __ movdl($dst$$XMMRegister, $src$$Register);
3440     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
3441     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3442     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3443     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3444   %}
3445   ins_pipe( pipe_slow );
3446 %}
3447 
3448 instruct Repl32B_mem(vecY dst, memory mem) %{
3449   predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
3450   match(Set dst (ReplicateB (LoadB mem)));
3451   format %{ "punpcklbw $dst,$mem\n\t"
3452             "pshuflw $dst,$dst,0x00\n\t"
3453             "punpcklqdq $dst,$dst\n\t"
3454             "vinserti128_high $dst,$dst\t! replicate32B" %}
3455   ins_encode %{
3456     __ punpcklbw($dst$$XMMRegister, $mem$$Address);
3457     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3458     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3459     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3460   %}
3461   ins_pipe( pipe_slow );
3462 %}
3463 
3464 instruct Repl64B(legVecZ dst, rRegI src) %{
3465   predicate(n->as_Vector()->length() == 64 && !VM_Version::supports_avx512vlbw());
3466   match(Set dst (ReplicateB src));
3467   format %{ "movd    $dst,$src\n\t"
3468             "punpcklbw $dst,$dst\n\t"
3469             "pshuflw $dst,$dst,0x00\n\t"
3470             "punpcklqdq $dst,$dst\n\t"
3471             "vinserti128_high $dst,$dst\t"
3472             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate64B" %}
3473   ins_encode %{
3474     __ movdl($dst$$XMMRegister, $src$$Register);
3475     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
3476     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3477     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3478     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3479     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3480   %}
3481   ins_pipe( pipe_slow );
3482 %}
3483 
3484 instruct Repl64B_mem(legVecZ dst, memory mem) %{
3485   predicate(n->as_Vector()->length() == 64 && !VM_Version::supports_avx512vlbw());
3486   match(Set dst (ReplicateB (LoadB mem)));
3487   format %{ "punpcklbw $dst,$mem\n\t"
3488             "pshuflw $dst,$dst,0x00\n\t"
3489             "punpcklqdq $dst,$dst\n\t"
3490             "vinserti128_high $dst,$dst\t"
3491             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate64B" %}
3492   ins_encode %{
3493     __ punpcklbw($dst$$XMMRegister, $mem$$Address);
3494     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3495     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3496     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3497     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3498   %}
3499   ins_pipe( pipe_slow );
3500 %}
3501 
3502 instruct Repl16B_imm(vecX dst, immI con) %{
3503   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
3504   match(Set dst (ReplicateB con));
3505   format %{ "movq    $dst,[$constantaddress]\n\t"
3506             "punpcklqdq $dst,$dst\t! replicate16B($con)" %}
3507   ins_encode %{
3508     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
3509     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3510   %}
3511   ins_pipe( pipe_slow );
3512 %}
3513 
3514 instruct Repl32B_imm(vecY dst, immI con) %{
3515   predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
3516   match(Set dst (ReplicateB con));
3517   format %{ "movq    $dst,[$constantaddress]\n\t"
3518             "punpcklqdq $dst,$dst\n\t"
3519             "vinserti128_high $dst,$dst\t! lreplicate32B($con)" %}
3520   ins_encode %{
3521     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
3522     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3523     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3524   %}
3525   ins_pipe( pipe_slow );
3526 %}
3527 
3528 instruct Repl64B_imm(legVecZ dst, immI con) %{
3529   predicate(n->as_Vector()->length() == 64 && !VM_Version::supports_avx512vlbw());
3530   match(Set dst (ReplicateB con));
3531   format %{ "movq    $dst,[$constantaddress]\n\t"
3532             "punpcklqdq $dst,$dst\n\t"
3533             "vinserti128_high $dst,$dst\t"
3534             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate64B($con)" %}
3535   ins_encode %{
3536     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
3537     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3538     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3539     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3540   %}
3541   ins_pipe( pipe_slow );
3542 %}
3543 
3544 instruct Repl4S(vecD dst, rRegI src) %{
3545   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vlbw());
3546   match(Set dst (ReplicateS src));
3547   format %{ "movd    $dst,$src\n\t"
3548             "pshuflw $dst,$dst,0x00\t! replicate4S" %}
3549   ins_encode %{
3550     __ movdl($dst$$XMMRegister, $src$$Register);
3551     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3552   %}
3553   ins_pipe( pipe_slow );
3554 %}
3555 
3556 instruct Repl4S_mem(vecD dst, memory mem) %{
3557   predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
3558   match(Set dst (ReplicateS (LoadS mem)));
3559   format %{ "pshuflw $dst,$mem,0x00\t! replicate4S" %}
3560   ins_encode %{
3561     __ pshuflw($dst$$XMMRegister, $mem$$Address, 0x00);
3562   %}
3563   ins_pipe( pipe_slow );
3564 %}
3565 
3566 instruct Repl8S(vecX dst, rRegI src) %{
3567   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vlbw());
3568   match(Set dst (ReplicateS src));
3569   format %{ "movd    $dst,$src\n\t"
3570             "pshuflw $dst,$dst,0x00\n\t"
3571             "punpcklqdq $dst,$dst\t! replicate8S" %}
3572   ins_encode %{
3573     __ movdl($dst$$XMMRegister, $src$$Register);
3574     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3575     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3576   %}
3577   ins_pipe( pipe_slow );
3578 %}
3579 
3580 instruct Repl8S_mem(vecX dst, memory mem) %{
3581   predicate(n->as_Vector()->length() == 8 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
3582   match(Set dst (ReplicateS (LoadS mem)));
3583   format %{ "pshuflw $dst,$mem,0x00\n\t"
3584             "punpcklqdq $dst,$dst\t! replicate8S" %}
3585   ins_encode %{
3586     __ pshuflw($dst$$XMMRegister, $mem$$Address, 0x00);
3587     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3588   %}
3589   ins_pipe( pipe_slow );
3590 %}
3591 
3592 instruct Repl8S_imm(vecX dst, immI con) %{
3593   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vlbw());
3594   match(Set dst (ReplicateS con));
3595   format %{ "movq    $dst,[$constantaddress]\n\t"
3596             "punpcklqdq $dst,$dst\t! replicate8S($con)" %}
3597   ins_encode %{
3598     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
3599     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3600   %}
3601   ins_pipe( pipe_slow );
3602 %}
3603 
3604 instruct Repl16S(vecY dst, rRegI src) %{
3605   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
3606   match(Set dst (ReplicateS src));
3607   format %{ "movd    $dst,$src\n\t"
3608             "pshuflw $dst,$dst,0x00\n\t"
3609             "punpcklqdq $dst,$dst\n\t"
3610             "vinserti128_high $dst,$dst\t! replicate16S" %}
3611   ins_encode %{
3612     __ movdl($dst$$XMMRegister, $src$$Register);
3613     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3614     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3615     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3616   %}
3617   ins_pipe( pipe_slow );
3618 %}
3619 
3620 instruct Repl16S_mem(vecY dst, memory mem) %{
3621   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
3622   match(Set dst (ReplicateS (LoadS mem)));
3623   format %{ "pshuflw $dst,$mem,0x00\n\t"
3624             "punpcklqdq $dst,$dst\n\t"
3625             "vinserti128_high $dst,$dst\t! replicate16S" %}
3626   ins_encode %{
3627     __ pshuflw($dst$$XMMRegister, $mem$$Address, 0x00);
3628     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3629     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3630   %}
3631   ins_pipe( pipe_slow );
3632 %}
3633 
3634 instruct Repl16S_imm(vecY dst, immI con) %{
3635   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
3636   match(Set dst (ReplicateS con));
3637   format %{ "movq    $dst,[$constantaddress]\n\t"
3638             "punpcklqdq $dst,$dst\n\t"
3639             "vinserti128_high $dst,$dst\t! replicate16S($con)" %}
3640   ins_encode %{
3641     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
3642     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3643     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3644   %}
3645   ins_pipe( pipe_slow );
3646 %}
3647 
3648 instruct Repl32S(legVecZ dst, rRegI src) %{
3649   predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
3650   match(Set dst (ReplicateS src));
3651   format %{ "movd    $dst,$src\n\t"
3652             "pshuflw $dst,$dst,0x00\n\t"
3653             "punpcklqdq $dst,$dst\n\t"
3654             "vinserti128_high $dst,$dst\t"
3655             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate32S" %}
3656   ins_encode %{
3657     __ movdl($dst$$XMMRegister, $src$$Register);
3658     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3659     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3660     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3661     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3662   %}
3663   ins_pipe( pipe_slow );
3664 %}
3665 
3666 instruct Repl32S_mem(legVecZ dst, memory mem) %{
3667   predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
3668   match(Set dst (ReplicateS (LoadS mem)));
3669   format %{ "pshuflw $dst,$mem,0x00\n\t"
3670             "punpcklqdq $dst,$dst\n\t"
3671             "vinserti128_high $dst,$dst\t"
3672             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate32S" %}
3673   ins_encode %{
3674     __ pshuflw($dst$$XMMRegister, $mem$$Address, 0x00);
3675     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3676     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3677     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3678   %}
3679   ins_pipe( pipe_slow );
3680 %}
3681 
3682 instruct Repl32S_imm(legVecZ dst, immI con) %{
3683   predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
3684   match(Set dst (ReplicateS con));
3685   format %{ "movq    $dst,[$constantaddress]\n\t"
3686             "punpcklqdq $dst,$dst\n\t"
3687             "vinserti128_high $dst,$dst\t"
3688             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate32S($con)" %}
3689   ins_encode %{
3690     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
3691     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3692     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3693     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3694   %}
3695   ins_pipe( pipe_slow );
3696 %}
3697 
3698 instruct Repl4I(vecX dst, rRegI src) %{
3699   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3700   match(Set dst (ReplicateI src));
3701   format %{ "movd    $dst,$src\n\t"
3702             "pshufd  $dst,$dst,0x00\t! replicate4I" %}
3703   ins_encode %{
3704     __ movdl($dst$$XMMRegister, $src$$Register);
3705     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3706   %}
3707   ins_pipe( pipe_slow );
3708 %}
3709 
3710 instruct Repl4I_mem(vecX dst, memory mem) %{
3711   predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3712   match(Set dst (ReplicateI (LoadI mem)));
3713   format %{ "pshufd  $dst,$mem,0x00\t! replicate4I" %}
3714   ins_encode %{
3715     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3716   %}
3717   ins_pipe( pipe_slow );
3718 %}
3719 
3720 instruct Repl8I(vecY dst, rRegI src) %{
3721   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3722   match(Set dst (ReplicateI src));
3723   format %{ "movd    $dst,$src\n\t"
3724             "pshufd  $dst,$dst,0x00\n\t"
3725             "vinserti128_high $dst,$dst\t! replicate8I" %}
3726   ins_encode %{
3727     __ movdl($dst$$XMMRegister, $src$$Register);
3728     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3729     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3730   %}
3731   ins_pipe( pipe_slow );
3732 %}
3733 
3734 instruct Repl8I_mem(vecY dst, memory mem) %{
3735   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3736   match(Set dst (ReplicateI (LoadI mem)));
3737   format %{ "pshufd  $dst,$mem,0x00\n\t"
3738             "vinserti128_high $dst,$dst\t! replicate8I" %}
3739   ins_encode %{
3740     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3741     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3742   %}
3743   ins_pipe( pipe_slow );
3744 %}
3745 
3746 instruct Repl16I(legVecZ dst, rRegI src) %{
3747   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vl());
3748   match(Set dst (ReplicateI src));
3749   format %{ "movd    $dst,$src\n\t"
3750             "pshufd  $dst,$dst,0x00\n\t"
3751             "vinserti128_high $dst,$dst\t"
3752             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate16I" %}
3753   ins_encode %{
3754     __ movdl($dst$$XMMRegister, $src$$Register);
3755     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3756     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3757     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3758   %}
3759   ins_pipe( pipe_slow );
3760 %}
3761 
3762 instruct Repl16I_mem(legVecZ dst, memory mem) %{
3763   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vl());
3764   match(Set dst (ReplicateI (LoadI mem)));
3765   format %{ "pshufd  $dst,$mem,0x00\n\t"
3766             "vinserti128_high $dst,$dst\t"
3767             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate16I" %}
3768   ins_encode %{
3769     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3770     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3771     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3772   %}
3773   ins_pipe( pipe_slow );
3774 %}
3775 
3776 instruct Repl4I_imm(vecX dst, immI con) %{
3777   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3778   match(Set dst (ReplicateI con));
3779   format %{ "movq    $dst,[$constantaddress]\t! replicate4I($con)\n\t"
3780             "punpcklqdq $dst,$dst" %}
3781   ins_encode %{
3782     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
3783     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3784   %}
3785   ins_pipe( pipe_slow );
3786 %}
3787 
3788 instruct Repl8I_imm(vecY dst, immI con) %{
3789   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3790   match(Set dst (ReplicateI con));
3791   format %{ "movq    $dst,[$constantaddress]\t! replicate8I($con)\n\t"
3792             "punpcklqdq $dst,$dst\n\t"
3793             "vinserti128_high $dst,$dst" %}
3794   ins_encode %{
3795     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
3796     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3797     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3798   %}
3799   ins_pipe( pipe_slow );
3800 %}
3801 
3802 instruct Repl16I_imm(legVecZ dst, immI con) %{
3803   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vl());
3804   match(Set dst (ReplicateI con));
3805   format %{ "movq    $dst,[$constantaddress]\t"
3806             "punpcklqdq $dst,$dst\n\t"
3807             "vinserti128_high $dst,$dst"
3808             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate16I($con)" %}
3809   ins_encode %{
3810     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
3811     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3812     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3813     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3814   %}
3815   ins_pipe( pipe_slow );
3816 %}
3817 
3818 // Long could be loaded into xmm register directly from memory.
3819 instruct Repl2L_mem(vecX dst, memory mem) %{
3820   predicate(n->as_Vector()->length() == 2 && !VM_Version::supports_avx512vlbw());
3821   match(Set dst (ReplicateL (LoadL mem)));
3822   format %{ "movq    $dst,$mem\n\t"
3823             "punpcklqdq $dst,$dst\t! replicate2L" %}
3824   ins_encode %{
3825     __ movq($dst$$XMMRegister, $mem$$Address);
3826     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3827   %}
3828   ins_pipe( pipe_slow );
3829 %}
3830 
3831 // Replicate long (8 byte) scalar to be vector
3832 #ifdef _LP64
3833 instruct Repl4L(vecY dst, rRegL src) %{
3834   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3835   match(Set dst (ReplicateL src));
3836   format %{ "movdq   $dst,$src\n\t"
3837             "punpcklqdq $dst,$dst\n\t"
3838             "vinserti128_high $dst,$dst\t! replicate4L" %}
3839   ins_encode %{
3840     __ movdq($dst$$XMMRegister, $src$$Register);
3841     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3842     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3843   %}
3844   ins_pipe( pipe_slow );
3845 %}
3846 
3847 instruct Repl8L(legVecZ dst, rRegL src) %{
3848   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3849   match(Set dst (ReplicateL src));
3850   format %{ "movdq   $dst,$src\n\t"
3851             "punpcklqdq $dst,$dst\n\t"
3852             "vinserti128_high $dst,$dst\t"
3853             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate8L" %}
3854   ins_encode %{
3855     __ movdq($dst$$XMMRegister, $src$$Register);
3856     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3857     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3858     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3859   %}
3860   ins_pipe( pipe_slow );
3861 %}
3862 #else // _LP64
3863 instruct Repl4L(vecY dst, eRegL src, vecY tmp) %{
3864   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3865   match(Set dst (ReplicateL src));
3866   effect(TEMP dst, USE src, TEMP tmp);
3867   format %{ "movdl   $dst,$src.lo\n\t"
3868             "movdl   $tmp,$src.hi\n\t"
3869             "punpckldq $dst,$tmp\n\t"
3870             "punpcklqdq $dst,$dst\n\t"
3871             "vinserti128_high $dst,$dst\t! replicate4L" %}
3872   ins_encode %{
3873     __ movdl($dst$$XMMRegister, $src$$Register);
3874     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
3875     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
3876     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3877     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3878   %}
3879   ins_pipe( pipe_slow );
3880 %}
3881 
3882 instruct Repl8L(legVecZ dst, eRegL src, legVecZ tmp) %{
3883   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3884   match(Set dst (ReplicateL src));
3885   effect(TEMP dst, USE src, TEMP tmp);
3886   format %{ "movdl   $dst,$src.lo\n\t"
3887             "movdl   $tmp,$src.hi\n\t"
3888             "punpckldq $dst,$tmp\n\t"
3889             "punpcklqdq $dst,$dst\n\t"
3890             "vinserti128_high $dst,$dst\t"
3891             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate8L" %}
3892   ins_encode %{
3893     __ movdl($dst$$XMMRegister, $src$$Register);
3894     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
3895     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
3896     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3897     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3898     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3899   %}
3900   ins_pipe( pipe_slow );
3901 %}
3902 #endif // _LP64
3903 
3904 instruct Repl4L_imm(vecY dst, immL con) %{
3905   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3906   match(Set dst (ReplicateL con));
3907   format %{ "movq    $dst,[$constantaddress]\n\t"
3908             "punpcklqdq $dst,$dst\n\t"
3909             "vinserti128_high $dst,$dst\t! replicate4L($con)" %}
3910   ins_encode %{
3911     __ movq($dst$$XMMRegister, $constantaddress($con));
3912     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3913     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3914   %}
3915   ins_pipe( pipe_slow );
3916 %}
3917 
3918 instruct Repl8L_imm(legVecZ dst, immL con) %{
3919   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3920   match(Set dst (ReplicateL con));
3921   format %{ "movq    $dst,[$constantaddress]\n\t"
3922             "punpcklqdq $dst,$dst\n\t"
3923             "vinserti128_high $dst,$dst\t"
3924             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate8L($con)" %}
3925   ins_encode %{
3926     __ movq($dst$$XMMRegister, $constantaddress($con));
3927     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3928     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3929     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3930   %}
3931   ins_pipe( pipe_slow );
3932 %}
3933 
3934 instruct Repl4L_mem(vecY dst, memory mem) %{
3935   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3936   match(Set dst (ReplicateL (LoadL mem)));
3937   format %{ "movq    $dst,$mem\n\t"
3938             "punpcklqdq $dst,$dst\n\t"
3939             "vinserti128_high $dst,$dst\t! replicate4L" %}
3940   ins_encode %{
3941     __ movq($dst$$XMMRegister, $mem$$Address);
3942     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3943     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3944   %}
3945   ins_pipe( pipe_slow );
3946 %}
3947 
3948 instruct Repl8L_mem(legVecZ dst, memory mem) %{
3949   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3950   match(Set dst (ReplicateL (LoadL mem)));
3951   format %{ "movq    $dst,$mem\n\t"
3952             "punpcklqdq $dst,$dst\n\t"
3953             "vinserti128_high $dst,$dst\t"
3954             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate8L" %}
3955   ins_encode %{
3956     __ movq($dst$$XMMRegister, $mem$$Address);
3957     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3958     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3959     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3960   %}
3961   ins_pipe( pipe_slow );
3962 %}
3963 
3964 instruct Repl2F_mem(vecD dst, memory mem) %{
3965   predicate(n->as_Vector()->length() == 2 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3966   match(Set dst (ReplicateF (LoadF mem)));
3967   format %{ "pshufd  $dst,$mem,0x00\t! replicate2F" %}
3968   ins_encode %{
3969     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3970   %}
3971   ins_pipe( pipe_slow );
3972 %}
3973 
3974 instruct Repl4F_mem(vecX dst, memory mem) %{
3975   predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3976   match(Set dst (ReplicateF (LoadF mem)));
3977   format %{ "pshufd  $dst,$mem,0x00\t! replicate4F" %}
3978   ins_encode %{
3979     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3980   %}
3981   ins_pipe( pipe_slow );
3982 %}
3983 
3984 instruct Repl8F(vecY dst, vlRegF src) %{
3985   predicate(n->as_Vector()->length() == 8 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3986   match(Set dst (ReplicateF src));
3987   format %{ "pshufd  $dst,$src,0x00\n\t"
3988             "vinsertf128_high $dst,$dst\t! replicate8F" %}
3989   ins_encode %{
3990     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
3991     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3992   %}
3993   ins_pipe( pipe_slow );
3994 %}
3995 
3996 instruct Repl8F_mem(vecY dst, memory mem) %{
3997   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3998   match(Set dst (ReplicateF (LoadF mem)));
3999   format %{ "pshufd  $dst,$mem,0x00\n\t"
4000             "vinsertf128_high $dst,$dst\t! replicate8F" %}
4001   ins_encode %{
4002     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
4003     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
4004   %}
4005   ins_pipe( pipe_slow );
4006 %}
4007 
4008 instruct Repl16F(legVecZ dst, vlRegF src) %{
4009   predicate(n->as_Vector()->length() == 16 && UseAVX > 0 && !VM_Version::supports_avx512vl());
4010   match(Set dst (ReplicateF src));
4011   format %{ "pshufd  $dst,$src,0x00\n\t"
4012             "vinsertf128_high $dst,$dst\t"
4013             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate16F" %}
4014   ins_encode %{
4015     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
4016     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
4017     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
4018   %}
4019   ins_pipe( pipe_slow );
4020 %}
4021 
4022 instruct Repl16F_mem(legVecZ dst, memory mem) %{
4023   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vl());
4024   match(Set dst (ReplicateF (LoadF mem)));
4025   format %{ "pshufd  $dst,$mem,0x00\n\t"
4026             "vinsertf128_high $dst,$dst\t"
4027             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate16F" %}
4028   ins_encode %{
4029     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
4030     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
4031     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
4032   %}
4033   ins_pipe( pipe_slow );
4034 %}
4035 
4036 instruct Repl2F_zero(vecD dst, immF0 zero) %{
4037   predicate(n->as_Vector()->length() == 2 && UseAVX < 3);
4038   match(Set dst (ReplicateF zero));
4039   format %{ "xorps   $dst,$dst\t! replicate2F zero" %}
4040   ins_encode %{
4041     __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
4042   %}
4043   ins_pipe( fpu_reg_reg );
4044 %}
4045 
4046 instruct Repl4F_zero(vecX dst, immF0 zero) %{
4047   predicate(n->as_Vector()->length() == 4 && UseAVX < 3);
4048   match(Set dst (ReplicateF zero));
4049   format %{ "xorps   $dst,$dst\t! replicate4F zero" %}
4050   ins_encode %{
4051     __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
4052   %}
4053   ins_pipe( fpu_reg_reg );
4054 %}
4055 
4056 instruct Repl8F_zero(vecY dst, immF0 zero) %{
4057   predicate(n->as_Vector()->length() == 8 && UseAVX < 3);
4058   match(Set dst (ReplicateF zero));
4059   format %{ "vxorps  $dst,$dst,$dst\t! replicate8F zero" %}
4060   ins_encode %{
4061     int vector_len = 1;
4062     __ vxorps($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4063   %}
4064   ins_pipe( fpu_reg_reg );
4065 %}
4066 
4067 instruct Repl2D_mem(vecX dst, memory mem) %{
4068   predicate(n->as_Vector()->length() == 2 && UseAVX > 0 && !VM_Version::supports_avx512vl());
4069   match(Set dst (ReplicateD (LoadD mem)));
4070   format %{ "pshufd  $dst,$mem,0x44\t! replicate2D" %}
4071   ins_encode %{
4072     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x44);
4073   %}
4074   ins_pipe( pipe_slow );
4075 %}
4076 
4077 instruct Repl4D(vecY dst, vlRegD src) %{
4078   predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vl());
4079   match(Set dst (ReplicateD src));
4080   format %{ "pshufd  $dst,$src,0x44\n\t"
4081             "vinsertf128_high $dst,$dst\t! replicate4D" %}
4082   ins_encode %{
4083     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
4084     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
4085   %}
4086   ins_pipe( pipe_slow );
4087 %}
4088 
4089 instruct Repl4D_mem(vecY dst, memory mem) %{
4090   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
4091   match(Set dst (ReplicateD (LoadD mem)));
4092   format %{ "pshufd  $dst,$mem,0x44\n\t"
4093             "vinsertf128_high $dst,$dst\t! replicate4D" %}
4094   ins_encode %{
4095     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x44);
4096     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
4097   %}
4098   ins_pipe( pipe_slow );
4099 %}
4100 
4101 instruct Repl8D(legVecZ dst, vlRegD src) %{
4102   predicate(n->as_Vector()->length() == 8 && UseAVX > 0 && !VM_Version::supports_avx512vl());
4103   match(Set dst (ReplicateD src));
4104   format %{ "pshufd  $dst,$src,0x44\n\t"
4105             "vinsertf128_high $dst,$dst\t"
4106             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate8D" %}
4107   ins_encode %{
4108     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
4109     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
4110     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
4111   %}
4112   ins_pipe( pipe_slow );
4113 %}
4114 
4115 instruct Repl8D_mem(legVecZ dst, memory mem) %{
4116   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
4117   match(Set dst (ReplicateD (LoadD mem)));
4118   format %{ "pshufd  $dst,$mem,0x44\n\t"
4119             "vinsertf128_high $dst,$dst\t"
4120             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate8D" %}
4121   ins_encode %{
4122     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x44);
4123     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
4124     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
4125   %}
4126   ins_pipe( pipe_slow );
4127 %}
4128 
4129 // Replicate double (8 byte) scalar zero to be vector
4130 instruct Repl2D_zero(vecX dst, immD0 zero) %{
4131   predicate(n->as_Vector()->length() == 2 && UseAVX < 3);
4132   match(Set dst (ReplicateD zero));
4133   format %{ "xorpd   $dst,$dst\t! replicate2D zero" %}
4134   ins_encode %{
4135     __ xorpd($dst$$XMMRegister, $dst$$XMMRegister);
4136   %}
4137   ins_pipe( fpu_reg_reg );
4138 %}
4139 
4140 instruct Repl4D_zero(vecY dst, immD0 zero) %{
4141   predicate(n->as_Vector()->length() == 4 && UseAVX < 3);
4142   match(Set dst (ReplicateD zero));
4143   format %{ "vxorpd  $dst,$dst,$dst,vect256\t! replicate4D zero" %}
4144   ins_encode %{
4145     int vector_len = 1;
4146     __ vxorpd($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4147   %}
4148   ins_pipe( fpu_reg_reg );
4149 %}
4150 
4151 // ====================GENERIC REPLICATE==========================================
4152 
4153 // Replicate byte scalar to be vector
4154 instruct Repl4B(vecS dst, rRegI src) %{
4155   predicate(n->as_Vector()->length() == 4);
4156   match(Set dst (ReplicateB src));
4157   format %{ "movd    $dst,$src\n\t"
4158             "punpcklbw $dst,$dst\n\t"
4159             "pshuflw $dst,$dst,0x00\t! replicate4B" %}
4160   ins_encode %{
4161     __ movdl($dst$$XMMRegister, $src$$Register);
4162     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
4163     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
4164   %}
4165   ins_pipe( pipe_slow );
4166 %}
4167 
4168 instruct Repl8B(vecD dst, rRegI src) %{
4169   predicate(n->as_Vector()->length() == 8);
4170   match(Set dst (ReplicateB src));
4171   format %{ "movd    $dst,$src\n\t"
4172             "punpcklbw $dst,$dst\n\t"
4173             "pshuflw $dst,$dst,0x00\t! replicate8B" %}
4174   ins_encode %{
4175     __ movdl($dst$$XMMRegister, $src$$Register);
4176     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
4177     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
4178   %}
4179   ins_pipe( pipe_slow );
4180 %}
4181 
4182 // Replicate byte scalar immediate to be vector by loading from const table.
4183 instruct Repl4B_imm(vecS dst, immI con) %{
4184   predicate(n->as_Vector()->length() == 4);
4185   match(Set dst (ReplicateB con));
4186   format %{ "movdl   $dst,[$constantaddress]\t! replicate4B($con)" %}
4187   ins_encode %{
4188     __ movdl($dst$$XMMRegister, $constantaddress(replicate4_imm($con$$constant, 1)));
4189   %}
4190   ins_pipe( pipe_slow );
4191 %}
4192 
4193 instruct Repl8B_imm(vecD dst, immI con) %{
4194   predicate(n->as_Vector()->length() == 8);
4195   match(Set dst (ReplicateB con));
4196   format %{ "movq    $dst,[$constantaddress]\t! replicate8B($con)" %}
4197   ins_encode %{
4198     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
4199   %}
4200   ins_pipe( pipe_slow );
4201 %}
4202 
4203 // Replicate byte scalar zero to be vector
4204 instruct Repl4B_zero(vecS dst, immI0 zero) %{
4205   predicate(n->as_Vector()->length() == 4);
4206   match(Set dst (ReplicateB zero));
4207   format %{ "pxor    $dst,$dst\t! replicate4B zero" %}
4208   ins_encode %{
4209     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4210   %}
4211   ins_pipe( fpu_reg_reg );
4212 %}
4213 
4214 instruct Repl8B_zero(vecD dst, immI0 zero) %{
4215   predicate(n->as_Vector()->length() == 8);
4216   match(Set dst (ReplicateB zero));
4217   format %{ "pxor    $dst,$dst\t! replicate8B zero" %}
4218   ins_encode %{
4219     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4220   %}
4221   ins_pipe( fpu_reg_reg );
4222 %}
4223 
4224 instruct Repl16B_zero(vecX dst, immI0 zero) %{
4225   predicate(n->as_Vector()->length() == 16);
4226   match(Set dst (ReplicateB zero));
4227   format %{ "pxor    $dst,$dst\t! replicate16B zero" %}
4228   ins_encode %{
4229     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4230   %}
4231   ins_pipe( fpu_reg_reg );
4232 %}
4233 
4234 instruct Repl32B_zero(vecY dst, immI0 zero) %{
4235   predicate(n->as_Vector()->length() == 32);
4236   match(Set dst (ReplicateB zero));
4237   format %{ "vpxor   $dst,$dst,$dst\t! replicate32B zero" %}
4238   ins_encode %{
4239     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
4240     int vector_len = 1;
4241     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4242   %}
4243   ins_pipe( fpu_reg_reg );
4244 %}
4245 
4246 // Replicate char/short (2 byte) scalar to be vector
4247 instruct Repl2S(vecS dst, rRegI src) %{
4248   predicate(n->as_Vector()->length() == 2);
4249   match(Set dst (ReplicateS src));
4250   format %{ "movd    $dst,$src\n\t"
4251             "pshuflw $dst,$dst,0x00\t! replicate2S" %}
4252   ins_encode %{
4253     __ movdl($dst$$XMMRegister, $src$$Register);
4254     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
4255   %}
4256   ins_pipe( fpu_reg_reg );
4257 %}
4258 
4259 // Replicate char/short (2 byte) scalar immediate to be vector by loading from const table.
4260 instruct Repl2S_imm(vecS dst, immI con) %{
4261   predicate(n->as_Vector()->length() == 2);
4262   match(Set dst (ReplicateS con));
4263   format %{ "movdl   $dst,[$constantaddress]\t! replicate2S($con)" %}
4264   ins_encode %{
4265     __ movdl($dst$$XMMRegister, $constantaddress(replicate4_imm($con$$constant, 2)));
4266   %}
4267   ins_pipe( fpu_reg_reg );
4268 %}
4269 
4270 instruct Repl4S_imm(vecD dst, immI con) %{
4271   predicate(n->as_Vector()->length() == 4);
4272   match(Set dst (ReplicateS con));
4273   format %{ "movq    $dst,[$constantaddress]\t! replicate4S($con)" %}
4274   ins_encode %{
4275     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
4276   %}
4277   ins_pipe( fpu_reg_reg );
4278 %}
4279 
4280 // Replicate char/short (2 byte) scalar zero to be vector
4281 instruct Repl2S_zero(vecS dst, immI0 zero) %{
4282   predicate(n->as_Vector()->length() == 2);
4283   match(Set dst (ReplicateS zero));
4284   format %{ "pxor    $dst,$dst\t! replicate2S zero" %}
4285   ins_encode %{
4286     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4287   %}
4288   ins_pipe( fpu_reg_reg );
4289 %}
4290 
4291 instruct Repl4S_zero(vecD dst, immI0 zero) %{
4292   predicate(n->as_Vector()->length() == 4);
4293   match(Set dst (ReplicateS zero));
4294   format %{ "pxor    $dst,$dst\t! replicate4S zero" %}
4295   ins_encode %{
4296     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4297   %}
4298   ins_pipe( fpu_reg_reg );
4299 %}
4300 
4301 instruct Repl8S_zero(vecX dst, immI0 zero) %{
4302   predicate(n->as_Vector()->length() == 8);
4303   match(Set dst (ReplicateS zero));
4304   format %{ "pxor    $dst,$dst\t! replicate8S zero" %}
4305   ins_encode %{
4306     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4307   %}
4308   ins_pipe( fpu_reg_reg );
4309 %}
4310 
4311 instruct Repl16S_zero(vecY dst, immI0 zero) %{
4312   predicate(n->as_Vector()->length() == 16);
4313   match(Set dst (ReplicateS zero));
4314   format %{ "vpxor   $dst,$dst,$dst\t! replicate16S zero" %}
4315   ins_encode %{
4316     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
4317     int vector_len = 1;
4318     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4319   %}
4320   ins_pipe( fpu_reg_reg );
4321 %}
4322 
4323 // Replicate integer (4 byte) scalar to be vector
4324 instruct Repl2I(vecD dst, rRegI src) %{
4325   predicate(n->as_Vector()->length() == 2);
4326   match(Set dst (ReplicateI src));
4327   format %{ "movd    $dst,$src\n\t"
4328             "pshufd  $dst,$dst,0x00\t! replicate2I" %}
4329   ins_encode %{
4330     __ movdl($dst$$XMMRegister, $src$$Register);
4331     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
4332   %}
4333   ins_pipe( fpu_reg_reg );
4334 %}
4335 
4336 // Integer could be loaded into xmm register directly from memory.
4337 instruct Repl2I_mem(vecD dst, memory mem) %{
4338   predicate(n->as_Vector()->length() == 2);
4339   match(Set dst (ReplicateI (LoadI mem)));
4340   format %{ "movd    $dst,$mem\n\t"
4341             "pshufd  $dst,$dst,0x00\t! replicate2I" %}
4342   ins_encode %{
4343     __ movdl($dst$$XMMRegister, $mem$$Address);
4344     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
4345   %}
4346   ins_pipe( fpu_reg_reg );
4347 %}
4348 
4349 // Replicate integer (4 byte) scalar immediate to be vector by loading from const table.
4350 instruct Repl2I_imm(vecD dst, immI con) %{
4351   predicate(n->as_Vector()->length() == 2);
4352   match(Set dst (ReplicateI con));
4353   format %{ "movq    $dst,[$constantaddress]\t! replicate2I($con)" %}
4354   ins_encode %{
4355     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
4356   %}
4357   ins_pipe( fpu_reg_reg );
4358 %}
4359 
4360 // Replicate integer (4 byte) scalar zero to be vector
4361 instruct Repl2I_zero(vecD dst, immI0 zero) %{
4362   predicate(n->as_Vector()->length() == 2);
4363   match(Set dst (ReplicateI zero));
4364   format %{ "pxor    $dst,$dst\t! replicate2I" %}
4365   ins_encode %{
4366     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4367   %}
4368   ins_pipe( fpu_reg_reg );
4369 %}
4370 
4371 instruct Repl4I_zero(vecX dst, immI0 zero) %{
4372   predicate(n->as_Vector()->length() == 4);
4373   match(Set dst (ReplicateI zero));
4374   format %{ "pxor    $dst,$dst\t! replicate4I zero)" %}
4375   ins_encode %{
4376     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4377   %}
4378   ins_pipe( fpu_reg_reg );
4379 %}
4380 
4381 instruct Repl8I_zero(vecY dst, immI0 zero) %{
4382   predicate(n->as_Vector()->length() == 8);
4383   match(Set dst (ReplicateI zero));
4384   format %{ "vpxor   $dst,$dst,$dst\t! replicate8I zero" %}
4385   ins_encode %{
4386     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
4387     int vector_len = 1;
4388     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4389   %}
4390   ins_pipe( fpu_reg_reg );
4391 %}
4392 
4393 // Replicate long (8 byte) scalar to be vector
4394 #ifdef _LP64
4395 instruct Repl2L(vecX dst, rRegL src) %{
4396   predicate(n->as_Vector()->length() == 2);
4397   match(Set dst (ReplicateL src));
4398   format %{ "movdq   $dst,$src\n\t"
4399             "punpcklqdq $dst,$dst\t! replicate2L" %}
4400   ins_encode %{
4401     __ movdq($dst$$XMMRegister, $src$$Register);
4402     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4403   %}
4404   ins_pipe( pipe_slow );
4405 %}
4406 #else // _LP64
4407 instruct Repl2L(vecX dst, eRegL src, vecX tmp) %{
4408   predicate(n->as_Vector()->length() == 2);
4409   match(Set dst (ReplicateL src));
4410   effect(TEMP dst, USE src, TEMP tmp);
4411   format %{ "movdl   $dst,$src.lo\n\t"
4412             "movdl   $tmp,$src.hi\n\t"
4413             "punpckldq $dst,$tmp\n\t"
4414             "punpcklqdq $dst,$dst\t! replicate2L"%}
4415   ins_encode %{
4416     __ movdl($dst$$XMMRegister, $src$$Register);
4417     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
4418     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
4419     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4420   %}
4421   ins_pipe( pipe_slow );
4422 %}
4423 #endif // _LP64
4424 
4425 // Replicate long (8 byte) scalar immediate to be vector by loading from const table.
4426 instruct Repl2L_imm(vecX dst, immL con) %{
4427   predicate(n->as_Vector()->length() == 2);
4428   match(Set dst (ReplicateL con));
4429   format %{ "movq    $dst,[$constantaddress]\n\t"
4430             "punpcklqdq $dst,$dst\t! replicate2L($con)" %}
4431   ins_encode %{
4432     __ movq($dst$$XMMRegister, $constantaddress($con));
4433     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4434   %}
4435   ins_pipe( pipe_slow );
4436 %}
4437 
4438 // Replicate long (8 byte) scalar zero to be vector
4439 instruct Repl2L_zero(vecX dst, immL0 zero) %{
4440   predicate(n->as_Vector()->length() == 2);
4441   match(Set dst (ReplicateL zero));
4442   format %{ "pxor    $dst,$dst\t! replicate2L zero" %}
4443   ins_encode %{
4444     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4445   %}
4446   ins_pipe( fpu_reg_reg );
4447 %}
4448 
4449 instruct Repl4L_zero(vecY dst, immL0 zero) %{
4450   predicate(n->as_Vector()->length() == 4);
4451   match(Set dst (ReplicateL zero));
4452   format %{ "vpxor   $dst,$dst,$dst\t! replicate4L zero" %}
4453   ins_encode %{
4454     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
4455     int vector_len = 1;
4456     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4457   %}
4458   ins_pipe( fpu_reg_reg );
4459 %}
4460 
4461 // Replicate float (4 byte) scalar to be vector
4462 instruct Repl2F(vecD dst, vlRegF src) %{
4463   predicate(n->as_Vector()->length() == 2);
4464   match(Set dst (ReplicateF src));
4465   format %{ "pshufd  $dst,$dst,0x00\t! replicate2F" %}
4466   ins_encode %{
4467     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
4468   %}
4469   ins_pipe( fpu_reg_reg );
4470 %}
4471 
4472 instruct Repl4F(vecX dst, vlRegF src) %{
4473   predicate(n->as_Vector()->length() == 4);
4474   match(Set dst (ReplicateF src));
4475   format %{ "pshufd  $dst,$dst,0x00\t! replicate4F" %}
4476   ins_encode %{
4477     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
4478   %}
4479   ins_pipe( pipe_slow );
4480 %}
4481 
4482 // Replicate double (8 bytes) scalar to be vector
4483 instruct Repl2D(vecX dst, vlRegD src) %{
4484   predicate(n->as_Vector()->length() == 2);
4485   match(Set dst (ReplicateD src));
4486   format %{ "pshufd  $dst,$src,0x44\t! replicate2D" %}
4487   ins_encode %{
4488     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
4489   %}
4490   ins_pipe( pipe_slow );
4491 %}
4492 
4493 // ====================EVEX REPLICATE=============================================
4494 
4495 instruct Repl4B_mem_evex(vecS dst, memory mem) %{
4496   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4497   match(Set dst (ReplicateB (LoadB mem)));
4498   format %{ "vpbroadcastb  $dst,$mem\t! replicate4B" %}
4499   ins_encode %{
4500     int vector_len = 0;
4501     __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
4502   %}
4503   ins_pipe( pipe_slow );
4504 %}
4505 
4506 instruct Repl8B_mem_evex(vecD dst, memory mem) %{
4507   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4508   match(Set dst (ReplicateB (LoadB mem)));
4509   format %{ "vpbroadcastb  $dst,$mem\t! replicate8B" %}
4510   ins_encode %{
4511     int vector_len = 0;
4512     __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
4513   %}
4514   ins_pipe( pipe_slow );
4515 %}
4516 
4517 instruct Repl16B_evex(vecX dst, rRegI src) %{
4518   predicate(n->as_Vector()->length() == 16 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4519   match(Set dst (ReplicateB src));
4520   format %{ "evpbroadcastb $dst,$src\t! replicate16B" %}
4521   ins_encode %{
4522    int vector_len = 0;
4523     __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vector_len);
4524   %}
4525   ins_pipe( pipe_slow );
4526 %}
4527 
4528 instruct Repl16B_mem_evex(vecX dst, memory mem) %{
4529   predicate(n->as_Vector()->length() == 16 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4530   match(Set dst (ReplicateB (LoadB mem)));
4531   format %{ "vpbroadcastb  $dst,$mem\t! replicate16B" %}
4532   ins_encode %{
4533     int vector_len = 0;
4534     __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
4535   %}
4536   ins_pipe( pipe_slow );
4537 %}
4538 
4539 instruct Repl32B_evex(vecY dst, rRegI src) %{
4540   predicate(n->as_Vector()->length() == 32 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4541   match(Set dst (ReplicateB src));
4542   format %{ "evpbroadcastb $dst,$src\t! replicate32B" %}
4543   ins_encode %{
4544    int vector_len = 1;
4545     __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vector_len);
4546   %}
4547   ins_pipe( pipe_slow );
4548 %}
4549 
4550 instruct Repl32B_mem_evex(vecY dst, memory mem) %{
4551   predicate(n->as_Vector()->length() == 32 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4552   match(Set dst (ReplicateB (LoadB mem)));
4553   format %{ "vpbroadcastb  $dst,$mem\t! replicate32B" %}
4554   ins_encode %{
4555     int vector_len = 1;
4556     __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
4557   %}
4558   ins_pipe( pipe_slow );
4559 %}
4560 
4561 instruct Repl64B_evex(vecZ dst, rRegI src) %{
4562   predicate(n->as_Vector()->length() == 64 && UseAVX > 2 && VM_Version::supports_avx512bw());
4563   match(Set dst (ReplicateB src));
4564   format %{ "evpbroadcastb $dst,$src\t! upper replicate64B" %}
4565   ins_encode %{
4566    int vector_len = 2;
4567     __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vector_len);
4568   %}
4569   ins_pipe( pipe_slow );
4570 %}
4571 
4572 instruct Repl64B_mem_evex(vecZ dst, memory mem) %{
4573   predicate(n->as_Vector()->length() == 64 && UseAVX > 2 && VM_Version::supports_avx512bw());
4574   match(Set dst (ReplicateB (LoadB mem)));
4575   format %{ "vpbroadcastb  $dst,$mem\t! replicate64B" %}
4576   ins_encode %{
4577     int vector_len = 2;
4578     __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
4579   %}
4580   ins_pipe( pipe_slow );
4581 %}
4582 
4583 instruct Repl16B_imm_evex(vecX dst, immI con) %{
4584   predicate(n->as_Vector()->length() == 16 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4585   match(Set dst (ReplicateB con));
4586   format %{ "movq    $dst,[$constantaddress]\n\t"
4587             "vpbroadcastb $dst,$dst\t! replicate16B" %}
4588   ins_encode %{
4589    int vector_len = 0;
4590     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
4591     __ vpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4592   %}
4593   ins_pipe( pipe_slow );
4594 %}
4595 
4596 instruct Repl32B_imm_evex(vecY dst, immI con) %{
4597   predicate(n->as_Vector()->length() == 32 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4598   match(Set dst (ReplicateB con));
4599   format %{ "movq    $dst,[$constantaddress]\n\t"
4600             "vpbroadcastb $dst,$dst\t! replicate32B" %}
4601   ins_encode %{
4602    int vector_len = 1;
4603     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
4604     __ vpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4605   %}
4606   ins_pipe( pipe_slow );
4607 %}
4608 
4609 instruct Repl64B_imm_evex(vecZ dst, immI con) %{
4610   predicate(n->as_Vector()->length() == 64 && UseAVX > 2 && VM_Version::supports_avx512bw());
4611   match(Set dst (ReplicateB con));
4612   format %{ "movq    $dst,[$constantaddress]\n\t"
4613             "vpbroadcastb $dst,$dst\t! upper replicate64B" %}
4614   ins_encode %{
4615    int vector_len = 2;
4616     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
4617     __ vpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4618   %}
4619   ins_pipe( pipe_slow );
4620 %}
4621 
4622 instruct Repl64B_zero_evex(vecZ dst, immI0 zero) %{
4623   predicate(n->as_Vector()->length() == 64 && UseAVX > 2);
4624   match(Set dst (ReplicateB zero));
4625   format %{ "vpxor   $dst k0,$dst,$dst\t! replicate64B zero" %}
4626   ins_encode %{
4627     // Use vxorpd since AVX does not have vpxor for 512-bit (EVEX will have it).
4628     int vector_len = 2;
4629     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4630   %}
4631   ins_pipe( fpu_reg_reg );
4632 %}
4633 
4634 instruct Repl4S_evex(vecD dst, rRegI src) %{
4635   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4636   match(Set dst (ReplicateS src));
4637   format %{ "evpbroadcastw $dst,$src\t! replicate4S" %}
4638   ins_encode %{
4639    int vector_len = 0;
4640     __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
4641   %}
4642   ins_pipe( pipe_slow );
4643 %}
4644 
4645 instruct Repl4S_mem_evex(vecD dst, memory mem) %{
4646   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4647   match(Set dst (ReplicateS (LoadS mem)));
4648   format %{ "vpbroadcastw  $dst,$mem\t! replicate4S" %}
4649   ins_encode %{
4650     int vector_len = 0;
4651     __ vpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
4652   %}
4653   ins_pipe( pipe_slow );
4654 %}
4655 
4656 instruct Repl8S_evex(vecX dst, rRegI src) %{
4657   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4658   match(Set dst (ReplicateS src));
4659   format %{ "evpbroadcastw $dst,$src\t! replicate8S" %}
4660   ins_encode %{
4661    int vector_len = 0;
4662     __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
4663   %}
4664   ins_pipe( pipe_slow );
4665 %}
4666 
4667 instruct Repl8S_mem_evex(vecX dst, memory mem) %{
4668   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4669   match(Set dst (ReplicateS (LoadS mem)));
4670   format %{ "vpbroadcastw  $dst,$mem\t! replicate8S" %}
4671   ins_encode %{
4672     int vector_len = 0;
4673     __ vpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
4674   %}
4675   ins_pipe( pipe_slow );
4676 %}
4677 
4678 instruct Repl16S_evex(vecY dst, rRegI src) %{
4679   predicate(n->as_Vector()->length() == 16 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4680   match(Set dst (ReplicateS src));
4681   format %{ "evpbroadcastw $dst,$src\t! replicate16S" %}
4682   ins_encode %{
4683    int vector_len = 1;
4684     __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
4685   %}
4686   ins_pipe( pipe_slow );
4687 %}
4688 
4689 instruct Repl16S_mem_evex(vecY dst, memory mem) %{
4690   predicate(n->as_Vector()->length() == 16 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4691   match(Set dst (ReplicateS (LoadS mem)));
4692   format %{ "vpbroadcastw  $dst,$mem\t! replicate16S" %}
4693   ins_encode %{
4694     int vector_len = 1;
4695     __ vpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
4696   %}
4697   ins_pipe( pipe_slow );
4698 %}
4699 
4700 instruct Repl32S_evex(vecZ dst, rRegI src) %{
4701   predicate(n->as_Vector()->length() == 32 && UseAVX > 2 && VM_Version::supports_avx512bw());
4702   match(Set dst (ReplicateS src));
4703   format %{ "evpbroadcastw $dst,$src\t! replicate32S" %}
4704   ins_encode %{
4705    int vector_len = 2;
4706     __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
4707   %}
4708   ins_pipe( pipe_slow );
4709 %}
4710 
4711 instruct Repl32S_mem_evex(vecZ dst, memory mem) %{
4712   predicate(n->as_Vector()->length() == 32 && UseAVX > 2 && VM_Version::supports_avx512bw());
4713   match(Set dst (ReplicateS (LoadS mem)));
4714   format %{ "vpbroadcastw  $dst,$mem\t! replicate32S" %}
4715   ins_encode %{
4716     int vector_len = 2;
4717     __ vpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
4718   %}
4719   ins_pipe( pipe_slow );
4720 %}
4721 
4722 instruct Repl8S_imm_evex(vecX dst, immI con) %{
4723   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4724   match(Set dst (ReplicateS con));
4725   format %{ "movq    $dst,[$constantaddress]\n\t"
4726             "vpbroadcastw $dst,$dst\t! replicate8S" %}
4727   ins_encode %{
4728    int vector_len = 0;
4729     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
4730     __ vpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4731   %}
4732   ins_pipe( pipe_slow );
4733 %}
4734 
4735 instruct Repl16S_imm_evex(vecY dst, immI con) %{
4736   predicate(n->as_Vector()->length() == 16 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4737   match(Set dst (ReplicateS con));
4738   format %{ "movq    $dst,[$constantaddress]\n\t"
4739             "vpbroadcastw $dst,$dst\t! replicate16S" %}
4740   ins_encode %{
4741    int vector_len = 1;
4742     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
4743     __ vpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4744   %}
4745   ins_pipe( pipe_slow );
4746 %}
4747 
4748 instruct Repl32S_imm_evex(vecZ dst, immI con) %{
4749   predicate(n->as_Vector()->length() == 32 && UseAVX > 2 && VM_Version::supports_avx512bw());
4750   match(Set dst (ReplicateS con));
4751   format %{ "movq    $dst,[$constantaddress]\n\t"
4752             "vpbroadcastw $dst,$dst\t! replicate32S" %}
4753   ins_encode %{
4754    int vector_len = 2;
4755     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
4756     __ vpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4757   %}
4758   ins_pipe( pipe_slow );
4759 %}
4760 
4761 instruct Repl32S_zero_evex(vecZ dst, immI0 zero) %{
4762   predicate(n->as_Vector()->length() == 32 && UseAVX > 2);
4763   match(Set dst (ReplicateS zero));
4764   format %{ "vpxor   $dst k0,$dst,$dst\t! replicate32S zero" %}
4765   ins_encode %{
4766     // Use vxorpd since AVX does not have vpxor for 512-bit (EVEX will have it).
4767     int vector_len = 2;
4768     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4769   %}
4770   ins_pipe( fpu_reg_reg );
4771 %}
4772 
4773 instruct Repl4I_evex(vecX dst, rRegI src) %{
4774   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
4775   match(Set dst (ReplicateI src));
4776   format %{ "evpbroadcastd  $dst,$src\t! replicate4I" %}
4777   ins_encode %{
4778     int vector_len = 0;
4779     __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vector_len);
4780   %}
4781   ins_pipe( pipe_slow );
4782 %}
4783 
4784 instruct Repl4I_mem_evex(vecX dst, memory mem) %{
4785   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
4786   match(Set dst (ReplicateI (LoadI mem)));
4787   format %{ "vpbroadcastd  $dst,$mem\t! replicate4I" %}
4788   ins_encode %{
4789     int vector_len = 0;
4790     __ vpbroadcastd($dst$$XMMRegister, $mem$$Address, vector_len);
4791   %}
4792   ins_pipe( pipe_slow );
4793 %}
4794 
4795 instruct Repl8I_evex(vecY dst, rRegI src) %{
4796   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vl());
4797   match(Set dst (ReplicateI src));
4798   format %{ "evpbroadcastd  $dst,$src\t! replicate8I" %}
4799   ins_encode %{
4800     int vector_len = 1;
4801     __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vector_len);
4802   %}
4803   ins_pipe( pipe_slow );
4804 %}
4805 
4806 instruct Repl8I_mem_evex(vecY dst, memory mem) %{
4807   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vl());
4808   match(Set dst (ReplicateI (LoadI mem)));
4809   format %{ "vpbroadcastd  $dst,$mem\t! replicate8I" %}
4810   ins_encode %{
4811     int vector_len = 1;
4812     __ vpbroadcastd($dst$$XMMRegister, $mem$$Address, vector_len);
4813   %}
4814   ins_pipe( pipe_slow );
4815 %}
4816 
4817 instruct Repl16I_evex(vecZ dst, rRegI src) %{
4818   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4819   match(Set dst (ReplicateI src));
4820   format %{ "evpbroadcastd  $dst,$src\t! replicate16I" %}
4821   ins_encode %{
4822     int vector_len = 2;
4823     __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vector_len);
4824   %}
4825   ins_pipe( pipe_slow );
4826 %}
4827 
4828 instruct Repl16I_mem_evex(vecZ dst, memory mem) %{
4829   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4830   match(Set dst (ReplicateI (LoadI mem)));
4831   format %{ "vpbroadcastd  $dst,$mem\t! replicate16I" %}
4832   ins_encode %{
4833     int vector_len = 2;
4834     __ vpbroadcastd($dst$$XMMRegister, $mem$$Address, vector_len);
4835   %}
4836   ins_pipe( pipe_slow );
4837 %}
4838 
4839 instruct Repl4I_imm_evex(vecX dst, immI con) %{
4840   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
4841   match(Set dst (ReplicateI con));
4842   format %{ "movq    $dst,[$constantaddress]\t! replicate8I($con)\n\t"
4843             "vpbroadcastd  $dst,$dst\t! replicate4I" %}
4844   ins_encode %{
4845     int vector_len = 0;
4846     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
4847     __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4848   %}
4849   ins_pipe( pipe_slow );
4850 %}
4851 
4852 instruct Repl8I_imm_evex(vecY dst, immI con) %{
4853   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vl());
4854   match(Set dst (ReplicateI con));
4855   format %{ "movq    $dst,[$constantaddress]\t! replicate8I($con)\n\t"
4856             "vpbroadcastd  $dst,$dst\t! replicate8I" %}
4857   ins_encode %{
4858     int vector_len = 1;
4859     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
4860     __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4861   %}
4862   ins_pipe( pipe_slow );
4863 %}
4864 
4865 instruct Repl16I_imm_evex(vecZ dst, immI con) %{
4866   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4867   match(Set dst (ReplicateI con));
4868   format %{ "movq    $dst,[$constantaddress]\t! replicate16I($con)\n\t"
4869             "vpbroadcastd  $dst,$dst\t! replicate16I" %}
4870   ins_encode %{
4871     int vector_len = 2;
4872     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
4873     __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4874   %}
4875   ins_pipe( pipe_slow );
4876 %}
4877 
4878 instruct Repl16I_zero_evex(vecZ dst, immI0 zero) %{
4879   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4880   match(Set dst (ReplicateI zero));
4881   format %{ "vpxor   $dst k0,$dst,$dst\t! replicate16I zero" %}
4882   ins_encode %{
4883     // Use vxorpd since AVX does not have vpxor for 512-bit (AVX2 will have it).
4884     int vector_len = 2;
4885     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4886   %}
4887   ins_pipe( fpu_reg_reg );
4888 %}
4889 
4890 // Replicate long (8 byte) scalar to be vector
4891 #ifdef _LP64
4892 instruct Repl4L_evex(vecY dst, rRegL src) %{
4893   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
4894   match(Set dst (ReplicateL src));
4895   format %{ "evpbroadcastq  $dst,$src\t! replicate4L" %}
4896   ins_encode %{
4897     int vector_len = 1;
4898     __ evpbroadcastq($dst$$XMMRegister, $src$$Register, vector_len);
4899   %}
4900   ins_pipe( pipe_slow );
4901 %}
4902 
4903 instruct Repl8L_evex(vecZ dst, rRegL src) %{
4904   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4905   match(Set dst (ReplicateL src));
4906   format %{ "evpbroadcastq  $dst,$src\t! replicate8L" %}
4907   ins_encode %{
4908     int vector_len = 2;
4909     __ evpbroadcastq($dst$$XMMRegister, $src$$Register, vector_len);
4910   %}
4911   ins_pipe( pipe_slow );
4912 %}
4913 #else // _LP64
4914 instruct Repl4L_evex(vecY dst, eRegL src, regD tmp) %{
4915   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
4916   match(Set dst (ReplicateL src));
4917   effect(TEMP dst, USE src, TEMP tmp);
4918   format %{ "movdl   $dst,$src.lo\n\t"
4919             "movdl   $tmp,$src.hi\n\t"
4920             "punpckldq $dst,$tmp\n\t"
4921             "vpbroadcastq  $dst,$dst\t! replicate4L" %}
4922   ins_encode %{
4923     int vector_len = 1;
4924     __ movdl($dst$$XMMRegister, $src$$Register);
4925     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
4926     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
4927     __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4928   %}
4929   ins_pipe( pipe_slow );
4930 %}
4931 
4932 instruct Repl8L_evex(legVecZ dst, eRegL src, legVecZ tmp) %{
4933   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4934   match(Set dst (ReplicateL src));
4935   effect(TEMP dst, USE src, TEMP tmp);
4936   format %{ "movdl   $dst,$src.lo\n\t"
4937             "movdl   $tmp,$src.hi\n\t"
4938             "punpckldq $dst,$tmp\n\t"
4939             "vpbroadcastq  $dst,$dst\t! replicate8L" %}
4940   ins_encode %{
4941     int vector_len = 2;
4942     __ movdl($dst$$XMMRegister, $src$$Register);
4943     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
4944     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
4945     __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4946   %}
4947   ins_pipe( pipe_slow );
4948 %}
4949 #endif // _LP64
4950 
4951 instruct Repl4L_imm_evex(vecY dst, immL con) %{
4952   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
4953   match(Set dst (ReplicateL con));
4954   format %{ "movq    $dst,[$constantaddress]\n\t"
4955             "vpbroadcastq  $dst,$dst\t! replicate4L" %}
4956   ins_encode %{
4957     int vector_len = 1;
4958     __ movq($dst$$XMMRegister, $constantaddress($con));
4959     __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4960   %}
4961   ins_pipe( pipe_slow );
4962 %}
4963 
4964 instruct Repl8L_imm_evex(vecZ dst, immL con) %{
4965   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4966   match(Set dst (ReplicateL con));
4967   format %{ "movq    $dst,[$constantaddress]\n\t"
4968             "vpbroadcastq  $dst,$dst\t! replicate8L" %}
4969   ins_encode %{
4970     int vector_len = 2;
4971     __ movq($dst$$XMMRegister, $constantaddress($con));
4972     __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4973   %}
4974   ins_pipe( pipe_slow );
4975 %}
4976 
4977 instruct Repl2L_mem_evex(vecX dst, memory mem) %{
4978   predicate(n->as_Vector()->length() == 2 && UseAVX > 2 && VM_Version::supports_avx512vl());
4979   match(Set dst (ReplicateL (LoadL mem)));
4980   format %{ "vpbroadcastd  $dst,$mem\t! replicate2L" %}
4981   ins_encode %{
4982     int vector_len = 0;
4983     __ vpbroadcastq($dst$$XMMRegister, $mem$$Address, vector_len);
4984   %}
4985   ins_pipe( pipe_slow );
4986 %}
4987 
4988 instruct Repl4L_mem_evex(vecY dst, memory mem) %{
4989   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
4990   match(Set dst (ReplicateL (LoadL mem)));
4991   format %{ "vpbroadcastd  $dst,$mem\t! replicate4L" %}
4992   ins_encode %{
4993     int vector_len = 1;
4994     __ vpbroadcastq($dst$$XMMRegister, $mem$$Address, vector_len);
4995   %}
4996   ins_pipe( pipe_slow );
4997 %}
4998 
4999 instruct Repl8L_mem_evex(vecZ dst, memory mem) %{
5000   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
5001   match(Set dst (ReplicateL (LoadL mem)));
5002   format %{ "vpbroadcastd  $dst,$mem\t! replicate8L" %}
5003   ins_encode %{
5004     int vector_len = 2;
5005     __ vpbroadcastq($dst$$XMMRegister, $mem$$Address, vector_len);
5006   %}
5007   ins_pipe( pipe_slow );
5008 %}
5009 
5010 instruct Repl8L_zero_evex(vecZ dst, immL0 zero) %{
5011   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
5012   match(Set dst (ReplicateL zero));
5013   format %{ "vpxor   $dst k0,$dst,$dst\t! replicate8L zero" %}
5014   ins_encode %{
5015     // Use vxorpd since AVX does not have vpxor for 512-bit (EVEX will have it).
5016     int vector_len = 2;
5017     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
5018   %}
5019   ins_pipe( fpu_reg_reg );
5020 %}
5021 
5022 instruct Repl8F_evex(vecY dst, regF src) %{
5023   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vl());
5024   match(Set dst (ReplicateF src));
5025   format %{ "vpbroadcastss $dst,$src\t! replicate8F" %}
5026   ins_encode %{
5027     int vector_len = 1;
5028     __ vpbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vector_len);
5029   %}
5030   ins_pipe( pipe_slow );
5031 %}
5032 
5033 instruct Repl8F_mem_evex(vecY dst, memory mem) %{
5034   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vl());
5035   match(Set dst (ReplicateF (LoadF mem)));
5036   format %{ "vbroadcastss  $dst,$mem\t! replicate8F" %}
5037   ins_encode %{
5038     int vector_len = 1;
5039     __ vpbroadcastss($dst$$XMMRegister, $mem$$Address, vector_len);
5040   %}
5041   ins_pipe( pipe_slow );
5042 %}
5043 
5044 instruct Repl16F_evex(vecZ dst, regF src) %{
5045   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
5046   match(Set dst (ReplicateF src));
5047   format %{ "vpbroadcastss $dst,$src\t! replicate16F" %}
5048   ins_encode %{
5049     int vector_len = 2;
5050     __ vpbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vector_len);
5051   %}
5052   ins_pipe( pipe_slow );
5053 %}
5054 
5055 instruct Repl16F_mem_evex(vecZ dst, memory mem) %{
5056   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
5057   match(Set dst (ReplicateF (LoadF mem)));
5058   format %{ "vbroadcastss  $dst,$mem\t! replicate16F" %}
5059   ins_encode %{
5060     int vector_len = 2;
5061     __ vpbroadcastss($dst$$XMMRegister, $mem$$Address, vector_len);
5062   %}
5063   ins_pipe( pipe_slow );
5064 %}
5065 
5066 instruct Repl2F_zero_evex(vecD dst, immF0 zero) %{
5067   predicate(n->as_Vector()->length() == 2 && UseAVX > 2);
5068   match(Set dst (ReplicateF zero));
5069   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate2F zero" %}
5070   ins_encode %{
5071     // Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
5072     int vector_len = 2;
5073     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
5074   %}
5075   ins_pipe( fpu_reg_reg );
5076 %}
5077 
5078 instruct Repl4F_zero_evex(vecX dst, immF0 zero) %{
5079   predicate(n->as_Vector()->length() == 4 && UseAVX > 2);
5080   match(Set dst (ReplicateF zero));
5081   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate4F zero" %}
5082   ins_encode %{
5083     // Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
5084     int vector_len = 2;
5085     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
5086   %}
5087   ins_pipe( fpu_reg_reg );
5088 %}
5089 
5090 instruct Repl8F_zero_evex(vecY dst, immF0 zero) %{
5091   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
5092   match(Set dst (ReplicateF zero));
5093   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate8F zero" %}
5094   ins_encode %{
5095     // Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
5096     int vector_len = 2;
5097     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
5098   %}
5099   ins_pipe( fpu_reg_reg );
5100 %}
5101 
5102 instruct Repl16F_zero_evex(vecZ dst, immF0 zero) %{
5103   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
5104   match(Set dst (ReplicateF zero));
5105   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate16F zero" %}
5106   ins_encode %{
5107     // Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
5108     int vector_len = 2;
5109     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
5110   %}
5111   ins_pipe( fpu_reg_reg );
5112 %}
5113 
5114 instruct Repl4D_evex(vecY dst, regD src) %{
5115   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
5116   match(Set dst (ReplicateD src));
5117   format %{ "vpbroadcastsd $dst,$src\t! replicate4D" %}
5118   ins_encode %{
5119     int vector_len = 1;
5120     __ vpbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
5121   %}
5122   ins_pipe( pipe_slow );
5123 %}
5124 
5125 instruct Repl4D_mem_evex(vecY dst, memory mem) %{
5126   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
5127   match(Set dst (ReplicateD (LoadD mem)));
5128   format %{ "vbroadcastsd  $dst,$mem\t! replicate4D" %}
5129   ins_encode %{
5130     int vector_len = 1;
5131     __ vpbroadcastsd($dst$$XMMRegister, $mem$$Address, vector_len);
5132   %}
5133   ins_pipe( pipe_slow );
5134 %}
5135 
5136 instruct Repl8D_evex(vecZ dst, regD src) %{
5137   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
5138   match(Set dst (ReplicateD src));
5139   format %{ "vpbroadcastsd $dst,$src\t! replicate8D" %}
5140   ins_encode %{
5141     int vector_len = 2;
5142     __ vpbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
5143   %}
5144   ins_pipe( pipe_slow );
5145 %}
5146 
5147 instruct Repl8D_mem_evex(vecZ dst, memory mem) %{
5148   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
5149   match(Set dst (ReplicateD (LoadD mem)));
5150   format %{ "vbroadcastsd  $dst,$mem\t! replicate8D" %}
5151   ins_encode %{
5152     int vector_len = 2;
5153     __ vpbroadcastsd($dst$$XMMRegister, $mem$$Address, vector_len);
5154   %}
5155   ins_pipe( pipe_slow );
5156 %}
5157 
5158 instruct Repl2D_zero_evex(vecX dst, immD0 zero) %{
5159   predicate(n->as_Vector()->length() == 2 && UseAVX > 2);
5160   match(Set dst (ReplicateD zero));
5161   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate2D zero" %}
5162   ins_encode %{
5163     // Use vpxor in place of vxorpd since EVEX has a constriant on dq for vxorpd: this is a 512-bit operation
5164     int vector_len = 2;
5165     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
5166   %}
5167   ins_pipe( fpu_reg_reg );
5168 %}
5169 
5170 instruct Repl4D_zero_evex(vecY dst, immD0 zero) %{
5171   predicate(n->as_Vector()->length() == 4 && UseAVX > 2);
5172   match(Set dst (ReplicateD zero));
5173   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate4D zero" %}
5174   ins_encode %{
5175     // Use vpxor in place of vxorpd since EVEX has a constriant on dq for vxorpd: this is a 512-bit operation
5176     int vector_len = 2;
5177     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
5178   %}
5179   ins_pipe( fpu_reg_reg );
5180 %}
5181 
5182 instruct Repl8D_zero_evex(vecZ dst, immD0 zero) %{
5183   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
5184   match(Set dst (ReplicateD zero));
5185   format %{ "vpxor  $dst k0,$dst,$dst,vect512\t! replicate8D zero" %}
5186   ins_encode %{
5187     // Use vpxor in place of vxorpd since EVEX has a constriant on dq for vxorpd: this is a 512-bit operation
5188     int vector_len = 2;
5189     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
5190   %}
5191   ins_pipe( fpu_reg_reg );
5192 %}
5193 
5194 // ====================REDUCTION ARITHMETIC=======================================
5195 
5196 instruct rsadd2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, vecD tmp, vecD tmp2) %{
5197   predicate(UseSSE > 2 && UseAVX == 0);
5198   match(Set dst (AddReductionVI src1 src2));
5199   effect(TEMP tmp2, TEMP tmp);
5200   format %{ "movdqu  $tmp2,$src2\n\t"
5201             "phaddd  $tmp2,$tmp2\n\t"
5202             "movd    $tmp,$src1\n\t"
5203             "paddd   $tmp,$tmp2\n\t"
5204             "movd    $dst,$tmp\t! add reduction2I" %}
5205   ins_encode %{
5206     __ movdqu($tmp2$$XMMRegister, $src2$$XMMRegister);
5207     __ phaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister);
5208     __ movdl($tmp$$XMMRegister, $src1$$Register);
5209     __ paddd($tmp$$XMMRegister, $tmp2$$XMMRegister);
5210     __ movdl($dst$$Register, $tmp$$XMMRegister);
5211   %}
5212   ins_pipe( pipe_slow );
5213 %}
5214 
5215 instruct rvadd2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, vecD tmp, vecD tmp2) %{
5216   predicate(VM_Version::supports_avxonly());
5217   match(Set dst (AddReductionVI src1 src2));
5218   effect(TEMP tmp, TEMP tmp2);
5219   format %{ "vphaddd  $tmp,$src2,$src2\n\t"
5220             "movd     $tmp2,$src1\n\t"
5221             "vpaddd   $tmp2,$tmp2,$tmp\n\t"
5222             "movd     $dst,$tmp2\t! add reduction2I" %}
5223   ins_encode %{
5224     int vector_len = 0;
5225     __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len);
5226     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5227     __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
5228     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5229   %}
5230   ins_pipe( pipe_slow );
5231 %}
5232 
5233 instruct rvadd2I_reduction_reg_evex(rRegI dst, rRegI src1, vecD src2, vecD tmp, vecD tmp2) %{
5234   predicate(UseAVX > 2);
5235   match(Set dst (AddReductionVI src1 src2));
5236   effect(TEMP tmp, TEMP tmp2);
5237   format %{ "pshufd  $tmp2,$src2,0x1\n\t"
5238             "vpaddd  $tmp,$src2,$tmp2\n\t"
5239             "movd    $tmp2,$src1\n\t"
5240             "vpaddd  $tmp2,$tmp,$tmp2\n\t"
5241             "movd    $dst,$tmp2\t! add reduction2I" %}
5242   ins_encode %{
5243     int vector_len = 0;
5244     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5245     __ vpaddd($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5246     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5247     __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5248     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5249   %}
5250   ins_pipe( pipe_slow );
5251 %}
5252 
5253 instruct rsadd4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, vecX tmp, vecX tmp2) %{
5254   predicate(UseSSE > 2 && UseAVX == 0);
5255   match(Set dst (AddReductionVI src1 src2));
5256   effect(TEMP tmp, TEMP tmp2);
5257   format %{ "movdqu  $tmp,$src2\n\t"
5258             "phaddd  $tmp,$tmp\n\t"
5259             "phaddd  $tmp,$tmp\n\t"
5260             "movd    $tmp2,$src1\n\t"
5261             "paddd   $tmp2,$tmp\n\t"
5262             "movd    $dst,$tmp2\t! add reduction4I" %}
5263   ins_encode %{
5264     __ movdqu($tmp$$XMMRegister, $src2$$XMMRegister);
5265     __ phaddd($tmp$$XMMRegister, $tmp$$XMMRegister);
5266     __ phaddd($tmp$$XMMRegister, $tmp$$XMMRegister);
5267     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5268     __ paddd($tmp2$$XMMRegister, $tmp$$XMMRegister);
5269     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5270   %}
5271   ins_pipe( pipe_slow );
5272 %}
5273 
5274 instruct rvadd4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, vecX tmp, vecX tmp2) %{
5275   predicate(VM_Version::supports_avxonly());
5276   match(Set dst (AddReductionVI src1 src2));
5277   effect(TEMP tmp, TEMP tmp2);
5278   format %{ "vphaddd  $tmp,$src2,$src2\n\t"
5279             "vphaddd  $tmp,$tmp,$tmp\n\t"
5280             "movd     $tmp2,$src1\n\t"
5281             "vpaddd   $tmp2,$tmp2,$tmp\n\t"
5282             "movd     $dst,$tmp2\t! add reduction4I" %}
5283   ins_encode %{
5284     int vector_len = 0;
5285     __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len);
5286     __ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp$$XMMRegister, vector_len);
5287     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5288     __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
5289     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5290   %}
5291   ins_pipe( pipe_slow );
5292 %}
5293 
5294 instruct rvadd4I_reduction_reg_evex(rRegI dst, rRegI src1, vecX src2, vecX tmp, vecX tmp2) %{
5295   predicate(UseAVX > 2);
5296   match(Set dst (AddReductionVI src1 src2));
5297   effect(TEMP tmp, TEMP tmp2);
5298   format %{ "pshufd  $tmp2,$src2,0xE\n\t"
5299             "vpaddd  $tmp,$src2,$tmp2\n\t"
5300             "pshufd  $tmp2,$tmp,0x1\n\t"
5301             "vpaddd  $tmp,$tmp,$tmp2\n\t"
5302             "movd    $tmp2,$src1\n\t"
5303             "vpaddd  $tmp2,$tmp,$tmp2\n\t"
5304             "movd    $dst,$tmp2\t! add reduction4I" %}
5305   ins_encode %{
5306     int vector_len = 0;
5307     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
5308     __ vpaddd($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5309     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
5310     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5311     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5312     __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5313     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5314   %}
5315   ins_pipe( pipe_slow );
5316 %}
5317 
5318 instruct rvadd8I_reduction_reg(rRegI dst, rRegI src1, vecY src2, vecY tmp, vecY tmp2) %{
5319   predicate(VM_Version::supports_avxonly());
5320   match(Set dst (AddReductionVI src1 src2));
5321   effect(TEMP tmp, TEMP tmp2);
5322   format %{ "vphaddd  $tmp,$src2,$src2\n\t"
5323             "vphaddd  $tmp,$tmp,$tmp2\n\t"
5324             "vextracti128_high  $tmp2,$tmp\n\t"
5325             "vpaddd   $tmp,$tmp,$tmp2\n\t"
5326             "movd     $tmp2,$src1\n\t"
5327             "vpaddd   $tmp2,$tmp2,$tmp\n\t"
5328             "movd     $dst,$tmp2\t! add reduction8I" %}
5329   ins_encode %{
5330     int vector_len = 1;
5331     __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len);
5332     __ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5333     __ vextracti128_high($tmp2$$XMMRegister, $tmp$$XMMRegister);
5334     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5335     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5336     __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5337     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5338   %}
5339   ins_pipe( pipe_slow );
5340 %}
5341 
5342 instruct rvadd8I_reduction_reg_evex(rRegI dst, rRegI src1, vecY src2, vecY tmp, vecY tmp2) %{
5343   predicate(UseAVX > 2);
5344   match(Set dst (AddReductionVI src1 src2));
5345   effect(TEMP tmp, TEMP tmp2);
5346   format %{ "vextracti128_high  $tmp,$src2\n\t"
5347             "vpaddd  $tmp,$tmp,$src2\n\t"
5348             "pshufd  $tmp2,$tmp,0xE\n\t"
5349             "vpaddd  $tmp,$tmp,$tmp2\n\t"
5350             "pshufd  $tmp2,$tmp,0x1\n\t"
5351             "vpaddd  $tmp,$tmp,$tmp2\n\t"
5352             "movd    $tmp2,$src1\n\t"
5353             "vpaddd  $tmp2,$tmp,$tmp2\n\t"
5354             "movd    $dst,$tmp2\t! add reduction8I" %}
5355   ins_encode %{
5356     int vector_len = 0;
5357     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
5358     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
5359     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
5360     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5361     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
5362     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5363     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5364     __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5365     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5366   %}
5367   ins_pipe( pipe_slow );
5368 %}
5369 
5370 instruct rvadd16I_reduction_reg_evex(rRegI dst, rRegI src1, legVecZ src2, legVecZ tmp, legVecZ tmp2, legVecZ tmp3) %{
5371   predicate(UseAVX > 2);
5372   match(Set dst (AddReductionVI src1 src2));
5373   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
5374   format %{ "vextracti64x4_high  $tmp3,$src2\n\t"
5375             "vpaddd  $tmp3,$tmp3,$src2\n\t"
5376             "vextracti128_high  $tmp,$tmp3\n\t"
5377             "vpaddd  $tmp,$tmp,$tmp3\n\t"
5378             "pshufd  $tmp2,$tmp,0xE\n\t"
5379             "vpaddd  $tmp,$tmp,$tmp2\n\t"
5380             "pshufd  $tmp2,$tmp,0x1\n\t"
5381             "vpaddd  $tmp,$tmp,$tmp2\n\t"
5382             "movd    $tmp2,$src1\n\t"
5383             "vpaddd  $tmp2,$tmp,$tmp2\n\t"
5384             "movd    $dst,$tmp2\t! mul reduction16I" %}
5385   ins_encode %{
5386     __ vextracti64x4_high($tmp3$$XMMRegister, $src2$$XMMRegister);
5387     __ vpaddd($tmp3$$XMMRegister, $tmp3$$XMMRegister, $src2$$XMMRegister, 1);
5388     __ vextracti128_high($tmp$$XMMRegister, $tmp3$$XMMRegister);
5389     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp3$$XMMRegister, 0);
5390     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
5391     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5392     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
5393     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5394     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5395     __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5396     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5397   %}
5398   ins_pipe( pipe_slow );
5399 %}
5400 
5401 #ifdef _LP64
5402 instruct rvadd2L_reduction_reg(rRegL dst, rRegL src1, vecX src2, vecX tmp, vecX tmp2) %{
5403   predicate(UseAVX > 2);
5404   match(Set dst (AddReductionVL src1 src2));
5405   effect(TEMP tmp, TEMP tmp2);
5406   format %{ "pshufd  $tmp2,$src2,0xE\n\t"
5407             "vpaddq  $tmp,$src2,$tmp2\n\t"
5408             "movdq   $tmp2,$src1\n\t"
5409             "vpaddq  $tmp2,$tmp,$tmp2\n\t"
5410             "movdq   $dst,$tmp2\t! add reduction2L" %}
5411   ins_encode %{
5412     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
5413     __ vpaddq($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, 0);
5414     __ movdq($tmp2$$XMMRegister, $src1$$Register);
5415     __ vpaddq($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5416     __ movdq($dst$$Register, $tmp2$$XMMRegister);
5417   %}
5418   ins_pipe( pipe_slow );
5419 %}
5420 
5421 instruct rvadd4L_reduction_reg(rRegL dst, rRegL src1, vecY src2, vecY tmp, vecY tmp2) %{
5422   predicate(UseAVX > 2);
5423   match(Set dst (AddReductionVL src1 src2));
5424   effect(TEMP tmp, TEMP tmp2);
5425   format %{ "vextracti128_high  $tmp,$src2\n\t"
5426             "vpaddq  $tmp2,$tmp,$src2\n\t"
5427             "pshufd  $tmp,$tmp2,0xE\n\t"
5428             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
5429             "movdq   $tmp,$src1\n\t"
5430             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
5431             "movdq   $dst,$tmp2\t! add reduction4L" %}
5432   ins_encode %{
5433     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
5434     __ vpaddq($tmp2$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, 0);
5435     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5436     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5437     __ movdq($tmp$$XMMRegister, $src1$$Register);
5438     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5439     __ movdq($dst$$Register, $tmp2$$XMMRegister);
5440   %}
5441   ins_pipe( pipe_slow );
5442 %}
5443 
5444 instruct rvadd8L_reduction_reg(rRegL dst, rRegL src1, legVecZ src2, legVecZ tmp, legVecZ tmp2) %{
5445   predicate(UseAVX > 2);
5446   match(Set dst (AddReductionVL src1 src2));
5447   effect(TEMP tmp, TEMP tmp2);
5448   format %{ "vextracti64x4_high  $tmp2,$src2\n\t"
5449             "vpaddq  $tmp2,$tmp2,$src2\n\t"
5450             "vextracti128_high  $tmp,$tmp2\n\t"
5451             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
5452             "pshufd  $tmp,$tmp2,0xE\n\t"
5453             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
5454             "movdq   $tmp,$src1\n\t"
5455             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
5456             "movdq   $dst,$tmp2\t! add reduction8L" %}
5457   ins_encode %{
5458     __ vextracti64x4_high($tmp2$$XMMRegister, $src2$$XMMRegister);
5459     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
5460     __ vextracti128_high($tmp$$XMMRegister, $tmp2$$XMMRegister);
5461     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5462     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5463     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5464     __ movdq($tmp$$XMMRegister, $src1$$Register);
5465     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5466     __ movdq($dst$$Register, $tmp2$$XMMRegister);
5467   %}
5468   ins_pipe( pipe_slow );
5469 %}
5470 #endif
5471 
5472 instruct rsadd2F_reduction_reg(regF dst, vecD src2, vecD tmp) %{
5473   predicate(UseSSE >= 1 && UseAVX == 0);
5474   match(Set dst (AddReductionVF dst src2));
5475   effect(TEMP dst, TEMP tmp);
5476   format %{ "addss   $dst,$src2\n\t"
5477             "pshufd  $tmp,$src2,0x01\n\t"
5478             "addss   $dst,$tmp\t! add reduction2F" %}
5479   ins_encode %{
5480     __ addss($dst$$XMMRegister, $src2$$XMMRegister);
5481     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5482     __ addss($dst$$XMMRegister, $tmp$$XMMRegister);
5483   %}
5484   ins_pipe( pipe_slow );
5485 %}
5486 
5487 instruct rvadd2F_reduction_reg(regF dst, vecD src2, vecD tmp) %{
5488   predicate(UseAVX > 0);
5489   match(Set dst (AddReductionVF dst src2));
5490   effect(TEMP dst, TEMP tmp);
5491   format %{ "vaddss  $dst,$dst,$src2\n\t"
5492             "pshufd  $tmp,$src2,0x01\n\t"
5493             "vaddss  $dst,$dst,$tmp\t! add reduction2F" %}
5494   ins_encode %{
5495     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5496     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5497     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5498   %}
5499   ins_pipe( pipe_slow );
5500 %}
5501 
5502 instruct rsadd4F_reduction_reg(regF dst, vecX src2, vecX tmp) %{
5503   predicate(UseSSE >= 1 && UseAVX == 0);
5504   match(Set dst (AddReductionVF dst src2));
5505   effect(TEMP dst, TEMP tmp);
5506   format %{ "addss   $dst,$src2\n\t"
5507             "pshufd  $tmp,$src2,0x01\n\t"
5508             "addss   $dst,$tmp\n\t"
5509             "pshufd  $tmp,$src2,0x02\n\t"
5510             "addss   $dst,$tmp\n\t"
5511             "pshufd  $tmp,$src2,0x03\n\t"
5512             "addss   $dst,$tmp\t! add reduction4F" %}
5513   ins_encode %{
5514     __ addss($dst$$XMMRegister, $src2$$XMMRegister);
5515     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5516     __ addss($dst$$XMMRegister, $tmp$$XMMRegister);
5517     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5518     __ addss($dst$$XMMRegister, $tmp$$XMMRegister);
5519     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5520     __ addss($dst$$XMMRegister, $tmp$$XMMRegister);
5521   %}
5522   ins_pipe( pipe_slow );
5523 %}
5524 
5525 instruct rvadd4F_reduction_reg(regF dst, vecX src2, vecX tmp) %{
5526   predicate(UseAVX > 0);
5527   match(Set dst (AddReductionVF dst src2));
5528   effect(TEMP tmp, TEMP dst);
5529   format %{ "vaddss  $dst,dst,$src2\n\t"
5530             "pshufd  $tmp,$src2,0x01\n\t"
5531             "vaddss  $dst,$dst,$tmp\n\t"
5532             "pshufd  $tmp,$src2,0x02\n\t"
5533             "vaddss  $dst,$dst,$tmp\n\t"
5534             "pshufd  $tmp,$src2,0x03\n\t"
5535             "vaddss  $dst,$dst,$tmp\t! add reduction4F" %}
5536   ins_encode %{
5537     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5538     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5539     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5540     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5541     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5542     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5543     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5544   %}
5545   ins_pipe( pipe_slow );
5546 %}
5547 
5548 instruct radd8F_reduction_reg(regF dst, vecY src2, vecY tmp, vecY tmp2) %{
5549   predicate(UseAVX > 0);
5550   match(Set dst (AddReductionVF dst src2));
5551   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5552   format %{ "vaddss  $dst,$dst,$src2\n\t"
5553             "pshufd  $tmp,$src2,0x01\n\t"
5554             "vaddss  $dst,$dst,$tmp\n\t"
5555             "pshufd  $tmp,$src2,0x02\n\t"
5556             "vaddss  $dst,$dst,$tmp\n\t"
5557             "pshufd  $tmp,$src2,0x03\n\t"
5558             "vaddss  $dst,$dst,$tmp\n\t"
5559             "vextractf128_high  $tmp2,$src2\n\t"
5560             "vaddss  $dst,$dst,$tmp2\n\t"
5561             "pshufd  $tmp,$tmp2,0x01\n\t"
5562             "vaddss  $dst,$dst,$tmp\n\t"
5563             "pshufd  $tmp,$tmp2,0x02\n\t"
5564             "vaddss  $dst,$dst,$tmp\n\t"
5565             "pshufd  $tmp,$tmp2,0x03\n\t"
5566             "vaddss  $dst,$dst,$tmp\t! add reduction8F" %}
5567   ins_encode %{
5568     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5569     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5570     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5571     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5572     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5573     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5574     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5575     __ vextractf128_high($tmp2$$XMMRegister, $src2$$XMMRegister);
5576     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5577     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5578     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5579     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5580     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5581     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5582     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5583   %}
5584   ins_pipe( pipe_slow );
5585 %}
5586 
5587 instruct radd16F_reduction_reg(regF dst, legVecZ src2, legVecZ tmp, legVecZ tmp2) %{
5588   predicate(UseAVX > 2);
5589   match(Set dst (AddReductionVF dst src2));
5590   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5591   format %{ "vaddss  $dst,$dst,$src2\n\t"
5592             "pshufd  $tmp,$src2,0x01\n\t"
5593             "vaddss  $dst,$dst,$tmp\n\t"
5594             "pshufd  $tmp,$src2,0x02\n\t"
5595             "vaddss  $dst,$dst,$tmp\n\t"
5596             "pshufd  $tmp,$src2,0x03\n\t"
5597             "vaddss  $dst,$dst,$tmp\n\t"
5598             "vextractf32x4  $tmp2,$src2,0x1\n\t"
5599             "vaddss  $dst,$dst,$tmp2\n\t"
5600             "pshufd  $tmp,$tmp2,0x01\n\t"
5601             "vaddss  $dst,$dst,$tmp\n\t"
5602             "pshufd  $tmp,$tmp2,0x02\n\t"
5603             "vaddss  $dst,$dst,$tmp\n\t"
5604             "pshufd  $tmp,$tmp2,0x03\n\t"
5605             "vaddss  $dst,$dst,$tmp\n\t"
5606             "vextractf32x4  $tmp2,$src2,0x2\n\t"
5607             "vaddss  $dst,$dst,$tmp2\n\t"
5608             "pshufd  $tmp,$tmp2,0x01\n\t"
5609             "vaddss  $dst,$dst,$tmp\n\t"
5610             "pshufd  $tmp,$tmp2,0x02\n\t"
5611             "vaddss  $dst,$dst,$tmp\n\t"
5612             "pshufd  $tmp,$tmp2,0x03\n\t"
5613             "vaddss  $dst,$dst,$tmp\n\t"
5614             "vextractf32x4  $tmp2,$src2,0x3\n\t"
5615             "vaddss  $dst,$dst,$tmp2\n\t"
5616             "pshufd  $tmp,$tmp2,0x01\n\t"
5617             "vaddss  $dst,$dst,$tmp\n\t"
5618             "pshufd  $tmp,$tmp2,0x02\n\t"
5619             "vaddss  $dst,$dst,$tmp\n\t"
5620             "pshufd  $tmp,$tmp2,0x03\n\t"
5621             "vaddss  $dst,$dst,$tmp\t! add reduction16F" %}
5622   ins_encode %{
5623     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5624     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5625     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5626     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5627     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5628     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5629     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5630     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5631     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5632     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5633     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5634     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5635     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5636     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5637     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5638     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
5639     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5640     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5641     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5642     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5643     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5644     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5645     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5646     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
5647     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5648     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5649     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5650     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5651     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5652     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5653     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5654   %}
5655   ins_pipe( pipe_slow );
5656 %}
5657 
5658 instruct rsadd2D_reduction_reg(regD dst, vecX src2, vecX tmp) %{
5659   predicate(UseSSE >= 1 && UseAVX == 0);
5660   match(Set dst (AddReductionVD dst src2));
5661   effect(TEMP tmp, TEMP dst);
5662   format %{ "addsd   $dst,$src2\n\t"
5663             "pshufd  $tmp,$src2,0xE\n\t"
5664             "addsd   $dst,$tmp\t! add reduction2D" %}
5665   ins_encode %{
5666     __ addsd($dst$$XMMRegister, $src2$$XMMRegister);
5667     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5668     __ addsd($dst$$XMMRegister, $tmp$$XMMRegister);
5669   %}
5670   ins_pipe( pipe_slow );
5671 %}
5672 
5673 instruct rvadd2D_reduction_reg(regD dst, vecX src2, vecX tmp) %{
5674   predicate(UseAVX > 0);
5675   match(Set dst (AddReductionVD dst src2));
5676   effect(TEMP tmp, TEMP dst);
5677   format %{ "vaddsd  $dst,$dst,$src2\n\t"
5678             "pshufd  $tmp,$src2,0xE\n\t"
5679             "vaddsd  $dst,$dst,$tmp\t! add reduction2D" %}
5680   ins_encode %{
5681     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5682     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5683     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5684   %}
5685   ins_pipe( pipe_slow );
5686 %}
5687 
5688 instruct rvadd4D_reduction_reg(regD dst, vecY src2, vecX tmp, vecX tmp2) %{
5689   predicate(UseAVX > 0);
5690   match(Set dst (AddReductionVD dst src2));
5691   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5692   format %{ "vaddsd  $dst,$dst,$src2\n\t"
5693             "pshufd  $tmp,$src2,0xE\n\t"
5694             "vaddsd  $dst,$dst,$tmp\n\t"
5695             "vextractf128  $tmp2,$src2,0x1\n\t"
5696             "vaddsd  $dst,$dst,$tmp2\n\t"
5697             "pshufd  $tmp,$tmp2,0xE\n\t"
5698             "vaddsd  $dst,$dst,$tmp\t! add reduction4D" %}
5699   ins_encode %{
5700     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5701     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5702     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5703     __ vextractf128($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5704     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5705     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5706     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5707   %}
5708   ins_pipe( pipe_slow );
5709 %}
5710 
5711 instruct rvadd8D_reduction_reg(regD dst, legVecZ src2, legVecZ tmp, legVecZ tmp2) %{
5712   predicate(UseAVX > 2);
5713   match(Set dst (AddReductionVD dst src2));
5714   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5715   format %{ "vaddsd  $dst,$dst,$src2\n\t"
5716             "pshufd  $tmp,$src2,0xE\n\t"
5717             "vaddsd  $dst,$dst,$tmp\n\t"
5718             "vextractf32x4  $tmp2,$src2,0x1\n\t"
5719             "vaddsd  $dst,$dst,$tmp2\n\t"
5720             "pshufd  $tmp,$tmp2,0xE\n\t"
5721             "vaddsd  $dst,$dst,$tmp\n\t"
5722             "vextractf32x4  $tmp2,$src2,0x2\n\t"
5723             "vaddsd  $dst,$dst,$tmp2\n\t"
5724             "pshufd  $tmp,$tmp2,0xE\n\t"
5725             "vaddsd  $dst,$dst,$tmp\n\t"
5726             "vextractf32x4  $tmp2,$src2,0x3\n\t"
5727             "vaddsd  $dst,$dst,$tmp2\n\t"
5728             "pshufd  $tmp,$tmp2,0xE\n\t"
5729             "vaddsd  $dst,$dst,$tmp\t! add reduction8D" %}
5730   ins_encode %{
5731     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5732     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5733     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5734     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5735     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5736     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5737     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5738     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
5739     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5740     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5741     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5742     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
5743     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5744     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5745     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5746   %}
5747   ins_pipe( pipe_slow );
5748 %}
5749 
5750 instruct rsmul2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, vecD tmp, vecD tmp2) %{
5751   predicate(UseSSE > 3 && UseAVX == 0);
5752   match(Set dst (MulReductionVI src1 src2));
5753   effect(TEMP tmp, TEMP tmp2);
5754   format %{ "pshufd  $tmp2,$src2,0x1\n\t"
5755             "pmulld  $tmp2,$src2\n\t"
5756             "movd    $tmp,$src1\n\t"
5757             "pmulld  $tmp2,$tmp\n\t"
5758             "movd    $dst,$tmp2\t! mul reduction2I" %}
5759   ins_encode %{
5760     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5761     __ pmulld($tmp2$$XMMRegister, $src2$$XMMRegister);
5762     __ movdl($tmp$$XMMRegister, $src1$$Register);
5763     __ pmulld($tmp2$$XMMRegister, $tmp$$XMMRegister);
5764     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5765   %}
5766   ins_pipe( pipe_slow );
5767 %}
5768 
5769 instruct rvmul2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, vecD tmp, vecD tmp2) %{
5770   predicate(UseAVX > 0);
5771   match(Set dst (MulReductionVI src1 src2));
5772   effect(TEMP tmp, TEMP tmp2);
5773   format %{ "pshufd   $tmp2,$src2,0x1\n\t"
5774             "vpmulld  $tmp,$src2,$tmp2\n\t"
5775             "movd     $tmp2,$src1\n\t"
5776             "vpmulld  $tmp2,$tmp,$tmp2\n\t"
5777             "movd     $dst,$tmp2\t! mul reduction2I" %}
5778   ins_encode %{
5779     int vector_len = 0;
5780     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5781     __ vpmulld($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5782     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5783     __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5784     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5785   %}
5786   ins_pipe( pipe_slow );
5787 %}
5788 
5789 instruct rsmul4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, vecX tmp, vecX tmp2) %{
5790   predicate(UseSSE > 3 && UseAVX == 0);
5791   match(Set dst (MulReductionVI src1 src2));
5792   effect(TEMP tmp, TEMP tmp2);
5793   format %{ "pshufd  $tmp2,$src2,0xE\n\t"
5794             "pmulld  $tmp2,$src2\n\t"
5795             "pshufd  $tmp,$tmp2,0x1\n\t"
5796             "pmulld  $tmp2,$tmp\n\t"
5797             "movd    $tmp,$src1\n\t"
5798             "pmulld  $tmp2,$tmp\n\t"
5799             "movd    $dst,$tmp2\t! mul reduction4I" %}
5800   ins_encode %{
5801     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
5802     __ pmulld($tmp2$$XMMRegister, $src2$$XMMRegister);
5803     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x1);
5804     __ pmulld($tmp2$$XMMRegister, $tmp$$XMMRegister);
5805     __ movdl($tmp$$XMMRegister, $src1$$Register);
5806     __ pmulld($tmp2$$XMMRegister, $tmp$$XMMRegister);
5807     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5808   %}
5809   ins_pipe( pipe_slow );
5810 %}
5811 
5812 instruct rvmul4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, vecX tmp, vecX tmp2) %{
5813   predicate(UseAVX > 0);
5814   match(Set dst (MulReductionVI src1 src2));
5815   effect(TEMP tmp, TEMP tmp2);
5816   format %{ "pshufd   $tmp2,$src2,0xE\n\t"
5817             "vpmulld  $tmp,$src2,$tmp2\n\t"
5818             "pshufd   $tmp2,$tmp,0x1\n\t"
5819             "vpmulld  $tmp,$tmp,$tmp2\n\t"
5820             "movd     $tmp2,$src1\n\t"
5821             "vpmulld  $tmp2,$tmp,$tmp2\n\t"
5822             "movd     $dst,$tmp2\t! mul reduction4I" %}
5823   ins_encode %{
5824     int vector_len = 0;
5825     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
5826     __ vpmulld($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5827     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
5828     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5829     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5830     __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5831     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5832   %}
5833   ins_pipe( pipe_slow );
5834 %}
5835 
5836 instruct rvmul8I_reduction_reg(rRegI dst, rRegI src1, vecY src2, vecY tmp, vecY tmp2) %{
5837   predicate(UseAVX > 1);
5838   match(Set dst (MulReductionVI src1 src2));
5839   effect(TEMP tmp, TEMP tmp2);
5840   format %{ "vextracti128_high  $tmp,$src2\n\t"
5841             "vpmulld  $tmp,$tmp,$src2\n\t"
5842             "pshufd   $tmp2,$tmp,0xE\n\t"
5843             "vpmulld  $tmp,$tmp,$tmp2\n\t"
5844             "pshufd   $tmp2,$tmp,0x1\n\t"
5845             "vpmulld  $tmp,$tmp,$tmp2\n\t"
5846             "movd     $tmp2,$src1\n\t"
5847             "vpmulld  $tmp2,$tmp,$tmp2\n\t"
5848             "movd     $dst,$tmp2\t! mul reduction8I" %}
5849   ins_encode %{
5850     int vector_len = 0;
5851     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
5852     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
5853     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
5854     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5855     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
5856     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5857     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5858     __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5859     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5860   %}
5861   ins_pipe( pipe_slow );
5862 %}
5863 
5864 instruct rvmul16I_reduction_reg(rRegI dst, rRegI src1, legVecZ src2, legVecZ tmp, legVecZ tmp2, legVecZ tmp3) %{
5865   predicate(UseAVX > 2);
5866   match(Set dst (MulReductionVI src1 src2));
5867   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
5868   format %{ "vextracti64x4_high  $tmp3,$src2\n\t"
5869             "vpmulld  $tmp3,$tmp3,$src2\n\t"
5870             "vextracti128_high  $tmp,$tmp3\n\t"
5871             "vpmulld  $tmp,$tmp,$src2\n\t"
5872             "pshufd   $tmp2,$tmp,0xE\n\t"
5873             "vpmulld  $tmp,$tmp,$tmp2\n\t"
5874             "pshufd   $tmp2,$tmp,0x1\n\t"
5875             "vpmulld  $tmp,$tmp,$tmp2\n\t"
5876             "movd     $tmp2,$src1\n\t"
5877             "vpmulld  $tmp2,$tmp,$tmp2\n\t"
5878             "movd     $dst,$tmp2\t! mul reduction16I" %}
5879   ins_encode %{
5880     __ vextracti64x4_high($tmp3$$XMMRegister, $src2$$XMMRegister);
5881     __ vpmulld($tmp3$$XMMRegister, $tmp3$$XMMRegister, $src2$$XMMRegister, 1);
5882     __ vextracti128_high($tmp$$XMMRegister, $tmp3$$XMMRegister);
5883     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp3$$XMMRegister, 0);
5884     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
5885     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5886     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
5887     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5888     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5889     __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5890     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5891   %}
5892   ins_pipe( pipe_slow );
5893 %}
5894 
5895 #ifdef _LP64
5896 instruct rvmul2L_reduction_reg(rRegL dst, rRegL src1, vecX src2, vecX tmp, vecX tmp2) %{
5897   predicate(UseAVX > 2 && VM_Version::supports_avx512dq());
5898   match(Set dst (MulReductionVL src1 src2));
5899   effect(TEMP tmp, TEMP tmp2);
5900   format %{ "pshufd   $tmp2,$src2,0xE\n\t"
5901             "vpmullq  $tmp,$src2,$tmp2\n\t"
5902             "movdq    $tmp2,$src1\n\t"
5903             "vpmullq  $tmp2,$tmp,$tmp2\n\t"
5904             "movdq    $dst,$tmp2\t! mul reduction2L" %}
5905   ins_encode %{
5906     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
5907     __ vpmullq($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, 0);
5908     __ movdq($tmp2$$XMMRegister, $src1$$Register);
5909     __ vpmullq($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5910     __ movdq($dst$$Register, $tmp2$$XMMRegister);
5911   %}
5912   ins_pipe( pipe_slow );
5913 %}
5914 
5915 instruct rvmul4L_reduction_reg(rRegL dst, rRegL src1, vecY src2, vecY tmp, vecY tmp2) %{
5916   predicate(UseAVX > 2 && VM_Version::supports_avx512dq());
5917   match(Set dst (MulReductionVL src1 src2));
5918   effect(TEMP tmp, TEMP tmp2);
5919   format %{ "vextracti128_high  $tmp,$src2\n\t"
5920             "vpmullq  $tmp2,$tmp,$src2\n\t"
5921             "pshufd   $tmp,$tmp2,0xE\n\t"
5922             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5923             "movdq    $tmp,$src1\n\t"
5924             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5925             "movdq    $dst,$tmp2\t! mul reduction4L" %}
5926   ins_encode %{
5927     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
5928     __ vpmullq($tmp2$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, 0);
5929     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5930     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5931     __ movdq($tmp$$XMMRegister, $src1$$Register);
5932     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5933     __ movdq($dst$$Register, $tmp2$$XMMRegister);
5934   %}
5935   ins_pipe( pipe_slow );
5936 %}
5937 
5938 instruct rvmul8L_reduction_reg(rRegL dst, rRegL src1, legVecZ src2, legVecZ tmp, legVecZ tmp2) %{
5939   predicate(UseAVX > 2 && VM_Version::supports_avx512dq());
5940   match(Set dst (MulReductionVL src1 src2));
5941   effect(TEMP tmp, TEMP tmp2);
5942   format %{ "vextracti64x4_high  $tmp2,$src2\n\t"
5943             "vpmullq  $tmp2,$tmp2,$src2\n\t"
5944             "vextracti128_high  $tmp,$tmp2\n\t"
5945             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5946             "pshufd   $tmp,$tmp2,0xE\n\t"
5947             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5948             "movdq    $tmp,$src1\n\t"
5949             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5950             "movdq    $dst,$tmp2\t! mul reduction8L" %}
5951   ins_encode %{
5952     __ vextracti64x4_high($tmp2$$XMMRegister, $src2$$XMMRegister);
5953     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
5954     __ vextracti128_high($tmp$$XMMRegister, $tmp2$$XMMRegister);
5955     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5956     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5957     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5958     __ movdq($tmp$$XMMRegister, $src1$$Register);
5959     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5960     __ movdq($dst$$Register, $tmp2$$XMMRegister);
5961   %}
5962   ins_pipe( pipe_slow );
5963 %}
5964 #endif
5965 
5966 instruct rsmul2F_reduction(regF dst, vecD src2, vecD tmp) %{
5967   predicate(UseSSE >= 1 && UseAVX == 0);
5968   match(Set dst (MulReductionVF dst src2));
5969   effect(TEMP dst, TEMP tmp);
5970   format %{ "mulss   $dst,$src2\n\t"
5971             "pshufd  $tmp,$src2,0x01\n\t"
5972             "mulss   $dst,$tmp\t! mul reduction2F" %}
5973   ins_encode %{
5974     __ mulss($dst$$XMMRegister, $src2$$XMMRegister);
5975     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5976     __ mulss($dst$$XMMRegister, $tmp$$XMMRegister);
5977   %}
5978   ins_pipe( pipe_slow );
5979 %}
5980 
5981 instruct rvmul2F_reduction_reg(regF dst, vecD src2, vecD tmp) %{
5982   predicate(UseAVX > 0);
5983   match(Set dst (MulReductionVF dst src2));
5984   effect(TEMP tmp, TEMP dst);
5985   format %{ "vmulss  $dst,$dst,$src2\n\t"
5986             "pshufd  $tmp,$src2,0x01\n\t"
5987             "vmulss  $dst,$dst,$tmp\t! mul reduction2F" %}
5988   ins_encode %{
5989     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5990     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5991     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5992   %}
5993   ins_pipe( pipe_slow );
5994 %}
5995 
5996 instruct rsmul4F_reduction_reg(regF dst, vecX src2, vecX tmp) %{
5997   predicate(UseSSE >= 1 && UseAVX == 0);
5998   match(Set dst (MulReductionVF dst src2));
5999   effect(TEMP dst, TEMP tmp);
6000   format %{ "mulss   $dst,$src2\n\t"
6001             "pshufd  $tmp,$src2,0x01\n\t"
6002             "mulss   $dst,$tmp\n\t"
6003             "pshufd  $tmp,$src2,0x02\n\t"
6004             "mulss   $dst,$tmp\n\t"
6005             "pshufd  $tmp,$src2,0x03\n\t"
6006             "mulss   $dst,$tmp\t! mul reduction4F" %}
6007   ins_encode %{
6008     __ mulss($dst$$XMMRegister, $src2$$XMMRegister);
6009     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
6010     __ mulss($dst$$XMMRegister, $tmp$$XMMRegister);
6011     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
6012     __ mulss($dst$$XMMRegister, $tmp$$XMMRegister);
6013     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
6014     __ mulss($dst$$XMMRegister, $tmp$$XMMRegister);
6015   %}
6016   ins_pipe( pipe_slow );
6017 %}
6018 
6019 instruct rvmul4F_reduction_reg(regF dst, vecX src2, vecX tmp) %{
6020   predicate(UseAVX > 0);
6021   match(Set dst (MulReductionVF dst src2));
6022   effect(TEMP tmp, TEMP dst);
6023   format %{ "vmulss  $dst,$dst,$src2\n\t"
6024             "pshufd  $tmp,$src2,0x01\n\t"
6025             "vmulss  $dst,$dst,$tmp\n\t"
6026             "pshufd  $tmp,$src2,0x02\n\t"
6027             "vmulss  $dst,$dst,$tmp\n\t"
6028             "pshufd  $tmp,$src2,0x03\n\t"
6029             "vmulss  $dst,$dst,$tmp\t! mul reduction4F" %}
6030   ins_encode %{
6031     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
6032     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
6033     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6034     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
6035     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6036     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
6037     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6038   %}
6039   ins_pipe( pipe_slow );
6040 %}
6041 
6042 instruct rvmul8F_reduction_reg(regF dst, vecY src2, vecY tmp, vecY tmp2) %{
6043   predicate(UseAVX > 0);
6044   match(Set dst (MulReductionVF dst src2));
6045   effect(TEMP tmp, TEMP dst, TEMP tmp2);
6046   format %{ "vmulss  $dst,$dst,$src2\n\t"
6047             "pshufd  $tmp,$src2,0x01\n\t"
6048             "vmulss  $dst,$dst,$tmp\n\t"
6049             "pshufd  $tmp,$src2,0x02\n\t"
6050             "vmulss  $dst,$dst,$tmp\n\t"
6051             "pshufd  $tmp,$src2,0x03\n\t"
6052             "vmulss  $dst,$dst,$tmp\n\t"
6053             "vextractf128_high  $tmp2,$src2\n\t"
6054             "vmulss  $dst,$dst,$tmp2\n\t"
6055             "pshufd  $tmp,$tmp2,0x01\n\t"
6056             "vmulss  $dst,$dst,$tmp\n\t"
6057             "pshufd  $tmp,$tmp2,0x02\n\t"
6058             "vmulss  $dst,$dst,$tmp\n\t"
6059             "pshufd  $tmp,$tmp2,0x03\n\t"
6060             "vmulss  $dst,$dst,$tmp\t! mul reduction8F" %}
6061   ins_encode %{
6062     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
6063     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
6064     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6065     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
6066     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6067     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
6068     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6069     __ vextractf128_high($tmp2$$XMMRegister, $src2$$XMMRegister);
6070     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
6071     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
6072     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6073     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
6074     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6075     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
6076     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6077   %}
6078   ins_pipe( pipe_slow );
6079 %}
6080 
6081 instruct rvmul16F_reduction_reg(regF dst, legVecZ src2, legVecZ tmp, legVecZ tmp2) %{
6082   predicate(UseAVX > 2);
6083   match(Set dst (MulReductionVF dst src2));
6084   effect(TEMP tmp, TEMP dst, TEMP tmp2);
6085   format %{ "vmulss  $dst,$dst,$src2\n\t"
6086             "pshufd  $tmp,$src2,0x01\n\t"
6087             "vmulss  $dst,$dst,$tmp\n\t"
6088             "pshufd  $tmp,$src2,0x02\n\t"
6089             "vmulss  $dst,$dst,$tmp\n\t"
6090             "pshufd  $tmp,$src2,0x03\n\t"
6091             "vmulss  $dst,$dst,$tmp\n\t"
6092             "vextractf32x4  $tmp2,$src2,0x1\n\t"
6093             "vmulss  $dst,$dst,$tmp2\n\t"
6094             "pshufd  $tmp,$tmp2,0x01\n\t"
6095             "vmulss  $dst,$dst,$tmp\n\t"
6096             "pshufd  $tmp,$tmp2,0x02\n\t"
6097             "vmulss  $dst,$dst,$tmp\n\t"
6098             "pshufd  $tmp,$tmp2,0x03\n\t"
6099             "vmulss  $dst,$dst,$tmp\n\t"
6100             "vextractf32x4  $tmp2,$src2,0x2\n\t"
6101             "vmulss  $dst,$dst,$tmp2\n\t"
6102             "pshufd  $tmp,$tmp2,0x01\n\t"
6103             "vmulss  $dst,$dst,$tmp\n\t"
6104             "pshufd  $tmp,$tmp2,0x02\n\t"
6105             "vmulss  $dst,$dst,$tmp\n\t"
6106             "pshufd  $tmp,$tmp2,0x03\n\t"
6107             "vmulss  $dst,$dst,$tmp\n\t"
6108             "vextractf32x4  $tmp2,$src2,0x3\n\t"
6109             "vmulss  $dst,$dst,$tmp2\n\t"
6110             "pshufd  $tmp,$tmp2,0x01\n\t"
6111             "vmulss  $dst,$dst,$tmp\n\t"
6112             "pshufd  $tmp,$tmp2,0x02\n\t"
6113             "vmulss  $dst,$dst,$tmp\n\t"
6114             "pshufd  $tmp,$tmp2,0x03\n\t"
6115             "vmulss  $dst,$dst,$tmp\t! mul reduction16F" %}
6116   ins_encode %{
6117     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
6118     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
6119     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6120     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
6121     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6122     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
6123     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6124     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
6125     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
6126     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
6127     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6128     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
6129     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6130     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
6131     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6132     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
6133     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
6134     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
6135     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6136     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
6137     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6138     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
6139     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6140     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
6141     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
6142     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
6143     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6144     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
6145     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6146     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
6147     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6148   %}
6149   ins_pipe( pipe_slow );
6150 %}
6151 
6152 instruct rsmul2D_reduction_reg(regD dst, vecX src2, vecX tmp) %{
6153   predicate(UseSSE >= 1 && UseAVX == 0);
6154   match(Set dst (MulReductionVD dst src2));
6155   effect(TEMP dst, TEMP tmp);
6156   format %{ "mulsd   $dst,$src2\n\t"
6157             "pshufd  $tmp,$src2,0xE\n\t"
6158             "mulsd   $dst,$tmp\t! mul reduction2D" %}
6159   ins_encode %{
6160     __ mulsd($dst$$XMMRegister, $src2$$XMMRegister);
6161     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
6162     __ mulsd($dst$$XMMRegister, $tmp$$XMMRegister);
6163   %}
6164   ins_pipe( pipe_slow );
6165 %}
6166 
6167 instruct rvmul2D_reduction_reg(regD dst, vecX src2, vecX tmp) %{
6168   predicate(UseAVX > 0);
6169   match(Set dst (MulReductionVD dst src2));
6170   effect(TEMP tmp, TEMP dst);
6171   format %{ "vmulsd  $dst,$dst,$src2\n\t"
6172             "pshufd  $tmp,$src2,0xE\n\t"
6173             "vmulsd  $dst,$dst,$tmp\t! mul reduction2D" %}
6174   ins_encode %{
6175     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
6176     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
6177     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6178   %}
6179   ins_pipe( pipe_slow );
6180 %}
6181 
6182 instruct rvmul4D_reduction_reg(regD dst, vecY src2, vecY tmp, vecY tmp2) %{
6183   predicate(UseAVX > 0);
6184   match(Set dst (MulReductionVD dst src2));
6185   effect(TEMP tmp, TEMP dst, TEMP tmp2);
6186   format %{ "vmulsd  $dst,$dst,$src2\n\t"
6187             "pshufd  $tmp,$src2,0xE\n\t"
6188             "vmulsd  $dst,$dst,$tmp\n\t"
6189             "vextractf128_high  $tmp2,$src2\n\t"
6190             "vmulsd  $dst,$dst,$tmp2\n\t"
6191             "pshufd  $tmp,$tmp2,0xE\n\t"
6192             "vmulsd  $dst,$dst,$tmp\t! mul reduction4D" %}
6193   ins_encode %{
6194     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
6195     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
6196     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6197     __ vextractf128_high($tmp2$$XMMRegister, $src2$$XMMRegister);
6198     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
6199     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
6200     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6201   %}
6202   ins_pipe( pipe_slow );
6203 %}
6204 
6205 instruct rvmul8D_reduction_reg(regD dst, legVecZ src2, legVecZ tmp, legVecZ tmp2) %{
6206   predicate(UseAVX > 2);
6207   match(Set dst (MulReductionVD dst src2));
6208   effect(TEMP tmp, TEMP dst, TEMP tmp2);
6209   format %{ "vmulsd  $dst,$dst,$src2\n\t"
6210             "pshufd  $tmp,$src2,0xE\n\t"
6211             "vmulsd  $dst,$dst,$tmp\n\t"
6212             "vextractf32x4  $tmp2,$src2,0x1\n\t"
6213             "vmulsd  $dst,$dst,$tmp2\n\t"
6214             "pshufd  $tmp,$src2,0xE\n\t"
6215             "vmulsd  $dst,$dst,$tmp\n\t"
6216             "vextractf32x4  $tmp2,$src2,0x2\n\t"
6217             "vmulsd  $dst,$dst,$tmp2\n\t"
6218             "pshufd  $tmp,$tmp2,0xE\n\t"
6219             "vmulsd  $dst,$dst,$tmp\n\t"
6220             "vextractf32x4  $tmp2,$src2,0x3\n\t"
6221             "vmulsd  $dst,$dst,$tmp2\n\t"
6222             "pshufd  $tmp,$tmp2,0xE\n\t"
6223             "vmulsd  $dst,$dst,$tmp\t! mul reduction8D" %}
6224   ins_encode %{
6225     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
6226     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
6227     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6228     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
6229     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
6230     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
6231     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6232     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
6233     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
6234     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
6235     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6236     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
6237     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
6238     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
6239     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6240   %}
6241   ins_pipe( pipe_slow );
6242 %}
6243 
6244 // ====================VECTOR ARITHMETIC=======================================
6245 
6246 // --------------------------------- ADD --------------------------------------
6247 
6248 // Bytes vector add
6249 instruct vadd4B(vecS dst, vecS src) %{
6250   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
6251   match(Set dst (AddVB dst src));
6252   format %{ "paddb   $dst,$src\t! add packed4B" %}
6253   ins_encode %{
6254     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
6255   %}
6256   ins_pipe( pipe_slow );
6257 %}
6258 
6259 instruct vadd4B_reg(vecS dst, vecS src1, vecS src2) %{
6260   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6261   match(Set dst (AddVB src1 src2));
6262   format %{ "vpaddb  $dst,$src1,$src2\t! add packed4B" %}
6263   ins_encode %{
6264     int vector_len = 0;
6265     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6266   %}
6267   ins_pipe( pipe_slow );
6268 %}
6269 
6270 
6271 instruct vadd4B_mem(vecS dst, vecS src, memory mem) %{
6272   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6273   match(Set dst (AddVB src (LoadVector mem)));
6274   format %{ "vpaddb  $dst,$src,$mem\t! add packed4B" %}
6275   ins_encode %{
6276     int vector_len = 0;
6277     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6278   %}
6279   ins_pipe( pipe_slow );
6280 %}
6281 
6282 instruct vadd8B(vecD dst, vecD src) %{
6283   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
6284   match(Set dst (AddVB dst src));
6285   format %{ "paddb   $dst,$src\t! add packed8B" %}
6286   ins_encode %{
6287     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
6288   %}
6289   ins_pipe( pipe_slow );
6290 %}
6291 
6292 instruct vadd8B_reg(vecD dst, vecD src1, vecD src2) %{
6293   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6294   match(Set dst (AddVB src1 src2));
6295   format %{ "vpaddb  $dst,$src1,$src2\t! add packed8B" %}
6296   ins_encode %{
6297     int vector_len = 0;
6298     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6299   %}
6300   ins_pipe( pipe_slow );
6301 %}
6302 
6303 
6304 instruct vadd8B_mem(vecD dst, vecD src, memory mem) %{
6305   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6306   match(Set dst (AddVB src (LoadVector mem)));
6307   format %{ "vpaddb  $dst,$src,$mem\t! add packed8B" %}
6308   ins_encode %{
6309     int vector_len = 0;
6310     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6311   %}
6312   ins_pipe( pipe_slow );
6313 %}
6314 
6315 instruct vadd16B(vecX dst, vecX src) %{
6316   predicate(UseAVX == 0 && n->as_Vector()->length() == 16);
6317   match(Set dst (AddVB dst src));
6318   format %{ "paddb   $dst,$src\t! add packed16B" %}
6319   ins_encode %{
6320     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
6321   %}
6322   ins_pipe( pipe_slow );
6323 %}
6324 
6325 instruct vadd16B_reg(vecX dst, vecX src1, vecX src2) %{
6326   predicate(UseAVX > 0  && n->as_Vector()->length() == 16);
6327   match(Set dst (AddVB src1 src2));
6328   format %{ "vpaddb  $dst,$src1,$src2\t! add packed16B" %}
6329   ins_encode %{
6330     int vector_len = 0;
6331     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6332   %}
6333   ins_pipe( pipe_slow );
6334 %}
6335 
6336 instruct vadd16B_mem(vecX dst, vecX src, memory mem) %{
6337   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
6338   match(Set dst (AddVB src (LoadVector mem)));
6339   format %{ "vpaddb  $dst,$src,$mem\t! add packed16B" %}
6340   ins_encode %{
6341     int vector_len = 0;
6342     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6343   %}
6344   ins_pipe( pipe_slow );
6345 %}
6346 
6347 instruct vadd32B_reg(vecY dst, vecY src1, vecY src2) %{
6348   predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
6349   match(Set dst (AddVB src1 src2));
6350   format %{ "vpaddb  $dst,$src1,$src2\t! add packed32B" %}
6351   ins_encode %{
6352     int vector_len = 1;
6353     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6354   %}
6355   ins_pipe( pipe_slow );
6356 %}
6357 
6358 instruct vadd32B_mem(vecY dst, vecY src, memory mem) %{
6359   predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
6360   match(Set dst (AddVB src (LoadVector mem)));
6361   format %{ "vpaddb  $dst,$src,$mem\t! add packed32B" %}
6362   ins_encode %{
6363     int vector_len = 1;
6364     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6365   %}
6366   ins_pipe( pipe_slow );
6367 %}
6368 
6369 instruct vadd64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
6370   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64);
6371   match(Set dst (AddVB src1 src2));
6372   format %{ "vpaddb  $dst,$src1,$src2\t! add packed64B" %}
6373   ins_encode %{
6374     int vector_len = 2;
6375     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6376   %}
6377   ins_pipe( pipe_slow );
6378 %}
6379 
6380 instruct vadd64B_mem(vecZ dst, vecZ src, memory mem) %{
6381   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64);
6382   match(Set dst (AddVB src (LoadVector mem)));
6383   format %{ "vpaddb  $dst,$src,$mem\t! add packed64B" %}
6384   ins_encode %{
6385     int vector_len = 2;
6386     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6387   %}
6388   ins_pipe( pipe_slow );
6389 %}
6390 
6391 // Shorts/Chars vector add
6392 instruct vadd2S(vecS dst, vecS src) %{
6393   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
6394   match(Set dst (AddVS dst src));
6395   format %{ "paddw   $dst,$src\t! add packed2S" %}
6396   ins_encode %{
6397     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
6398   %}
6399   ins_pipe( pipe_slow );
6400 %}
6401 
6402 instruct vadd2S_reg(vecS dst, vecS src1, vecS src2) %{
6403   predicate(UseAVX > 0  && n->as_Vector()->length() == 2);
6404   match(Set dst (AddVS src1 src2));
6405   format %{ "vpaddw  $dst,$src1,$src2\t! add packed2S" %}
6406   ins_encode %{
6407     int vector_len = 0;
6408     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6409   %}
6410   ins_pipe( pipe_slow );
6411 %}
6412 
6413 instruct vadd2S_mem(vecS dst, vecS src, memory mem) %{
6414   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6415   match(Set dst (AddVS src (LoadVector mem)));
6416   format %{ "vpaddw  $dst,$src,$mem\t! add packed2S" %}
6417   ins_encode %{
6418     int vector_len = 0;
6419     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6420   %}
6421   ins_pipe( pipe_slow );
6422 %}
6423 
6424 instruct vadd4S(vecD dst, vecD src) %{
6425   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
6426   match(Set dst (AddVS dst src));
6427   format %{ "paddw   $dst,$src\t! add packed4S" %}
6428   ins_encode %{
6429     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
6430   %}
6431   ins_pipe( pipe_slow );
6432 %}
6433 
6434 instruct vadd4S_reg(vecD dst, vecD src1, vecD src2) %{
6435   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6436   match(Set dst (AddVS src1 src2));
6437   format %{ "vpaddw  $dst,$src1,$src2\t! add packed4S" %}
6438   ins_encode %{
6439     int vector_len = 0;
6440     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6441   %}
6442   ins_pipe( pipe_slow );
6443 %}
6444 
6445 instruct vadd4S_mem(vecD dst, vecD src, memory mem) %{
6446   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6447   match(Set dst (AddVS src (LoadVector mem)));
6448   format %{ "vpaddw  $dst,$src,$mem\t! add packed4S" %}
6449   ins_encode %{
6450     int vector_len = 0;
6451     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6452   %}
6453   ins_pipe( pipe_slow );
6454 %}
6455 
6456 instruct vadd8S(vecX dst, vecX src) %{
6457   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
6458   match(Set dst (AddVS dst src));
6459   format %{ "paddw   $dst,$src\t! add packed8S" %}
6460   ins_encode %{
6461     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
6462   %}
6463   ins_pipe( pipe_slow );
6464 %}
6465 
6466 instruct vadd8S_reg(vecX dst, vecX src1, vecX src2) %{
6467   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6468   match(Set dst (AddVS src1 src2));
6469   format %{ "vpaddw  $dst,$src1,$src2\t! add packed8S" %}
6470   ins_encode %{
6471     int vector_len = 0;
6472     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6473   %}
6474   ins_pipe( pipe_slow );
6475 %}
6476 
6477 instruct vadd8S_mem(vecX dst, vecX src, memory mem) %{
6478   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6479   match(Set dst (AddVS src (LoadVector mem)));
6480   format %{ "vpaddw  $dst,$src,$mem\t! add packed8S" %}
6481   ins_encode %{
6482     int vector_len = 0;
6483     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6484   %}
6485   ins_pipe( pipe_slow );
6486 %}
6487 
6488 instruct vadd16S_reg(vecY dst, vecY src1, vecY src2) %{
6489   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
6490   match(Set dst (AddVS src1 src2));
6491   format %{ "vpaddw  $dst,$src1,$src2\t! add packed16S" %}
6492   ins_encode %{
6493     int vector_len = 1;
6494     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6495   %}
6496   ins_pipe( pipe_slow );
6497 %}
6498 
6499 instruct vadd16S_mem(vecY dst, vecY src, memory mem) %{
6500   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
6501   match(Set dst (AddVS src (LoadVector mem)));
6502   format %{ "vpaddw  $dst,$src,$mem\t! add packed16S" %}
6503   ins_encode %{
6504     int vector_len = 1;
6505     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6506   %}
6507   ins_pipe( pipe_slow );
6508 %}
6509 
6510 instruct vadd32S_reg(vecZ dst, vecZ src1, vecZ src2) %{
6511   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
6512   match(Set dst (AddVS src1 src2));
6513   format %{ "vpaddw  $dst,$src1,$src2\t! add packed32S" %}
6514   ins_encode %{
6515     int vector_len = 2;
6516     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6517   %}
6518   ins_pipe( pipe_slow );
6519 %}
6520 
6521 instruct vadd32S_mem(vecZ dst, vecZ src, memory mem) %{
6522   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
6523   match(Set dst (AddVS src (LoadVector mem)));
6524   format %{ "vpaddw  $dst,$src,$mem\t! add packed32S" %}
6525   ins_encode %{
6526     int vector_len = 2;
6527     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6528   %}
6529   ins_pipe( pipe_slow );
6530 %}
6531 
6532 // Integers vector add
6533 instruct vadd2I(vecD dst, vecD src) %{
6534   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
6535   match(Set dst (AddVI dst src));
6536   format %{ "paddd   $dst,$src\t! add packed2I" %}
6537   ins_encode %{
6538     __ paddd($dst$$XMMRegister, $src$$XMMRegister);
6539   %}
6540   ins_pipe( pipe_slow );
6541 %}
6542 
6543 instruct vadd2I_reg(vecD dst, vecD src1, vecD src2) %{
6544   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6545   match(Set dst (AddVI src1 src2));
6546   format %{ "vpaddd  $dst,$src1,$src2\t! add packed2I" %}
6547   ins_encode %{
6548     int vector_len = 0;
6549     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6550   %}
6551   ins_pipe( pipe_slow );
6552 %}
6553 
6554 instruct vadd2I_mem(vecD dst, vecD src, memory mem) %{
6555   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6556   match(Set dst (AddVI src (LoadVector mem)));
6557   format %{ "vpaddd  $dst,$src,$mem\t! add packed2I" %}
6558   ins_encode %{
6559     int vector_len = 0;
6560     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6561   %}
6562   ins_pipe( pipe_slow );
6563 %}
6564 
6565 instruct vadd4I(vecX dst, vecX src) %{
6566   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
6567   match(Set dst (AddVI dst src));
6568   format %{ "paddd   $dst,$src\t! add packed4I" %}
6569   ins_encode %{
6570     __ paddd($dst$$XMMRegister, $src$$XMMRegister);
6571   %}
6572   ins_pipe( pipe_slow );
6573 %}
6574 
6575 instruct vadd4I_reg(vecX dst, vecX src1, vecX src2) %{
6576   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6577   match(Set dst (AddVI src1 src2));
6578   format %{ "vpaddd  $dst,$src1,$src2\t! add packed4I" %}
6579   ins_encode %{
6580     int vector_len = 0;
6581     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6582   %}
6583   ins_pipe( pipe_slow );
6584 %}
6585 
6586 instruct vadd4I_mem(vecX dst, vecX src, memory mem) %{
6587   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6588   match(Set dst (AddVI src (LoadVector mem)));
6589   format %{ "vpaddd  $dst,$src,$mem\t! add packed4I" %}
6590   ins_encode %{
6591     int vector_len = 0;
6592     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6593   %}
6594   ins_pipe( pipe_slow );
6595 %}
6596 
6597 instruct vadd8I_reg(vecY dst, vecY src1, vecY src2) %{
6598   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
6599   match(Set dst (AddVI src1 src2));
6600   format %{ "vpaddd  $dst,$src1,$src2\t! add packed8I" %}
6601   ins_encode %{
6602     int vector_len = 1;
6603     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6604   %}
6605   ins_pipe( pipe_slow );
6606 %}
6607 
6608 instruct vadd8I_mem(vecY dst, vecY src, memory mem) %{
6609   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
6610   match(Set dst (AddVI src (LoadVector mem)));
6611   format %{ "vpaddd  $dst,$src,$mem\t! add packed8I" %}
6612   ins_encode %{
6613     int vector_len = 1;
6614     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6615   %}
6616   ins_pipe( pipe_slow );
6617 %}
6618 
6619 instruct vadd16I_reg(vecZ dst, vecZ src1, vecZ src2) %{
6620   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
6621   match(Set dst (AddVI src1 src2));
6622   format %{ "vpaddd  $dst,$src1,$src2\t! add packed16I" %}
6623   ins_encode %{
6624     int vector_len = 2;
6625     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6626   %}
6627   ins_pipe( pipe_slow );
6628 %}
6629 
6630 instruct vadd16I_mem(vecZ dst, vecZ src, memory mem) %{
6631   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
6632   match(Set dst (AddVI src (LoadVector mem)));
6633   format %{ "vpaddd  $dst,$src,$mem\t! add packed16I" %}
6634   ins_encode %{
6635     int vector_len = 2;
6636     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6637   %}
6638   ins_pipe( pipe_slow );
6639 %}
6640 
6641 // Longs vector add
6642 instruct vadd2L(vecX dst, vecX src) %{
6643   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
6644   match(Set dst (AddVL dst src));
6645   format %{ "paddq   $dst,$src\t! add packed2L" %}
6646   ins_encode %{
6647     __ paddq($dst$$XMMRegister, $src$$XMMRegister);
6648   %}
6649   ins_pipe( pipe_slow );
6650 %}
6651 
6652 instruct vadd2L_reg(vecX dst, vecX src1, vecX src2) %{
6653   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6654   match(Set dst (AddVL src1 src2));
6655   format %{ "vpaddq  $dst,$src1,$src2\t! add packed2L" %}
6656   ins_encode %{
6657     int vector_len = 0;
6658     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6659   %}
6660   ins_pipe( pipe_slow );
6661 %}
6662 
6663 instruct vadd2L_mem(vecX dst, vecX src, memory mem) %{
6664   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6665   match(Set dst (AddVL src (LoadVector mem)));
6666   format %{ "vpaddq  $dst,$src,$mem\t! add packed2L" %}
6667   ins_encode %{
6668     int vector_len = 0;
6669     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6670   %}
6671   ins_pipe( pipe_slow );
6672 %}
6673 
6674 instruct vadd4L_reg(vecY dst, vecY src1, vecY src2) %{
6675   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
6676   match(Set dst (AddVL src1 src2));
6677   format %{ "vpaddq  $dst,$src1,$src2\t! add packed4L" %}
6678   ins_encode %{
6679     int vector_len = 1;
6680     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6681   %}
6682   ins_pipe( pipe_slow );
6683 %}
6684 
6685 instruct vadd4L_mem(vecY dst, vecY src, memory mem) %{
6686   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
6687   match(Set dst (AddVL src (LoadVector mem)));
6688   format %{ "vpaddq  $dst,$src,$mem\t! add packed4L" %}
6689   ins_encode %{
6690     int vector_len = 1;
6691     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6692   %}
6693   ins_pipe( pipe_slow );
6694 %}
6695 
6696 instruct vadd8L_reg(vecZ dst, vecZ src1, vecZ src2) %{
6697   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
6698   match(Set dst (AddVL src1 src2));
6699   format %{ "vpaddq  $dst,$src1,$src2\t! add packed8L" %}
6700   ins_encode %{
6701     int vector_len = 2;
6702     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6703   %}
6704   ins_pipe( pipe_slow );
6705 %}
6706 
6707 instruct vadd8L_mem(vecZ dst, vecZ src, memory mem) %{
6708   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
6709   match(Set dst (AddVL src (LoadVector mem)));
6710   format %{ "vpaddq  $dst,$src,$mem\t! add packed8L" %}
6711   ins_encode %{
6712     int vector_len = 2;
6713     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6714   %}
6715   ins_pipe( pipe_slow );
6716 %}
6717 
6718 // Floats vector add
6719 instruct vadd2F(vecD dst, vecD src) %{
6720   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
6721   match(Set dst (AddVF dst src));
6722   format %{ "addps   $dst,$src\t! add packed2F" %}
6723   ins_encode %{
6724     __ addps($dst$$XMMRegister, $src$$XMMRegister);
6725   %}
6726   ins_pipe( pipe_slow );
6727 %}
6728 
6729 instruct vadd2F_reg(vecD dst, vecD src1, vecD src2) %{
6730   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6731   match(Set dst (AddVF src1 src2));
6732   format %{ "vaddps  $dst,$src1,$src2\t! add packed2F" %}
6733   ins_encode %{
6734     int vector_len = 0;
6735     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6736   %}
6737   ins_pipe( pipe_slow );
6738 %}
6739 
6740 instruct vadd2F_mem(vecD dst, vecD src, memory mem) %{
6741   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6742   match(Set dst (AddVF src (LoadVector mem)));
6743   format %{ "vaddps  $dst,$src,$mem\t! add packed2F" %}
6744   ins_encode %{
6745     int vector_len = 0;
6746     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6747   %}
6748   ins_pipe( pipe_slow );
6749 %}
6750 
6751 instruct vadd4F(vecX dst, vecX src) %{
6752   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
6753   match(Set dst (AddVF dst src));
6754   format %{ "addps   $dst,$src\t! add packed4F" %}
6755   ins_encode %{
6756     __ addps($dst$$XMMRegister, $src$$XMMRegister);
6757   %}
6758   ins_pipe( pipe_slow );
6759 %}
6760 
6761 instruct vadd4F_reg(vecX dst, vecX src1, vecX src2) %{
6762   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6763   match(Set dst (AddVF src1 src2));
6764   format %{ "vaddps  $dst,$src1,$src2\t! add packed4F" %}
6765   ins_encode %{
6766     int vector_len = 0;
6767     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6768   %}
6769   ins_pipe( pipe_slow );
6770 %}
6771 
6772 instruct vadd4F_mem(vecX dst, vecX src, memory mem) %{
6773   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6774   match(Set dst (AddVF src (LoadVector mem)));
6775   format %{ "vaddps  $dst,$src,$mem\t! add packed4F" %}
6776   ins_encode %{
6777     int vector_len = 0;
6778     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6779   %}
6780   ins_pipe( pipe_slow );
6781 %}
6782 
6783 instruct vadd8F_reg(vecY dst, vecY src1, vecY src2) %{
6784   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6785   match(Set dst (AddVF src1 src2));
6786   format %{ "vaddps  $dst,$src1,$src2\t! add packed8F" %}
6787   ins_encode %{
6788     int vector_len = 1;
6789     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6790   %}
6791   ins_pipe( pipe_slow );
6792 %}
6793 
6794 instruct vadd8F_mem(vecY dst, vecY src, memory mem) %{
6795   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6796   match(Set dst (AddVF src (LoadVector mem)));
6797   format %{ "vaddps  $dst,$src,$mem\t! add packed8F" %}
6798   ins_encode %{
6799     int vector_len = 1;
6800     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6801   %}
6802   ins_pipe( pipe_slow );
6803 %}
6804 
6805 instruct vadd16F_reg(vecZ dst, vecZ src1, vecZ src2) %{
6806   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
6807   match(Set dst (AddVF src1 src2));
6808   format %{ "vaddps  $dst,$src1,$src2\t! add packed16F" %}
6809   ins_encode %{
6810     int vector_len = 2;
6811     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6812   %}
6813   ins_pipe( pipe_slow );
6814 %}
6815 
6816 instruct vadd16F_mem(vecZ dst, vecZ src, memory mem) %{
6817   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
6818   match(Set dst (AddVF src (LoadVector mem)));
6819   format %{ "vaddps  $dst,$src,$mem\t! add packed16F" %}
6820   ins_encode %{
6821     int vector_len = 2;
6822     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6823   %}
6824   ins_pipe( pipe_slow );
6825 %}
6826 
6827 // Doubles vector add
6828 instruct vadd2D(vecX dst, vecX src) %{
6829   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
6830   match(Set dst (AddVD dst src));
6831   format %{ "addpd   $dst,$src\t! add packed2D" %}
6832   ins_encode %{
6833     __ addpd($dst$$XMMRegister, $src$$XMMRegister);
6834   %}
6835   ins_pipe( pipe_slow );
6836 %}
6837 
6838 instruct vadd2D_reg(vecX dst, vecX src1, vecX src2) %{
6839   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6840   match(Set dst (AddVD src1 src2));
6841   format %{ "vaddpd  $dst,$src1,$src2\t! add packed2D" %}
6842   ins_encode %{
6843     int vector_len = 0;
6844     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6845   %}
6846   ins_pipe( pipe_slow );
6847 %}
6848 
6849 instruct vadd2D_mem(vecX dst, vecX src, memory mem) %{
6850   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6851   match(Set dst (AddVD src (LoadVector mem)));
6852   format %{ "vaddpd  $dst,$src,$mem\t! add packed2D" %}
6853   ins_encode %{
6854     int vector_len = 0;
6855     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6856   %}
6857   ins_pipe( pipe_slow );
6858 %}
6859 
6860 instruct vadd4D_reg(vecY dst, vecY src1, vecY src2) %{
6861   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6862   match(Set dst (AddVD src1 src2));
6863   format %{ "vaddpd  $dst,$src1,$src2\t! add packed4D" %}
6864   ins_encode %{
6865     int vector_len = 1;
6866     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6867   %}
6868   ins_pipe( pipe_slow );
6869 %}
6870 
6871 instruct vadd4D_mem(vecY dst, vecY src, memory mem) %{
6872   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6873   match(Set dst (AddVD src (LoadVector mem)));
6874   format %{ "vaddpd  $dst,$src,$mem\t! add packed4D" %}
6875   ins_encode %{
6876     int vector_len = 1;
6877     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6878   %}
6879   ins_pipe( pipe_slow );
6880 %}
6881 
6882 instruct vadd8D_reg(vecZ dst, vecZ src1, vecZ src2) %{
6883   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
6884   match(Set dst (AddVD src1 src2));
6885   format %{ "vaddpd  $dst,$src1,$src2\t! add packed8D" %}
6886   ins_encode %{
6887     int vector_len = 2;
6888     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6889   %}
6890   ins_pipe( pipe_slow );
6891 %}
6892 
6893 instruct vadd8D_mem(vecZ dst, vecZ src, memory mem) %{
6894   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
6895   match(Set dst (AddVD src (LoadVector mem)));
6896   format %{ "vaddpd  $dst,$src,$mem\t! add packed8D" %}
6897   ins_encode %{
6898     int vector_len = 2;
6899     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6900   %}
6901   ins_pipe( pipe_slow );
6902 %}
6903 
6904 // --------------------------------- SUB --------------------------------------
6905 
6906 // Bytes vector sub
6907 instruct vsub4B(vecS dst, vecS src) %{
6908   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
6909   match(Set dst (SubVB dst src));
6910   format %{ "psubb   $dst,$src\t! sub packed4B" %}
6911   ins_encode %{
6912     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
6913   %}
6914   ins_pipe( pipe_slow );
6915 %}
6916 
6917 instruct vsub4B_reg(vecS dst, vecS src1, vecS src2) %{
6918   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6919   match(Set dst (SubVB src1 src2));
6920   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed4B" %}
6921   ins_encode %{
6922     int vector_len = 0;
6923     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6924   %}
6925   ins_pipe( pipe_slow );
6926 %}
6927 
6928 instruct vsub4B_mem(vecS dst, vecS src, memory mem) %{
6929   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6930   match(Set dst (SubVB src (LoadVector mem)));
6931   format %{ "vpsubb  $dst,$src,$mem\t! sub packed4B" %}
6932   ins_encode %{
6933     int vector_len = 0;
6934     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6935   %}
6936   ins_pipe( pipe_slow );
6937 %}
6938 
6939 instruct vsub8B(vecD dst, vecD src) %{
6940   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
6941   match(Set dst (SubVB dst src));
6942   format %{ "psubb   $dst,$src\t! sub packed8B" %}
6943   ins_encode %{
6944     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
6945   %}
6946   ins_pipe( pipe_slow );
6947 %}
6948 
6949 instruct vsub8B_reg(vecD dst, vecD src1, vecD src2) %{
6950   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6951   match(Set dst (SubVB src1 src2));
6952   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed8B" %}
6953   ins_encode %{
6954     int vector_len = 0;
6955     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6956   %}
6957   ins_pipe( pipe_slow );
6958 %}
6959 
6960 instruct vsub8B_mem(vecD dst, vecD src, memory mem) %{
6961   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6962   match(Set dst (SubVB src (LoadVector mem)));
6963   format %{ "vpsubb  $dst,$src,$mem\t! sub packed8B" %}
6964   ins_encode %{
6965     int vector_len = 0;
6966     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6967   %}
6968   ins_pipe( pipe_slow );
6969 %}
6970 
6971 instruct vsub16B(vecX dst, vecX src) %{
6972   predicate(UseAVX == 0 && n->as_Vector()->length() == 16);
6973   match(Set dst (SubVB dst src));
6974   format %{ "psubb   $dst,$src\t! sub packed16B" %}
6975   ins_encode %{
6976     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
6977   %}
6978   ins_pipe( pipe_slow );
6979 %}
6980 
6981 instruct vsub16B_reg(vecX dst, vecX src1, vecX src2) %{
6982   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
6983   match(Set dst (SubVB src1 src2));
6984   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed16B" %}
6985   ins_encode %{
6986     int vector_len = 0;
6987     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6988   %}
6989   ins_pipe( pipe_slow );
6990 %}
6991 
6992 instruct vsub16B_mem(vecX dst, vecX src, memory mem) %{
6993   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
6994   match(Set dst (SubVB src (LoadVector mem)));
6995   format %{ "vpsubb  $dst,$src,$mem\t! sub packed16B" %}
6996   ins_encode %{
6997     int vector_len = 0;
6998     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6999   %}
7000   ins_pipe( pipe_slow );
7001 %}
7002 
7003 instruct vsub32B_reg(vecY dst, vecY src1, vecY src2) %{
7004   predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
7005   match(Set dst (SubVB src1 src2));
7006   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed32B" %}
7007   ins_encode %{
7008     int vector_len = 1;
7009     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7010   %}
7011   ins_pipe( pipe_slow );
7012 %}
7013 
7014 instruct vsub32B_mem(vecY dst, vecY src, memory mem) %{
7015   predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
7016   match(Set dst (SubVB src (LoadVector mem)));
7017   format %{ "vpsubb  $dst,$src,$mem\t! sub packed32B" %}
7018   ins_encode %{
7019     int vector_len = 1;
7020     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7021   %}
7022   ins_pipe( pipe_slow );
7023 %}
7024 
7025 instruct vsub64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
7026   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64);
7027   match(Set dst (SubVB src1 src2));
7028   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed64B" %}
7029   ins_encode %{
7030     int vector_len = 2;
7031     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7032   %}
7033   ins_pipe( pipe_slow );
7034 %}
7035 
7036 instruct vsub64B_mem(vecZ dst, vecZ src, memory mem) %{
7037   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64);
7038   match(Set dst (SubVB src (LoadVector mem)));
7039   format %{ "vpsubb  $dst,$src,$mem\t! sub packed64B" %}
7040   ins_encode %{
7041     int vector_len = 2;
7042     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7043   %}
7044   ins_pipe( pipe_slow );
7045 %}
7046 
7047 // Shorts/Chars vector sub
7048 instruct vsub2S(vecS dst, vecS src) %{
7049   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
7050   match(Set dst (SubVS dst src));
7051   format %{ "psubw   $dst,$src\t! sub packed2S" %}
7052   ins_encode %{
7053     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
7054   %}
7055   ins_pipe( pipe_slow );
7056 %}
7057 
7058 instruct vsub2S_reg(vecS dst, vecS src1, vecS src2) %{
7059   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7060   match(Set dst (SubVS src1 src2));
7061   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed2S" %}
7062   ins_encode %{
7063     int vector_len = 0;
7064     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7065   %}
7066   ins_pipe( pipe_slow );
7067 %}
7068 
7069 instruct vsub2S_mem(vecS dst, vecS src, memory mem) %{
7070   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7071   match(Set dst (SubVS src (LoadVector mem)));
7072   format %{ "vpsubw  $dst,$src,$mem\t! sub packed2S" %}
7073   ins_encode %{
7074     int vector_len = 0;
7075     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7076   %}
7077   ins_pipe( pipe_slow );
7078 %}
7079 
7080 instruct vsub4S(vecD dst, vecD src) %{
7081   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
7082   match(Set dst (SubVS dst src));
7083   format %{ "psubw   $dst,$src\t! sub packed4S" %}
7084   ins_encode %{
7085     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
7086   %}
7087   ins_pipe( pipe_slow );
7088 %}
7089 
7090 instruct vsub4S_reg(vecD dst, vecD src1, vecD src2) %{
7091   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7092   match(Set dst (SubVS src1 src2));
7093   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed4S" %}
7094   ins_encode %{
7095     int vector_len = 0;
7096     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7097   %}
7098   ins_pipe( pipe_slow );
7099 %}
7100 
7101 instruct vsub4S_mem(vecD dst, vecD src, memory mem) %{
7102   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7103   match(Set dst (SubVS src (LoadVector mem)));
7104   format %{ "vpsubw  $dst,$src,$mem\t! sub packed4S" %}
7105   ins_encode %{
7106     int vector_len = 0;
7107     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7108   %}
7109   ins_pipe( pipe_slow );
7110 %}
7111 
7112 instruct vsub8S(vecX dst, vecX src) %{
7113   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
7114   match(Set dst (SubVS dst src));
7115   format %{ "psubw   $dst,$src\t! sub packed8S" %}
7116   ins_encode %{
7117     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
7118   %}
7119   ins_pipe( pipe_slow );
7120 %}
7121 
7122 instruct vsub8S_reg(vecX dst, vecX src1, vecX src2) %{
7123   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
7124   match(Set dst (SubVS src1 src2));
7125   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed8S" %}
7126   ins_encode %{
7127     int vector_len = 0;
7128     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7129   %}
7130   ins_pipe( pipe_slow );
7131 %}
7132 
7133 instruct vsub8S_mem(vecX dst, vecX src, memory mem) %{
7134   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
7135   match(Set dst (SubVS src (LoadVector mem)));
7136   format %{ "vpsubw  $dst,$src,$mem\t! sub packed8S" %}
7137   ins_encode %{
7138     int vector_len = 0;
7139     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7140   %}
7141   ins_pipe( pipe_slow );
7142 %}
7143 
7144 instruct vsub16S_reg(vecY dst, vecY src1, vecY src2) %{
7145   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
7146   match(Set dst (SubVS src1 src2));
7147   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed16S" %}
7148   ins_encode %{
7149     int vector_len = 1;
7150     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7151   %}
7152   ins_pipe( pipe_slow );
7153 %}
7154 
7155 instruct vsub16S_mem(vecY dst, vecY src, memory mem) %{
7156   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
7157   match(Set dst (SubVS src (LoadVector mem)));
7158   format %{ "vpsubw  $dst,$src,$mem\t! sub packed16S" %}
7159   ins_encode %{
7160     int vector_len = 1;
7161     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7162   %}
7163   ins_pipe( pipe_slow );
7164 %}
7165 
7166 instruct vsub32S_reg(vecZ dst, vecZ src1, vecZ src2) %{
7167   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
7168   match(Set dst (SubVS src1 src2));
7169   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed32S" %}
7170   ins_encode %{
7171     int vector_len = 2;
7172     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7173   %}
7174   ins_pipe( pipe_slow );
7175 %}
7176 
7177 instruct vsub32S_mem(vecZ dst, vecZ src, memory mem) %{
7178   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
7179   match(Set dst (SubVS src (LoadVector mem)));
7180   format %{ "vpsubw  $dst,$src,$mem\t! sub packed32S" %}
7181   ins_encode %{
7182     int vector_len = 2;
7183     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7184   %}
7185   ins_pipe( pipe_slow );
7186 %}
7187 
7188 // Integers vector sub
7189 instruct vsub2I(vecD dst, vecD src) %{
7190   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
7191   match(Set dst (SubVI dst src));
7192   format %{ "psubd   $dst,$src\t! sub packed2I" %}
7193   ins_encode %{
7194     __ psubd($dst$$XMMRegister, $src$$XMMRegister);
7195   %}
7196   ins_pipe( pipe_slow );
7197 %}
7198 
7199 instruct vsub2I_reg(vecD dst, vecD src1, vecD src2) %{
7200   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7201   match(Set dst (SubVI src1 src2));
7202   format %{ "vpsubd  $dst,$src1,$src2\t! sub packed2I" %}
7203   ins_encode %{
7204     int vector_len = 0;
7205     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7206   %}
7207   ins_pipe( pipe_slow );
7208 %}
7209 
7210 instruct vsub2I_mem(vecD dst, vecD src, memory mem) %{
7211   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7212   match(Set dst (SubVI src (LoadVector mem)));
7213   format %{ "vpsubd  $dst,$src,$mem\t! sub packed2I" %}
7214   ins_encode %{
7215     int vector_len = 0;
7216     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7217   %}
7218   ins_pipe( pipe_slow );
7219 %}
7220 
7221 instruct vsub4I(vecX dst, vecX src) %{
7222   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
7223   match(Set dst (SubVI dst src));
7224   format %{ "psubd   $dst,$src\t! sub packed4I" %}
7225   ins_encode %{
7226     __ psubd($dst$$XMMRegister, $src$$XMMRegister);
7227   %}
7228   ins_pipe( pipe_slow );
7229 %}
7230 
7231 instruct vsub4I_reg(vecX dst, vecX src1, vecX src2) %{
7232   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7233   match(Set dst (SubVI src1 src2));
7234   format %{ "vpsubd  $dst,$src1,$src2\t! sub packed4I" %}
7235   ins_encode %{
7236     int vector_len = 0;
7237     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7238   %}
7239   ins_pipe( pipe_slow );
7240 %}
7241 
7242 instruct vsub4I_mem(vecX dst, vecX src, memory mem) %{
7243   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7244   match(Set dst (SubVI src (LoadVector mem)));
7245   format %{ "vpsubd  $dst,$src,$mem\t! sub packed4I" %}
7246   ins_encode %{
7247     int vector_len = 0;
7248     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7249   %}
7250   ins_pipe( pipe_slow );
7251 %}
7252 
7253 instruct vsub8I_reg(vecY dst, vecY src1, vecY src2) %{
7254   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
7255   match(Set dst (SubVI src1 src2));
7256   format %{ "vpsubd  $dst,$src1,$src2\t! sub packed8I" %}
7257   ins_encode %{
7258     int vector_len = 1;
7259     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7260   %}
7261   ins_pipe( pipe_slow );
7262 %}
7263 
7264 instruct vsub8I_mem(vecY dst, vecY src, memory mem) %{
7265   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
7266   match(Set dst (SubVI src (LoadVector mem)));
7267   format %{ "vpsubd  $dst,$src,$mem\t! sub packed8I" %}
7268   ins_encode %{
7269     int vector_len = 1;
7270     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7271   %}
7272   ins_pipe( pipe_slow );
7273 %}
7274 
7275 instruct vsub16I_reg(vecZ dst, vecZ src1, vecZ src2) %{
7276   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
7277   match(Set dst (SubVI src1 src2));
7278   format %{ "vpsubd  $dst,$src1,$src2\t! sub packed16I" %}
7279   ins_encode %{
7280     int vector_len = 2;
7281     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7282   %}
7283   ins_pipe( pipe_slow );
7284 %}
7285 
7286 instruct vsub16I_mem(vecZ dst, vecZ src, memory mem) %{
7287   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
7288   match(Set dst (SubVI src (LoadVector mem)));
7289   format %{ "vpsubd  $dst,$src,$mem\t! sub packed16I" %}
7290   ins_encode %{
7291     int vector_len = 2;
7292     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7293   %}
7294   ins_pipe( pipe_slow );
7295 %}
7296 
7297 // Longs vector sub
7298 instruct vsub2L(vecX dst, vecX src) %{
7299   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
7300   match(Set dst (SubVL dst src));
7301   format %{ "psubq   $dst,$src\t! sub packed2L" %}
7302   ins_encode %{
7303     __ psubq($dst$$XMMRegister, $src$$XMMRegister);
7304   %}
7305   ins_pipe( pipe_slow );
7306 %}
7307 
7308 instruct vsub2L_reg(vecX dst, vecX src1, vecX src2) %{
7309   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7310   match(Set dst (SubVL src1 src2));
7311   format %{ "vpsubq  $dst,$src1,$src2\t! sub packed2L" %}
7312   ins_encode %{
7313     int vector_len = 0;
7314     __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7315   %}
7316   ins_pipe( pipe_slow );
7317 %}
7318 
7319 instruct vsub2L_mem(vecX dst, vecX src, memory mem) %{
7320   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7321   match(Set dst (SubVL src (LoadVector mem)));
7322   format %{ "vpsubq  $dst,$src,$mem\t! sub packed2L" %}
7323   ins_encode %{
7324     int vector_len = 0;
7325     __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7326   %}
7327   ins_pipe( pipe_slow );
7328 %}
7329 
7330 instruct vsub4L_reg(vecY dst, vecY src1, vecY src2) %{
7331   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
7332   match(Set dst (SubVL src1 src2));
7333   format %{ "vpsubq  $dst,$src1,$src2\t! sub packed4L" %}
7334   ins_encode %{
7335     int vector_len = 1;
7336     __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7337   %}
7338   ins_pipe( pipe_slow );
7339 %}
7340 
7341 instruct vsub4L_mem(vecY dst, vecY src, memory mem) %{
7342   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
7343   match(Set dst (SubVL src (LoadVector mem)));
7344   format %{ "vpsubq  $dst,$src,$mem\t! sub packed4L" %}
7345   ins_encode %{
7346     int vector_len = 1;
7347     __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7348   %}
7349   ins_pipe( pipe_slow );
7350 %}
7351 
7352 instruct vsub8L_reg(vecZ dst, vecZ src1, vecZ src2) %{
7353   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
7354   match(Set dst (SubVL src1 src2));
7355   format %{ "vpsubq  $dst,$src1,$src2\t! sub packed8L" %}
7356   ins_encode %{
7357     int vector_len = 2;
7358     __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7359   %}
7360   ins_pipe( pipe_slow );
7361 %}
7362 
7363 instruct vsub8L_mem(vecZ dst, vecZ src, memory mem) %{
7364   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
7365   match(Set dst (SubVL src (LoadVector mem)));
7366   format %{ "vpsubq  $dst,$src,$mem\t! sub packed8L" %}
7367   ins_encode %{
7368     int vector_len = 2;
7369     __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7370   %}
7371   ins_pipe( pipe_slow );
7372 %}
7373 
7374 // Floats vector sub
7375 instruct vsub2F(vecD dst, vecD src) %{
7376   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
7377   match(Set dst (SubVF dst src));
7378   format %{ "subps   $dst,$src\t! sub packed2F" %}
7379   ins_encode %{
7380     __ subps($dst$$XMMRegister, $src$$XMMRegister);
7381   %}
7382   ins_pipe( pipe_slow );
7383 %}
7384 
7385 instruct vsub2F_reg(vecD dst, vecD src1, vecD src2) %{
7386   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7387   match(Set dst (SubVF src1 src2));
7388   format %{ "vsubps  $dst,$src1,$src2\t! sub packed2F" %}
7389   ins_encode %{
7390     int vector_len = 0;
7391     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7392   %}
7393   ins_pipe( pipe_slow );
7394 %}
7395 
7396 instruct vsub2F_mem(vecD dst, vecD src, memory mem) %{
7397   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7398   match(Set dst (SubVF src (LoadVector mem)));
7399   format %{ "vsubps  $dst,$src,$mem\t! sub packed2F" %}
7400   ins_encode %{
7401     int vector_len = 0;
7402     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7403   %}
7404   ins_pipe( pipe_slow );
7405 %}
7406 
7407 instruct vsub4F(vecX dst, vecX src) %{
7408   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
7409   match(Set dst (SubVF dst src));
7410   format %{ "subps   $dst,$src\t! sub packed4F" %}
7411   ins_encode %{
7412     __ subps($dst$$XMMRegister, $src$$XMMRegister);
7413   %}
7414   ins_pipe( pipe_slow );
7415 %}
7416 
7417 instruct vsub4F_reg(vecX dst, vecX src1, vecX src2) %{
7418   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7419   match(Set dst (SubVF src1 src2));
7420   format %{ "vsubps  $dst,$src1,$src2\t! sub packed4F" %}
7421   ins_encode %{
7422     int vector_len = 0;
7423     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7424   %}
7425   ins_pipe( pipe_slow );
7426 %}
7427 
7428 instruct vsub4F_mem(vecX dst, vecX src, memory mem) %{
7429   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7430   match(Set dst (SubVF src (LoadVector mem)));
7431   format %{ "vsubps  $dst,$src,$mem\t! sub packed4F" %}
7432   ins_encode %{
7433     int vector_len = 0;
7434     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7435   %}
7436   ins_pipe( pipe_slow );
7437 %}
7438 
7439 instruct vsub8F_reg(vecY dst, vecY src1, vecY src2) %{
7440   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
7441   match(Set dst (SubVF src1 src2));
7442   format %{ "vsubps  $dst,$src1,$src2\t! sub packed8F" %}
7443   ins_encode %{
7444     int vector_len = 1;
7445     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7446   %}
7447   ins_pipe( pipe_slow );
7448 %}
7449 
7450 instruct vsub8F_mem(vecY dst, vecY src, memory mem) %{
7451   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
7452   match(Set dst (SubVF src (LoadVector mem)));
7453   format %{ "vsubps  $dst,$src,$mem\t! sub packed8F" %}
7454   ins_encode %{
7455     int vector_len = 1;
7456     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7457   %}
7458   ins_pipe( pipe_slow );
7459 %}
7460 
7461 instruct vsub16F_reg(vecZ dst, vecZ src1, vecZ src2) %{
7462   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
7463   match(Set dst (SubVF src1 src2));
7464   format %{ "vsubps  $dst,$src1,$src2\t! sub packed16F" %}
7465   ins_encode %{
7466     int vector_len = 2;
7467     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7468   %}
7469   ins_pipe( pipe_slow );
7470 %}
7471 
7472 instruct vsub16F_mem(vecZ dst, vecZ src, memory mem) %{
7473   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
7474   match(Set dst (SubVF src (LoadVector mem)));
7475   format %{ "vsubps  $dst,$src,$mem\t! sub packed16F" %}
7476   ins_encode %{
7477     int vector_len = 2;
7478     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7479   %}
7480   ins_pipe( pipe_slow );
7481 %}
7482 
7483 // Doubles vector sub
7484 instruct vsub2D(vecX dst, vecX src) %{
7485   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
7486   match(Set dst (SubVD dst src));
7487   format %{ "subpd   $dst,$src\t! sub packed2D" %}
7488   ins_encode %{
7489     __ subpd($dst$$XMMRegister, $src$$XMMRegister);
7490   %}
7491   ins_pipe( pipe_slow );
7492 %}
7493 
7494 instruct vsub2D_reg(vecX dst, vecX src1, vecX src2) %{
7495   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7496   match(Set dst (SubVD src1 src2));
7497   format %{ "vsubpd  $dst,$src1,$src2\t! sub packed2D" %}
7498   ins_encode %{
7499     int vector_len = 0;
7500     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7501   %}
7502   ins_pipe( pipe_slow );
7503 %}
7504 
7505 instruct vsub2D_mem(vecX dst, vecX src, memory mem) %{
7506   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7507   match(Set dst (SubVD src (LoadVector mem)));
7508   format %{ "vsubpd  $dst,$src,$mem\t! sub packed2D" %}
7509   ins_encode %{
7510     int vector_len = 0;
7511     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7512   %}
7513   ins_pipe( pipe_slow );
7514 %}
7515 
7516 instruct vsub4D_reg(vecY dst, vecY src1, vecY src2) %{
7517   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7518   match(Set dst (SubVD src1 src2));
7519   format %{ "vsubpd  $dst,$src1,$src2\t! sub packed4D" %}
7520   ins_encode %{
7521     int vector_len = 1;
7522     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7523   %}
7524   ins_pipe( pipe_slow );
7525 %}
7526 
7527 instruct vsub4D_mem(vecY dst, vecY src, memory mem) %{
7528   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7529   match(Set dst (SubVD src (LoadVector mem)));
7530   format %{ "vsubpd  $dst,$src,$mem\t! sub packed4D" %}
7531   ins_encode %{
7532     int vector_len = 1;
7533     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7534   %}
7535   ins_pipe( pipe_slow );
7536 %}
7537 
7538 instruct vsub8D_reg(vecZ dst, vecZ src1, vecZ src2) %{
7539   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
7540   match(Set dst (SubVD src1 src2));
7541   format %{ "vsubpd  $dst,$src1,$src2\t! sub packed8D" %}
7542   ins_encode %{
7543     int vector_len = 2;
7544     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7545   %}
7546   ins_pipe( pipe_slow );
7547 %}
7548 
7549 instruct vsub8D_mem(vecZ dst, vecZ src, memory mem) %{
7550   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
7551   match(Set dst (SubVD src (LoadVector mem)));
7552   format %{ "vsubpd  $dst,$src,$mem\t! sub packed8D" %}
7553   ins_encode %{
7554     int vector_len = 2;
7555     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7556   %}
7557   ins_pipe( pipe_slow );
7558 %}
7559 
7560 // --------------------------------- MUL --------------------------------------
7561 
7562 // Byte vector mul
7563 instruct mul4B_reg(vecS dst, vecS src1, vecS src2, vecS tmp, rRegI scratch) %{
7564   predicate(UseSSE > 3 && n->as_Vector()->length() == 4);
7565   match(Set dst (MulVB src1 src2));
7566   effect(TEMP dst, TEMP tmp, TEMP scratch);
7567   format %{"pmovsxbw  $tmp,$src1\n\t"
7568            "pmovsxbw  $dst,$src2\n\t"
7569            "pmullw    $tmp,$dst\n\t"
7570            "movdqu    $dst,[0x00ff00ff0x00ff00ff]\n\t"
7571            "pand      $dst,$tmp\n\t"
7572            "packuswb  $dst,$dst\t! mul packed4B" %}
7573   ins_encode %{
7574     __ pmovsxbw($tmp$$XMMRegister, $src1$$XMMRegister);
7575     __ pmovsxbw($dst$$XMMRegister, $src2$$XMMRegister);
7576     __ pmullw($tmp$$XMMRegister, $dst$$XMMRegister);
7577     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
7578     __ pand($dst$$XMMRegister, $tmp$$XMMRegister);
7579     __ packuswb($dst$$XMMRegister, $dst$$XMMRegister);
7580   %}
7581   ins_pipe( pipe_slow );
7582 %}
7583 
7584 instruct mul8B_reg(vecD dst, vecD src1, vecD src2, vecD tmp, rRegI scratch) %{
7585   predicate(UseSSE > 3 && n->as_Vector()->length() == 8);
7586   match(Set dst (MulVB src1 src2));
7587   effect(TEMP dst, TEMP tmp, TEMP scratch);
7588   format %{"pmovsxbw  $tmp,$src1\n\t"
7589            "pmovsxbw  $dst,$src2\n\t"
7590            "pmullw    $tmp,$dst\n\t"
7591            "movdqu    $dst,[0x00ff00ff0x00ff00ff]\n\t"
7592            "pand      $dst,$tmp\n\t"
7593            "packuswb  $dst,$dst\t! mul packed8B" %}
7594   ins_encode %{
7595     __ pmovsxbw($tmp$$XMMRegister, $src1$$XMMRegister);
7596     __ pmovsxbw($dst$$XMMRegister, $src2$$XMMRegister);
7597     __ pmullw($tmp$$XMMRegister, $dst$$XMMRegister);
7598     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
7599     __ pand($dst$$XMMRegister, $tmp$$XMMRegister);
7600     __ packuswb($dst$$XMMRegister, $dst$$XMMRegister);
7601   %}
7602   ins_pipe( pipe_slow );
7603 %}
7604 
7605 instruct mul16B_reg(vecX dst, vecX src1, vecX src2, vecX tmp1, vecX tmp2, rRegI scratch) %{
7606   predicate(UseSSE > 3 && n->as_Vector()->length() == 16);
7607   match(Set dst (MulVB src1 src2));
7608   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP scratch);
7609   format %{"pmovsxbw  $tmp1,$src1\n\t"
7610            "pmovsxbw  $tmp2,$src2\n\t"
7611            "pmullw    $tmp1,$tmp2\n\t"
7612            "pshufd    $tmp2,$src1,0xEE\n\t"
7613            "pshufd    $dst,$src2,0xEE\n\t"
7614            "pmovsxbw  $tmp2,$tmp2\n\t"
7615            "pmovsxbw  $dst,$dst\n\t"
7616            "pmullw    $tmp2,$dst\n\t"
7617            "movdqu    $dst,[0x00ff00ff0x00ff00ff]\n\t"
7618            "pand      $tmp2,$dst\n\t"
7619            "pand      $dst,$tmp1\n\t"
7620            "packuswb  $dst,$tmp2\t! mul packed16B" %}
7621   ins_encode %{
7622     __ pmovsxbw($tmp1$$XMMRegister, $src1$$XMMRegister);
7623     __ pmovsxbw($tmp2$$XMMRegister, $src2$$XMMRegister);
7624     __ pmullw($tmp1$$XMMRegister, $tmp2$$XMMRegister);
7625     __ pshufd($tmp2$$XMMRegister, $src1$$XMMRegister, 0xEE);
7626     __ pshufd($dst$$XMMRegister, $src2$$XMMRegister, 0xEE);
7627     __ pmovsxbw($tmp2$$XMMRegister, $tmp2$$XMMRegister);
7628     __ pmovsxbw($dst$$XMMRegister, $dst$$XMMRegister);
7629     __ pmullw($tmp2$$XMMRegister, $dst$$XMMRegister);
7630     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
7631     __ pand($tmp2$$XMMRegister, $dst$$XMMRegister);
7632     __ pand($dst$$XMMRegister, $tmp1$$XMMRegister);
7633     __ packuswb($dst$$XMMRegister, $tmp2$$XMMRegister);
7634   %}
7635   ins_pipe( pipe_slow );
7636 %}
7637 
7638 instruct vmul16B_reg_avx(vecX dst, vecX src1, vecX src2, vecX tmp, rRegI scratch) %{
7639   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
7640   match(Set dst (MulVB src1 src2));
7641   effect(TEMP dst, TEMP tmp, TEMP scratch);
7642   format %{"vpmovsxbw  $tmp,$src1\n\t"
7643            "vpmovsxbw  $dst,$src2\n\t"
7644            "vpmullw    $tmp,$tmp,$dst\n\t"
7645            "vmovdqu    $dst,[0x00ff00ff0x00ff00ff]\n\t"
7646            "vpand      $dst,$dst,$tmp\n\t"
7647            "vextracti128_high  $tmp,$dst\n\t"
7648            "vpackuswb  $dst,$dst,$dst\n\t! mul packed16B" %}
7649   ins_encode %{
7650   int vector_len = 1;
7651     __ vpmovsxbw($tmp$$XMMRegister, $src1$$XMMRegister, vector_len);
7652     __ vpmovsxbw($dst$$XMMRegister, $src2$$XMMRegister, vector_len);
7653     __ vpmullw($tmp$$XMMRegister, $tmp$$XMMRegister, $dst$$XMMRegister, vector_len);
7654     __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
7655     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
7656     __ vextracti128_high($tmp$$XMMRegister, $dst$$XMMRegister);
7657     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, 0);
7658   %}
7659   ins_pipe( pipe_slow );
7660 %}
7661 
7662 instruct vmul32B_reg_avx(vecY dst, vecY src1, vecY src2, vecY tmp1, vecY tmp2, rRegI scratch) %{
7663   predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
7664   match(Set dst (MulVB src1 src2));
7665   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP scratch);
7666   format %{"vextracti128_high  $tmp1,$src1\n\t"
7667            "vextracti128_high  $dst,$src2\n\t"
7668            "vpmovsxbw $tmp1,$tmp1\n\t"
7669            "vpmovsxbw $dst,$dst\n\t"
7670            "vpmullw $tmp1,$tmp1,$dst\n\t"
7671            "vpmovsxbw $tmp2,$src1\n\t"
7672            "vpmovsxbw $dst,$src2\n\t"
7673            "vpmullw $tmp2,$tmp2,$dst\n\t"
7674            "vmovdqu $dst, [0x00ff00ff0x00ff00ff]\n\t"
7675            "vpbroadcastd $dst, $dst\n\t"
7676            "vpand $tmp1,$tmp1,$dst\n\t"
7677            "vpand $dst,$dst,$tmp2\n\t"
7678            "vpackuswb $dst,$dst,$tmp1\n\t"
7679            "vpermq $dst, $dst, 0xD8\t! mul packed32B" %}
7680   ins_encode %{
7681     int vector_len = 1;
7682     __ vextracti128_high($tmp1$$XMMRegister, $src1$$XMMRegister);
7683     __ vextracti128_high($dst$$XMMRegister, $src2$$XMMRegister);
7684     __ vpmovsxbw($tmp1$$XMMRegister, $tmp1$$XMMRegister, vector_len);
7685     __ vpmovsxbw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
7686     __ vpmullw($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vector_len);
7687     __ vpmovsxbw($tmp2$$XMMRegister, $src1$$XMMRegister, vector_len);
7688     __ vpmovsxbw($dst$$XMMRegister, $src2$$XMMRegister, vector_len);
7689     __ vpmullw($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vector_len);
7690     __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
7691     __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
7692     __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vector_len);
7693     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister, vector_len);
7694     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp1$$XMMRegister, vector_len);
7695     __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vector_len);
7696   %}
7697   ins_pipe( pipe_slow );
7698 %}
7699 
7700 instruct vmul64B_reg_avx(vecZ dst, vecZ src1, vecZ src2, vecZ tmp1, vecZ tmp2, rRegI scratch) %{
7701   predicate(UseAVX > 2 && n->as_Vector()->length() == 64);
7702   match(Set dst (MulVB src1 src2));
7703   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP scratch);
7704   format %{"vextracti64x4_high  $tmp1,$src1\n\t"
7705            "vextracti64x4_high  $dst,$src2\n\t"
7706            "vpmovsxbw $tmp1,$tmp1\n\t"
7707            "vpmovsxbw $dst,$dst\n\t"
7708            "vpmullw $tmp1,$tmp1,$dst\n\t"
7709            "vpmovsxbw $tmp2,$src1\n\t"
7710            "vpmovsxbw $dst,$src2\n\t"
7711            "vpmullw $tmp2,$tmp2,$dst\n\t"
7712            "vmovdqu $dst, [0x00ff00ff0x00ff00ff]\n\t"
7713            "vpbroadcastd $dst, $dst\n\t"
7714            "vpand $tmp1,$tmp1,$dst\n\t"
7715            "vpand $tmp2,$tmp2,$dst\n\t"
7716            "vpackuswb $dst,$tmp1,$tmp2\n\t"
7717            "evmovdquq  $tmp2,[0x0604020007050301]\n\t"
7718            "vpermq $dst,$tmp2,$dst,0x01\t! mul packed64B" %}
7719 
7720   ins_encode %{
7721     int vector_len = 2;
7722     __ vextracti64x4_high($tmp1$$XMMRegister, $src1$$XMMRegister);
7723     __ vextracti64x4_high($dst$$XMMRegister, $src2$$XMMRegister);
7724     __ vpmovsxbw($tmp1$$XMMRegister, $tmp1$$XMMRegister, vector_len);
7725     __ vpmovsxbw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
7726     __ vpmullw($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vector_len);
7727     __ vpmovsxbw($tmp2$$XMMRegister, $src1$$XMMRegister, vector_len);
7728     __ vpmovsxbw($dst$$XMMRegister, $src2$$XMMRegister, vector_len);
7729     __ vpmullw($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vector_len);
7730     __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
7731     __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
7732     __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vector_len);
7733     __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vector_len);
7734     __ vpackuswb($dst$$XMMRegister, $tmp1$$XMMRegister, $tmp2$$XMMRegister, vector_len);
7735     __ evmovdquq($tmp2$$XMMRegister, ExternalAddress(vector_byte_perm_mask()), vector_len, $scratch$$Register);
7736     __ vpermq($dst$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vector_len);
7737 
7738   %}
7739   ins_pipe( pipe_slow );
7740 %}
7741 
7742 // Shorts/Chars vector mul
7743 instruct vmul2S(vecS dst, vecS src) %{
7744   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
7745   match(Set dst (MulVS dst src));
7746   format %{ "pmullw $dst,$src\t! mul packed2S" %}
7747   ins_encode %{
7748     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
7749   %}
7750   ins_pipe( pipe_slow );
7751 %}
7752 
7753 instruct vmul2S_reg(vecS dst, vecS src1, vecS src2) %{
7754   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7755   match(Set dst (MulVS src1 src2));
7756   format %{ "vpmullw $dst,$src1,$src2\t! mul packed2S" %}
7757   ins_encode %{
7758     int vector_len = 0;
7759     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7760   %}
7761   ins_pipe( pipe_slow );
7762 %}
7763 
7764 instruct vmul2S_mem(vecS dst, vecS src, memory mem) %{
7765   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7766   match(Set dst (MulVS src (LoadVector mem)));
7767   format %{ "vpmullw $dst,$src,$mem\t! mul packed2S" %}
7768   ins_encode %{
7769     int vector_len = 0;
7770     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7771   %}
7772   ins_pipe( pipe_slow );
7773 %}
7774 
7775 instruct vmul4S(vecD dst, vecD src) %{
7776   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
7777   match(Set dst (MulVS dst src));
7778   format %{ "pmullw  $dst,$src\t! mul packed4S" %}
7779   ins_encode %{
7780     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
7781   %}
7782   ins_pipe( pipe_slow );
7783 %}
7784 
7785 instruct vmul4S_reg(vecD dst, vecD src1, vecD src2) %{
7786   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7787   match(Set dst (MulVS src1 src2));
7788   format %{ "vpmullw $dst,$src1,$src2\t! mul packed4S" %}
7789   ins_encode %{
7790     int vector_len = 0;
7791     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7792   %}
7793   ins_pipe( pipe_slow );
7794 %}
7795 
7796 instruct vmul4S_mem(vecD dst, vecD src, memory mem) %{
7797   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7798   match(Set dst (MulVS src (LoadVector mem)));
7799   format %{ "vpmullw $dst,$src,$mem\t! mul packed4S" %}
7800   ins_encode %{
7801     int vector_len = 0;
7802     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7803   %}
7804   ins_pipe( pipe_slow );
7805 %}
7806 
7807 instruct vmul8S(vecX dst, vecX src) %{
7808   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
7809   match(Set dst (MulVS dst src));
7810   format %{ "pmullw  $dst,$src\t! mul packed8S" %}
7811   ins_encode %{
7812     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
7813   %}
7814   ins_pipe( pipe_slow );
7815 %}
7816 
7817 instruct vmul8S_reg(vecX dst, vecX src1, vecX src2) %{
7818   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
7819   match(Set dst (MulVS src1 src2));
7820   format %{ "vpmullw $dst,$src1,$src2\t! mul packed8S" %}
7821   ins_encode %{
7822     int vector_len = 0;
7823     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7824   %}
7825   ins_pipe( pipe_slow );
7826 %}
7827 
7828 instruct vmul8S_mem(vecX dst, vecX src, memory mem) %{
7829   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
7830   match(Set dst (MulVS src (LoadVector mem)));
7831   format %{ "vpmullw $dst,$src,$mem\t! mul packed8S" %}
7832   ins_encode %{
7833     int vector_len = 0;
7834     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7835   %}
7836   ins_pipe( pipe_slow );
7837 %}
7838 
7839 instruct vmul16S_reg(vecY dst, vecY src1, vecY src2) %{
7840   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
7841   match(Set dst (MulVS src1 src2));
7842   format %{ "vpmullw $dst,$src1,$src2\t! mul packed16S" %}
7843   ins_encode %{
7844     int vector_len = 1;
7845     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7846   %}
7847   ins_pipe( pipe_slow );
7848 %}
7849 
7850 instruct vmul16S_mem(vecY dst, vecY src, memory mem) %{
7851   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
7852   match(Set dst (MulVS src (LoadVector mem)));
7853   format %{ "vpmullw $dst,$src,$mem\t! mul packed16S" %}
7854   ins_encode %{
7855     int vector_len = 1;
7856     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7857   %}
7858   ins_pipe( pipe_slow );
7859 %}
7860 
7861 instruct vmul32S_reg(vecZ dst, vecZ src1, vecZ src2) %{
7862   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
7863   match(Set dst (MulVS src1 src2));
7864   format %{ "vpmullw $dst,$src1,$src2\t! mul packed32S" %}
7865   ins_encode %{
7866     int vector_len = 2;
7867     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7868   %}
7869   ins_pipe( pipe_slow );
7870 %}
7871 
7872 instruct vmul32S_mem(vecZ dst, vecZ src, memory mem) %{
7873   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
7874   match(Set dst (MulVS src (LoadVector mem)));
7875   format %{ "vpmullw $dst,$src,$mem\t! mul packed32S" %}
7876   ins_encode %{
7877     int vector_len = 2;
7878     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7879   %}
7880   ins_pipe( pipe_slow );
7881 %}
7882 
7883 // Integers vector mul (sse4_1)
7884 instruct vmul2I(vecD dst, vecD src) %{
7885   predicate(UseSSE > 3 && n->as_Vector()->length() == 2);
7886   match(Set dst (MulVI dst src));
7887   format %{ "pmulld  $dst,$src\t! mul packed2I" %}
7888   ins_encode %{
7889     __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
7890   %}
7891   ins_pipe( pipe_slow );
7892 %}
7893 
7894 instruct vmul2I_reg(vecD dst, vecD src1, vecD src2) %{
7895   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7896   match(Set dst (MulVI src1 src2));
7897   format %{ "vpmulld $dst,$src1,$src2\t! mul packed2I" %}
7898   ins_encode %{
7899     int vector_len = 0;
7900     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7901   %}
7902   ins_pipe( pipe_slow );
7903 %}
7904 
7905 instruct vmul2I_mem(vecD dst, vecD src, memory mem) %{
7906   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7907   match(Set dst (MulVI src (LoadVector mem)));
7908   format %{ "vpmulld $dst,$src,$mem\t! mul packed2I" %}
7909   ins_encode %{
7910     int vector_len = 0;
7911     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7912   %}
7913   ins_pipe( pipe_slow );
7914 %}
7915 
7916 instruct vmul4I(vecX dst, vecX src) %{
7917   predicate(UseSSE > 3 && n->as_Vector()->length() == 4);
7918   match(Set dst (MulVI dst src));
7919   format %{ "pmulld  $dst,$src\t! mul packed4I" %}
7920   ins_encode %{
7921     __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
7922   %}
7923   ins_pipe( pipe_slow );
7924 %}
7925 
7926 instruct vmul4I_reg(vecX dst, vecX src1, vecX src2) %{
7927   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7928   match(Set dst (MulVI src1 src2));
7929   format %{ "vpmulld $dst,$src1,$src2\t! mul packed4I" %}
7930   ins_encode %{
7931     int vector_len = 0;
7932     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7933   %}
7934   ins_pipe( pipe_slow );
7935 %}
7936 
7937 instruct vmul4I_mem(vecX dst, vecX src, memory mem) %{
7938   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7939   match(Set dst (MulVI src (LoadVector mem)));
7940   format %{ "vpmulld $dst,$src,$mem\t! mul packed4I" %}
7941   ins_encode %{
7942     int vector_len = 0;
7943     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7944   %}
7945   ins_pipe( pipe_slow );
7946 %}
7947 
7948 instruct vmul2L_reg(vecX dst, vecX src1, vecX src2) %{
7949   predicate(UseAVX > 2 && n->as_Vector()->length() == 2 && VM_Version::supports_avx512dq());
7950   match(Set dst (MulVL src1 src2));
7951   format %{ "vpmullq $dst,$src1,$src2\t! mul packed2L" %}
7952   ins_encode %{
7953     int vector_len = 0;
7954     __ vpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7955   %}
7956   ins_pipe( pipe_slow );
7957 %}
7958 
7959 instruct vmul2L_mem(vecX dst, vecX src, memory mem) %{
7960   predicate(UseAVX > 2 && n->as_Vector()->length() == 2 && VM_Version::supports_avx512dq());
7961   match(Set dst (MulVL src (LoadVector mem)));
7962   format %{ "vpmullq $dst,$src,$mem\t! mul packed2L" %}
7963   ins_encode %{
7964     int vector_len = 0;
7965     __ vpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7966   %}
7967   ins_pipe( pipe_slow );
7968 %}
7969 
7970 instruct vmul4L_reg(vecY dst, vecY src1, vecY src2) %{
7971   predicate(UseAVX > 2 && n->as_Vector()->length() == 4 && VM_Version::supports_avx512dq());
7972   match(Set dst (MulVL src1 src2));
7973   format %{ "vpmullq $dst,$src1,$src2\t! mul packed4L" %}
7974   ins_encode %{
7975     int vector_len = 1;
7976     __ vpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7977   %}
7978   ins_pipe( pipe_slow );
7979 %}
7980 
7981 instruct vmul4L_mem(vecY dst, vecY src, memory mem) %{
7982   predicate(UseAVX > 2 && n->as_Vector()->length() == 4 && VM_Version::supports_avx512dq());
7983   match(Set dst (MulVL src (LoadVector mem)));
7984   format %{ "vpmullq $dst,$src,$mem\t! mul packed4L" %}
7985   ins_encode %{
7986     int vector_len = 1;
7987     __ vpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7988   %}
7989   ins_pipe( pipe_slow );
7990 %}
7991 
7992 instruct vmul8L_reg(vecZ dst, vecZ src1, vecZ src2) %{
7993   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && VM_Version::supports_avx512dq());
7994   match(Set dst (MulVL src1 src2));
7995   format %{ "vpmullq $dst,$src1,$src2\t! mul packed8L" %}
7996   ins_encode %{
7997     int vector_len = 2;
7998     __ vpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7999   %}
8000   ins_pipe( pipe_slow );
8001 %}
8002 
8003 instruct vmul8L_mem(vecZ dst, vecZ src, memory mem) %{
8004   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && VM_Version::supports_avx512dq());
8005   match(Set dst (MulVL src (LoadVector mem)));
8006   format %{ "vpmullq $dst,$src,$mem\t! mul packed8L" %}
8007   ins_encode %{
8008     int vector_len = 2;
8009     __ vpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8010   %}
8011   ins_pipe( pipe_slow );
8012 %}
8013 
8014 instruct vmul8I_reg(vecY dst, vecY src1, vecY src2) %{
8015   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
8016   match(Set dst (MulVI src1 src2));
8017   format %{ "vpmulld $dst,$src1,$src2\t! mul packed8I" %}
8018   ins_encode %{
8019     int vector_len = 1;
8020     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8021   %}
8022   ins_pipe( pipe_slow );
8023 %}
8024 
8025 instruct vmul8I_mem(vecY dst, vecY src, memory mem) %{
8026   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
8027   match(Set dst (MulVI src (LoadVector mem)));
8028   format %{ "vpmulld $dst,$src,$mem\t! mul packed8I" %}
8029   ins_encode %{
8030     int vector_len = 1;
8031     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8032   %}
8033   ins_pipe( pipe_slow );
8034 %}
8035 
8036 instruct vmul16I_reg(vecZ dst, vecZ src1, vecZ src2) %{
8037   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
8038   match(Set dst (MulVI src1 src2));
8039   format %{ "vpmulld $dst,$src1,$src2\t! mul packed16I" %}
8040   ins_encode %{
8041     int vector_len = 2;
8042     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8043   %}
8044   ins_pipe( pipe_slow );
8045 %}
8046 
8047 instruct vmul16I_mem(vecZ dst, vecZ src, memory mem) %{
8048   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
8049   match(Set dst (MulVI src (LoadVector mem)));
8050   format %{ "vpmulld $dst,$src,$mem\t! mul packed16I" %}
8051   ins_encode %{
8052     int vector_len = 2;
8053     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8054   %}
8055   ins_pipe( pipe_slow );
8056 %}
8057 
8058 // Floats vector mul
8059 instruct vmul2F(vecD dst, vecD src) %{
8060   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
8061   match(Set dst (MulVF dst src));
8062   format %{ "mulps   $dst,$src\t! mul packed2F" %}
8063   ins_encode %{
8064     __ mulps($dst$$XMMRegister, $src$$XMMRegister);
8065   %}
8066   ins_pipe( pipe_slow );
8067 %}
8068 
8069 instruct vmul2F_reg(vecD dst, vecD src1, vecD src2) %{
8070   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8071   match(Set dst (MulVF src1 src2));
8072   format %{ "vmulps  $dst,$src1,$src2\t! mul packed2F" %}
8073   ins_encode %{
8074     int vector_len = 0;
8075     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8076   %}
8077   ins_pipe( pipe_slow );
8078 %}
8079 
8080 instruct vmul2F_mem(vecD dst, vecD src, memory mem) %{
8081   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8082   match(Set dst (MulVF src (LoadVector mem)));
8083   format %{ "vmulps  $dst,$src,$mem\t! mul packed2F" %}
8084   ins_encode %{
8085     int vector_len = 0;
8086     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8087   %}
8088   ins_pipe( pipe_slow );
8089 %}
8090 
8091 instruct vmul4F(vecX dst, vecX src) %{
8092   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
8093   match(Set dst (MulVF dst src));
8094   format %{ "mulps   $dst,$src\t! mul packed4F" %}
8095   ins_encode %{
8096     __ mulps($dst$$XMMRegister, $src$$XMMRegister);
8097   %}
8098   ins_pipe( pipe_slow );
8099 %}
8100 
8101 instruct vmul4F_reg(vecX dst, vecX src1, vecX src2) %{
8102   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8103   match(Set dst (MulVF src1 src2));
8104   format %{ "vmulps  $dst,$src1,$src2\t! mul packed4F" %}
8105   ins_encode %{
8106     int vector_len = 0;
8107     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8108   %}
8109   ins_pipe( pipe_slow );
8110 %}
8111 
8112 instruct vmul4F_mem(vecX dst, vecX src, memory mem) %{
8113   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8114   match(Set dst (MulVF src (LoadVector mem)));
8115   format %{ "vmulps  $dst,$src,$mem\t! mul packed4F" %}
8116   ins_encode %{
8117     int vector_len = 0;
8118     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8119   %}
8120   ins_pipe( pipe_slow );
8121 %}
8122 
8123 instruct vmul8F_reg(vecY dst, vecY src1, vecY src2) %{
8124   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
8125   match(Set dst (MulVF src1 src2));
8126   format %{ "vmulps  $dst,$src1,$src2\t! mul packed8F" %}
8127   ins_encode %{
8128     int vector_len = 1;
8129     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8130   %}
8131   ins_pipe( pipe_slow );
8132 %}
8133 
8134 instruct vmul8F_mem(vecY dst, vecY src, memory mem) %{
8135   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
8136   match(Set dst (MulVF src (LoadVector mem)));
8137   format %{ "vmulps  $dst,$src,$mem\t! mul packed8F" %}
8138   ins_encode %{
8139     int vector_len = 1;
8140     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8141   %}
8142   ins_pipe( pipe_slow );
8143 %}
8144 
8145 instruct vmul16F_reg(vecZ dst, vecZ src1, vecZ src2) %{
8146   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
8147   match(Set dst (MulVF src1 src2));
8148   format %{ "vmulps  $dst,$src1,$src2\t! mul packed16F" %}
8149   ins_encode %{
8150     int vector_len = 2;
8151     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8152   %}
8153   ins_pipe( pipe_slow );
8154 %}
8155 
8156 instruct vmul16F_mem(vecZ dst, vecZ src, memory mem) %{
8157   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
8158   match(Set dst (MulVF src (LoadVector mem)));
8159   format %{ "vmulps  $dst,$src,$mem\t! mul packed16F" %}
8160   ins_encode %{
8161     int vector_len = 2;
8162     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8163   %}
8164   ins_pipe( pipe_slow );
8165 %}
8166 
8167 // Doubles vector mul
8168 instruct vmul2D(vecX dst, vecX src) %{
8169   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
8170   match(Set dst (MulVD dst src));
8171   format %{ "mulpd   $dst,$src\t! mul packed2D" %}
8172   ins_encode %{
8173     __ mulpd($dst$$XMMRegister, $src$$XMMRegister);
8174   %}
8175   ins_pipe( pipe_slow );
8176 %}
8177 
8178 instruct vmul2D_reg(vecX dst, vecX src1, vecX src2) %{
8179   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8180   match(Set dst (MulVD src1 src2));
8181   format %{ "vmulpd  $dst,$src1,$src2\t! mul packed2D" %}
8182   ins_encode %{
8183     int vector_len = 0;
8184     __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8185   %}
8186   ins_pipe( pipe_slow );
8187 %}
8188 
8189 instruct vmul2D_mem(vecX dst, vecX src, memory mem) %{
8190   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8191   match(Set dst (MulVD src (LoadVector mem)));
8192   format %{ "vmulpd  $dst,$src,$mem\t! mul packed2D" %}
8193   ins_encode %{
8194     int vector_len = 0;
8195     __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8196   %}
8197   ins_pipe( pipe_slow );
8198 %}
8199 
8200 instruct vmul4D_reg(vecY dst, vecY src1, vecY src2) %{
8201   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8202   match(Set dst (MulVD src1 src2));
8203   format %{ "vmulpd  $dst,$src1,$src2\t! mul packed4D" %}
8204   ins_encode %{
8205     int vector_len = 1;
8206     __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8207   %}
8208   ins_pipe( pipe_slow );
8209 %}
8210 
8211 instruct vmul4D_mem(vecY dst, vecY src, memory mem) %{
8212   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8213   match(Set dst (MulVD src (LoadVector mem)));
8214   format %{ "vmulpd  $dst,$src,$mem\t! mul packed4D" %}
8215   ins_encode %{
8216     int vector_len = 1;
8217     __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8218   %}
8219   ins_pipe( pipe_slow );
8220 %}
8221 
8222 instruct vmul8D_reg(vecZ dst, vecZ src1, vecZ src2) %{
8223   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8224   match(Set dst (MulVD src1 src2));
8225   format %{ "vmulpd  $dst k0,$src1,$src2\t! mul packed8D" %}
8226   ins_encode %{
8227     int vector_len = 2;
8228     __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8229   %}
8230   ins_pipe( pipe_slow );
8231 %}
8232 
8233 instruct vmul8D_mem(vecZ dst, vecZ src, memory mem) %{
8234   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8235   match(Set dst (MulVD src (LoadVector mem)));
8236   format %{ "vmulpd  $dst k0,$src,$mem\t! mul packed8D" %}
8237   ins_encode %{
8238     int vector_len = 2;
8239     __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8240   %}
8241   ins_pipe( pipe_slow );
8242 %}
8243 
8244 instruct vcmov8F_reg(legVecY dst, legVecY src1, legVecY src2, immI8 cop, cmpOp_vcmppd copnd) %{
8245   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
8246   match(Set dst (CMoveVF (Binary copnd cop) (Binary src1 src2)));
8247   effect(TEMP dst, USE src1, USE src2);
8248   format %{ "cmpps.$copnd  $dst, $src1, $src2  ! vcmovevf, cond=$cop\n\t"
8249             "blendvps $dst,$src1,$src2,$dst ! vcmovevf\n\t"
8250          %}
8251   ins_encode %{
8252     int vector_len = 1;
8253     int cond = (Assembler::Condition)($copnd$$cmpcode);
8254     __ cmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cond, vector_len);
8255     __ blendvps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $dst$$XMMRegister, vector_len);
8256   %}
8257   ins_pipe( pipe_slow );
8258 %}
8259 
8260 instruct vcmov4D_reg(legVecY dst, legVecY src1, legVecY src2, immI8 cop, cmpOp_vcmppd copnd) %{
8261   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8262   match(Set dst (CMoveVD (Binary copnd cop) (Binary src1 src2)));
8263   effect(TEMP dst, USE src1, USE src2);
8264   format %{ "cmppd.$copnd  $dst, $src1, $src2  ! vcmovevd, cond=$cop\n\t"
8265             "blendvpd $dst,$src1,$src2,$dst ! vcmovevd\n\t"
8266          %}
8267   ins_encode %{
8268     int vector_len = 1;
8269     int cond = (Assembler::Condition)($copnd$$cmpcode);
8270     __ cmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cond, vector_len);
8271     __ blendvpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $dst$$XMMRegister, vector_len);
8272   %}
8273   ins_pipe( pipe_slow );
8274 %}
8275 
8276 // --------------------------------- DIV --------------------------------------
8277 
8278 // Floats vector div
8279 instruct vdiv2F(vecD dst, vecD src) %{
8280   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
8281   match(Set dst (DivVF dst src));
8282   format %{ "divps   $dst,$src\t! div packed2F" %}
8283   ins_encode %{
8284     __ divps($dst$$XMMRegister, $src$$XMMRegister);
8285   %}
8286   ins_pipe( pipe_slow );
8287 %}
8288 
8289 instruct vdiv2F_reg(vecD dst, vecD src1, vecD src2) %{
8290   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8291   match(Set dst (DivVF src1 src2));
8292   format %{ "vdivps  $dst,$src1,$src2\t! div packed2F" %}
8293   ins_encode %{
8294     int vector_len = 0;
8295     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8296   %}
8297   ins_pipe( pipe_slow );
8298 %}
8299 
8300 instruct vdiv2F_mem(vecD dst, vecD src, memory mem) %{
8301   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8302   match(Set dst (DivVF src (LoadVector mem)));
8303   format %{ "vdivps  $dst,$src,$mem\t! div packed2F" %}
8304   ins_encode %{
8305     int vector_len = 0;
8306     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8307   %}
8308   ins_pipe( pipe_slow );
8309 %}
8310 
8311 instruct vdiv4F(vecX dst, vecX src) %{
8312   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
8313   match(Set dst (DivVF dst src));
8314   format %{ "divps   $dst,$src\t! div packed4F" %}
8315   ins_encode %{
8316     __ divps($dst$$XMMRegister, $src$$XMMRegister);
8317   %}
8318   ins_pipe( pipe_slow );
8319 %}
8320 
8321 instruct vdiv4F_reg(vecX dst, vecX src1, vecX src2) %{
8322   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8323   match(Set dst (DivVF src1 src2));
8324   format %{ "vdivps  $dst,$src1,$src2\t! div packed4F" %}
8325   ins_encode %{
8326     int vector_len = 0;
8327     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8328   %}
8329   ins_pipe( pipe_slow );
8330 %}
8331 
8332 instruct vdiv4F_mem(vecX dst, vecX src, memory mem) %{
8333   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8334   match(Set dst (DivVF src (LoadVector mem)));
8335   format %{ "vdivps  $dst,$src,$mem\t! div packed4F" %}
8336   ins_encode %{
8337     int vector_len = 0;
8338     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8339   %}
8340   ins_pipe( pipe_slow );
8341 %}
8342 
8343 instruct vdiv8F_reg(vecY dst, vecY src1, vecY src2) %{
8344   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
8345   match(Set dst (DivVF src1 src2));
8346   format %{ "vdivps  $dst,$src1,$src2\t! div packed8F" %}
8347   ins_encode %{
8348     int vector_len = 1;
8349     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8350   %}
8351   ins_pipe( pipe_slow );
8352 %}
8353 
8354 instruct vdiv8F_mem(vecY dst, vecY src, memory mem) %{
8355   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
8356   match(Set dst (DivVF src (LoadVector mem)));
8357   format %{ "vdivps  $dst,$src,$mem\t! div packed8F" %}
8358   ins_encode %{
8359     int vector_len = 1;
8360     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8361   %}
8362   ins_pipe( pipe_slow );
8363 %}
8364 
8365 instruct vdiv16F_reg(vecZ dst, vecZ src1, vecZ src2) %{
8366   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
8367   match(Set dst (DivVF src1 src2));
8368   format %{ "vdivps  $dst,$src1,$src2\t! div packed16F" %}
8369   ins_encode %{
8370     int vector_len = 2;
8371     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8372   %}
8373   ins_pipe( pipe_slow );
8374 %}
8375 
8376 instruct vdiv16F_mem(vecZ dst, vecZ src, memory mem) %{
8377   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
8378   match(Set dst (DivVF src (LoadVector mem)));
8379   format %{ "vdivps  $dst,$src,$mem\t! div packed16F" %}
8380   ins_encode %{
8381     int vector_len = 2;
8382     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8383   %}
8384   ins_pipe( pipe_slow );
8385 %}
8386 
8387 // Doubles vector div
8388 instruct vdiv2D(vecX dst, vecX src) %{
8389   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
8390   match(Set dst (DivVD dst src));
8391   format %{ "divpd   $dst,$src\t! div packed2D" %}
8392   ins_encode %{
8393     __ divpd($dst$$XMMRegister, $src$$XMMRegister);
8394   %}
8395   ins_pipe( pipe_slow );
8396 %}
8397 
8398 instruct vdiv2D_reg(vecX dst, vecX src1, vecX src2) %{
8399   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8400   match(Set dst (DivVD src1 src2));
8401   format %{ "vdivpd  $dst,$src1,$src2\t! div packed2D" %}
8402   ins_encode %{
8403     int vector_len = 0;
8404     __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8405   %}
8406   ins_pipe( pipe_slow );
8407 %}
8408 
8409 instruct vdiv2D_mem(vecX dst, vecX src, memory mem) %{
8410   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8411   match(Set dst (DivVD src (LoadVector mem)));
8412   format %{ "vdivpd  $dst,$src,$mem\t! div packed2D" %}
8413   ins_encode %{
8414     int vector_len = 0;
8415     __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8416   %}
8417   ins_pipe( pipe_slow );
8418 %}
8419 
8420 instruct vdiv4D_reg(vecY dst, vecY src1, vecY src2) %{
8421   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8422   match(Set dst (DivVD src1 src2));
8423   format %{ "vdivpd  $dst,$src1,$src2\t! div packed4D" %}
8424   ins_encode %{
8425     int vector_len = 1;
8426     __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8427   %}
8428   ins_pipe( pipe_slow );
8429 %}
8430 
8431 instruct vdiv4D_mem(vecY dst, vecY src, memory mem) %{
8432   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8433   match(Set dst (DivVD src (LoadVector mem)));
8434   format %{ "vdivpd  $dst,$src,$mem\t! div packed4D" %}
8435   ins_encode %{
8436     int vector_len = 1;
8437     __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8438   %}
8439   ins_pipe( pipe_slow );
8440 %}
8441 
8442 instruct vdiv8D_reg(vecZ dst, vecZ src1, vecZ src2) %{
8443   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8444   match(Set dst (DivVD src1 src2));
8445   format %{ "vdivpd  $dst,$src1,$src2\t! div packed8D" %}
8446   ins_encode %{
8447     int vector_len = 2;
8448     __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8449   %}
8450   ins_pipe( pipe_slow );
8451 %}
8452 
8453 instruct vdiv8D_mem(vecZ dst, vecZ src, memory mem) %{
8454   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8455   match(Set dst (DivVD src (LoadVector mem)));
8456   format %{ "vdivpd  $dst,$src,$mem\t! div packed8D" %}
8457   ins_encode %{
8458     int vector_len = 2;
8459     __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8460   %}
8461   ins_pipe( pipe_slow );
8462 %}
8463 
8464 // --------------------------------- Sqrt --------------------------------------
8465 
8466 // Floating point vector sqrt
8467 instruct vsqrt2D_reg(vecX dst, vecX src) %{
8468   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8469   match(Set dst (SqrtVD src));
8470   format %{ "vsqrtpd  $dst,$src\t! sqrt packed2D" %}
8471   ins_encode %{
8472     int vector_len = 0;
8473     __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
8474   %}
8475   ins_pipe( pipe_slow );
8476 %}
8477 
8478 instruct vsqrt2D_mem(vecX dst, memory mem) %{
8479   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8480   match(Set dst (SqrtVD (LoadVector mem)));
8481   format %{ "vsqrtpd  $dst,$mem\t! sqrt packed2D" %}
8482   ins_encode %{
8483     int vector_len = 0;
8484     __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vector_len);
8485   %}
8486   ins_pipe( pipe_slow );
8487 %}
8488 
8489 instruct vsqrt4D_reg(vecY dst, vecY src) %{
8490   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8491   match(Set dst (SqrtVD src));
8492   format %{ "vsqrtpd  $dst,$src\t! sqrt packed4D" %}
8493   ins_encode %{
8494     int vector_len = 1;
8495     __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
8496   %}
8497   ins_pipe( pipe_slow );
8498 %}
8499 
8500 instruct vsqrt4D_mem(vecY dst, memory mem) %{
8501   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8502   match(Set dst (SqrtVD (LoadVector mem)));
8503   format %{ "vsqrtpd  $dst,$mem\t! sqrt packed4D" %}
8504   ins_encode %{
8505     int vector_len = 1;
8506     __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vector_len);
8507   %}
8508   ins_pipe( pipe_slow );
8509 %}
8510 
8511 instruct vsqrt8D_reg(vecZ dst, vecZ src) %{
8512   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8513   match(Set dst (SqrtVD src));
8514   format %{ "vsqrtpd  $dst,$src\t! sqrt packed8D" %}
8515   ins_encode %{
8516     int vector_len = 2;
8517     __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
8518   %}
8519   ins_pipe( pipe_slow );
8520 %}
8521 
8522 instruct vsqrt8D_mem(vecZ dst, memory mem) %{
8523   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8524   match(Set dst (SqrtVD (LoadVector mem)));
8525   format %{ "vsqrtpd  $dst,$mem\t! sqrt packed8D" %}
8526   ins_encode %{
8527     int vector_len = 2;
8528     __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vector_len);
8529   %}
8530   ins_pipe( pipe_slow );
8531 %}
8532 
8533 instruct vsqrt2F_reg(vecD dst, vecD src) %{
8534   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8535   match(Set dst (SqrtVF src));
8536   format %{ "vsqrtps  $dst,$src\t! sqrt packed2F" %}
8537   ins_encode %{
8538     int vector_len = 0;
8539     __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vector_len);
8540   %}
8541   ins_pipe( pipe_slow );
8542 %}
8543 
8544 instruct vsqrt2F_mem(vecD dst, memory mem) %{
8545   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8546   match(Set dst (SqrtVF (LoadVector mem)));
8547   format %{ "vsqrtps  $dst,$mem\t! sqrt packed2F" %}
8548   ins_encode %{
8549     int vector_len = 0;
8550     __ vsqrtps($dst$$XMMRegister, $mem$$Address, vector_len);
8551   %}
8552   ins_pipe( pipe_slow );
8553 %}
8554 
8555 instruct vsqrt4F_reg(vecX dst, vecX src) %{
8556   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8557   match(Set dst (SqrtVF src));
8558   format %{ "vsqrtps  $dst,$src\t! sqrt packed4F" %}
8559   ins_encode %{
8560     int vector_len = 0;
8561     __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vector_len);
8562   %}
8563   ins_pipe( pipe_slow );
8564 %}
8565 
8566 instruct vsqrt4F_mem(vecX dst, memory mem) %{
8567   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8568   match(Set dst (SqrtVF (LoadVector mem)));
8569   format %{ "vsqrtps  $dst,$mem\t! sqrt packed4F" %}
8570   ins_encode %{
8571     int vector_len = 0;
8572     __ vsqrtps($dst$$XMMRegister, $mem$$Address, vector_len);
8573   %}
8574   ins_pipe( pipe_slow );
8575 %}
8576 
8577 instruct vsqrt8F_reg(vecY dst, vecY src) %{
8578   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
8579   match(Set dst (SqrtVF src));
8580   format %{ "vsqrtps  $dst,$src\t! sqrt packed8F" %}
8581   ins_encode %{
8582     int vector_len = 1;
8583     __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vector_len);
8584   %}
8585   ins_pipe( pipe_slow );
8586 %}
8587 
8588 instruct vsqrt8F_mem(vecY dst, memory mem) %{
8589   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
8590   match(Set dst (SqrtVF (LoadVector mem)));
8591   format %{ "vsqrtps  $dst,$mem\t! sqrt packed8F" %}
8592   ins_encode %{
8593     int vector_len = 1;
8594     __ vsqrtps($dst$$XMMRegister, $mem$$Address, vector_len);
8595   %}
8596   ins_pipe( pipe_slow );
8597 %}
8598 
8599 instruct vsqrt16F_reg(vecZ dst, vecZ src) %{
8600   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
8601   match(Set dst (SqrtVF src));
8602   format %{ "vsqrtps  $dst,$src\t! sqrt packed16F" %}
8603   ins_encode %{
8604     int vector_len = 2;
8605     __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vector_len);
8606   %}
8607   ins_pipe( pipe_slow );
8608 %}
8609 
8610 instruct vsqrt16F_mem(vecZ dst, memory mem) %{
8611   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
8612   match(Set dst (SqrtVF (LoadVector mem)));
8613   format %{ "vsqrtps  $dst,$mem\t! sqrt packed16F" %}
8614   ins_encode %{
8615     int vector_len = 2;
8616     __ vsqrtps($dst$$XMMRegister, $mem$$Address, vector_len);
8617   %}
8618   ins_pipe( pipe_slow );
8619 %}
8620 
8621 // ------------------------------ Shift ---------------------------------------
8622 
8623 // Left and right shift count vectors are the same on x86
8624 // (only lowest bits of xmm reg are used for count).
8625 instruct vshiftcnt(vecS dst, rRegI cnt) %{
8626   match(Set dst (LShiftCntV cnt));
8627   match(Set dst (RShiftCntV cnt));
8628   format %{ "movdl    $dst,$cnt\t! load shift count" %}
8629   ins_encode %{
8630     __ movdl($dst$$XMMRegister, $cnt$$Register);
8631   %}
8632   ins_pipe( pipe_slow );
8633 %}
8634 
8635 instruct vshiftcntimm(vecS dst, immI8 cnt, rRegI tmp) %{
8636   match(Set dst cnt);
8637   effect(TEMP tmp);
8638   format %{ "movl    $tmp,$cnt\t"
8639             "movdl   $dst,$tmp\t! load shift count" %}
8640   ins_encode %{
8641     __ movl($tmp$$Register, $cnt$$constant);
8642     __ movdl($dst$$XMMRegister, $tmp$$Register);
8643   %}
8644   ins_pipe( pipe_slow );
8645 %}
8646 
8647 // Byte vector shift
8648 instruct vshift4B(vecS dst, vecS src, vecS shift, vecS tmp, rRegI scratch) %{
8649   predicate(UseSSE > 3 && n->as_Vector()->length() == 4);
8650   match(Set dst (LShiftVB src shift));
8651   match(Set dst (RShiftVB src shift));
8652   match(Set dst (URShiftVB src shift));
8653   effect(TEMP dst, TEMP tmp, TEMP scratch);
8654   format %{"pmovxbw   $tmp,$src\n\t"
8655            "shiftop   $tmp,$shift\n\t"
8656            "movdqu    $dst,[0x00ff00ff0x00ff00ff]\n\t"
8657            "pand      $dst,$tmp\n\t"
8658            "packuswb  $dst,$dst\n\t ! packed4B shift" %}
8659   ins_encode %{
8660     emit_vshift4Bor8B_code(_masm, this->as_Mach()->ideal_Opcode() , $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, $tmp$$XMMRegister, $scratch$$Register);
8661   %}
8662   ins_pipe( pipe_slow );
8663 %}
8664 
8665 instruct vshift8B(vecD dst, vecD src, vecS shift, vecD tmp, rRegI scratch) %{
8666   predicate(UseSSE > 3 && n->as_Vector()->length() == 8);
8667   match(Set dst (LShiftVB src shift));
8668   match(Set dst (RShiftVB src shift));
8669   match(Set dst (URShiftVB src shift));
8670   effect(TEMP dst, TEMP tmp, TEMP scratch);
8671   format %{"pmovxbw   $tmp,$src\n\t"
8672            "shiftop   $tmp,$shift\n\t"
8673            "movdqu    $dst,[0x00ff00ff0x00ff00ff]\n\t"
8674            "pand      $dst,$tmp\n\t"
8675            "packuswb  $dst,$dst\n\t ! packed8B shift" %}
8676   ins_encode %{
8677     emit_vshift4Bor8B_code(_masm, this->as_Mach()->ideal_Opcode() , $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, $tmp$$XMMRegister, $scratch$$Register);
8678   %}
8679   ins_pipe( pipe_slow );
8680 %}
8681 
8682 instruct vshift16B(vecX dst, vecX src, vecS shift, vecX tmp1, vecX tmp2, rRegI scratch) %{
8683   predicate(UseSSE > 3  && UseAVX <= 1 && n->as_Vector()->length() == 16);
8684   match(Set dst (LShiftVB src shift));
8685   match(Set dst (RShiftVB src shift));
8686   match(Set dst (URShiftVB src shift));
8687   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP scratch);
8688   format %{"pmovxbw   $tmp1,$src\n\t"
8689            "shiftop   $tmp1,$shift\n\t"
8690            "pshufd    $tmp2,$src\n\t"
8691            "pmovxbw   $tmp2,$tmp2\n\t"
8692            "shiftop   $tmp2,$shift\n\t"
8693            "movdqu    $dst,[0x00ff00ff0x00ff00ff]\n\t"
8694            "pand      $tmp2,$dst\n\t"
8695            "pand      $dst,$tmp1\n\t"
8696            "packuswb  $dst,$tmp2\n\t! packed16B shift" %}
8697   ins_encode %{
8698     emit_vshift16B_code(_masm, this->as_Mach()->ideal_Opcode() , $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, $tmp1$$XMMRegister, $tmp2$$XMMRegister, $scratch$$Register);
8699   %}
8700   ins_pipe( pipe_slow );
8701 %}
8702 
8703 instruct vshift16B_avx(vecX dst, vecX src, vecS shift, vecX tmp, rRegI scratch) %{
8704   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
8705   match(Set dst (LShiftVB src shift));
8706   match(Set dst (RShiftVB src shift));
8707   match(Set dst (URShiftVB src shift));
8708   effect(TEMP dst, TEMP tmp, TEMP scratch);
8709   format %{"vpmovxbw   $tmp,$src\n\t"
8710            "shiftop    $tmp,$tmp,$shift\n\t"
8711            "vpand      $tmp,$tmp,[0x00ff00ff0x00ff00ff]\n\t"
8712            "vextracti128_high  $dst,$tmp\n\t"
8713            "vpackuswb  $dst,$tmp,$dst\n\t! packed16B shift" %}
8714   ins_encode %{
8715     emit_vshift16B_avx_code(_masm, this->as_Mach()->ideal_Opcode() , $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, $tmp$$XMMRegister, $scratch$$Register);
8716   %}
8717   ins_pipe( pipe_slow );
8718 %}
8719 
8720 instruct vshift32B_avx(vecY dst, vecY src, vecS shift, vecY tmp, rRegI scratch) %{
8721   predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
8722   match(Set dst (LShiftVB src shift));
8723   match(Set dst (RShiftVB src shift));
8724   match(Set dst (URShiftVB src shift));
8725   effect(TEMP dst, TEMP tmp, TEMP scratch);
8726   format %{"vextracti128_high  $tmp,$src\n\t"
8727            "vpmovxbw   $tmp,$tmp\n\t"
8728            "vpmovxbw   $dst,$src\n\t"
8729            "shiftop    $tmp,$tmp,$shift\n\t"
8730            "shiftop    $dst,$dst,$shift\n\t"
8731            "vpand      $tmp,$tmp,[0x00ff00ff0x00ff00ff]\n\t"
8732            "vpand      $dst,$dst,[0x00ff00ff0x00ff00ff]\n\t"
8733            "vpackuswb  $dst,$dst,$tmp\n\t"
8734            "vpermq     $dst,$dst,0xD8\n\t! packed32B shift" %}
8735   ins_encode %{
8736     emit_vshift32B_avx_code(_masm, this->as_Mach()->ideal_Opcode() , $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, $tmp$$XMMRegister, $scratch$$Register);
8737   %}
8738   ins_pipe( pipe_slow );
8739 %}
8740 
8741 instruct vshift64B_avx(vecZ dst, vecZ src, vecS shift, vecZ tmp1, vecZ tmp2, rRegI scratch) %{
8742   predicate(UseAVX > 2 && n->as_Vector()->length() == 64);
8743   match(Set dst (LShiftVB src shift));
8744   match(Set dst (RShiftVB src shift));
8745   match(Set dst (URShiftVB src shift));
8746   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP scratch);
8747   format %{"vextracti64x4  $tmp1,$src\n\t"
8748            "vpmovxbw       $tmp1,$tmp1\n\t"
8749            "vpmovxbw       $tmp2,$src\n\t"
8750            "shiftop        $tmp1,$tmp1,$shift\n\t"
8751            "shiftop        $tmp2,$tmp2,$shift\n\t"
8752            "vmovdqu        $dst,[0x00ff00ff0x00ff00ff]\n\t"
8753            "vpbroadcastd   $dst,$dst\n\t"
8754            "vpand          $tmp1,$tmp1,$dst\n\t"
8755            "vpand          $tmp2,$tmp2,$dst\n\t"
8756            "vpackuswb      $dst,$tmp1,$tmp2\n\t"
8757            "evmovdquq      $tmp2, [0x0604020007050301]\n\t"
8758            "vpermq         $dst,$tmp2,$dst\n\t! packed64B shift" %}
8759   ins_encode %{
8760     emit_vshift64B_avx_code(_masm, this->as_Mach()->ideal_Opcode() , $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, $tmp1$$XMMRegister, $tmp2$$XMMRegister, $scratch$$Register);
8761   %}
8762   ins_pipe( pipe_slow );
8763 %}
8764 
8765 // Shorts vector logical right shift produces incorrect Java result
8766 // for negative data because java code convert short value into int with
8767 // sign extension before a shift. But char vectors are fine since chars are
8768 // unsigned values.
8769 // Shorts/Chars vector left shift
8770 instruct vshist2S(vecS dst, vecS src, vecS shift) %{
8771   predicate(n->as_Vector()->length() == 2);
8772   match(Set dst (LShiftVS src shift));
8773   match(Set dst (RShiftVS src shift));
8774   match(Set dst (URShiftVS src shift));
8775   format %{ "shiftop  $dst,$src,$shift\t! shift packed2S" %}
8776   ins_encode %{
8777     if (UseAVX == 0) { 
8778       XX_Inst shiftinst = get_xx_inst(this->as_Mach()->ideal_Opcode());
8779       if ($dst$$XMMRegister != $src$$XMMRegister)
8780          __ movflt($dst$$XMMRegister, $src$$XMMRegister);
8781       (_masm.*shiftinst)($dst$$XMMRegister, $shift$$XMMRegister);
8782     } else {
8783       int vector_len = 0;
8784       XXXI_Inst shiftinst = get_xxxi_inst(this->as_Mach()->ideal_Opcode());
8785       (_masm.*shiftinst)($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8786     }
8787   %}
8788   ins_pipe( pipe_slow );
8789 %}
8790 
8791 instruct vshift4S(vecD dst, vecD src, vecS shift) %{
8792   predicate(n->as_Vector()->length() == 4);
8793   match(Set dst (LShiftVS src shift));
8794   match(Set dst (RShiftVS src shift));
8795   match(Set dst (URShiftVS src shift));
8796   format %{ "shiftop  $dst,$src,$shift\t! shift packed4S" %}
8797   ins_encode %{
8798     if (UseAVX == 0) { 
8799       XX_Inst shiftinst = get_xx_inst(this->as_Mach()->ideal_Opcode());
8800       if ($dst$$XMMRegister != $src$$XMMRegister)
8801          __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
8802       (_masm.*shiftinst)($dst$$XMMRegister, $shift$$XMMRegister);
8803     
8804     } else {
8805       int vector_len = 0;
8806       XXXI_Inst shiftinst = get_xxxi_inst(this->as_Mach()->ideal_Opcode());
8807       (_masm.*shiftinst)($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8808     }
8809   %}
8810   ins_pipe( pipe_slow );
8811 %}
8812 
8813 instruct vshift8S(vecX dst, vecX src, vecS shift) %{
8814   predicate(n->as_Vector()->length() == 8);
8815   match(Set dst (LShiftVS src shift));
8816   match(Set dst (RShiftVS src shift));
8817   match(Set dst (URShiftVS src shift));
8818   format %{ "shiftop  $dst,$src,$shift\t! shift packed8S" %}
8819   ins_encode %{
8820     if (UseAVX == 0) { 
8821       XX_Inst shiftinst = get_xx_inst(this->as_Mach()->ideal_Opcode());
8822       if ($dst$$XMMRegister != $src$$XMMRegister)
8823          __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
8824       (_masm.*shiftinst)($dst$$XMMRegister, $shift$$XMMRegister);
8825     } else {
8826       int vector_len = 0;
8827       XXXI_Inst shiftinst = get_xxxi_inst(this->as_Mach()->ideal_Opcode());
8828       (_masm.*shiftinst)($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8829     }
8830   %}
8831   ins_pipe( pipe_slow );
8832 %}
8833 
8834 instruct vshift16S(vecY dst, vecY src, vecS shift) %{
8835   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
8836   match(Set dst (LShiftVS src shift));
8837   match(Set dst (RShiftVS src shift));
8838   match(Set dst (URShiftVS src shift));
8839   format %{ "shiftop  $dst,$src,$shift\t! shift packed16S" %}
8840   ins_encode %{
8841     int vector_len = 1;
8842     XXXI_Inst shiftinst = get_xxxi_inst(this->as_Mach()->ideal_Opcode());
8843     (_masm.*shiftinst)($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8844   %}
8845   ins_pipe( pipe_slow );
8846 %}
8847 
8848 instruct vshift32S(vecZ dst, vecZ src, vecS shift) %{
8849   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
8850   match(Set dst (LShiftVS src shift));
8851   match(Set dst (RShiftVS src shift));
8852   match(Set dst (URShiftVS src shift));
8853   format %{ "shiftop  $dst,$src,$shift\t! shift packed32S" %}
8854   ins_encode %{
8855     int vector_len = 2;
8856     XXXI_Inst shiftinst = get_xxxi_inst(this->as_Mach()->ideal_Opcode());
8857     (_masm.*shiftinst)($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8858   %}
8859   ins_pipe( pipe_slow );
8860 %}
8861 
8862 // Integers vector left shift
8863 instruct vshift2I(vecD dst, vecD src, vecS shift) %{
8864   predicate(n->as_Vector()->length() == 2);
8865   match(Set dst (LShiftVI src shift));
8866   match(Set dst (RShiftVI src shift));
8867   match(Set dst (URShiftVI src shift));
8868   format %{ "shiftop  $dst,$src,$shift\t! shift packed2I" %}
8869   ins_encode %{
8870     if (UseAVX == 0) { 
8871       XX_Inst shiftinst = get_xx_inst(this->as_Mach()->ideal_Opcode());
8872       if ($dst$$XMMRegister != $src$$XMMRegister)
8873          __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
8874       (_masm.*shiftinst)($dst$$XMMRegister, $shift$$XMMRegister);
8875     } else {
8876       int vector_len = 0;
8877       XXXI_Inst shiftinst = get_xxxi_inst(this->as_Mach()->ideal_Opcode());
8878       (_masm.*shiftinst)($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8879     }
8880   %}
8881   ins_pipe( pipe_slow );
8882 %}
8883 
8884 instruct vshift4I(vecX dst, vecX src, vecS shift) %{
8885   predicate(n->as_Vector()->length() == 4);
8886   match(Set dst (LShiftVI src shift));
8887   match(Set dst (RShiftVI src shift));
8888   match(Set dst (URShiftVI src shift));
8889   format %{ "shiftop  $dst,$src,$shift\t! shift packed4I" %}
8890   ins_encode %{
8891     if (UseAVX == 0) { 
8892       XX_Inst shiftinst = get_xx_inst(this->as_Mach()->ideal_Opcode());
8893       if ($dst$$XMMRegister != $src$$XMMRegister)
8894          __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
8895       (_masm.*shiftinst)($dst$$XMMRegister, $shift$$XMMRegister);
8896     } else {
8897       int vector_len = 0;
8898       XXXI_Inst shiftinst = get_xxxi_inst(this->as_Mach()->ideal_Opcode());
8899       (_masm.*shiftinst)($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8900     }
8901   %}
8902   ins_pipe( pipe_slow );
8903 %}
8904 
8905 instruct vshift8I(vecY dst, vecY src, vecS shift) %{
8906   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
8907   match(Set dst (LShiftVI src shift));
8908   match(Set dst (RShiftVI src shift));
8909   match(Set dst (URShiftVI src shift));
8910   format %{ "shiftop  $dst,$src,$shift\t! shift packed8I" %}
8911   ins_encode %{
8912     int vector_len = 1;
8913     XXXI_Inst shiftinst = get_xxxi_inst(this->as_Mach()->ideal_Opcode());
8914     (_masm.*shiftinst)($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8915   %}
8916   ins_pipe( pipe_slow );
8917 %}
8918 
8919 instruct vshift16I(vecZ dst, vecZ src, vecS shift) %{
8920   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
8921   match(Set dst (LShiftVI src shift));
8922   match(Set dst (RShiftVI src shift));
8923   match(Set dst (URShiftVI src shift));
8924   format %{ "shiftop  $dst,$src,$shift\t! shift packed16I" %}
8925   ins_encode %{
8926     int vector_len = 2;
8927     XXXI_Inst shiftinst = get_xxxi_inst(this->as_Mach()->ideal_Opcode());
8928     (_masm.*shiftinst)($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8929   %}
8930   ins_pipe( pipe_slow );
8931 %}
8932 
8933 // Longs vector shift
8934 instruct vshift2L(vecX dst, vecX src, vecS shift) %{
8935   predicate(n->as_Vector()->length() == 2);
8936   match(Set dst (LShiftVL src shift));
8937   match(Set dst (URShiftVL src shift));
8938   format %{ "shiftop  $dst,$src,$shift\t! shift packed2L" %}
8939   ins_encode %{
8940     if (UseAVX == 0) { 
8941       XX_Inst shiftinst = get_xx_inst(this->as_Mach()->ideal_Opcode());
8942       if ($dst$$XMMRegister != $src$$XMMRegister)
8943          __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
8944       (_masm.*shiftinst)($dst$$XMMRegister, $shift$$XMMRegister);
8945     } else {
8946       int vector_len = 0;
8947       XXXI_Inst shiftinst = get_xxxi_inst(this->as_Mach()->ideal_Opcode());
8948       (_masm.*shiftinst)($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8949     }
8950   %}
8951   ins_pipe( pipe_slow );
8952 %}
8953 
8954 instruct vshift4L(vecY dst, vecY src, vecS shift) %{
8955   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
8956   match(Set dst (LShiftVL src shift));
8957   match(Set dst (URShiftVL src shift));
8958   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed4L" %}
8959   ins_encode %{
8960     int vector_len = 1;
8961     XXXI_Inst shiftinst = get_xxxi_inst(this->as_Mach()->ideal_Opcode());
8962     (_masm.*shiftinst)($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8963   %}
8964   ins_pipe( pipe_slow );
8965 %}
8966 
8967 instruct vshift8L(vecZ dst, vecZ src, vecS shift) %{
8968   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8969   match(Set dst (LShiftVL src shift));
8970   match(Set dst (RShiftVL src shift));
8971   match(Set dst (URShiftVL src shift));
8972   format %{ "shiftop  $dst,$src,$shift\t! shift packed8L" %}
8973   ins_encode %{
8974     int vector_len = 2;
8975     XXXI_Inst shiftinst = get_xxxi_inst(this->as_Mach()->ideal_Opcode());
8976     (_masm.*shiftinst)($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8977   %}
8978   ins_pipe( pipe_slow );
8979 %}
8980 
8981 // -------------------ArithmeticRightShift -----------------------------------
8982 // Long vector arithmetic right shift
8983 instruct vsra2L_reg(vecX dst, vecX src, vecS shift, vecX tmp, rRegI scratch) %{
8984   predicate(UseSSE >= 2 && n->as_Vector()->length() == 2);
8985   match(Set dst (RShiftVL src shift));
8986   effect(TEMP dst, TEMP tmp, TEMP scratch);
8987   format %{ "movdqu  $dst,$src\n\t"
8988             "psrlq   $dst,$shift\n\t"
8989             "movdqu  $tmp,[0x8000000000000000]\n\t"
8990             "psrlq   $tmp,$shift\n\t"
8991             "pxor    $dst,$tmp\n\t"
8992             "psubq   $dst,$tmp\t! arithmetic right shift packed2L" %}
8993   ins_encode %{
8994     __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
8995     __ psrlq($dst$$XMMRegister, $shift$$XMMRegister);
8996     __ movdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), $scratch$$Register);
8997     __ psrlq($tmp$$XMMRegister, $shift$$XMMRegister);
8998     __ pxor($dst$$XMMRegister, $tmp$$XMMRegister);
8999     __ psubq($dst$$XMMRegister, $tmp$$XMMRegister);
9000   %}
9001   ins_pipe( pipe_slow );
9002 %}
9003 
9004 instruct vsra2L_reg_evex(vecX dst, vecX src, vecS shift) %{
9005   predicate(UseAVX > 2 && n->as_Vector()->length() == 2);
9006   match(Set dst (RShiftVL src shift));
9007   format %{ "evpsraq  $dst,$src,$shift\t! arithmetic right shift packed2L" %}
9008   ins_encode %{
9009     int vector_len = 0;
9010     __ evpsraq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9011   %}
9012   ins_pipe( pipe_slow );
9013 %}
9014 
9015 instruct vsra4L_reg(vecY dst, vecY src, vecS shift, vecY tmp, rRegI scratch) %{
9016   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
9017   match(Set dst (RShiftVL src shift));
9018   effect(TEMP dst, TEMP tmp, TEMP scratch);
9019   format %{ "vpsrlq   $dst,$src,$shift\n\t"
9020             "vmovdqu  $tmp,[0x8000000000000000]\n\t"
9021             "vpsrlq   $tmp,$tmp,$shift\n\t"
9022             "vpxor    $dst,$dst,$tmp\n\t"
9023             "vpsubq   $dst,$dst,$tmp\t! arithmetic right shift packed4L" %}
9024   ins_encode %{
9025     int vector_len = 1;
9026     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9027     __ vmovdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), $scratch$$Register);
9028     __ vpsrlq($tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vector_len);
9029     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
9030     __ vpsubq($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
9031   %}
9032   ins_pipe( pipe_slow );
9033 %}
9034 
9035 instruct vsra4L_reg_evex(vecY dst, vecY src, vecS shift) %{
9036   predicate(UseAVX > 2 && n->as_Vector()->length() == 4);
9037   match(Set dst (RShiftVL src shift));
9038   format %{ "evpsraq  $dst,$src,$shift\t! arithmetic right shift packed4L" %}
9039   ins_encode %{
9040     int vector_len = 1;
9041     __ evpsraq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9042   %}
9043   ins_pipe( pipe_slow );
9044 %}
9045 
9046 // --------------------------------- AND --------------------------------------
9047 
9048 instruct vand4B(vecS dst, vecS src) %{
9049   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 4);
9050   match(Set dst (AndV dst src));
9051   format %{ "pand    $dst,$src\t! and vectors (4 bytes)" %}
9052   ins_encode %{
9053     __ pand($dst$$XMMRegister, $src$$XMMRegister);
9054   %}
9055   ins_pipe( pipe_slow );
9056 %}
9057 
9058 instruct vand4B_reg(vecS dst, vecS src1, vecS src2) %{
9059   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
9060   match(Set dst (AndV src1 src2));
9061   format %{ "vpand   $dst,$src1,$src2\t! and vectors (4 bytes)" %}
9062   ins_encode %{
9063     int vector_len = 0;
9064     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9065   %}
9066   ins_pipe( pipe_slow );
9067 %}
9068 
9069 instruct vand4B_mem(vecS dst, vecS src, memory mem) %{
9070   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
9071   match(Set dst (AndV src (LoadVector mem)));
9072   format %{ "vpand   $dst,$src,$mem\t! and vectors (4 bytes)" %}
9073   ins_encode %{
9074     int vector_len = 0;
9075     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9076   %}
9077   ins_pipe( pipe_slow );
9078 %}
9079 
9080 instruct vand8B(vecD dst, vecD src) %{
9081   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 8);
9082   match(Set dst (AndV dst src));
9083   format %{ "pand    $dst,$src\t! and vectors (8 bytes)" %}
9084   ins_encode %{
9085     __ pand($dst$$XMMRegister, $src$$XMMRegister);
9086   %}
9087   ins_pipe( pipe_slow );
9088 %}
9089 
9090 instruct vand8B_reg(vecD dst, vecD src1, vecD src2) %{
9091   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
9092   match(Set dst (AndV src1 src2));
9093   format %{ "vpand   $dst,$src1,$src2\t! and vectors (8 bytes)" %}
9094   ins_encode %{
9095     int vector_len = 0;
9096     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9097   %}
9098   ins_pipe( pipe_slow );
9099 %}
9100 
9101 instruct vand8B_mem(vecD dst, vecD src, memory mem) %{
9102   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
9103   match(Set dst (AndV src (LoadVector mem)));
9104   format %{ "vpand   $dst,$src,$mem\t! and vectors (8 bytes)" %}
9105   ins_encode %{
9106     int vector_len = 0;
9107     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9108   %}
9109   ins_pipe( pipe_slow );
9110 %}
9111 
9112 instruct vand16B(vecX dst, vecX src) %{
9113   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 16);
9114   match(Set dst (AndV dst src));
9115   format %{ "pand    $dst,$src\t! and vectors (16 bytes)" %}
9116   ins_encode %{
9117     __ pand($dst$$XMMRegister, $src$$XMMRegister);
9118   %}
9119   ins_pipe( pipe_slow );
9120 %}
9121 
9122 instruct vand16B_reg(vecX dst, vecX src1, vecX src2) %{
9123   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
9124   match(Set dst (AndV src1 src2));
9125   format %{ "vpand   $dst,$src1,$src2\t! and vectors (16 bytes)" %}
9126   ins_encode %{
9127     int vector_len = 0;
9128     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9129   %}
9130   ins_pipe( pipe_slow );
9131 %}
9132 
9133 instruct vand16B_mem(vecX dst, vecX src, memory mem) %{
9134   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
9135   match(Set dst (AndV src (LoadVector mem)));
9136   format %{ "vpand   $dst,$src,$mem\t! and vectors (16 bytes)" %}
9137   ins_encode %{
9138     int vector_len = 0;
9139     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9140   %}
9141   ins_pipe( pipe_slow );
9142 %}
9143 
9144 instruct vand32B_reg(vecY dst, vecY src1, vecY src2) %{
9145   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
9146   match(Set dst (AndV src1 src2));
9147   format %{ "vpand   $dst,$src1,$src2\t! and vectors (32 bytes)" %}
9148   ins_encode %{
9149     int vector_len = 1;
9150     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9151   %}
9152   ins_pipe( pipe_slow );
9153 %}
9154 
9155 instruct vand32B_mem(vecY dst, vecY src, memory mem) %{
9156   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
9157   match(Set dst (AndV src (LoadVector mem)));
9158   format %{ "vpand   $dst,$src,$mem\t! and vectors (32 bytes)" %}
9159   ins_encode %{
9160     int vector_len = 1;
9161     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9162   %}
9163   ins_pipe( pipe_slow );
9164 %}
9165 
9166 instruct vand64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
9167   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
9168   match(Set dst (AndV src1 src2));
9169   format %{ "vpand   $dst,$src1,$src2\t! and vectors (64 bytes)" %}
9170   ins_encode %{
9171     int vector_len = 2;
9172     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9173   %}
9174   ins_pipe( pipe_slow );
9175 %}
9176 
9177 instruct vand64B_mem(vecZ dst, vecZ src, memory mem) %{
9178   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
9179   match(Set dst (AndV src (LoadVector mem)));
9180   format %{ "vpand   $dst,$src,$mem\t! and vectors (64 bytes)" %}
9181   ins_encode %{
9182     int vector_len = 2;
9183     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9184   %}
9185   ins_pipe( pipe_slow );
9186 %}
9187 
9188 // --------------------------------- OR ---------------------------------------
9189 
9190 instruct vor4B(vecS dst, vecS src) %{
9191   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 4);
9192   match(Set dst (OrV dst src));
9193   format %{ "por     $dst,$src\t! or vectors (4 bytes)" %}
9194   ins_encode %{
9195     __ por($dst$$XMMRegister, $src$$XMMRegister);
9196   %}
9197   ins_pipe( pipe_slow );
9198 %}
9199 
9200 instruct vor4B_reg(vecS dst, vecS src1, vecS src2) %{
9201   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
9202   match(Set dst (OrV src1 src2));
9203   format %{ "vpor    $dst,$src1,$src2\t! or vectors (4 bytes)" %}
9204   ins_encode %{
9205     int vector_len = 0;
9206     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9207   %}
9208   ins_pipe( pipe_slow );
9209 %}
9210 
9211 instruct vor4B_mem(vecS dst, vecS src, memory mem) %{
9212   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
9213   match(Set dst (OrV src (LoadVector mem)));
9214   format %{ "vpor    $dst,$src,$mem\t! or vectors (4 bytes)" %}
9215   ins_encode %{
9216     int vector_len = 0;
9217     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9218   %}
9219   ins_pipe( pipe_slow );
9220 %}
9221 
9222 instruct vor8B(vecD dst, vecD src) %{
9223   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 8);
9224   match(Set dst (OrV dst src));
9225   format %{ "por     $dst,$src\t! or vectors (8 bytes)" %}
9226   ins_encode %{
9227     __ por($dst$$XMMRegister, $src$$XMMRegister);
9228   %}
9229   ins_pipe( pipe_slow );
9230 %}
9231 
9232 instruct vor8B_reg(vecD dst, vecD src1, vecD src2) %{
9233   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
9234   match(Set dst (OrV src1 src2));
9235   format %{ "vpor    $dst,$src1,$src2\t! or vectors (8 bytes)" %}
9236   ins_encode %{
9237     int vector_len = 0;
9238     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9239   %}
9240   ins_pipe( pipe_slow );
9241 %}
9242 
9243 instruct vor8B_mem(vecD dst, vecD src, memory mem) %{
9244   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
9245   match(Set dst (OrV src (LoadVector mem)));
9246   format %{ "vpor    $dst,$src,$mem\t! or vectors (8 bytes)" %}
9247   ins_encode %{
9248     int vector_len = 0;
9249     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9250   %}
9251   ins_pipe( pipe_slow );
9252 %}
9253 
9254 instruct vor16B(vecX dst, vecX src) %{
9255   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 16);
9256   match(Set dst (OrV dst src));
9257   format %{ "por     $dst,$src\t! or vectors (16 bytes)" %}
9258   ins_encode %{
9259     __ por($dst$$XMMRegister, $src$$XMMRegister);
9260   %}
9261   ins_pipe( pipe_slow );
9262 %}
9263 
9264 instruct vor16B_reg(vecX dst, vecX src1, vecX src2) %{
9265   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
9266   match(Set dst (OrV src1 src2));
9267   format %{ "vpor    $dst,$src1,$src2\t! or vectors (16 bytes)" %}
9268   ins_encode %{
9269     int vector_len = 0;
9270     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9271   %}
9272   ins_pipe( pipe_slow );
9273 %}
9274 
9275 instruct vor16B_mem(vecX dst, vecX src, memory mem) %{
9276   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
9277   match(Set dst (OrV src (LoadVector mem)));
9278   format %{ "vpor    $dst,$src,$mem\t! or vectors (16 bytes)" %}
9279   ins_encode %{
9280     int vector_len = 0;
9281     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9282   %}
9283   ins_pipe( pipe_slow );
9284 %}
9285 
9286 instruct vor32B_reg(vecY dst, vecY src1, vecY src2) %{
9287   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
9288   match(Set dst (OrV src1 src2));
9289   format %{ "vpor    $dst,$src1,$src2\t! or vectors (32 bytes)" %}
9290   ins_encode %{
9291     int vector_len = 1;
9292     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9293   %}
9294   ins_pipe( pipe_slow );
9295 %}
9296 
9297 instruct vor32B_mem(vecY dst, vecY src, memory mem) %{
9298   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
9299   match(Set dst (OrV src (LoadVector mem)));
9300   format %{ "vpor    $dst,$src,$mem\t! or vectors (32 bytes)" %}
9301   ins_encode %{
9302     int vector_len = 1;
9303     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9304   %}
9305   ins_pipe( pipe_slow );
9306 %}
9307 
9308 instruct vor64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
9309   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
9310   match(Set dst (OrV src1 src2));
9311   format %{ "vpor    $dst,$src1,$src2\t! or vectors (64 bytes)" %}
9312   ins_encode %{
9313     int vector_len = 2;
9314     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9315   %}
9316   ins_pipe( pipe_slow );
9317 %}
9318 
9319 instruct vor64B_mem(vecZ dst, vecZ src, memory mem) %{
9320   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
9321   match(Set dst (OrV src (LoadVector mem)));
9322   format %{ "vpor    $dst,$src,$mem\t! or vectors (64 bytes)" %}
9323   ins_encode %{
9324     int vector_len = 2;
9325     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9326   %}
9327   ins_pipe( pipe_slow );
9328 %}
9329 
9330 // --------------------------------- XOR --------------------------------------
9331 
9332 instruct vxor4B(vecS dst, vecS src) %{
9333   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 4);
9334   match(Set dst (XorV dst src));
9335   format %{ "pxor    $dst,$src\t! xor vectors (4 bytes)" %}
9336   ins_encode %{
9337     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
9338   %}
9339   ins_pipe( pipe_slow );
9340 %}
9341 
9342 instruct vxor4B_reg(vecS dst, vecS src1, vecS src2) %{
9343   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
9344   match(Set dst (XorV src1 src2));
9345   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (4 bytes)" %}
9346   ins_encode %{
9347     int vector_len = 0;
9348     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9349   %}
9350   ins_pipe( pipe_slow );
9351 %}
9352 
9353 instruct vxor4B_mem(vecS dst, vecS src, memory mem) %{
9354   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
9355   match(Set dst (XorV src (LoadVector mem)));
9356   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (4 bytes)" %}
9357   ins_encode %{
9358     int vector_len = 0;
9359     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9360   %}
9361   ins_pipe( pipe_slow );
9362 %}
9363 
9364 instruct vxor8B(vecD dst, vecD src) %{
9365   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 8);
9366   match(Set dst (XorV dst src));
9367   format %{ "pxor    $dst,$src\t! xor vectors (8 bytes)" %}
9368   ins_encode %{
9369     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
9370   %}
9371   ins_pipe( pipe_slow );
9372 %}
9373 
9374 instruct vxor8B_reg(vecD dst, vecD src1, vecD src2) %{
9375   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
9376   match(Set dst (XorV src1 src2));
9377   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (8 bytes)" %}
9378   ins_encode %{
9379     int vector_len = 0;
9380     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9381   %}
9382   ins_pipe( pipe_slow );
9383 %}
9384 
9385 instruct vxor8B_mem(vecD dst, vecD src, memory mem) %{
9386   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
9387   match(Set dst (XorV src (LoadVector mem)));
9388   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (8 bytes)" %}
9389   ins_encode %{
9390     int vector_len = 0;
9391     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9392   %}
9393   ins_pipe( pipe_slow );
9394 %}
9395 
9396 instruct vxor16B(vecX dst, vecX src) %{
9397   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 16);
9398   match(Set dst (XorV dst src));
9399   format %{ "pxor    $dst,$src\t! xor vectors (16 bytes)" %}
9400   ins_encode %{
9401     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
9402   %}
9403   ins_pipe( pipe_slow );
9404 %}
9405 
9406 instruct vxor16B_reg(vecX dst, vecX src1, vecX src2) %{
9407   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
9408   match(Set dst (XorV src1 src2));
9409   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (16 bytes)" %}
9410   ins_encode %{
9411     int vector_len = 0;
9412     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9413   %}
9414   ins_pipe( pipe_slow );
9415 %}
9416 
9417 instruct vxor16B_mem(vecX dst, vecX src, memory mem) %{
9418   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
9419   match(Set dst (XorV src (LoadVector mem)));
9420   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (16 bytes)" %}
9421   ins_encode %{
9422     int vector_len = 0;
9423     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9424   %}
9425   ins_pipe( pipe_slow );
9426 %}
9427 
9428 instruct vxor32B_reg(vecY dst, vecY src1, vecY src2) %{
9429   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
9430   match(Set dst (XorV src1 src2));
9431   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (32 bytes)" %}
9432   ins_encode %{
9433     int vector_len = 1;
9434     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9435   %}
9436   ins_pipe( pipe_slow );
9437 %}
9438 
9439 instruct vxor32B_mem(vecY dst, vecY src, memory mem) %{
9440   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
9441   match(Set dst (XorV src (LoadVector mem)));
9442   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (32 bytes)" %}
9443   ins_encode %{
9444     int vector_len = 1;
9445     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9446   %}
9447   ins_pipe( pipe_slow );
9448 %}
9449 
9450 instruct vxor64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
9451   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
9452   match(Set dst (XorV src1 src2));
9453   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (64 bytes)" %}
9454   ins_encode %{
9455     int vector_len = 2;
9456     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9457   %}
9458   ins_pipe( pipe_slow );
9459 %}
9460 
9461 instruct vxor64B_mem(vecZ dst, vecZ src, memory mem) %{
9462   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
9463   match(Set dst (XorV src (LoadVector mem)));
9464   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (64 bytes)" %}
9465   ins_encode %{
9466     int vector_len = 2;
9467     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9468   %}
9469   ins_pipe( pipe_slow );
9470 %}
9471 
9472 // --------------------------------- ABS --------------------------------------
9473 // a = |a|
9474 instruct vabs4B_reg(vecS dst, vecS src) %{
9475   predicate(UseSSE > 2 && n->as_Vector()->length() == 4);
9476   match(Set dst (AbsVB  src));
9477   format %{ "pabsb $dst,$src\t# $dst = |$src| abs packed4B" %}
9478   ins_encode %{
9479     __ pabsb($dst$$XMMRegister, $src$$XMMRegister);
9480   %}
9481   ins_pipe( pipe_slow );
9482 %}
9483 
9484 instruct vabs8B_reg(vecD dst, vecD src) %{
9485   predicate(UseSSE > 2 && n->as_Vector()->length() == 8);
9486   match(Set dst (AbsVB  src));
9487   format %{ "pabsb $dst,$src\t# $dst = |$src| abs packed8B" %}
9488   ins_encode %{
9489     __ pabsb($dst$$XMMRegister, $src$$XMMRegister);
9490   %}
9491   ins_pipe( pipe_slow );
9492 %}
9493 
9494 instruct vabs16B_reg(vecX dst, vecX src) %{
9495   predicate(UseSSE > 2 && n->as_Vector()->length() == 16);
9496   match(Set dst (AbsVB  src));
9497   format %{ "pabsb $dst,$src\t# $dst = |$src| abs packed16B" %}
9498   ins_encode %{
9499     __ pabsb($dst$$XMMRegister, $src$$XMMRegister);
9500   %}
9501   ins_pipe( pipe_slow );
9502 %}
9503 
9504 instruct vabs32B_reg(vecY dst, vecY src) %{
9505   predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
9506   match(Set dst (AbsVB  src));
9507   format %{ "vpabsb $dst,$src\t# $dst = |$src| abs packed32B" %}
9508   ins_encode %{
9509     int vector_len = 1;
9510     __ vpabsb($dst$$XMMRegister, $src$$XMMRegister, vector_len);
9511   %}
9512   ins_pipe( pipe_slow );
9513 %}
9514 
9515 instruct vabs64B_reg(vecZ dst, vecZ src) %{
9516   predicate(UseAVX > 2 && n->as_Vector()->length() == 64);
9517   match(Set dst (AbsVB  src));
9518   format %{ "vpabsb $dst,$src\t# $dst = |$src| abs packed64B" %}
9519   ins_encode %{
9520     int vector_len = 2;
9521     __ vpabsb($dst$$XMMRegister, $src$$XMMRegister, vector_len);
9522   %}
9523   ins_pipe( pipe_slow );
9524 %}
9525 
9526 instruct vabs2S_reg(vecD dst, vecD src) %{
9527   predicate(UseSSE > 2 && n->as_Vector()->length() == 2);
9528   match(Set dst (AbsVS  src));
9529   format %{ "pabsw $dst,$src\t# $dst = |$src| abs packed2S" %}
9530   ins_encode %{
9531     __ pabsw($dst$$XMMRegister, $src$$XMMRegister);
9532   %}
9533   ins_pipe( pipe_slow );
9534 %}
9535 
9536 instruct vabs4S_reg(vecD dst, vecD src) %{
9537   predicate(UseSSE > 2 && n->as_Vector()->length() == 4);
9538   match(Set dst (AbsVS  src));
9539   format %{ "pabsw $dst,$src\t# $dst = |$src| abs packed4S" %}
9540   ins_encode %{
9541     __ pabsw($dst$$XMMRegister, $src$$XMMRegister);
9542   %}
9543   ins_pipe( pipe_slow );
9544 %}
9545 
9546 instruct vabs8S_reg(vecX dst, vecX src) %{
9547   predicate(UseSSE > 2 && n->as_Vector()->length() == 8);
9548   match(Set dst (AbsVS  src));
9549   format %{ "pabsw $dst,$src\t# $dst = |$src| abs packed8S" %}
9550   ins_encode %{
9551     __ pabsw($dst$$XMMRegister, $src$$XMMRegister);
9552   %}
9553   ins_pipe( pipe_slow );
9554 %}
9555 
9556 instruct vabs16S_reg(vecY dst, vecY src) %{
9557   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
9558   match(Set dst (AbsVS  src));
9559   format %{ "vpabsw $dst,$src\t# $dst = |$src| abs packed16S" %}
9560   ins_encode %{
9561     int vector_len = 1;
9562     __ vpabsw($dst$$XMMRegister, $src$$XMMRegister, vector_len);
9563   %}
9564   ins_pipe( pipe_slow );
9565 %}
9566 
9567 instruct vabs32S_reg(vecZ dst, vecZ src) %{
9568   predicate(UseAVX > 2 && n->as_Vector()->length() == 32);
9569   match(Set dst (AbsVS  src));
9570   format %{ "vpabsw $dst,$src\t# $dst = |$src| abs packed32S" %}
9571   ins_encode %{
9572     int vector_len = 2;
9573     __ vpabsw($dst$$XMMRegister, $src$$XMMRegister, vector_len);
9574   %}
9575   ins_pipe( pipe_slow );
9576 %}
9577 
9578 instruct vabs2I_reg(vecD dst, vecD src) %{
9579   predicate(UseSSE > 2 && n->as_Vector()->length() == 2);
9580   match(Set dst (AbsVI  src));
9581   format %{ "pabsd $dst,$src\t# $dst = |$src| abs packed2I" %}
9582   ins_encode %{
9583     __ pabsd($dst$$XMMRegister, $src$$XMMRegister);
9584   %}
9585   ins_pipe( pipe_slow );
9586 %}
9587 
9588 instruct vabs4I_reg(vecX dst, vecX src) %{
9589   predicate(UseSSE > 2 && n->as_Vector()->length() == 4);
9590   match(Set dst (AbsVI  src));
9591   format %{ "pabsd $dst,$src\t# $dst = |$src| abs packed4I" %}
9592   ins_encode %{
9593     __ pabsd($dst$$XMMRegister, $src$$XMMRegister);
9594   %}
9595   ins_pipe( pipe_slow );
9596 %}
9597 
9598 instruct vabs8I_reg(vecY dst, vecY src) %{
9599   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
9600   match(Set dst (AbsVI src));
9601   format %{ "vpabsd $dst,$src\t# $dst = |$src| abs packed8I" %}
9602   ins_encode %{
9603     int vector_len = 1;
9604     __ vpabsd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
9605   %}
9606   ins_pipe( pipe_slow );
9607 %}
9608 
9609 instruct vabs16I_reg(vecZ dst, vecZ src) %{
9610   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
9611   match(Set dst (AbsVI src));
9612   format %{ "vpabsd $dst,$src\t# $dst = |$src| abs packed16I" %}
9613   ins_encode %{
9614     int vector_len = 2;
9615     __ vpabsd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
9616   %}
9617   ins_pipe( pipe_slow );
9618 %}
9619 
9620 instruct vabs2L_reg(vecX dst, vecX src) %{
9621   predicate(UseAVX > 2 && n->as_Vector()->length() == 2);
9622   match(Set dst (AbsVL  src));
9623   format %{ "evpabsq $dst,$src\t# $dst = |$src| abs packed2L" %}
9624   ins_encode %{
9625     int vector_len = 0;
9626     __ evpabsq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
9627   %}
9628   ins_pipe( pipe_slow );
9629 %}
9630 
9631 instruct vabs4L_reg(vecY dst, vecY src) %{
9632   predicate(UseAVX > 2 && n->as_Vector()->length() == 4);
9633   match(Set dst (AbsVL  src));
9634   format %{ "evpabsq $dst,$src\t# $dst = |$src| abs packed4L" %}
9635   ins_encode %{
9636     int vector_len = 1;
9637     __ evpabsq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
9638   %}
9639   ins_pipe( pipe_slow );
9640 %}
9641 
9642 instruct vabs8L_reg(vecZ dst, vecZ src) %{
9643   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
9644   match(Set dst (AbsVL  src));
9645   format %{ "evpabsq $dst,$src\t# $dst = |$src| abs packed8L" %}
9646   ins_encode %{
9647     int vector_len = 2;
9648     __ evpabsq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
9649   %}
9650   ins_pipe( pipe_slow );
9651 %}
9652 
9653 // --------------------------------- ABSNEG --------------------------------------
9654 
9655 instruct vabsneg2D(vecX dst, vecX src, rRegI scratch) %{
9656   predicate(UseSSE >= 2 && n->as_Vector()->length() == 2);
9657   match(Set dst (AbsVD  src));
9658   match(Set dst (NegVD  src));
9659   effect(TEMP scratch);
9660   format %{ "and(xor)pd $dst,$src,[mask]\t# absneg packed2D" %}
9661   ins_encode %{
9662     int opcode = this->as_Mach()->ideal_Opcode();
9663     XAR_Inst opinst = get_xar_inst(opcode);
9664     AddressLiteral adr = get_mask(opcode);
9665     if ($dst$$XMMRegister != $src$$XMMRegister)
9666       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
9667     (_masm.*opinst)($dst$$XMMRegister, adr, $scratch$$Register);
9668   %}
9669   ins_pipe( pipe_slow );
9670 %}
9671 
9672 instruct vabsneg4D(vecY dst, vecY src, rRegI scratch) %{
9673   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
9674   match(Set dst (AbsVD  src));
9675   match(Set dst (NegVD  src));
9676   effect(TEMP scratch);
9677   format %{ "vand(xor)pd $dst,$src,[mask]\t# absneg packed4D" %}
9678   ins_encode %{
9679     int opcode = this->as_Mach()->ideal_Opcode();
9680     XXAIR_Inst opinst = get_xxair_inst(opcode);
9681     AddressLiteral adr = get_mask(opcode);
9682     int vector_len = 1;
9683     (_masm.*opinst)($dst$$XMMRegister, $src$$XMMRegister, adr, vector_len, $scratch$$Register);
9684   %}
9685   ins_pipe( pipe_slow );
9686 %}
9687 
9688 instruct vabsneg8D(vecZ dst, vecZ src, rRegI scratch) %{
9689   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
9690   match(Set dst (AbsVD  src));
9691   match(Set dst (NegVD  src));
9692   effect(TEMP scratch);
9693   format %{ "vand(xor)pd $dst,$src,[mask]\t# absneg packed8D" %}
9694   ins_encode %{
9695     int opcode = this->as_Mach()->ideal_Opcode();
9696     XXAIR_Inst opinst = get_xxair_inst(opcode);
9697     AddressLiteral adr = get_mask(opcode);
9698     int vector_len = 2;
9699     (_masm.*opinst)($dst$$XMMRegister, $src$$XMMRegister, adr, vector_len, $scratch$$Register);
9700   %}
9701   ins_pipe( pipe_slow );
9702 %}
9703 
9704 instruct vabsneg2F(vecD dst, vecD src, rRegI scratch) %{
9705   predicate(UseSSE > 0 && n->as_Vector()->length() == 2);
9706   match(Set dst (AbsVF  src));
9707   match(Set dst (NegVF  src));
9708   effect(TEMP scratch);
9709   format %{ "and(xor)ps $dst,$src,[mask]\t# absneg packed2F" %}
9710   ins_encode %{
9711     int opcode = this->as_Mach()->ideal_Opcode();
9712     XAR_Inst opinst = get_xar_inst(opcode);
9713     AddressLiteral adr = get_mask(opcode);
9714     if ($dst$$XMMRegister != $src$$XMMRegister)
9715       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
9716     (_masm.*opinst)($dst$$XMMRegister, adr, $scratch$$Register);
9717   %}
9718   ins_pipe( pipe_slow );
9719 %}
9720 
9721 instruct vabsneg4F(vecX dst, rRegI scratch) %{
9722   predicate(UseSSE > 0 && n->as_Vector()->length() == 4);
9723   match(Set dst (AbsVF  dst));
9724   match(Set dst (NegVF  dst));
9725   effect(TEMP scratch);
9726   format %{ "vand(xor)ps $dst,[mask]\t# absneg packed4F" %}
9727   ins_cost(150);
9728   ins_encode %{
9729     int opcode = this->as_Mach()->ideal_Opcode();
9730     XAR_Inst opinst = get_xar_inst(opcode);
9731     AddressLiteral adr = get_mask(opcode);
9732     (_masm.*opinst)($dst$$XMMRegister, adr, $scratch$$Register);
9733   %}
9734   ins_pipe( pipe_slow );
9735 %}
9736 
9737 instruct vabsneg8F(vecY dst, vecY src, rRegI scratch) %{
9738   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
9739   match(Set dst (AbsVF  src));
9740   match(Set dst (NegVF  src));
9741   effect(TEMP scratch);
9742   format %{ "vand(xor)ps $dst,$src,[mask]\t# absneg packed8F" %}
9743   ins_cost(150);
9744   ins_encode %{
9745     int opcode = this->as_Mach()->ideal_Opcode();
9746     XXAIR_Inst opinst = get_xxair_inst(opcode);
9747     AddressLiteral adr = get_mask(opcode);
9748     int vector_len = 1;
9749     (_masm.*opinst)($dst$$XMMRegister, $src$$XMMRegister, adr, vector_len, $scratch$$Register);
9750   %}
9751   ins_pipe( pipe_slow );
9752 %}
9753 
9754 instruct vabsneg16F(vecZ dst, vecZ src, rRegI scratch) %{
9755   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
9756   match(Set dst (AbsVF  src));
9757   match(Set dst (NegVF  src));
9758   effect(TEMP scratch);
9759   format %{ "vand(xor)ps $dst,$src,[mask]\t# absneg packed16F" %}
9760   ins_cost(150);
9761   ins_encode %{
9762     int opcode = this->as_Mach()->ideal_Opcode();
9763     XXAIR_Inst opinst = get_xxair_inst(opcode);
9764     AddressLiteral adr = get_mask(opcode);
9765     int vector_len = 2;
9766     (_masm.*opinst)($dst$$XMMRegister, $src$$XMMRegister, adr, vector_len, $scratch$$Register);
9767   %}
9768   ins_pipe( pipe_slow );
9769 %}
9770 
9771 // --------------------------------- FMA --------------------------------------
9772 
9773 // a * b + c
9774 instruct vfma2D_reg(vecX a, vecX b, vecX c) %{
9775   predicate(UseFMA && n->as_Vector()->length() == 2);
9776   match(Set c (FmaVD  c (Binary a b)));
9777   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed2D" %}
9778   ins_cost(150);
9779   ins_encode %{
9780     int vector_len = 0;
9781     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
9782   %}
9783   ins_pipe( pipe_slow );
9784 %}
9785 
9786 // a * b + c
9787 instruct vfma2D_mem(vecX a, memory b, vecX c) %{
9788   predicate(UseFMA && n->as_Vector()->length() == 2);
9789   match(Set c (FmaVD  c (Binary a (LoadVector b))));
9790   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed2D" %}
9791   ins_cost(150);
9792   ins_encode %{
9793     int vector_len = 0;
9794     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
9795   %}
9796   ins_pipe( pipe_slow );
9797 %}
9798 
9799 
9800 // a * b + c
9801 instruct vfma4D_reg(vecY a, vecY b, vecY c) %{
9802   predicate(UseFMA && n->as_Vector()->length() == 4);
9803   match(Set c (FmaVD  c (Binary a b)));
9804   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed4D" %}
9805   ins_cost(150);
9806   ins_encode %{
9807     int vector_len = 1;
9808     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
9809   %}
9810   ins_pipe( pipe_slow );
9811 %}
9812 
9813 // a * b + c
9814 instruct vfma4D_mem(vecY a, memory b, vecY c) %{
9815   predicate(UseFMA && n->as_Vector()->length() == 4);
9816   match(Set c (FmaVD  c (Binary a (LoadVector b))));
9817   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed4D" %}
9818   ins_cost(150);
9819   ins_encode %{
9820     int vector_len = 1;
9821     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
9822   %}
9823   ins_pipe( pipe_slow );
9824 %}
9825 
9826 // a * b + c
9827 instruct vfma8D_reg(vecZ a, vecZ b, vecZ c) %{
9828   predicate(UseFMA && n->as_Vector()->length() == 8);
9829   match(Set c (FmaVD  c (Binary a b)));
9830   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed8D" %}
9831   ins_cost(150);
9832   ins_encode %{
9833     int vector_len = 2;
9834     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
9835   %}
9836   ins_pipe( pipe_slow );
9837 %}
9838 
9839 // a * b + c
9840 instruct vfma8D_mem(vecZ a, memory b, vecZ c) %{
9841   predicate(UseFMA && n->as_Vector()->length() == 8);
9842   match(Set c (FmaVD  c (Binary a (LoadVector b))));
9843   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed8D" %}
9844   ins_cost(150);
9845   ins_encode %{
9846     int vector_len = 2;
9847     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
9848   %}
9849   ins_pipe( pipe_slow );
9850 %}
9851 
9852 // a * b + c
9853 instruct vfma4F_reg(vecX a, vecX b, vecX c) %{
9854   predicate(UseFMA && n->as_Vector()->length() == 4);
9855   match(Set c (FmaVF  c (Binary a b)));
9856   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed4F" %}
9857   ins_cost(150);
9858   ins_encode %{
9859     int vector_len = 0;
9860     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
9861   %}
9862   ins_pipe( pipe_slow );
9863 %}
9864 
9865 // a * b + c
9866 instruct vfma4F_mem(vecX a, memory b, vecX c) %{
9867   predicate(UseFMA && n->as_Vector()->length() == 4);
9868   match(Set c (FmaVF  c (Binary a (LoadVector b))));
9869   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed4F" %}
9870   ins_cost(150);
9871   ins_encode %{
9872     int vector_len = 0;
9873     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
9874   %}
9875   ins_pipe( pipe_slow );
9876 %}
9877 
9878 // a * b + c
9879 instruct vfma8F_reg(vecY a, vecY b, vecY c) %{
9880   predicate(UseFMA && n->as_Vector()->length() == 8);
9881   match(Set c (FmaVF  c (Binary a b)));
9882   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed8F" %}
9883   ins_cost(150);
9884   ins_encode %{
9885     int vector_len = 1;
9886     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
9887   %}
9888   ins_pipe( pipe_slow );
9889 %}
9890 
9891 // a * b + c
9892 instruct vfma8F_mem(vecY a, memory b, vecY c) %{
9893   predicate(UseFMA && n->as_Vector()->length() == 8);
9894   match(Set c (FmaVF  c (Binary a (LoadVector b))));
9895   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed8F" %}
9896   ins_cost(150);
9897   ins_encode %{
9898     int vector_len = 1;
9899     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
9900   %}
9901   ins_pipe( pipe_slow );
9902 %}
9903 
9904 // a * b + c
9905 instruct vfma16F_reg(vecZ a, vecZ b, vecZ c) %{
9906   predicate(UseFMA && n->as_Vector()->length() == 16);
9907   match(Set c (FmaVF  c (Binary a b)));
9908   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed16F" %}
9909   ins_cost(150);
9910   ins_encode %{
9911     int vector_len = 2;
9912     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
9913   %}
9914   ins_pipe( pipe_slow );
9915 %}
9916 
9917 // a * b + c
9918 instruct vfma16F_mem(vecZ a, memory b, vecZ c) %{
9919   predicate(UseFMA && n->as_Vector()->length() == 16);
9920   match(Set c (FmaVF  c (Binary a (LoadVector b))));
9921   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed16F" %}
9922   ins_cost(150);
9923   ins_encode %{
9924     int vector_len = 2;
9925     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
9926   %}
9927   ins_pipe( pipe_slow );
9928 %}
9929 
9930 // --------------------------------- Vector Multiply Add --------------------------------------
9931 
9932 instruct smuladd4S2I_reg(vecD dst, vecD src1) %{
9933   predicate(UseSSE >= 2 && UseAVX == 0 && n->as_Vector()->length() == 2);
9934   match(Set dst (MulAddVS2VI dst src1));
9935   format %{ "pmaddwd $dst,$dst,$src1\t! muladd packed4Sto2I" %}
9936   ins_encode %{
9937     __ pmaddwd($dst$$XMMRegister, $src1$$XMMRegister);
9938   %}
9939   ins_pipe( pipe_slow );
9940 %}
9941 
9942 instruct vmuladd4S2I_reg(vecD dst, vecD src1, vecD src2) %{
9943   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
9944   match(Set dst (MulAddVS2VI src1 src2));
9945   format %{ "vpmaddwd $dst,$src1,$src2\t! muladd packed4Sto2I" %}
9946   ins_encode %{
9947     int vector_len = 0;
9948     __ vpmaddwd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9949   %}
9950   ins_pipe( pipe_slow );
9951 %}
9952 
9953 instruct smuladd8S4I_reg(vecX dst, vecX src1) %{
9954   predicate(UseSSE >= 2 && UseAVX == 0 && n->as_Vector()->length() == 4);
9955   match(Set dst (MulAddVS2VI dst src1));
9956   format %{ "pmaddwd $dst,$dst,$src1\t! muladd packed8Sto4I" %}
9957   ins_encode %{
9958     __ pmaddwd($dst$$XMMRegister, $src1$$XMMRegister);
9959   %}
9960   ins_pipe( pipe_slow );
9961 %}
9962 
9963 instruct vmuladd8S4I_reg(vecX dst, vecX src1, vecX src2) %{
9964   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
9965   match(Set dst (MulAddVS2VI src1 src2));
9966   format %{ "vpmaddwd $dst,$src1,$src2\t! muladd packed8Sto4I" %}
9967   ins_encode %{
9968     int vector_len = 0;
9969     __ vpmaddwd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9970   %}
9971   ins_pipe( pipe_slow );
9972 %}
9973 
9974 instruct vmuladd16S8I_reg(vecY dst, vecY src1, vecY src2) %{
9975   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
9976   match(Set dst (MulAddVS2VI src1 src2));
9977   format %{ "vpmaddwd $dst,$src1,$src2\t! muladd packed16Sto8I" %}
9978   ins_encode %{
9979     int vector_len = 1;
9980     __ vpmaddwd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9981   %}
9982   ins_pipe( pipe_slow );
9983 %}
9984 
9985 instruct vmuladd32S16I_reg(vecZ dst, vecZ src1, vecZ src2) %{
9986   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
9987   match(Set dst (MulAddVS2VI src1 src2));
9988   format %{ "vpmaddwd $dst,$src1,$src2\t! muladd packed32Sto16I" %}
9989   ins_encode %{
9990     int vector_len = 2;
9991     __ vpmaddwd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9992   %}
9993   ins_pipe( pipe_slow );
9994 %}
9995 
9996 // --------------------------------- Vector Multiply Add Add ----------------------------------
9997 
9998 instruct vmuladdadd4S2I_reg(vecD dst, vecD src1, vecD src2) %{
9999   predicate(VM_Version::supports_vnni() && UseAVX > 2 && n->as_Vector()->length() == 2);
10000   match(Set dst (AddVI (MulAddVS2VI src1 src2) dst));
10001   format %{ "evpdpwssd $dst,$src1,$src2\t! muladdadd packed4Sto2I" %}
10002   ins_encode %{
10003     int vector_len = 0;
10004     __ evpdpwssd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
10005   %}
10006   ins_pipe( pipe_slow );
10007   ins_cost(10);
10008 %}
10009 
10010 instruct vmuladdadd8S4I_reg(vecX dst, vecX src1, vecX src2) %{
10011   predicate(VM_Version::supports_vnni() && UseAVX > 2 && n->as_Vector()->length() == 4);
10012   match(Set dst (AddVI (MulAddVS2VI src1 src2) dst));
10013   format %{ "evpdpwssd $dst,$src1,$src2\t! muladdadd packed8Sto4I" %}
10014   ins_encode %{
10015     int vector_len = 0;
10016     __ evpdpwssd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
10017   %}
10018   ins_pipe( pipe_slow );
10019   ins_cost(10);
10020 %}
10021 
10022 instruct vmuladdadd16S8I_reg(vecY dst, vecY src1, vecY src2) %{
10023   predicate(VM_Version::supports_vnni() && UseAVX > 2 && n->as_Vector()->length() == 8);
10024   match(Set dst (AddVI (MulAddVS2VI src1 src2) dst));
10025   format %{ "evpdpwssd $dst,$src1,$src2\t! muladdadd packed16Sto8I" %}
10026   ins_encode %{
10027     int vector_len = 1;
10028     __ evpdpwssd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
10029   %}
10030   ins_pipe( pipe_slow );
10031   ins_cost(10);
10032 %}
10033 
10034 instruct vmuladdadd32S16I_reg(vecZ dst, vecZ src1, vecZ src2) %{
10035   predicate(VM_Version::supports_vnni() && UseAVX > 2 && n->as_Vector()->length() == 16);
10036   match(Set dst (AddVI (MulAddVS2VI src1 src2) dst));
10037   format %{ "evpdpwssd $dst,$src1,$src2\t! muladdadd packed32Sto16I" %}
10038   ins_encode %{
10039     int vector_len = 2;
10040     __ evpdpwssd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
10041   %}
10042   ins_pipe( pipe_slow );
10043   ins_cost(10);
10044 %}
10045 
10046 // --------------------------------- PopCount --------------------------------------
10047 
10048 instruct vpopcount2I(vecD dst, vecD src) %{
10049   predicate(VM_Version::supports_vpopcntdq() && UsePopCountInstruction && n->as_Vector()->length() == 2);
10050   match(Set dst (PopCountVI src));
10051   format %{ "vpopcntd  $dst,$src\t! vector popcount packed2I" %}
10052   ins_encode %{
10053     int vector_len = 0;
10054     __ vpopcntd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
10055   %}
10056   ins_pipe( pipe_slow );
10057 %}
10058 
10059 instruct vpopcount4I(vecX dst, vecX src) %{
10060   predicate(VM_Version::supports_vpopcntdq() && UsePopCountInstruction && n->as_Vector()->length() == 4);
10061   match(Set dst (PopCountVI src));
10062   format %{ "vpopcntd  $dst,$src\t! vector popcount packed4I" %}
10063   ins_encode %{
10064     int vector_len = 0;
10065     __ vpopcntd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
10066   %}
10067   ins_pipe( pipe_slow );
10068 %}
10069 
10070 instruct vpopcount8I(vecY dst, vecY src) %{
10071   predicate(VM_Version::supports_vpopcntdq() && UsePopCountInstruction && n->as_Vector()->length() == 8);
10072   match(Set dst (PopCountVI src));
10073   format %{ "vpopcntd  $dst,$src\t! vector popcount packed8I" %}
10074   ins_encode %{
10075     int vector_len = 1;
10076     __ vpopcntd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
10077   %}
10078   ins_pipe( pipe_slow );
10079 %}
10080 
10081 instruct vpopcount16I(vecZ dst, vecZ src) %{
10082   predicate(VM_Version::supports_vpopcntdq() && UsePopCountInstruction && n->as_Vector()->length() == 16);
10083   match(Set dst (PopCountVI src));
10084   format %{ "vpopcntd  $dst,$src\t! vector popcount packed16I" %}
10085   ins_encode %{
10086     int vector_len = 2;
10087     __ vpopcntd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
10088   %}
10089   ins_pipe( pipe_slow );
10090 %}