1 //
   2 // Copyright (c) 2011, 2018, Oracle and/or its affiliates. All rights reserved.
   3 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4 //
   5 // This code is free software; you can redistribute it and/or modify it
   6 // under the terms of the GNU General Public License version 2 only, as
   7 // published by the Free Software Foundation.
   8 //
   9 // This code is distributed in the hope that it will be useful, but WITHOUT
  10 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11 // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12 // version 2 for more details (a copy is included in the LICENSE file that
  13 // accompanied this code).
  14 //
  15 // You should have received a copy of the GNU General Public License version
  16 // 2 along with this work; if not, write to the Free Software Foundation,
  17 // Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18 //
  19 // Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20 // or visit www.oracle.com if you need additional information or have any
  21 // questions.
  22 //
  23 //
  24 
  25 // X86 Common Architecture Description File
  26 
  27 //----------REGISTER DEFINITION BLOCK------------------------------------------
  28 // This information is used by the matcher and the register allocator to
  29 // describe individual registers and classes of registers within the target
  30 // archtecture.
  31 
  32 register %{
  33 //----------Architecture Description Register Definitions----------------------
  34 // General Registers
  35 // "reg_def"  name ( register save type, C convention save type,
  36 //                   ideal register type, encoding );
  37 // Register Save Types:
  38 //
  39 // NS  = No-Save:       The register allocator assumes that these registers
  40 //                      can be used without saving upon entry to the method, &
  41 //                      that they do not need to be saved at call sites.
  42 //
  43 // SOC = Save-On-Call:  The register allocator assumes that these registers
  44 //                      can be used without saving upon entry to the method,
  45 //                      but that they must be saved at call sites.
  46 //
  47 // SOE = Save-On-Entry: The register allocator assumes that these registers
  48 //                      must be saved before using them upon entry to the
  49 //                      method, but they do not need to be saved at call
  50 //                      sites.
  51 //
  52 // AS  = Always-Save:   The register allocator assumes that these registers
  53 //                      must be saved before using them upon entry to the
  54 //                      method, & that they must be saved at call sites.
  55 //
  56 // Ideal Register Type is used to determine how to save & restore a
  57 // register.  Op_RegI will get spilled with LoadI/StoreI, Op_RegP will get
  58 // spilled with LoadP/StoreP.  If the register supports both, use Op_RegI.
  59 //
  60 // The encoding number is the actual bit-pattern placed into the opcodes.
  61 
  62 // XMM registers.  512-bit registers or 8 words each, labeled (a)-p.
  63 // Word a in each register holds a Float, words ab hold a Double.
  64 // The whole registers are used in SSE4.2 version intrinsics,
  65 // array copy stubs and superword operations (see UseSSE42Intrinsics,
  66 // UseXMMForArrayCopy and UseSuperword flags).
  67 // For pre EVEX enabled architectures:
  68 //      XMM8-XMM15 must be encoded with REX (VEX for UseAVX)
  69 // For EVEX enabled architectures:
  70 //      XMM8-XMM31 must be encoded with REX (EVEX for UseAVX).
  71 //
  72 // Linux ABI:   No register preserved across function calls
  73 //              XMM0-XMM7 might hold parameters
  74 // Windows ABI: XMM6-XMM31 preserved across function calls
  75 //              XMM0-XMM3 might hold parameters
  76 
  77 reg_def XMM0 ( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg());
  78 reg_def XMM0b( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(1));
  79 reg_def XMM0c( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(2));
  80 reg_def XMM0d( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(3));
  81 reg_def XMM0e( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(4));
  82 reg_def XMM0f( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(5));
  83 reg_def XMM0g( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(6));
  84 reg_def XMM0h( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(7));
  85 reg_def XMM0i( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(8));
  86 reg_def XMM0j( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(9));
  87 reg_def XMM0k( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(10));
  88 reg_def XMM0l( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(11));
  89 reg_def XMM0m( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(12));
  90 reg_def XMM0n( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(13));
  91 reg_def XMM0o( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(14));
  92 reg_def XMM0p( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(15));
  93 
  94 reg_def XMM1 ( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg());
  95 reg_def XMM1b( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(1));
  96 reg_def XMM1c( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(2));
  97 reg_def XMM1d( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(3));
  98 reg_def XMM1e( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(4));
  99 reg_def XMM1f( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(5));
 100 reg_def XMM1g( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(6));
 101 reg_def XMM1h( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(7));
 102 reg_def XMM1i( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(8));
 103 reg_def XMM1j( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(9));
 104 reg_def XMM1k( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(10));
 105 reg_def XMM1l( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(11));
 106 reg_def XMM1m( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(12));
 107 reg_def XMM1n( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(13));
 108 reg_def XMM1o( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(14));
 109 reg_def XMM1p( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(15));
 110 
 111 reg_def XMM2 ( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg());
 112 reg_def XMM2b( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(1));
 113 reg_def XMM2c( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(2));
 114 reg_def XMM2d( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(3));
 115 reg_def XMM2e( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(4));
 116 reg_def XMM2f( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(5));
 117 reg_def XMM2g( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(6));
 118 reg_def XMM2h( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(7));
 119 reg_def XMM2i( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(8));
 120 reg_def XMM2j( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(9));
 121 reg_def XMM2k( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(10));
 122 reg_def XMM2l( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(11));
 123 reg_def XMM2m( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(12));
 124 reg_def XMM2n( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(13));
 125 reg_def XMM2o( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(14));
 126 reg_def XMM2p( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(15));
 127 
 128 reg_def XMM3 ( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg());
 129 reg_def XMM3b( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(1));
 130 reg_def XMM3c( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(2));
 131 reg_def XMM3d( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(3));
 132 reg_def XMM3e( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(4));
 133 reg_def XMM3f( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(5));
 134 reg_def XMM3g( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(6));
 135 reg_def XMM3h( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(7));
 136 reg_def XMM3i( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(8));
 137 reg_def XMM3j( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(9));
 138 reg_def XMM3k( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(10));
 139 reg_def XMM3l( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(11));
 140 reg_def XMM3m( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(12));
 141 reg_def XMM3n( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(13));
 142 reg_def XMM3o( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(14));
 143 reg_def XMM3p( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(15));
 144 
 145 reg_def XMM4 ( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg());
 146 reg_def XMM4b( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(1));
 147 reg_def XMM4c( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(2));
 148 reg_def XMM4d( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(3));
 149 reg_def XMM4e( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(4));
 150 reg_def XMM4f( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(5));
 151 reg_def XMM4g( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(6));
 152 reg_def XMM4h( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(7));
 153 reg_def XMM4i( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(8));
 154 reg_def XMM4j( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(9));
 155 reg_def XMM4k( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(10));
 156 reg_def XMM4l( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(11));
 157 reg_def XMM4m( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(12));
 158 reg_def XMM4n( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(13));
 159 reg_def XMM4o( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(14));
 160 reg_def XMM4p( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(15));
 161 
 162 reg_def XMM5 ( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg());
 163 reg_def XMM5b( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(1));
 164 reg_def XMM5c( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(2));
 165 reg_def XMM5d( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(3));
 166 reg_def XMM5e( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(4));
 167 reg_def XMM5f( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(5));
 168 reg_def XMM5g( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(6));
 169 reg_def XMM5h( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(7));
 170 reg_def XMM5i( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(8));
 171 reg_def XMM5j( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(9));
 172 reg_def XMM5k( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(10));
 173 reg_def XMM5l( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(11));
 174 reg_def XMM5m( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(12));
 175 reg_def XMM5n( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(13));
 176 reg_def XMM5o( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(14));
 177 reg_def XMM5p( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(15));
 178 
 179 reg_def XMM6 ( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg());
 180 reg_def XMM6b( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(1));
 181 reg_def XMM6c( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(2));
 182 reg_def XMM6d( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(3));
 183 reg_def XMM6e( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(4));
 184 reg_def XMM6f( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(5));
 185 reg_def XMM6g( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(6));
 186 reg_def XMM6h( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(7));
 187 reg_def XMM6i( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(8));
 188 reg_def XMM6j( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(9));
 189 reg_def XMM6k( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(10));
 190 reg_def XMM6l( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(11));
 191 reg_def XMM6m( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(12));
 192 reg_def XMM6n( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(13));
 193 reg_def XMM6o( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(14));
 194 reg_def XMM6p( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(15));
 195 
 196 reg_def XMM7 ( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg());
 197 reg_def XMM7b( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(1));
 198 reg_def XMM7c( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(2));
 199 reg_def XMM7d( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(3));
 200 reg_def XMM7e( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(4));
 201 reg_def XMM7f( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(5));
 202 reg_def XMM7g( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(6));
 203 reg_def XMM7h( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(7));
 204 reg_def XMM7i( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(8));
 205 reg_def XMM7j( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(9));
 206 reg_def XMM7k( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(10));
 207 reg_def XMM7l( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(11));
 208 reg_def XMM7m( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(12));
 209 reg_def XMM7n( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(13));
 210 reg_def XMM7o( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(14));
 211 reg_def XMM7p( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(15));
 212 
 213 #ifdef _LP64
 214 
 215 reg_def XMM8 ( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg());
 216 reg_def XMM8b( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(1));
 217 reg_def XMM8c( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(2));
 218 reg_def XMM8d( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(3));
 219 reg_def XMM8e( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(4));
 220 reg_def XMM8f( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(5));
 221 reg_def XMM8g( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(6));
 222 reg_def XMM8h( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(7));
 223 reg_def XMM8i( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(8));
 224 reg_def XMM8j( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(9));
 225 reg_def XMM8k( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(10));
 226 reg_def XMM8l( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(11));
 227 reg_def XMM8m( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(12));
 228 reg_def XMM8n( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(13));
 229 reg_def XMM8o( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(14));
 230 reg_def XMM8p( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(15));
 231 
 232 reg_def XMM9 ( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg());
 233 reg_def XMM9b( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(1));
 234 reg_def XMM9c( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(2));
 235 reg_def XMM9d( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(3));
 236 reg_def XMM9e( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(4));
 237 reg_def XMM9f( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(5));
 238 reg_def XMM9g( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(6));
 239 reg_def XMM9h( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(7));
 240 reg_def XMM9i( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(8));
 241 reg_def XMM9j( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(9));
 242 reg_def XMM9k( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(10));
 243 reg_def XMM9l( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(11));
 244 reg_def XMM9m( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(12));
 245 reg_def XMM9n( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(13));
 246 reg_def XMM9o( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(14));
 247 reg_def XMM9p( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(15));
 248 
 249 reg_def XMM10 ( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg());
 250 reg_def XMM10b( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(1));
 251 reg_def XMM10c( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(2));
 252 reg_def XMM10d( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(3));
 253 reg_def XMM10e( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(4));
 254 reg_def XMM10f( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(5));
 255 reg_def XMM10g( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(6));
 256 reg_def XMM10h( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(7));
 257 reg_def XMM10i( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(8));
 258 reg_def XMM10j( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(9));
 259 reg_def XMM10k( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(10));
 260 reg_def XMM10l( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(11));
 261 reg_def XMM10m( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(12));
 262 reg_def XMM10n( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(13));
 263 reg_def XMM10o( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(14));
 264 reg_def XMM10p( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(15));
 265 
 266 reg_def XMM11 ( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg());
 267 reg_def XMM11b( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(1));
 268 reg_def XMM11c( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(2));
 269 reg_def XMM11d( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(3));
 270 reg_def XMM11e( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(4));
 271 reg_def XMM11f( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(5));
 272 reg_def XMM11g( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(6));
 273 reg_def XMM11h( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(7));
 274 reg_def XMM11i( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(8));
 275 reg_def XMM11j( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(9));
 276 reg_def XMM11k( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(10));
 277 reg_def XMM11l( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(11));
 278 reg_def XMM11m( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(12));
 279 reg_def XMM11n( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(13));
 280 reg_def XMM11o( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(14));
 281 reg_def XMM11p( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(15));
 282 
 283 reg_def XMM12 ( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg());
 284 reg_def XMM12b( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(1));
 285 reg_def XMM12c( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(2));
 286 reg_def XMM12d( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(3));
 287 reg_def XMM12e( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(4));
 288 reg_def XMM12f( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(5));
 289 reg_def XMM12g( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(6));
 290 reg_def XMM12h( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(7));
 291 reg_def XMM12i( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(8));
 292 reg_def XMM12j( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(9));
 293 reg_def XMM12k( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(10));
 294 reg_def XMM12l( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(11));
 295 reg_def XMM12m( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(12));
 296 reg_def XMM12n( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(13));
 297 reg_def XMM12o( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(14));
 298 reg_def XMM12p( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(15));
 299 
 300 reg_def XMM13 ( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg());
 301 reg_def XMM13b( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(1));
 302 reg_def XMM13c( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(2));
 303 reg_def XMM13d( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(3));
 304 reg_def XMM13e( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(4));
 305 reg_def XMM13f( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(5));
 306 reg_def XMM13g( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(6));
 307 reg_def XMM13h( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(7));
 308 reg_def XMM13i( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(8));
 309 reg_def XMM13j( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(9));
 310 reg_def XMM13k( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(10));
 311 reg_def XMM13l( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(11));
 312 reg_def XMM13m( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(12));
 313 reg_def XMM13n( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(13));
 314 reg_def XMM13o( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(14));
 315 reg_def XMM13p( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(15));
 316 
 317 reg_def XMM14 ( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg());
 318 reg_def XMM14b( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(1));
 319 reg_def XMM14c( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(2));
 320 reg_def XMM14d( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(3));
 321 reg_def XMM14e( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(4));
 322 reg_def XMM14f( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(5));
 323 reg_def XMM14g( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(6));
 324 reg_def XMM14h( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(7));
 325 reg_def XMM14i( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(8));
 326 reg_def XMM14j( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(9));
 327 reg_def XMM14k( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(10));
 328 reg_def XMM14l( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(11));
 329 reg_def XMM14m( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(12));
 330 reg_def XMM14n( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(13));
 331 reg_def XMM14o( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(14));
 332 reg_def XMM14p( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(15));
 333 
 334 reg_def XMM15 ( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg());
 335 reg_def XMM15b( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(1));
 336 reg_def XMM15c( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(2));
 337 reg_def XMM15d( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(3));
 338 reg_def XMM15e( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(4));
 339 reg_def XMM15f( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(5));
 340 reg_def XMM15g( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(6));
 341 reg_def XMM15h( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(7));
 342 reg_def XMM15i( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(8));
 343 reg_def XMM15j( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(9));
 344 reg_def XMM15k( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(10));
 345 reg_def XMM15l( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(11));
 346 reg_def XMM15m( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(12));
 347 reg_def XMM15n( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(13));
 348 reg_def XMM15o( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(14));
 349 reg_def XMM15p( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(15));
 350 
 351 reg_def XMM16 ( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg());
 352 reg_def XMM16b( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(1));
 353 reg_def XMM16c( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(2));
 354 reg_def XMM16d( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(3));
 355 reg_def XMM16e( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(4));
 356 reg_def XMM16f( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(5));
 357 reg_def XMM16g( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(6));
 358 reg_def XMM16h( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(7));
 359 reg_def XMM16i( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(8));
 360 reg_def XMM16j( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(9));
 361 reg_def XMM16k( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(10));
 362 reg_def XMM16l( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(11));
 363 reg_def XMM16m( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(12));
 364 reg_def XMM16n( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(13));
 365 reg_def XMM16o( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(14));
 366 reg_def XMM16p( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(15));
 367 
 368 reg_def XMM17 ( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg());
 369 reg_def XMM17b( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(1));
 370 reg_def XMM17c( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(2));
 371 reg_def XMM17d( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(3));
 372 reg_def XMM17e( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(4));
 373 reg_def XMM17f( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(5));
 374 reg_def XMM17g( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(6));
 375 reg_def XMM17h( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(7));
 376 reg_def XMM17i( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(8));
 377 reg_def XMM17j( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(9));
 378 reg_def XMM17k( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(10));
 379 reg_def XMM17l( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(11));
 380 reg_def XMM17m( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(12));
 381 reg_def XMM17n( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(13));
 382 reg_def XMM17o( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(14));
 383 reg_def XMM17p( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(15));
 384 
 385 reg_def XMM18 ( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg());
 386 reg_def XMM18b( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(1));
 387 reg_def XMM18c( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(2));
 388 reg_def XMM18d( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(3));
 389 reg_def XMM18e( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(4));
 390 reg_def XMM18f( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(5));
 391 reg_def XMM18g( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(6));
 392 reg_def XMM18h( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(7));
 393 reg_def XMM18i( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(8));
 394 reg_def XMM18j( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(9));
 395 reg_def XMM18k( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(10));
 396 reg_def XMM18l( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(11));
 397 reg_def XMM18m( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(12));
 398 reg_def XMM18n( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(13));
 399 reg_def XMM18o( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(14));
 400 reg_def XMM18p( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(15));
 401 
 402 reg_def XMM19 ( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg());
 403 reg_def XMM19b( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(1));
 404 reg_def XMM19c( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(2));
 405 reg_def XMM19d( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(3));
 406 reg_def XMM19e( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(4));
 407 reg_def XMM19f( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(5));
 408 reg_def XMM19g( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(6));
 409 reg_def XMM19h( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(7));
 410 reg_def XMM19i( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(8));
 411 reg_def XMM19j( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(9));
 412 reg_def XMM19k( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(10));
 413 reg_def XMM19l( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(11));
 414 reg_def XMM19m( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(12));
 415 reg_def XMM19n( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(13));
 416 reg_def XMM19o( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(14));
 417 reg_def XMM19p( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(15));
 418 
 419 reg_def XMM20 ( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg());
 420 reg_def XMM20b( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(1));
 421 reg_def XMM20c( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(2));
 422 reg_def XMM20d( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(3));
 423 reg_def XMM20e( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(4));
 424 reg_def XMM20f( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(5));
 425 reg_def XMM20g( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(6));
 426 reg_def XMM20h( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(7));
 427 reg_def XMM20i( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(8));
 428 reg_def XMM20j( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(9));
 429 reg_def XMM20k( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(10));
 430 reg_def XMM20l( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(11));
 431 reg_def XMM20m( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(12));
 432 reg_def XMM20n( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(13));
 433 reg_def XMM20o( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(14));
 434 reg_def XMM20p( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(15));
 435 
 436 reg_def XMM21 ( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg());
 437 reg_def XMM21b( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(1));
 438 reg_def XMM21c( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(2));
 439 reg_def XMM21d( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(3));
 440 reg_def XMM21e( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(4));
 441 reg_def XMM21f( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(5));
 442 reg_def XMM21g( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(6));
 443 reg_def XMM21h( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(7));
 444 reg_def XMM21i( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(8));
 445 reg_def XMM21j( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(9));
 446 reg_def XMM21k( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(10));
 447 reg_def XMM21l( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(11));
 448 reg_def XMM21m( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(12));
 449 reg_def XMM21n( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(13));
 450 reg_def XMM21o( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(14));
 451 reg_def XMM21p( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(15));
 452 
 453 reg_def XMM22 ( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg());
 454 reg_def XMM22b( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(1));
 455 reg_def XMM22c( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(2));
 456 reg_def XMM22d( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(3));
 457 reg_def XMM22e( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(4));
 458 reg_def XMM22f( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(5));
 459 reg_def XMM22g( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(6));
 460 reg_def XMM22h( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(7));
 461 reg_def XMM22i( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(8));
 462 reg_def XMM22j( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(9));
 463 reg_def XMM22k( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(10));
 464 reg_def XMM22l( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(11));
 465 reg_def XMM22m( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(12));
 466 reg_def XMM22n( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(13));
 467 reg_def XMM22o( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(14));
 468 reg_def XMM22p( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(15));
 469 
 470 reg_def XMM23 ( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg());
 471 reg_def XMM23b( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(1));
 472 reg_def XMM23c( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(2));
 473 reg_def XMM23d( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(3));
 474 reg_def XMM23e( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(4));
 475 reg_def XMM23f( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(5));
 476 reg_def XMM23g( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(6));
 477 reg_def XMM23h( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(7));
 478 reg_def XMM23i( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(8));
 479 reg_def XMM23j( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(9));
 480 reg_def XMM23k( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(10));
 481 reg_def XMM23l( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(11));
 482 reg_def XMM23m( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(12));
 483 reg_def XMM23n( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(13));
 484 reg_def XMM23o( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(14));
 485 reg_def XMM23p( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(15));
 486 
 487 reg_def XMM24 ( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg());
 488 reg_def XMM24b( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(1));
 489 reg_def XMM24c( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(2));
 490 reg_def XMM24d( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(3));
 491 reg_def XMM24e( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(4));
 492 reg_def XMM24f( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(5));
 493 reg_def XMM24g( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(6));
 494 reg_def XMM24h( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(7));
 495 reg_def XMM24i( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(8));
 496 reg_def XMM24j( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(9));
 497 reg_def XMM24k( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(10));
 498 reg_def XMM24l( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(11));
 499 reg_def XMM24m( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(12));
 500 reg_def XMM24n( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(13));
 501 reg_def XMM24o( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(14));
 502 reg_def XMM24p( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(15));
 503 
 504 reg_def XMM25 ( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg());
 505 reg_def XMM25b( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(1));
 506 reg_def XMM25c( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(2));
 507 reg_def XMM25d( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(3));
 508 reg_def XMM25e( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(4));
 509 reg_def XMM25f( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(5));
 510 reg_def XMM25g( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(6));
 511 reg_def XMM25h( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(7));
 512 reg_def XMM25i( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(8));
 513 reg_def XMM25j( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(9));
 514 reg_def XMM25k( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(10));
 515 reg_def XMM25l( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(11));
 516 reg_def XMM25m( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(12));
 517 reg_def XMM25n( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(13));
 518 reg_def XMM25o( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(14));
 519 reg_def XMM25p( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(15));
 520 
 521 reg_def XMM26 ( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg());
 522 reg_def XMM26b( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(1));
 523 reg_def XMM26c( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(2));
 524 reg_def XMM26d( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(3));
 525 reg_def XMM26e( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(4));
 526 reg_def XMM26f( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(5));
 527 reg_def XMM26g( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(6));
 528 reg_def XMM26h( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(7));
 529 reg_def XMM26i( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(8));
 530 reg_def XMM26j( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(9));
 531 reg_def XMM26k( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(10));
 532 reg_def XMM26l( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(11));
 533 reg_def XMM26m( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(12));
 534 reg_def XMM26n( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(13));
 535 reg_def XMM26o( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(14));
 536 reg_def XMM26p( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(15));
 537 
 538 reg_def XMM27 ( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg());
 539 reg_def XMM27b( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(1));
 540 reg_def XMM27c( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(2));
 541 reg_def XMM27d( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(3));
 542 reg_def XMM27e( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(4));
 543 reg_def XMM27f( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(5));
 544 reg_def XMM27g( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(6));
 545 reg_def XMM27h( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(7));
 546 reg_def XMM27i( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(8));
 547 reg_def XMM27j( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(9));
 548 reg_def XMM27k( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(10));
 549 reg_def XMM27l( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(11));
 550 reg_def XMM27m( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(12));
 551 reg_def XMM27n( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(13));
 552 reg_def XMM27o( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(14));
 553 reg_def XMM27p( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(15));
 554 
 555 reg_def XMM28 ( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg());
 556 reg_def XMM28b( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(1));
 557 reg_def XMM28c( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(2));
 558 reg_def XMM28d( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(3));
 559 reg_def XMM28e( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(4));
 560 reg_def XMM28f( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(5));
 561 reg_def XMM28g( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(6));
 562 reg_def XMM28h( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(7));
 563 reg_def XMM28i( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(8));
 564 reg_def XMM28j( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(9));
 565 reg_def XMM28k( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(10));
 566 reg_def XMM28l( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(11));
 567 reg_def XMM28m( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(12));
 568 reg_def XMM28n( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(13));
 569 reg_def XMM28o( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(14));
 570 reg_def XMM28p( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(15));
 571 
 572 reg_def XMM29 ( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg());
 573 reg_def XMM29b( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(1));
 574 reg_def XMM29c( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(2));
 575 reg_def XMM29d( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(3));
 576 reg_def XMM29e( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(4));
 577 reg_def XMM29f( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(5));
 578 reg_def XMM29g( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(6));
 579 reg_def XMM29h( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(7));
 580 reg_def XMM29i( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(8));
 581 reg_def XMM29j( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(9));
 582 reg_def XMM29k( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(10));
 583 reg_def XMM29l( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(11));
 584 reg_def XMM29m( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(12));
 585 reg_def XMM29n( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(13));
 586 reg_def XMM29o( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(14));
 587 reg_def XMM29p( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(15));
 588 
 589 reg_def XMM30 ( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg());
 590 reg_def XMM30b( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(1));
 591 reg_def XMM30c( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(2));
 592 reg_def XMM30d( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(3));
 593 reg_def XMM30e( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(4));
 594 reg_def XMM30f( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(5));
 595 reg_def XMM30g( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(6));
 596 reg_def XMM30h( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(7));
 597 reg_def XMM30i( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(8));
 598 reg_def XMM30j( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(9));
 599 reg_def XMM30k( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(10));
 600 reg_def XMM30l( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(11));
 601 reg_def XMM30m( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(12));
 602 reg_def XMM30n( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(13));
 603 reg_def XMM30o( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(14));
 604 reg_def XMM30p( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(15));
 605 
 606 reg_def XMM31 ( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg());
 607 reg_def XMM31b( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(1));
 608 reg_def XMM31c( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(2));
 609 reg_def XMM31d( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(3));
 610 reg_def XMM31e( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(4));
 611 reg_def XMM31f( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(5));
 612 reg_def XMM31g( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(6));
 613 reg_def XMM31h( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(7));
 614 reg_def XMM31i( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(8));
 615 reg_def XMM31j( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(9));
 616 reg_def XMM31k( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(10));
 617 reg_def XMM31l( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(11));
 618 reg_def XMM31m( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(12));
 619 reg_def XMM31n( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(13));
 620 reg_def XMM31o( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(14));
 621 reg_def XMM31p( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(15));
 622 
 623 #endif // _LP64
 624 
 625 #ifdef _LP64
 626 reg_def RFLAGS(SOC, SOC, 0, 16, VMRegImpl::Bad());
 627 #else
 628 reg_def RFLAGS(SOC, SOC, 0, 8, VMRegImpl::Bad());
 629 #endif // _LP64
 630 
 631 alloc_class chunk1(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
 632                    XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
 633                    XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
 634                    XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
 635                    XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
 636                    XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
 637                    XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
 638                    XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
 639 #ifdef _LP64
 640                   ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
 641                    XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
 642                    XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
 643                    XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
 644                    XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
 645                    XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
 646                    XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
 647                    XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
 648                   ,XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
 649                    XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
 650                    XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
 651                    XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
 652                    XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
 653                    XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
 654                    XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
 655                    XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
 656                    XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
 657                    XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
 658                    XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
 659                    XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
 660                    XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
 661                    XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
 662                    XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
 663                    XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p
 664 #endif
 665                       );
 666 
 667 // flags allocation class should be last.
 668 alloc_class chunk2(RFLAGS);
 669 
 670 // Singleton class for condition codes
 671 reg_class int_flags(RFLAGS);
 672 
 673 // Class for pre evex float registers
 674 reg_class float_reg_legacy(XMM0,
 675                     XMM1,
 676                     XMM2,
 677                     XMM3,
 678                     XMM4,
 679                     XMM5,
 680                     XMM6,
 681                     XMM7
 682 #ifdef _LP64
 683                    ,XMM8,
 684                     XMM9,
 685                     XMM10,
 686                     XMM11,
 687                     XMM12,
 688                     XMM13,
 689                     XMM14,
 690                     XMM15
 691 #endif
 692                     );
 693 
 694 // Class for evex float registers
 695 reg_class float_reg_evex(XMM0,
 696                     XMM1,
 697                     XMM2,
 698                     XMM3,
 699                     XMM4,
 700                     XMM5,
 701                     XMM6,
 702                     XMM7
 703 #ifdef _LP64
 704                    ,XMM8,
 705                     XMM9,
 706                     XMM10,
 707                     XMM11,
 708                     XMM12,
 709                     XMM13,
 710                     XMM14,
 711                     XMM15,
 712                     XMM16,
 713                     XMM17,
 714                     XMM18,
 715                     XMM19,
 716                     XMM20,
 717                     XMM21,
 718                     XMM22,
 719                     XMM23,
 720                     XMM24,
 721                     XMM25,
 722                     XMM26,
 723                     XMM27,
 724                     XMM28,
 725                     XMM29,
 726                     XMM30,
 727                     XMM31
 728 #endif
 729                     );
 730 
 731 reg_class_dynamic float_reg(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() %} );
 732 reg_class_dynamic float_reg_vl(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
 733 
 734 // Class for pre evex double registers
 735 reg_class double_reg_legacy(XMM0,  XMM0b,
 736                      XMM1,  XMM1b,
 737                      XMM2,  XMM2b,
 738                      XMM3,  XMM3b,
 739                      XMM4,  XMM4b,
 740                      XMM5,  XMM5b,
 741                      XMM6,  XMM6b,
 742                      XMM7,  XMM7b
 743 #ifdef _LP64
 744                     ,XMM8,  XMM8b,
 745                      XMM9,  XMM9b,
 746                      XMM10, XMM10b,
 747                      XMM11, XMM11b,
 748                      XMM12, XMM12b,
 749                      XMM13, XMM13b,
 750                      XMM14, XMM14b,
 751                      XMM15, XMM15b
 752 #endif
 753                      );
 754 
 755 // Class for evex double registers
 756 reg_class double_reg_evex(XMM0,  XMM0b,
 757                      XMM1,  XMM1b,
 758                      XMM2,  XMM2b,
 759                      XMM3,  XMM3b,
 760                      XMM4,  XMM4b,
 761                      XMM5,  XMM5b,
 762                      XMM6,  XMM6b,
 763                      XMM7,  XMM7b
 764 #ifdef _LP64
 765                     ,XMM8,  XMM8b,
 766                      XMM9,  XMM9b,
 767                      XMM10, XMM10b,
 768                      XMM11, XMM11b,
 769                      XMM12, XMM12b,
 770                      XMM13, XMM13b,
 771                      XMM14, XMM14b,
 772                      XMM15, XMM15b,
 773                      XMM16, XMM16b,
 774                      XMM17, XMM17b,
 775                      XMM18, XMM18b,
 776                      XMM19, XMM19b,
 777                      XMM20, XMM20b,
 778                      XMM21, XMM21b,
 779                      XMM22, XMM22b,
 780                      XMM23, XMM23b,
 781                      XMM24, XMM24b,
 782                      XMM25, XMM25b,
 783                      XMM26, XMM26b,
 784                      XMM27, XMM27b,
 785                      XMM28, XMM28b,
 786                      XMM29, XMM29b,
 787                      XMM30, XMM30b,
 788                      XMM31, XMM31b
 789 #endif
 790                      );
 791 
 792 reg_class_dynamic double_reg(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() %} );
 793 reg_class_dynamic double_reg_vl(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
 794 
 795 // Class for pre evex 32bit vector registers
 796 reg_class vectors_reg_legacy(XMM0,
 797                       XMM1,
 798                       XMM2,
 799                       XMM3,
 800                       XMM4,
 801                       XMM5,
 802                       XMM6,
 803                       XMM7
 804 #ifdef _LP64
 805                      ,XMM8,
 806                       XMM9,
 807                       XMM10,
 808                       XMM11,
 809                       XMM12,
 810                       XMM13,
 811                       XMM14,
 812                       XMM15
 813 #endif
 814                       );
 815 
 816 // Class for evex 32bit vector registers
 817 reg_class vectors_reg_evex(XMM0,
 818                       XMM1,
 819                       XMM2,
 820                       XMM3,
 821                       XMM4,
 822                       XMM5,
 823                       XMM6,
 824                       XMM7
 825 #ifdef _LP64
 826                      ,XMM8,
 827                       XMM9,
 828                       XMM10,
 829                       XMM11,
 830                       XMM12,
 831                       XMM13,
 832                       XMM14,
 833                       XMM15,
 834                       XMM16,
 835                       XMM17,
 836                       XMM18,
 837                       XMM19,
 838                       XMM20,
 839                       XMM21,
 840                       XMM22,
 841                       XMM23,
 842                       XMM24,
 843                       XMM25,
 844                       XMM26,
 845                       XMM27,
 846                       XMM28,
 847                       XMM29,
 848                       XMM30,
 849                       XMM31
 850 #endif
 851                       );
 852 
 853 reg_class_dynamic vectors_reg(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_evex() %} );
 854 reg_class_dynamic vectors_reg_vlbwdq(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
 855 
 856 // Class for all 64bit vector registers
 857 reg_class vectord_reg_legacy(XMM0,  XMM0b,
 858                       XMM1,  XMM1b,
 859                       XMM2,  XMM2b,
 860                       XMM3,  XMM3b,
 861                       XMM4,  XMM4b,
 862                       XMM5,  XMM5b,
 863                       XMM6,  XMM6b,
 864                       XMM7,  XMM7b
 865 #ifdef _LP64
 866                      ,XMM8,  XMM8b,
 867                       XMM9,  XMM9b,
 868                       XMM10, XMM10b,
 869                       XMM11, XMM11b,
 870                       XMM12, XMM12b,
 871                       XMM13, XMM13b,
 872                       XMM14, XMM14b,
 873                       XMM15, XMM15b
 874 #endif
 875                       );
 876 
 877 // Class for all 64bit vector registers
 878 reg_class vectord_reg_evex(XMM0,  XMM0b,
 879                       XMM1,  XMM1b,
 880                       XMM2,  XMM2b,
 881                       XMM3,  XMM3b,
 882                       XMM4,  XMM4b,
 883                       XMM5,  XMM5b,
 884                       XMM6,  XMM6b,
 885                       XMM7,  XMM7b
 886 #ifdef _LP64
 887                      ,XMM8,  XMM8b,
 888                       XMM9,  XMM9b,
 889                       XMM10, XMM10b,
 890                       XMM11, XMM11b,
 891                       XMM12, XMM12b,
 892                       XMM13, XMM13b,
 893                       XMM14, XMM14b,
 894                       XMM15, XMM15b,
 895                       XMM16, XMM16b,
 896                       XMM17, XMM17b,
 897                       XMM18, XMM18b,
 898                       XMM19, XMM19b,
 899                       XMM20, XMM20b,
 900                       XMM21, XMM21b,
 901                       XMM22, XMM22b,
 902                       XMM23, XMM23b,
 903                       XMM24, XMM24b,
 904                       XMM25, XMM25b,
 905                       XMM26, XMM26b,
 906                       XMM27, XMM27b,
 907                       XMM28, XMM28b,
 908                       XMM29, XMM29b,
 909                       XMM30, XMM30b,
 910                       XMM31, XMM31b
 911 #endif
 912                       );
 913 
 914 reg_class_dynamic vectord_reg(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_evex() %} );
 915 reg_class_dynamic vectord_reg_vlbwdq(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
 916 
 917 // Class for all 128bit vector registers
 918 reg_class vectorx_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,
 919                       XMM1,  XMM1b,  XMM1c,  XMM1d,
 920                       XMM2,  XMM2b,  XMM2c,  XMM2d,
 921                       XMM3,  XMM3b,  XMM3c,  XMM3d,
 922                       XMM4,  XMM4b,  XMM4c,  XMM4d,
 923                       XMM5,  XMM5b,  XMM5c,  XMM5d,
 924                       XMM6,  XMM6b,  XMM6c,  XMM6d,
 925                       XMM7,  XMM7b,  XMM7c,  XMM7d
 926 #ifdef _LP64
 927                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,
 928                       XMM9,  XMM9b,  XMM9c,  XMM9d,
 929                       XMM10, XMM10b, XMM10c, XMM10d,
 930                       XMM11, XMM11b, XMM11c, XMM11d,
 931                       XMM12, XMM12b, XMM12c, XMM12d,
 932                       XMM13, XMM13b, XMM13c, XMM13d,
 933                       XMM14, XMM14b, XMM14c, XMM14d,
 934                       XMM15, XMM15b, XMM15c, XMM15d
 935 #endif
 936                       );
 937 
 938 // Class for all 128bit vector registers
 939 reg_class vectorx_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,
 940                       XMM1,  XMM1b,  XMM1c,  XMM1d,
 941                       XMM2,  XMM2b,  XMM2c,  XMM2d,
 942                       XMM3,  XMM3b,  XMM3c,  XMM3d,
 943                       XMM4,  XMM4b,  XMM4c,  XMM4d,
 944                       XMM5,  XMM5b,  XMM5c,  XMM5d,
 945                       XMM6,  XMM6b,  XMM6c,  XMM6d,
 946                       XMM7,  XMM7b,  XMM7c,  XMM7d
 947 #ifdef _LP64
 948                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,
 949                       XMM9,  XMM9b,  XMM9c,  XMM9d,
 950                       XMM10, XMM10b, XMM10c, XMM10d,
 951                       XMM11, XMM11b, XMM11c, XMM11d,
 952                       XMM12, XMM12b, XMM12c, XMM12d,
 953                       XMM13, XMM13b, XMM13c, XMM13d,
 954                       XMM14, XMM14b, XMM14c, XMM14d,
 955                       XMM15, XMM15b, XMM15c, XMM15d,
 956                       XMM16, XMM16b, XMM16c, XMM16d,
 957                       XMM17, XMM17b, XMM17c, XMM17d,
 958                       XMM18, XMM18b, XMM18c, XMM18d,
 959                       XMM19, XMM19b, XMM19c, XMM19d,
 960                       XMM20, XMM20b, XMM20c, XMM20d,
 961                       XMM21, XMM21b, XMM21c, XMM21d,
 962                       XMM22, XMM22b, XMM22c, XMM22d,
 963                       XMM23, XMM23b, XMM23c, XMM23d,
 964                       XMM24, XMM24b, XMM24c, XMM24d,
 965                       XMM25, XMM25b, XMM25c, XMM25d,
 966                       XMM26, XMM26b, XMM26c, XMM26d,
 967                       XMM27, XMM27b, XMM27c, XMM27d,
 968                       XMM28, XMM28b, XMM28c, XMM28d,
 969                       XMM29, XMM29b, XMM29c, XMM29d,
 970                       XMM30, XMM30b, XMM30c, XMM30d,
 971                       XMM31, XMM31b, XMM31c, XMM31d
 972 #endif
 973                       );
 974 
 975 reg_class_dynamic vectorx_reg(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_evex() %} );
 976 reg_class_dynamic vectorx_reg_vlbwdq(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
 977 
 978 // Class for all 256bit vector registers
 979 reg_class vectory_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
 980                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
 981                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
 982                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
 983                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
 984                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
 985                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
 986                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
 987 #ifdef _LP64
 988                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
 989                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
 990                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
 991                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
 992                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
 993                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
 994                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
 995                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h
 996 #endif
 997                       );
 998 
 999 // Class for all 256bit vector registers
1000 reg_class vectory_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
1001                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
1002                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
1003                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
1004                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
1005                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
1006                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
1007                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
1008 #ifdef _LP64
1009                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
1010                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
1011                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
1012                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
1013                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
1014                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
1015                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
1016                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h,
1017                       XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h,
1018                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h,
1019                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h,
1020                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h,
1021                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h,
1022                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h,
1023                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h,
1024                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h,
1025                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h,
1026                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h,
1027                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h,
1028                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h,
1029                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h,
1030                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h,
1031                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h,
1032                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h
1033 #endif
1034                       );
1035 
1036 reg_class_dynamic vectory_reg(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_evex() %} );
1037 reg_class_dynamic vectory_reg_vlbwdq(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
1038 
1039 // Class for all 512bit vector registers
1040 reg_class vectorz_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
1041                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
1042                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
1043                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
1044                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
1045                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
1046                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
1047                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
1048 #ifdef _LP64
1049                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
1050                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
1051                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
1052                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
1053                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
1054                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
1055                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
1056                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
1057                      ,XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
1058                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
1059                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
1060                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
1061                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
1062                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
1063                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
1064                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
1065                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
1066                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
1067                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
1068                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
1069                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
1070                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
1071                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
1072                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p
1073 #endif
1074                       );
1075 
1076 // Class for restricted 512bit vector registers
1077 reg_class vectorz_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
1078                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
1079                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
1080                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
1081                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
1082                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
1083                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
1084                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
1085 #ifdef _LP64
1086                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
1087                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
1088                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
1089                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
1090                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
1091                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
1092                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
1093                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
1094 #endif
1095                       );
1096 
1097 reg_class_dynamic vectorz_reg(vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() %} );
1098 reg_class_dynamic vectorz_reg_vl(vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
1099 
1100 reg_class xmm0_reg(XMM0, XMM0b, XMM0c, XMM0d);
1101 reg_class ymm0_reg(XMM0, XMM0b, XMM0c, XMM0d, XMM0e, XMM0f, XMM0g, XMM0h);
1102 reg_class zmm0_reg(XMM0, XMM0b, XMM0c, XMM0d, XMM0e, XMM0f, XMM0g, XMM0h, XMM0i, XMM0j, XMM0k, XMM0l, XMM0m, XMM0n, XMM0o, XMM0p);
1103 
1104 reg_class xmm1_reg(XMM1, XMM1b, XMM1c, XMM1d);
1105 reg_class ymm1_reg(XMM1, XMM1b, XMM1c, XMM1d, XMM1e, XMM1f, XMM1g, XMM1h);
1106 reg_class zmm1_reg(XMM1, XMM1b, XMM1c, XMM1d, XMM1e, XMM1f, XMM1g, XMM1h, XMM1i, XMM1j, XMM1k, XMM1l, XMM1m, XMM1n, XMM1o, XMM1p);
1107 
1108 reg_class xmm2_reg(XMM2, XMM2b, XMM2c, XMM2d);
1109 reg_class ymm2_reg(XMM2, XMM2b, XMM2c, XMM2d, XMM2e, XMM2f, XMM2g, XMM2h);
1110 reg_class zmm2_reg(XMM2, XMM2b, XMM2c, XMM2d, XMM2e, XMM2f, XMM2g, XMM2h, XMM2i, XMM2j, XMM2k, XMM2l, XMM2m, XMM2n, XMM2o, XMM2p);
1111 
1112 reg_class xmm3_reg(XMM3, XMM3b, XMM3c, XMM3d);
1113 reg_class ymm3_reg(XMM3, XMM3b, XMM3c, XMM3d, XMM3e, XMM3f, XMM3g, XMM3h);
1114 reg_class zmm3_reg(XMM3, XMM3b, XMM3c, XMM3d, XMM3e, XMM3f, XMM3g, XMM3h, XMM3i, XMM3j, XMM3k, XMM3l, XMM3m, XMM3n, XMM3o, XMM3p);
1115 
1116 reg_class xmm4_reg(XMM4, XMM4b, XMM4c, XMM4d);
1117 reg_class ymm4_reg(XMM4, XMM4b, XMM4c, XMM4d, XMM4e, XMM4f, XMM4g, XMM4h);
1118 reg_class zmm4_reg(XMM4, XMM4b, XMM4c, XMM4d, XMM4e, XMM4f, XMM4g, XMM4h, XMM4i, XMM4j, XMM4k, XMM4l, XMM4m, XMM4n, XMM4o, XMM4p);
1119 
1120 reg_class xmm5_reg(XMM5, XMM5b, XMM5c, XMM5d);
1121 reg_class ymm5_reg(XMM5, XMM5b, XMM5c, XMM5d, XMM5e, XMM5f, XMM5g, XMM5h);
1122 reg_class zmm5_reg(XMM5, XMM5b, XMM5c, XMM5d, XMM5e, XMM5f, XMM5g, XMM5h, XMM5i, XMM5j, XMM5k, XMM5l, XMM5m, XMM5n, XMM5o, XMM5p);
1123 
1124 reg_class xmm6_reg(XMM6, XMM6b, XMM6c, XMM6d);
1125 reg_class ymm6_reg(XMM6, XMM6b, XMM6c, XMM6d, XMM6e, XMM6f, XMM6g, XMM6h);
1126 reg_class zmm6_reg(XMM6, XMM6b, XMM6c, XMM6d, XMM6e, XMM6f, XMM6g, XMM6h, XMM6i, XMM6j, XMM6k, XMM6l, XMM6m, XMM6n, XMM6o, XMM6p);
1127 
1128 reg_class xmm7_reg(XMM7, XMM7b, XMM7c, XMM7d);
1129 reg_class ymm7_reg(XMM7, XMM7b, XMM7c, XMM7d, XMM7e, XMM7f, XMM7g, XMM7h);
1130 reg_class zmm7_reg(XMM7, XMM7b, XMM7c, XMM7d, XMM7e, XMM7f, XMM7g, XMM7h, XMM7i, XMM7j, XMM7k, XMM7l, XMM7m, XMM7n, XMM7o, XMM7p);
1131 
1132 #ifdef _LP64
1133 
1134 reg_class xmm8_reg(XMM8, XMM8b, XMM8c, XMM8d);
1135 reg_class ymm8_reg(XMM8, XMM8b, XMM8c, XMM8d, XMM8e, XMM8f, XMM8g, XMM8h);
1136 reg_class zmm8_reg(XMM8, XMM8b, XMM8c, XMM8d, XMM8e, XMM8f, XMM8g, XMM8h, XMM8i, XMM8j, XMM8k, XMM8l, XMM8m, XMM8n, XMM8o, XMM8p);
1137 
1138 reg_class xmm9_reg(XMM9, XMM9b, XMM9c, XMM9d);
1139 reg_class ymm9_reg(XMM9, XMM9b, XMM9c, XMM9d, XMM9e, XMM9f, XMM9g, XMM9h);
1140 reg_class zmm9_reg(XMM9, XMM9b, XMM9c, XMM9d, XMM9e, XMM9f, XMM9g, XMM9h, XMM9i, XMM9j, XMM9k, XMM9l, XMM9m, XMM9n, XMM9o, XMM9p);
1141 
1142 reg_class xmm10_reg(XMM10, XMM10b, XMM10c, XMM10d);
1143 reg_class ymm10_reg(XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h);
1144 reg_class zmm10_reg(XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p);
1145 
1146 reg_class xmm11_reg(XMM11, XMM11b, XMM11c, XMM11d);
1147 reg_class ymm11_reg(XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h);
1148 reg_class zmm11_reg(XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p);
1149 
1150 reg_class xmm12_reg(XMM12, XMM12b, XMM12c, XMM12d);
1151 reg_class ymm12_reg(XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h);
1152 reg_class zmm12_reg(XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p);
1153 
1154 reg_class xmm13_reg(XMM13, XMM13b, XMM13c, XMM13d);
1155 reg_class ymm13_reg(XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h);
1156 reg_class zmm13_reg(XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p);
1157 
1158 reg_class xmm14_reg(XMM14, XMM14b, XMM14c, XMM14d);
1159 reg_class ymm14_reg(XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h);
1160 reg_class zmm14_reg(XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p);
1161 
1162 reg_class xmm15_reg(XMM15, XMM15b, XMM15c, XMM15d);
1163 reg_class ymm15_reg(XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h);
1164 reg_class zmm15_reg(XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p);
1165 
1166 reg_class xmm16_reg(XMM16, XMM16b, XMM16c, XMM16d);
1167 reg_class ymm16_reg(XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h);
1168 reg_class zmm16_reg(XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p);
1169 
1170 reg_class xmm17_reg(XMM17, XMM17b, XMM17c, XMM17d);
1171 reg_class ymm17_reg(XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h);
1172 reg_class zmm17_reg(XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p);
1173 
1174 reg_class xmm18_reg(XMM18, XMM18b, XMM18c, XMM18d);
1175 reg_class ymm18_reg(XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h);
1176 reg_class zmm18_reg(XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p);
1177 
1178 reg_class xmm19_reg(XMM19, XMM19b, XMM19c, XMM19d);
1179 reg_class ymm19_reg(XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h);
1180 reg_class zmm19_reg(XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p);
1181 
1182 reg_class xmm20_reg(XMM20, XMM20b, XMM20c, XMM20d);
1183 reg_class ymm20_reg(XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h);
1184 reg_class zmm20_reg(XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p);
1185 
1186 reg_class xmm21_reg(XMM21, XMM21b, XMM21c, XMM21d);
1187 reg_class ymm21_reg(XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h);
1188 reg_class zmm21_reg(XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p);
1189 
1190 reg_class xmm22_reg(XMM22, XMM22b, XMM22c, XMM22d);
1191 reg_class ymm22_reg(XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h);
1192 reg_class zmm22_reg(XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p);
1193 
1194 reg_class xmm23_reg(XMM23, XMM23b, XMM23c, XMM23d);
1195 reg_class ymm23_reg(XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h);
1196 reg_class zmm23_reg(XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p);
1197 
1198 reg_class xmm24_reg(XMM24, XMM24b, XMM24c, XMM24d);
1199 reg_class ymm24_reg(XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h);
1200 reg_class zmm24_reg(XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p);
1201 
1202 reg_class xmm25_reg(XMM25, XMM25b, XMM25c, XMM25d);
1203 reg_class ymm25_reg(XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h);
1204 reg_class zmm25_reg(XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p);
1205 
1206 reg_class xmm26_reg(XMM26, XMM26b, XMM26c, XMM26d);
1207 reg_class ymm26_reg(XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h);
1208 reg_class zmm26_reg(XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p);
1209 
1210 reg_class xmm27_reg(XMM27, XMM27b, XMM27c, XMM27d);
1211 reg_class ymm27_reg(XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h);
1212 reg_class zmm27_reg(XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p);
1213 
1214 reg_class xmm28_reg(XMM28, XMM28b, XMM28c, XMM28d);
1215 reg_class ymm28_reg(XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h);
1216 reg_class zmm28_reg(XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p);
1217 
1218 reg_class xmm29_reg(XMM29, XMM29b, XMM29c, XMM29d);
1219 reg_class ymm29_reg(XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h);
1220 reg_class zmm29_reg(XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p);
1221 
1222 reg_class xmm30_reg(XMM30, XMM30b, XMM30c, XMM30d);
1223 reg_class ymm30_reg(XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h);
1224 reg_class zmm30_reg(XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p);
1225 
1226 reg_class xmm31_reg(XMM31, XMM31b, XMM31c, XMM31d);
1227 reg_class ymm31_reg(XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h);
1228 reg_class zmm31_reg(XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p);
1229 
1230 #endif
1231 
1232 %}
1233 
1234 
1235 //----------SOURCE BLOCK-------------------------------------------------------
1236 // This is a block of C++ code which provides values, functions, and
1237 // definitions necessary in the rest of the architecture description
1238 
1239 source_hpp %{
1240 // Header information of the source block.
1241 // Method declarations/definitions which are used outside
1242 // the ad-scope can conveniently be defined here.
1243 //
1244 // To keep related declarations/definitions/uses close together,
1245 // we switch between source %{ }% and source_hpp %{ }% freely as needed.
1246 
1247 class NativeJump;
1248 
1249 class CallStubImpl {
1250 
1251   //--------------------------------------------------------------
1252   //---<  Used for optimization in Compile::shorten_branches  >---
1253   //--------------------------------------------------------------
1254 
1255  public:
1256   // Size of call trampoline stub.
1257   static uint size_call_trampoline() {
1258     return 0; // no call trampolines on this platform
1259   }
1260 
1261   // number of relocations needed by a call trampoline stub
1262   static uint reloc_call_trampoline() {
1263     return 0; // no call trampolines on this platform
1264   }
1265 };
1266 
1267 class HandlerImpl {
1268 
1269  public:
1270 
1271   static int emit_exception_handler(CodeBuffer &cbuf);
1272   static int emit_deopt_handler(CodeBuffer& cbuf);
1273 
1274   static uint size_exception_handler() {
1275     // NativeCall instruction size is the same as NativeJump.
1276     // exception handler starts out as jump and can be patched to
1277     // a call be deoptimization.  (4932387)
1278     // Note that this value is also credited (in output.cpp) to
1279     // the size of the code section.
1280     return NativeJump::instruction_size;
1281   }
1282 
1283 #ifdef _LP64
1284   static uint size_deopt_handler() {
1285     // three 5 byte instructions
1286     return 15;
1287   }
1288 #else
1289   static uint size_deopt_handler() {
1290     // NativeCall instruction size is the same as NativeJump.
1291     // exception handler starts out as jump and can be patched to
1292     // a call be deoptimization.  (4932387)
1293     // Note that this value is also credited (in output.cpp) to
1294     // the size of the code section.
1295     return 5 + NativeJump::instruction_size; // pushl(); jmp;
1296   }
1297 #endif
1298 };
1299 
1300 %} // end source_hpp
1301 
1302 source %{
1303 
1304 #include "opto/addnode.hpp"
1305 
1306 // Emit exception handler code.
1307 // Stuff framesize into a register and call a VM stub routine.
1308 int HandlerImpl::emit_exception_handler(CodeBuffer& cbuf) {
1309 
1310   // Note that the code buffer's insts_mark is always relative to insts.
1311   // That's why we must use the macroassembler to generate a handler.
1312   MacroAssembler _masm(&cbuf);
1313   address base = __ start_a_stub(size_exception_handler());
1314   if (base == NULL) {
1315     ciEnv::current()->record_failure("CodeCache is full");
1316     return 0;  // CodeBuffer::expand failed
1317   }
1318   int offset = __ offset();
1319   __ jump(RuntimeAddress(OptoRuntime::exception_blob()->entry_point()));
1320   assert(__ offset() - offset <= (int) size_exception_handler(), "overflow");
1321   __ end_a_stub();
1322   return offset;
1323 }
1324 
1325 // Emit deopt handler code.
1326 int HandlerImpl::emit_deopt_handler(CodeBuffer& cbuf) {
1327 
1328   // Note that the code buffer's insts_mark is always relative to insts.
1329   // That's why we must use the macroassembler to generate a handler.
1330   MacroAssembler _masm(&cbuf);
1331   address base = __ start_a_stub(size_deopt_handler());
1332   if (base == NULL) {
1333     ciEnv::current()->record_failure("CodeCache is full");
1334     return 0;  // CodeBuffer::expand failed
1335   }
1336   int offset = __ offset();
1337 
1338 #ifdef _LP64
1339   address the_pc = (address) __ pc();
1340   Label next;
1341   // push a "the_pc" on the stack without destroying any registers
1342   // as they all may be live.
1343 
1344   // push address of "next"
1345   __ call(next, relocInfo::none); // reloc none is fine since it is a disp32
1346   __ bind(next);
1347   // adjust it so it matches "the_pc"
1348   __ subptr(Address(rsp, 0), __ offset() - offset);
1349 #else
1350   InternalAddress here(__ pc());
1351   __ pushptr(here.addr());
1352 #endif
1353 
1354   __ jump(RuntimeAddress(SharedRuntime::deopt_blob()->unpack()));
1355   assert(__ offset() - offset <= (int) size_deopt_handler(), "overflow");
1356   __ end_a_stub();
1357   return offset;
1358 }
1359 
1360 
1361 //=============================================================================
1362 
1363   // Float masks come from different places depending on platform.
1364 #ifdef _LP64
1365   static address float_signmask()  { return StubRoutines::x86::float_sign_mask(); }
1366   static address float_signflip()  { return StubRoutines::x86::float_sign_flip(); }
1367   static address double_signmask() { return StubRoutines::x86::double_sign_mask(); }
1368   static address double_signflip() { return StubRoutines::x86::double_sign_flip(); }
1369 #else
1370   static address float_signmask()  { return (address)float_signmask_pool; }
1371   static address float_signflip()  { return (address)float_signflip_pool; }
1372   static address double_signmask() { return (address)double_signmask_pool; }
1373   static address double_signflip() { return (address)double_signflip_pool; }
1374 #endif
1375   static address vector_short_to_byte_mask() { return StubRoutines::x86::vector_short_to_byte_mask(); }
1376   static address vector_byte_perm_mask() { return StubRoutines::x86::vector_byte_perm_mask(); }
1377   static address vector_long_sign_mask() { return StubRoutines::x86::vector_long_sign_mask(); }
1378 
1379 //=============================================================================
1380 const bool Matcher::match_rule_supported(int opcode) {
1381   if (!has_match_rule(opcode))
1382     return false;
1383 
1384   bool ret_value = true;
1385   switch (opcode) {
1386     case Op_AbsVL:
1387       if (UseAVX < 3)
1388         ret_value = false;
1389     case Op_PopCountI:
1390     case Op_PopCountL:
1391       if (!UsePopCountInstruction)
1392         ret_value = false;
1393       break;
1394     case Op_PopCountVI:
1395       if (!UsePopCountInstruction || !VM_Version::supports_vpopcntdq())
1396         ret_value = false;
1397       break;
1398     case Op_MulVI:
1399       if ((UseSSE < 4) && (UseAVX < 1)) // only with SSE4_1 or AVX
1400         ret_value = false;
1401       break;
1402     case Op_MulVL:
1403     case Op_MulReductionVL:
1404       if (VM_Version::supports_avx512dq() == false)
1405         ret_value = false;
1406       break;
1407     case Op_AddReductionVL:
1408       if (UseAVX < 3) // only EVEX : vector connectivity becomes an issue here
1409         ret_value = false;
1410       break;
1411     case Op_AbsVB:
1412     case Op_AbsVS:
1413     case Op_AbsVI:
1414     case Op_AddReductionVI:
1415       if (UseSSE < 3 || !VM_Version::supports_ssse3()) // requires at least SSSE3
1416         ret_value = false;
1417       break;
1418     case Op_MulReductionVI:
1419       if (UseSSE < 4) // requires at least SSE4
1420         ret_value = false;
1421       break;
1422     case Op_AddReductionVF:
1423     case Op_AddReductionVD:
1424     case Op_MulReductionVF:
1425     case Op_MulReductionVD:
1426       if (UseSSE < 1) // requires at least SSE
1427         ret_value = false;
1428       break;
1429     case Op_SqrtVD:
1430     case Op_SqrtVF:
1431       if (UseAVX < 1) // enabled for AVX only
1432         ret_value = false;
1433       break;
1434     case Op_CompareAndSwapL:
1435     case Op_RShiftVL:
1436     case Op_AbsVD:
1437     case Op_NegVD:
1438       if (UseSSE < 2)
1439         ret_value = false;
1440       break;
1441     case Op_MulVB:
1442     case Op_LShiftVB:
1443     case Op_RShiftVB:
1444     case Op_URShiftVB:
1445       if (UseSSE < 4)
1446         ret_value = false;
1447       break;
1448 #ifdef _LP64
1449     case Op_CompareAndSwapP:
1450 #endif
1451       if (!VM_Version::supports_cx8())
1452         ret_value = false;
1453       break;
1454     case Op_CMoveVF:
1455     case Op_CMoveVD:
1456       if (UseAVX < 1 || UseAVX > 2)
1457         ret_value = false;
1458       break;
1459     case Op_StrIndexOf:
1460       if (!UseSSE42Intrinsics)
1461         ret_value = false;
1462       break;
1463     case Op_StrIndexOfChar:
1464       if (!UseSSE42Intrinsics)
1465         ret_value = false;
1466       break;
1467     case Op_OnSpinWait:
1468       if (VM_Version::supports_on_spin_wait() == false)
1469         ret_value = false;
1470       break;
1471   }
1472 
1473   return ret_value;  // Per default match rules are supported.
1474 }
1475 
1476 const bool Matcher::match_rule_supported_vector(int opcode, int vlen) {
1477   // identify extra cases that we might want to provide match rules for
1478   // e.g. Op_ vector nodes and other intrinsics while guarding with vlen
1479   bool ret_value = match_rule_supported(opcode);
1480   if (ret_value) {
1481     switch (opcode) {
1482       case Op_AbsVB:
1483       case Op_AddVB:
1484       case Op_SubVB:
1485         if ((vlen == 64) && (VM_Version::supports_avx512bw() == false))
1486           ret_value = false;
1487         break;
1488       case Op_AbsVS:
1489       case Op_AddVS:
1490       case Op_SubVS:
1491       case Op_MulVS:
1492       case Op_LShiftVS:
1493       case Op_RShiftVS:
1494       case Op_URShiftVS:
1495         if ((vlen == 32) && (VM_Version::supports_avx512bw() == false))
1496           ret_value = false;
1497         break;
1498       case Op_MulVB:
1499       case Op_LShiftVB:
1500       case Op_RShiftVB:
1501       case Op_URShiftVB:
1502         if ((vlen == 32 && UseAVX < 2) || 
1503             ((vlen == 64) && (VM_Version::supports_avx512bw() == false)))
1504           ret_value = false;
1505         break;
1506       case Op_NegVF:
1507         if ((vlen == 16) && (VM_Version::supports_avx512dq() == false))
1508           ret_value = false;
1509         break;
1510       case Op_CMoveVF:
1511         if (vlen != 8)
1512           ret_value  = false;
1513         break;
1514       case Op_NegVD:
1515         if ((vlen == 8) && (VM_Version::supports_avx512dq() == false))
1516           ret_value = false;
1517         break;
1518       case Op_CMoveVD:
1519         if (vlen != 4)
1520           ret_value  = false;
1521         break;
1522     }
1523   }
1524 
1525   return ret_value;  // Per default match rules are supported.
1526 }
1527 
1528 const bool Matcher::has_predicated_vectors(void) {
1529   bool ret_value = false;
1530   if (UseAVX > 2) {
1531     ret_value = VM_Version::supports_avx512vl();
1532   }
1533 
1534   return ret_value;
1535 }
1536 
1537 const int Matcher::float_pressure(int default_pressure_threshold) {
1538   int float_pressure_threshold = default_pressure_threshold;
1539 #ifdef _LP64
1540   if (UseAVX > 2) {
1541     // Increase pressure threshold on machines with AVX3 which have
1542     // 2x more XMM registers.
1543     float_pressure_threshold = default_pressure_threshold * 2;
1544   }
1545 #endif
1546   return float_pressure_threshold;
1547 }
1548 
1549 // Max vector size in bytes. 0 if not supported.
1550 const int Matcher::vector_width_in_bytes(BasicType bt) {
1551   assert(is_java_primitive(bt), "only primitive type vectors");
1552   if (UseSSE < 2) return 0;
1553   // SSE2 supports 128bit vectors for all types.
1554   // AVX2 supports 256bit vectors for all types.
1555   // AVX2/EVEX supports 512bit vectors for all types.
1556   int size = (UseAVX > 1) ? (1 << UseAVX) * 8 : 16;
1557   // AVX1 supports 256bit vectors only for FLOAT and DOUBLE.
1558   if (UseAVX > 0 && (bt == T_FLOAT || bt == T_DOUBLE))
1559     size = (UseAVX > 2) ? 64 : 32;
1560   if (UseAVX > 2 && (bt == T_BYTE || bt == T_SHORT || bt == T_CHAR))
1561     size = (VM_Version::supports_avx512bw()) ? 64 : 32;
1562   // Use flag to limit vector size.
1563   size = MIN2(size,(int)MaxVectorSize);
1564   // Minimum 2 values in vector (or 4 for bytes).
1565   switch (bt) {
1566   case T_DOUBLE:
1567   case T_LONG:
1568     if (size < 16) return 0;
1569     break;
1570   case T_FLOAT:
1571   case T_INT:
1572     if (size < 8) return 0;
1573     break;
1574   case T_BOOLEAN:
1575     if (size < 4) return 0;
1576     break;
1577   case T_CHAR:
1578     if (size < 4) return 0;
1579     break;
1580   case T_BYTE:
1581     if (size < 4) return 0;
1582     break;
1583   case T_SHORT:
1584     if (size < 4) return 0;
1585     break;
1586   default:
1587     ShouldNotReachHere();
1588   }
1589   return size;
1590 }
1591 
1592 // Limits on vector size (number of elements) loaded into vector.
1593 const int Matcher::max_vector_size(const BasicType bt) {
1594   return vector_width_in_bytes(bt)/type2aelembytes(bt);
1595 }
1596 const int Matcher::min_vector_size(const BasicType bt) {
1597   int max_size = max_vector_size(bt);
1598   // Min size which can be loaded into vector is 4 bytes.
1599   int size = (type2aelembytes(bt) == 1) ? 4 : 2;
1600   return MIN2(size,max_size);
1601 }
1602 
1603 // Vector ideal reg corresponding to specified size in bytes
1604 const uint Matcher::vector_ideal_reg(int size) {
1605   assert(MaxVectorSize >= size, "");
1606   switch(size) {
1607     case  4: return Op_VecS;
1608     case  8: return Op_VecD;
1609     case 16: return Op_VecX;
1610     case 32: return Op_VecY;
1611     case 64: return Op_VecZ;
1612   }
1613   ShouldNotReachHere();
1614   return 0;
1615 }
1616 
1617 // Only lowest bits of xmm reg are used for vector shift count.
1618 const uint Matcher::vector_shift_count_ideal_reg(int size) {
1619   return Op_VecS;
1620 }
1621 
1622 // x86 supports misaligned vectors store/load.
1623 const bool Matcher::misaligned_vectors_ok() {
1624   return true;
1625 }
1626 
1627 // x86 AES instructions are compatible with SunJCE expanded
1628 // keys, hence we do not need to pass the original key to stubs
1629 const bool Matcher::pass_original_key_for_aes() {
1630   return false;
1631 }
1632 
1633 
1634 const bool Matcher::convi2l_type_required = true;
1635 
1636 // Check for shift by small constant as well
1637 static bool clone_shift(Node* shift, Matcher* matcher, Matcher::MStack& mstack, VectorSet& address_visited) {
1638   if (shift->Opcode() == Op_LShiftX && shift->in(2)->is_Con() &&
1639       shift->in(2)->get_int() <= 3 &&
1640       // Are there other uses besides address expressions?
1641       !matcher->is_visited(shift)) {
1642     address_visited.set(shift->_idx); // Flag as address_visited
1643     mstack.push(shift->in(2), Matcher::Visit);
1644     Node *conv = shift->in(1);
1645 #ifdef _LP64
1646     // Allow Matcher to match the rule which bypass
1647     // ConvI2L operation for an array index on LP64
1648     // if the index value is positive.
1649     if (conv->Opcode() == Op_ConvI2L &&
1650         conv->as_Type()->type()->is_long()->_lo >= 0 &&
1651         // Are there other uses besides address expressions?
1652         !matcher->is_visited(conv)) {
1653       address_visited.set(conv->_idx); // Flag as address_visited
1654       mstack.push(conv->in(1), Matcher::Pre_Visit);
1655     } else
1656 #endif
1657       mstack.push(conv, Matcher::Pre_Visit);
1658     return true;
1659   }
1660   return false;
1661 }
1662 
1663 // Should the Matcher clone shifts on addressing modes, expecting them
1664 // to be subsumed into complex addressing expressions or compute them
1665 // into registers?
1666 bool Matcher::clone_address_expressions(AddPNode* m, Matcher::MStack& mstack, VectorSet& address_visited) {
1667   Node *off = m->in(AddPNode::Offset);
1668   if (off->is_Con()) {
1669     address_visited.test_set(m->_idx); // Flag as address_visited
1670     Node *adr = m->in(AddPNode::Address);
1671 
1672     // Intel can handle 2 adds in addressing mode
1673     // AtomicAdd is not an addressing expression.
1674     // Cheap to find it by looking for screwy base.
1675     if (adr->is_AddP() &&
1676         !adr->in(AddPNode::Base)->is_top() &&
1677         LP64_ONLY( off->get_long() == (int) (off->get_long()) && ) // immL32
1678         // Are there other uses besides address expressions?
1679         !is_visited(adr)) {
1680       address_visited.set(adr->_idx); // Flag as address_visited
1681       Node *shift = adr->in(AddPNode::Offset);
1682       if (!clone_shift(shift, this, mstack, address_visited)) {
1683         mstack.push(shift, Pre_Visit);
1684       }
1685       mstack.push(adr->in(AddPNode::Address), Pre_Visit);
1686       mstack.push(adr->in(AddPNode::Base), Pre_Visit);
1687     } else {
1688       mstack.push(adr, Pre_Visit);
1689     }
1690 
1691     // Clone X+offset as it also folds into most addressing expressions
1692     mstack.push(off, Visit);
1693     mstack.push(m->in(AddPNode::Base), Pre_Visit);
1694     return true;
1695   } else if (clone_shift(off, this, mstack, address_visited)) {
1696     address_visited.test_set(m->_idx); // Flag as address_visited
1697     mstack.push(m->in(AddPNode::Address), Pre_Visit);
1698     mstack.push(m->in(AddPNode::Base), Pre_Visit);
1699     return true;
1700   }
1701   return false;
1702 }
1703 
1704 void Compile::reshape_address(AddPNode* addp) {
1705 }
1706 
1707 // Helper methods for MachSpillCopyNode::implementation().
1708 static int vec_mov_helper(CodeBuffer *cbuf, bool do_size, int src_lo, int dst_lo,
1709                           int src_hi, int dst_hi, uint ireg, outputStream* st) {
1710   // In 64-bit VM size calculation is very complex. Emitting instructions
1711   // into scratch buffer is used to get size in 64-bit VM.
1712   LP64_ONLY( assert(!do_size, "this method calculates size only for 32-bit VM"); )
1713   assert(ireg == Op_VecS || // 32bit vector
1714          (src_lo & 1) == 0 && (src_lo + 1) == src_hi &&
1715          (dst_lo & 1) == 0 && (dst_lo + 1) == dst_hi,
1716          "no non-adjacent vector moves" );
1717   if (cbuf) {
1718     MacroAssembler _masm(cbuf);
1719     int offset = __ offset();
1720     switch (ireg) {
1721     case Op_VecS: // copy whole register
1722     case Op_VecD:
1723     case Op_VecX:
1724 #ifndef _LP64
1725       __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
1726 #else
1727       if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
1728         __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
1729       } else {
1730         __ vextractf32x4(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 0x0);
1731      }
1732 #endif
1733       break;
1734     case Op_VecY:
1735 #ifndef _LP64
1736       __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
1737 #else
1738       if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
1739         __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
1740       } else {
1741         __ vextractf64x4(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 0x0);
1742      }
1743 #endif
1744       break;
1745     case Op_VecZ:
1746       __ evmovdquq(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 2);
1747       break;
1748     default:
1749       ShouldNotReachHere();
1750     }
1751     int size = __ offset() - offset;
1752 #ifdef ASSERT
1753     // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
1754     assert(!do_size || size == 4, "incorrect size calculattion");
1755 #endif
1756     return size;
1757 #ifndef PRODUCT
1758   } else if (!do_size) {
1759     switch (ireg) {
1760     case Op_VecS:
1761     case Op_VecD:
1762     case Op_VecX:
1763       st->print("movdqu  %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
1764       break;
1765     case Op_VecY:
1766     case Op_VecZ:
1767       st->print("vmovdqu %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
1768       break;
1769     default:
1770       ShouldNotReachHere();
1771     }
1772 #endif
1773   }
1774   // VEX_2bytes prefix is used if UseAVX > 0, and it takes the same 2 bytes as SIMD prefix.
1775   return (UseAVX > 2) ? 6 : 4;
1776 }
1777 
1778 static int vec_spill_helper(CodeBuffer *cbuf, bool do_size, bool is_load,
1779                             int stack_offset, int reg, uint ireg, outputStream* st) {
1780   // In 64-bit VM size calculation is very complex. Emitting instructions
1781   // into scratch buffer is used to get size in 64-bit VM.
1782   LP64_ONLY( assert(!do_size, "this method calculates size only for 32-bit VM"); )
1783   if (cbuf) {
1784     MacroAssembler _masm(cbuf);
1785     int offset = __ offset();
1786     if (is_load) {
1787       switch (ireg) {
1788       case Op_VecS:
1789         __ movdl(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
1790         break;
1791       case Op_VecD:
1792         __ movq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
1793         break;
1794       case Op_VecX:
1795 #ifndef _LP64
1796         __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
1797 #else
1798         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
1799           __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
1800         } else {
1801           __ vpxor(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 2);
1802           __ vinsertf32x4(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset),0x0);
1803         }
1804 #endif
1805         break;
1806       case Op_VecY:
1807 #ifndef _LP64
1808         __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
1809 #else
1810         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
1811           __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
1812         } else {
1813           __ vpxor(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 2);
1814           __ vinsertf64x4(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset),0x0);
1815         }
1816 #endif
1817         break;
1818       case Op_VecZ:
1819         __ evmovdquq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset), 2);
1820         break;
1821       default:
1822         ShouldNotReachHere();
1823       }
1824     } else { // store
1825       switch (ireg) {
1826       case Op_VecS:
1827         __ movdl(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
1828         break;
1829       case Op_VecD:
1830         __ movq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
1831         break;
1832       case Op_VecX:
1833 #ifndef _LP64
1834         __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
1835 #else
1836         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
1837           __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
1838         }
1839         else {
1840           __ vextractf32x4(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 0x0);
1841         }
1842 #endif
1843         break;
1844       case Op_VecY:
1845 #ifndef _LP64
1846         __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
1847 #else
1848         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
1849           __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
1850         }
1851         else {
1852           __ vextractf64x4(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 0x0);
1853         }
1854 #endif
1855         break;
1856       case Op_VecZ:
1857         __ evmovdquq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 2);
1858         break;
1859       default:
1860         ShouldNotReachHere();
1861       }
1862     }
1863     int size = __ offset() - offset;
1864 #ifdef ASSERT
1865     int offset_size = (stack_offset == 0) ? 0 : ((stack_offset < 0x80) ? 1 : (UseAVX > 2) ? 6 : 4);
1866     // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
1867     assert(!do_size || size == (5+offset_size), "incorrect size calculattion");
1868 #endif
1869     return size;
1870 #ifndef PRODUCT
1871   } else if (!do_size) {
1872     if (is_load) {
1873       switch (ireg) {
1874       case Op_VecS:
1875         st->print("movd    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
1876         break;
1877       case Op_VecD:
1878         st->print("movq    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
1879         break;
1880        case Op_VecX:
1881         st->print("movdqu  %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
1882         break;
1883       case Op_VecY:
1884       case Op_VecZ:
1885         st->print("vmovdqu %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
1886         break;
1887       default:
1888         ShouldNotReachHere();
1889       }
1890     } else { // store
1891       switch (ireg) {
1892       case Op_VecS:
1893         st->print("movd    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
1894         break;
1895       case Op_VecD:
1896         st->print("movq    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
1897         break;
1898        case Op_VecX:
1899         st->print("movdqu  [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
1900         break;
1901       case Op_VecY:
1902       case Op_VecZ:
1903         st->print("vmovdqu [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
1904         break;
1905       default:
1906         ShouldNotReachHere();
1907       }
1908     }
1909 #endif
1910   }
1911   bool is_single_byte = false;
1912   int vec_len = 0;
1913   if ((UseAVX > 2) && (stack_offset != 0)) {
1914     int tuple_type = Assembler::EVEX_FVM;
1915     int input_size = Assembler::EVEX_32bit;
1916     switch (ireg) {
1917     case Op_VecS:
1918       tuple_type = Assembler::EVEX_T1S;
1919       break;
1920     case Op_VecD:
1921       tuple_type = Assembler::EVEX_T1S;
1922       input_size = Assembler::EVEX_64bit;
1923       break;
1924     case Op_VecX:
1925       break;
1926     case Op_VecY:
1927       vec_len = 1;
1928       break;
1929     case Op_VecZ:
1930       vec_len = 2;
1931       break;
1932     }
1933     is_single_byte = Assembler::query_compressed_disp_byte(stack_offset, true, vec_len, tuple_type, input_size, 0);
1934   }
1935   int offset_size = 0;
1936   int size = 5;
1937   if (UseAVX > 2 ) {
1938     if (VM_Version::supports_avx512novl() && (vec_len == 2)) {
1939       offset_size = (stack_offset == 0) ? 0 : ((is_single_byte) ? 1 : 4);
1940       size += 2; // Need an additional two bytes for EVEX encoding
1941     } else if (VM_Version::supports_avx512novl() && (vec_len < 2)) {
1942       offset_size = (stack_offset == 0) ? 0 : ((stack_offset <= 127) ? 1 : 4);
1943     } else {
1944       offset_size = (stack_offset == 0) ? 0 : ((is_single_byte) ? 1 : 4);
1945       size += 2; // Need an additional two bytes for EVEX encodding
1946     }
1947   } else {
1948     offset_size = (stack_offset == 0) ? 0 : ((stack_offset <= 127) ? 1 : 4);
1949   }
1950   // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
1951   return size+offset_size;
1952 }
1953 
1954 static inline jint replicate4_imm(int con, int width) {
1955   // Load a constant of "width" (in bytes) and replicate it to fill 32bit.
1956   assert(width == 1 || width == 2, "only byte or short types here");
1957   int bit_width = width * 8;
1958   jint val = con;
1959   val &= (1 << bit_width) - 1;  // mask off sign bits
1960   while(bit_width < 32) {
1961     val |= (val << bit_width);
1962     bit_width <<= 1;
1963   }
1964   return val;
1965 }
1966 
1967 static inline jlong replicate8_imm(int con, int width) {
1968   // Load a constant of "width" (in bytes) and replicate it to fill 64bit.
1969   assert(width == 1 || width == 2 || width == 4, "only byte, short or int types here");
1970   int bit_width = width * 8;
1971   jlong val = con;
1972   val &= (((jlong) 1) << bit_width) - 1;  // mask off sign bits
1973   while(bit_width < 64) {
1974     val |= (val << bit_width);
1975     bit_width <<= 1;
1976   }
1977   return val;
1978 }
1979 
1980 #ifndef PRODUCT
1981   void MachNopNode::format(PhaseRegAlloc*, outputStream* st) const {
1982     st->print("nop \t# %d bytes pad for loops and calls", _count);
1983   }
1984 #endif
1985 
1986   void MachNopNode::emit(CodeBuffer &cbuf, PhaseRegAlloc*) const {
1987     MacroAssembler _masm(&cbuf);
1988     __ nop(_count);
1989   }
1990 
1991   uint MachNopNode::size(PhaseRegAlloc*) const {
1992     return _count;
1993   }
1994 
1995 #ifndef PRODUCT
1996   void MachBreakpointNode::format(PhaseRegAlloc*, outputStream* st) const {
1997     st->print("# breakpoint");
1998   }
1999 #endif
2000 
2001   void MachBreakpointNode::emit(CodeBuffer &cbuf, PhaseRegAlloc* ra_) const {
2002     MacroAssembler _masm(&cbuf);
2003     __ int3();
2004   }
2005 
2006   uint MachBreakpointNode::size(PhaseRegAlloc* ra_) const {
2007     return MachNode::size(ra_);
2008   }
2009 
2010 %}
2011 
2012 encode %{
2013 
2014   enc_class call_epilog %{
2015     if (VerifyStackAtCalls) {
2016       // Check that stack depth is unchanged: find majik cookie on stack
2017       int framesize = ra_->reg2offset_unchecked(OptoReg::add(ra_->_matcher._old_SP, -3*VMRegImpl::slots_per_word));
2018       MacroAssembler _masm(&cbuf);
2019       Label L;
2020       __ cmpptr(Address(rsp, framesize), (int32_t)0xbadb100d);
2021       __ jccb(Assembler::equal, L);
2022       // Die if stack mismatch
2023       __ int3();
2024       __ bind(L);
2025     }
2026   %}
2027 
2028 %}
2029 
2030 
2031 //----------OPERANDS-----------------------------------------------------------
2032 // Operand definitions must precede instruction definitions for correct parsing
2033 // in the ADLC because operands constitute user defined types which are used in
2034 // instruction definitions.
2035 
2036 operand vecZ() %{
2037   constraint(ALLOC_IN_RC(vectorz_reg));
2038   match(VecZ);
2039 
2040   format %{ %}
2041   interface(REG_INTER);
2042 %}
2043 
2044 operand legVecZ() %{
2045   constraint(ALLOC_IN_RC(vectorz_reg_vl));
2046   match(VecZ);
2047 
2048   format %{ %}
2049   interface(REG_INTER);
2050 %}
2051 
2052 // Comparison Code for FP conditional move
2053 operand cmpOp_vcmppd() %{
2054   match(Bool);
2055 
2056   predicate(n->as_Bool()->_test._test != BoolTest::overflow &&
2057             n->as_Bool()->_test._test != BoolTest::no_overflow);
2058   format %{ "" %}
2059   interface(COND_INTER) %{
2060     equal        (0x0, "eq");
2061     less         (0x1, "lt");
2062     less_equal   (0x2, "le");
2063     not_equal    (0xC, "ne");
2064     greater_equal(0xD, "ge");
2065     greater      (0xE, "gt");
2066     //TODO cannot compile (adlc breaks) without two next lines with error:
2067     // x86_64.ad(13987) Syntax Error: :In operand cmpOp_vcmppd: Do not support this encode constant: ' %{
2068     // equal' for overflow.
2069     overflow     (0x20, "o");  // not really supported by the instruction
2070     no_overflow  (0x21, "no"); // not really supported by the instruction
2071   %}
2072 %}
2073 
2074 
2075 // INSTRUCTIONS -- Platform independent definitions (same for 32- and 64-bit)
2076 
2077 // ============================================================================
2078 
2079 instruct ShouldNotReachHere() %{
2080   match(Halt);
2081   format %{ "stop\t# ShouldNotReachHere" %}
2082   ins_encode %{
2083     if (is_reachable()) {
2084       __ stop(_halt_reason);
2085     }
2086   %}
2087   ins_pipe(pipe_slow);
2088 %}
2089 
2090 // =================================EVEX special===============================
2091 
2092 instruct setMask(rRegI dst, rRegI src) %{
2093   predicate(Matcher::has_predicated_vectors());
2094   match(Set dst (SetVectMaskI  src));
2095   effect(TEMP dst);
2096   format %{ "setvectmask   $dst, $src" %}
2097   ins_encode %{
2098     __ setvectmask($dst$$Register, $src$$Register);
2099   %}
2100   ins_pipe(pipe_slow);
2101 %}
2102 
2103 // ============================================================================
2104 
2105 instruct addF_reg(regF dst, regF src) %{
2106   predicate((UseSSE>=1) && (UseAVX == 0));
2107   match(Set dst (AddF dst src));
2108 
2109   format %{ "addss   $dst, $src" %}
2110   ins_cost(150);
2111   ins_encode %{
2112     __ addss($dst$$XMMRegister, $src$$XMMRegister);
2113   %}
2114   ins_pipe(pipe_slow);
2115 %}
2116 
2117 instruct addF_mem(regF dst, memory src) %{
2118   predicate((UseSSE>=1) && (UseAVX == 0));
2119   match(Set dst (AddF dst (LoadF src)));
2120 
2121   format %{ "addss   $dst, $src" %}
2122   ins_cost(150);
2123   ins_encode %{
2124     __ addss($dst$$XMMRegister, $src$$Address);
2125   %}
2126   ins_pipe(pipe_slow);
2127 %}
2128 
2129 instruct addF_imm(regF dst, immF con) %{
2130   predicate((UseSSE>=1) && (UseAVX == 0));
2131   match(Set dst (AddF dst con));
2132   format %{ "addss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2133   ins_cost(150);
2134   ins_encode %{
2135     __ addss($dst$$XMMRegister, $constantaddress($con));
2136   %}
2137   ins_pipe(pipe_slow);
2138 %}
2139 
2140 instruct addF_reg_reg(regF dst, regF src1, regF src2) %{
2141   predicate(UseAVX > 0);
2142   match(Set dst (AddF src1 src2));
2143 
2144   format %{ "vaddss  $dst, $src1, $src2" %}
2145   ins_cost(150);
2146   ins_encode %{
2147     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2148   %}
2149   ins_pipe(pipe_slow);
2150 %}
2151 
2152 instruct addF_reg_mem(regF dst, regF src1, memory src2) %{
2153   predicate(UseAVX > 0);
2154   match(Set dst (AddF src1 (LoadF src2)));
2155 
2156   format %{ "vaddss  $dst, $src1, $src2" %}
2157   ins_cost(150);
2158   ins_encode %{
2159     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2160   %}
2161   ins_pipe(pipe_slow);
2162 %}
2163 
2164 instruct addF_reg_imm(regF dst, regF src, immF con) %{
2165   predicate(UseAVX > 0);
2166   match(Set dst (AddF src con));
2167 
2168   format %{ "vaddss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
2169   ins_cost(150);
2170   ins_encode %{
2171     __ vaddss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2172   %}
2173   ins_pipe(pipe_slow);
2174 %}
2175 
2176 instruct addD_reg(regD dst, regD src) %{
2177   predicate((UseSSE>=2) && (UseAVX == 0));
2178   match(Set dst (AddD dst src));
2179 
2180   format %{ "addsd   $dst, $src" %}
2181   ins_cost(150);
2182   ins_encode %{
2183     __ addsd($dst$$XMMRegister, $src$$XMMRegister);
2184   %}
2185   ins_pipe(pipe_slow);
2186 %}
2187 
2188 instruct addD_mem(regD dst, memory src) %{
2189   predicate((UseSSE>=2) && (UseAVX == 0));
2190   match(Set dst (AddD dst (LoadD src)));
2191 
2192   format %{ "addsd   $dst, $src" %}
2193   ins_cost(150);
2194   ins_encode %{
2195     __ addsd($dst$$XMMRegister, $src$$Address);
2196   %}
2197   ins_pipe(pipe_slow);
2198 %}
2199 
2200 instruct addD_imm(regD dst, immD con) %{
2201   predicate((UseSSE>=2) && (UseAVX == 0));
2202   match(Set dst (AddD dst con));
2203   format %{ "addsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2204   ins_cost(150);
2205   ins_encode %{
2206     __ addsd($dst$$XMMRegister, $constantaddress($con));
2207   %}
2208   ins_pipe(pipe_slow);
2209 %}
2210 
2211 instruct addD_reg_reg(regD dst, regD src1, regD src2) %{
2212   predicate(UseAVX > 0);
2213   match(Set dst (AddD src1 src2));
2214 
2215   format %{ "vaddsd  $dst, $src1, $src2" %}
2216   ins_cost(150);
2217   ins_encode %{
2218     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2219   %}
2220   ins_pipe(pipe_slow);
2221 %}
2222 
2223 instruct addD_reg_mem(regD dst, regD src1, memory src2) %{
2224   predicate(UseAVX > 0);
2225   match(Set dst (AddD src1 (LoadD src2)));
2226 
2227   format %{ "vaddsd  $dst, $src1, $src2" %}
2228   ins_cost(150);
2229   ins_encode %{
2230     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2231   %}
2232   ins_pipe(pipe_slow);
2233 %}
2234 
2235 instruct addD_reg_imm(regD dst, regD src, immD con) %{
2236   predicate(UseAVX > 0);
2237   match(Set dst (AddD src con));
2238 
2239   format %{ "vaddsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
2240   ins_cost(150);
2241   ins_encode %{
2242     __ vaddsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2243   %}
2244   ins_pipe(pipe_slow);
2245 %}
2246 
2247 instruct subF_reg(regF dst, regF src) %{
2248   predicate((UseSSE>=1) && (UseAVX == 0));
2249   match(Set dst (SubF dst src));
2250 
2251   format %{ "subss   $dst, $src" %}
2252   ins_cost(150);
2253   ins_encode %{
2254     __ subss($dst$$XMMRegister, $src$$XMMRegister);
2255   %}
2256   ins_pipe(pipe_slow);
2257 %}
2258 
2259 instruct subF_mem(regF dst, memory src) %{
2260   predicate((UseSSE>=1) && (UseAVX == 0));
2261   match(Set dst (SubF dst (LoadF src)));
2262 
2263   format %{ "subss   $dst, $src" %}
2264   ins_cost(150);
2265   ins_encode %{
2266     __ subss($dst$$XMMRegister, $src$$Address);
2267   %}
2268   ins_pipe(pipe_slow);
2269 %}
2270 
2271 instruct subF_imm(regF dst, immF con) %{
2272   predicate((UseSSE>=1) && (UseAVX == 0));
2273   match(Set dst (SubF dst con));
2274   format %{ "subss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2275   ins_cost(150);
2276   ins_encode %{
2277     __ subss($dst$$XMMRegister, $constantaddress($con));
2278   %}
2279   ins_pipe(pipe_slow);
2280 %}
2281 
2282 instruct subF_reg_reg(regF dst, regF src1, regF src2) %{
2283   predicate(UseAVX > 0);
2284   match(Set dst (SubF src1 src2));
2285 
2286   format %{ "vsubss  $dst, $src1, $src2" %}
2287   ins_cost(150);
2288   ins_encode %{
2289     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2290   %}
2291   ins_pipe(pipe_slow);
2292 %}
2293 
2294 instruct subF_reg_mem(regF dst, regF src1, memory src2) %{
2295   predicate(UseAVX > 0);
2296   match(Set dst (SubF src1 (LoadF src2)));
2297 
2298   format %{ "vsubss  $dst, $src1, $src2" %}
2299   ins_cost(150);
2300   ins_encode %{
2301     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2302   %}
2303   ins_pipe(pipe_slow);
2304 %}
2305 
2306 instruct subF_reg_imm(regF dst, regF src, immF con) %{
2307   predicate(UseAVX > 0);
2308   match(Set dst (SubF src con));
2309 
2310   format %{ "vsubss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
2311   ins_cost(150);
2312   ins_encode %{
2313     __ vsubss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2314   %}
2315   ins_pipe(pipe_slow);
2316 %}
2317 
2318 instruct subD_reg(regD dst, regD src) %{
2319   predicate((UseSSE>=2) && (UseAVX == 0));
2320   match(Set dst (SubD dst src));
2321 
2322   format %{ "subsd   $dst, $src" %}
2323   ins_cost(150);
2324   ins_encode %{
2325     __ subsd($dst$$XMMRegister, $src$$XMMRegister);
2326   %}
2327   ins_pipe(pipe_slow);
2328 %}
2329 
2330 instruct subD_mem(regD dst, memory src) %{
2331   predicate((UseSSE>=2) && (UseAVX == 0));
2332   match(Set dst (SubD dst (LoadD src)));
2333 
2334   format %{ "subsd   $dst, $src" %}
2335   ins_cost(150);
2336   ins_encode %{
2337     __ subsd($dst$$XMMRegister, $src$$Address);
2338   %}
2339   ins_pipe(pipe_slow);
2340 %}
2341 
2342 instruct subD_imm(regD dst, immD con) %{
2343   predicate((UseSSE>=2) && (UseAVX == 0));
2344   match(Set dst (SubD dst con));
2345   format %{ "subsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2346   ins_cost(150);
2347   ins_encode %{
2348     __ subsd($dst$$XMMRegister, $constantaddress($con));
2349   %}
2350   ins_pipe(pipe_slow);
2351 %}
2352 
2353 instruct subD_reg_reg(regD dst, regD src1, regD src2) %{
2354   predicate(UseAVX > 0);
2355   match(Set dst (SubD src1 src2));
2356 
2357   format %{ "vsubsd  $dst, $src1, $src2" %}
2358   ins_cost(150);
2359   ins_encode %{
2360     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2361   %}
2362   ins_pipe(pipe_slow);
2363 %}
2364 
2365 instruct subD_reg_mem(regD dst, regD src1, memory src2) %{
2366   predicate(UseAVX > 0);
2367   match(Set dst (SubD src1 (LoadD src2)));
2368 
2369   format %{ "vsubsd  $dst, $src1, $src2" %}
2370   ins_cost(150);
2371   ins_encode %{
2372     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2373   %}
2374   ins_pipe(pipe_slow);
2375 %}
2376 
2377 instruct subD_reg_imm(regD dst, regD src, immD con) %{
2378   predicate(UseAVX > 0);
2379   match(Set dst (SubD src con));
2380 
2381   format %{ "vsubsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
2382   ins_cost(150);
2383   ins_encode %{
2384     __ vsubsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2385   %}
2386   ins_pipe(pipe_slow);
2387 %}
2388 
2389 instruct mulF_reg(regF dst, regF src) %{
2390   predicate((UseSSE>=1) && (UseAVX == 0));
2391   match(Set dst (MulF dst src));
2392 
2393   format %{ "mulss   $dst, $src" %}
2394   ins_cost(150);
2395   ins_encode %{
2396     __ mulss($dst$$XMMRegister, $src$$XMMRegister);
2397   %}
2398   ins_pipe(pipe_slow);
2399 %}
2400 
2401 instruct mulF_mem(regF dst, memory src) %{
2402   predicate((UseSSE>=1) && (UseAVX == 0));
2403   match(Set dst (MulF dst (LoadF src)));
2404 
2405   format %{ "mulss   $dst, $src" %}
2406   ins_cost(150);
2407   ins_encode %{
2408     __ mulss($dst$$XMMRegister, $src$$Address);
2409   %}
2410   ins_pipe(pipe_slow);
2411 %}
2412 
2413 instruct mulF_imm(regF dst, immF con) %{
2414   predicate((UseSSE>=1) && (UseAVX == 0));
2415   match(Set dst (MulF dst con));
2416   format %{ "mulss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2417   ins_cost(150);
2418   ins_encode %{
2419     __ mulss($dst$$XMMRegister, $constantaddress($con));
2420   %}
2421   ins_pipe(pipe_slow);
2422 %}
2423 
2424 instruct mulF_reg_reg(regF dst, regF src1, regF src2) %{
2425   predicate(UseAVX > 0);
2426   match(Set dst (MulF src1 src2));
2427 
2428   format %{ "vmulss  $dst, $src1, $src2" %}
2429   ins_cost(150);
2430   ins_encode %{
2431     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2432   %}
2433   ins_pipe(pipe_slow);
2434 %}
2435 
2436 instruct mulF_reg_mem(regF dst, regF src1, memory src2) %{
2437   predicate(UseAVX > 0);
2438   match(Set dst (MulF src1 (LoadF src2)));
2439 
2440   format %{ "vmulss  $dst, $src1, $src2" %}
2441   ins_cost(150);
2442   ins_encode %{
2443     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2444   %}
2445   ins_pipe(pipe_slow);
2446 %}
2447 
2448 instruct mulF_reg_imm(regF dst, regF src, immF con) %{
2449   predicate(UseAVX > 0);
2450   match(Set dst (MulF src con));
2451 
2452   format %{ "vmulss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
2453   ins_cost(150);
2454   ins_encode %{
2455     __ vmulss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2456   %}
2457   ins_pipe(pipe_slow);
2458 %}
2459 
2460 instruct mulD_reg(regD dst, regD src) %{
2461   predicate((UseSSE>=2) && (UseAVX == 0));
2462   match(Set dst (MulD dst src));
2463 
2464   format %{ "mulsd   $dst, $src" %}
2465   ins_cost(150);
2466   ins_encode %{
2467     __ mulsd($dst$$XMMRegister, $src$$XMMRegister);
2468   %}
2469   ins_pipe(pipe_slow);
2470 %}
2471 
2472 instruct mulD_mem(regD dst, memory src) %{
2473   predicate((UseSSE>=2) && (UseAVX == 0));
2474   match(Set dst (MulD dst (LoadD src)));
2475 
2476   format %{ "mulsd   $dst, $src" %}
2477   ins_cost(150);
2478   ins_encode %{
2479     __ mulsd($dst$$XMMRegister, $src$$Address);
2480   %}
2481   ins_pipe(pipe_slow);
2482 %}
2483 
2484 instruct mulD_imm(regD dst, immD con) %{
2485   predicate((UseSSE>=2) && (UseAVX == 0));
2486   match(Set dst (MulD dst con));
2487   format %{ "mulsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2488   ins_cost(150);
2489   ins_encode %{
2490     __ mulsd($dst$$XMMRegister, $constantaddress($con));
2491   %}
2492   ins_pipe(pipe_slow);
2493 %}
2494 
2495 instruct mulD_reg_reg(regD dst, regD src1, regD src2) %{
2496   predicate(UseAVX > 0);
2497   match(Set dst (MulD src1 src2));
2498 
2499   format %{ "vmulsd  $dst, $src1, $src2" %}
2500   ins_cost(150);
2501   ins_encode %{
2502     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2503   %}
2504   ins_pipe(pipe_slow);
2505 %}
2506 
2507 instruct mulD_reg_mem(regD dst, regD src1, memory src2) %{
2508   predicate(UseAVX > 0);
2509   match(Set dst (MulD src1 (LoadD src2)));
2510 
2511   format %{ "vmulsd  $dst, $src1, $src2" %}
2512   ins_cost(150);
2513   ins_encode %{
2514     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2515   %}
2516   ins_pipe(pipe_slow);
2517 %}
2518 
2519 instruct mulD_reg_imm(regD dst, regD src, immD con) %{
2520   predicate(UseAVX > 0);
2521   match(Set dst (MulD src con));
2522 
2523   format %{ "vmulsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
2524   ins_cost(150);
2525   ins_encode %{
2526     __ vmulsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2527   %}
2528   ins_pipe(pipe_slow);
2529 %}
2530 
2531 instruct divF_reg(regF dst, regF src) %{
2532   predicate((UseSSE>=1) && (UseAVX == 0));
2533   match(Set dst (DivF dst src));
2534 
2535   format %{ "divss   $dst, $src" %}
2536   ins_cost(150);
2537   ins_encode %{
2538     __ divss($dst$$XMMRegister, $src$$XMMRegister);
2539   %}
2540   ins_pipe(pipe_slow);
2541 %}
2542 
2543 instruct divF_mem(regF dst, memory src) %{
2544   predicate((UseSSE>=1) && (UseAVX == 0));
2545   match(Set dst (DivF dst (LoadF src)));
2546 
2547   format %{ "divss   $dst, $src" %}
2548   ins_cost(150);
2549   ins_encode %{
2550     __ divss($dst$$XMMRegister, $src$$Address);
2551   %}
2552   ins_pipe(pipe_slow);
2553 %}
2554 
2555 instruct divF_imm(regF dst, immF con) %{
2556   predicate((UseSSE>=1) && (UseAVX == 0));
2557   match(Set dst (DivF dst con));
2558   format %{ "divss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2559   ins_cost(150);
2560   ins_encode %{
2561     __ divss($dst$$XMMRegister, $constantaddress($con));
2562   %}
2563   ins_pipe(pipe_slow);
2564 %}
2565 
2566 instruct divF_reg_reg(regF dst, regF src1, regF src2) %{
2567   predicate(UseAVX > 0);
2568   match(Set dst (DivF src1 src2));
2569 
2570   format %{ "vdivss  $dst, $src1, $src2" %}
2571   ins_cost(150);
2572   ins_encode %{
2573     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2574   %}
2575   ins_pipe(pipe_slow);
2576 %}
2577 
2578 instruct divF_reg_mem(regF dst, regF src1, memory src2) %{
2579   predicate(UseAVX > 0);
2580   match(Set dst (DivF src1 (LoadF src2)));
2581 
2582   format %{ "vdivss  $dst, $src1, $src2" %}
2583   ins_cost(150);
2584   ins_encode %{
2585     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2586   %}
2587   ins_pipe(pipe_slow);
2588 %}
2589 
2590 instruct divF_reg_imm(regF dst, regF src, immF con) %{
2591   predicate(UseAVX > 0);
2592   match(Set dst (DivF src con));
2593 
2594   format %{ "vdivss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
2595   ins_cost(150);
2596   ins_encode %{
2597     __ vdivss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2598   %}
2599   ins_pipe(pipe_slow);
2600 %}
2601 
2602 instruct divD_reg(regD dst, regD src) %{
2603   predicate((UseSSE>=2) && (UseAVX == 0));
2604   match(Set dst (DivD dst src));
2605 
2606   format %{ "divsd   $dst, $src" %}
2607   ins_cost(150);
2608   ins_encode %{
2609     __ divsd($dst$$XMMRegister, $src$$XMMRegister);
2610   %}
2611   ins_pipe(pipe_slow);
2612 %}
2613 
2614 instruct divD_mem(regD dst, memory src) %{
2615   predicate((UseSSE>=2) && (UseAVX == 0));
2616   match(Set dst (DivD dst (LoadD src)));
2617 
2618   format %{ "divsd   $dst, $src" %}
2619   ins_cost(150);
2620   ins_encode %{
2621     __ divsd($dst$$XMMRegister, $src$$Address);
2622   %}
2623   ins_pipe(pipe_slow);
2624 %}
2625 
2626 instruct divD_imm(regD dst, immD con) %{
2627   predicate((UseSSE>=2) && (UseAVX == 0));
2628   match(Set dst (DivD dst con));
2629   format %{ "divsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2630   ins_cost(150);
2631   ins_encode %{
2632     __ divsd($dst$$XMMRegister, $constantaddress($con));
2633   %}
2634   ins_pipe(pipe_slow);
2635 %}
2636 
2637 instruct divD_reg_reg(regD dst, regD src1, regD src2) %{
2638   predicate(UseAVX > 0);
2639   match(Set dst (DivD src1 src2));
2640 
2641   format %{ "vdivsd  $dst, $src1, $src2" %}
2642   ins_cost(150);
2643   ins_encode %{
2644     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2645   %}
2646   ins_pipe(pipe_slow);
2647 %}
2648 
2649 instruct divD_reg_mem(regD dst, regD src1, memory src2) %{
2650   predicate(UseAVX > 0);
2651   match(Set dst (DivD src1 (LoadD src2)));
2652 
2653   format %{ "vdivsd  $dst, $src1, $src2" %}
2654   ins_cost(150);
2655   ins_encode %{
2656     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2657   %}
2658   ins_pipe(pipe_slow);
2659 %}
2660 
2661 instruct divD_reg_imm(regD dst, regD src, immD con) %{
2662   predicate(UseAVX > 0);
2663   match(Set dst (DivD src con));
2664 
2665   format %{ "vdivsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
2666   ins_cost(150);
2667   ins_encode %{
2668     __ vdivsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2669   %}
2670   ins_pipe(pipe_slow);
2671 %}
2672 
2673 instruct absF_reg(regF dst) %{
2674   predicate((UseSSE>=1) && (UseAVX == 0));
2675   match(Set dst (AbsF dst));
2676   ins_cost(150);
2677   format %{ "andps   $dst, [0x7fffffff]\t# abs float by sign masking" %}
2678   ins_encode %{
2679     __ andps($dst$$XMMRegister, ExternalAddress(float_signmask()));
2680   %}
2681   ins_pipe(pipe_slow);
2682 %}
2683 
2684 instruct absF_reg_reg(vlRegF dst, vlRegF src) %{
2685   predicate(UseAVX > 0);
2686   match(Set dst (AbsF src));
2687   ins_cost(150);
2688   format %{ "vandps  $dst, $src, [0x7fffffff]\t# abs float by sign masking" %}
2689   ins_encode %{
2690     int vector_len = 0;
2691     __ vandps($dst$$XMMRegister, $src$$XMMRegister,
2692               ExternalAddress(float_signmask()), vector_len);
2693   %}
2694   ins_pipe(pipe_slow);
2695 %}
2696 
2697 instruct absD_reg(regD dst) %{
2698   predicate((UseSSE>=2) && (UseAVX == 0));
2699   match(Set dst (AbsD dst));
2700   ins_cost(150);
2701   format %{ "andpd   $dst, [0x7fffffffffffffff]\t"
2702             "# abs double by sign masking" %}
2703   ins_encode %{
2704     __ andpd($dst$$XMMRegister, ExternalAddress(double_signmask()));
2705   %}
2706   ins_pipe(pipe_slow);
2707 %}
2708 
2709 instruct absD_reg_reg(vlRegD dst, vlRegD src) %{
2710   predicate(UseAVX > 0);
2711   match(Set dst (AbsD src));
2712   ins_cost(150);
2713   format %{ "vandpd  $dst, $src, [0x7fffffffffffffff]\t"
2714             "# abs double by sign masking" %}
2715   ins_encode %{
2716     int vector_len = 0;
2717     __ vandpd($dst$$XMMRegister, $src$$XMMRegister,
2718               ExternalAddress(double_signmask()), vector_len);
2719   %}
2720   ins_pipe(pipe_slow);
2721 %}
2722 
2723 instruct negF_reg(regF dst) %{
2724   predicate((UseSSE>=1) && (UseAVX == 0));
2725   match(Set dst (NegF dst));
2726   ins_cost(150);
2727   format %{ "xorps   $dst, [0x80000000]\t# neg float by sign flipping" %}
2728   ins_encode %{
2729     __ xorps($dst$$XMMRegister, ExternalAddress(float_signflip()));
2730   %}
2731   ins_pipe(pipe_slow);
2732 %}
2733 
2734 instruct negF_reg_reg(vlRegF dst, vlRegF src) %{
2735   predicate(UseAVX > 0);
2736   match(Set dst (NegF src));
2737   ins_cost(150);
2738   format %{ "vnegatess  $dst, $src, [0x80000000]\t# neg float by sign flipping" %}
2739   ins_encode %{
2740     __ vnegatess($dst$$XMMRegister, $src$$XMMRegister,
2741                  ExternalAddress(float_signflip()));
2742   %}
2743   ins_pipe(pipe_slow);
2744 %}
2745 
2746 instruct negD_reg(regD dst) %{
2747   predicate((UseSSE>=2) && (UseAVX == 0));
2748   match(Set dst (NegD dst));
2749   ins_cost(150);
2750   format %{ "xorpd   $dst, [0x8000000000000000]\t"
2751             "# neg double by sign flipping" %}
2752   ins_encode %{
2753     __ xorpd($dst$$XMMRegister, ExternalAddress(double_signflip()));
2754   %}
2755   ins_pipe(pipe_slow);
2756 %}
2757 
2758 instruct negD_reg_reg(vlRegD dst, vlRegD src) %{
2759   predicate(UseAVX > 0);
2760   match(Set dst (NegD src));
2761   ins_cost(150);
2762   format %{ "vnegatesd  $dst, $src, [0x8000000000000000]\t"
2763             "# neg double by sign flipping" %}
2764   ins_encode %{
2765     __ vnegatesd($dst$$XMMRegister, $src$$XMMRegister,
2766                  ExternalAddress(double_signflip()));
2767   %}
2768   ins_pipe(pipe_slow);
2769 %}
2770 
2771 instruct sqrtF_reg(regF dst, regF src) %{
2772   predicate(UseSSE>=1);
2773   match(Set dst (SqrtF src));
2774 
2775   format %{ "sqrtss  $dst, $src" %}
2776   ins_cost(150);
2777   ins_encode %{
2778     __ sqrtss($dst$$XMMRegister, $src$$XMMRegister);
2779   %}
2780   ins_pipe(pipe_slow);
2781 %}
2782 
2783 instruct sqrtF_mem(regF dst, memory src) %{
2784   predicate(UseSSE>=1);
2785   match(Set dst (SqrtF (LoadF src)));
2786 
2787   format %{ "sqrtss  $dst, $src" %}
2788   ins_cost(150);
2789   ins_encode %{
2790     __ sqrtss($dst$$XMMRegister, $src$$Address);
2791   %}
2792   ins_pipe(pipe_slow);
2793 %}
2794 
2795 instruct sqrtF_imm(regF dst, immF con) %{
2796   predicate(UseSSE>=1);
2797   match(Set dst (SqrtF con));
2798 
2799   format %{ "sqrtss  $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2800   ins_cost(150);
2801   ins_encode %{
2802     __ sqrtss($dst$$XMMRegister, $constantaddress($con));
2803   %}
2804   ins_pipe(pipe_slow);
2805 %}
2806 
2807 instruct sqrtD_reg(regD dst, regD src) %{
2808   predicate(UseSSE>=2);
2809   match(Set dst (SqrtD src));
2810 
2811   format %{ "sqrtsd  $dst, $src" %}
2812   ins_cost(150);
2813   ins_encode %{
2814     __ sqrtsd($dst$$XMMRegister, $src$$XMMRegister);
2815   %}
2816   ins_pipe(pipe_slow);
2817 %}
2818 
2819 instruct sqrtD_mem(regD dst, memory src) %{
2820   predicate(UseSSE>=2);
2821   match(Set dst (SqrtD (LoadD src)));
2822 
2823   format %{ "sqrtsd  $dst, $src" %}
2824   ins_cost(150);
2825   ins_encode %{
2826     __ sqrtsd($dst$$XMMRegister, $src$$Address);
2827   %}
2828   ins_pipe(pipe_slow);
2829 %}
2830 
2831 instruct sqrtD_imm(regD dst, immD con) %{
2832   predicate(UseSSE>=2);
2833   match(Set dst (SqrtD con));
2834   format %{ "sqrtsd  $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2835   ins_cost(150);
2836   ins_encode %{
2837     __ sqrtsd($dst$$XMMRegister, $constantaddress($con));
2838   %}
2839   ins_pipe(pipe_slow);
2840 %}
2841 
2842 instruct onspinwait() %{
2843   match(OnSpinWait);
2844   ins_cost(200);
2845 
2846   format %{
2847     $$template
2848     if (os::is_MP()) {
2849       $$emit$$"pause\t! membar_onspinwait"
2850     } else {
2851       $$emit$$"MEMBAR-onspinwait ! (empty encoding)"
2852     }
2853   %}
2854   ins_encode %{
2855     __ pause();
2856   %}
2857   ins_pipe(pipe_slow);
2858 %}
2859 
2860 // a * b + c
2861 instruct fmaD_reg(regD a, regD b, regD c) %{
2862   predicate(UseFMA);
2863   match(Set c (FmaD  c (Binary a b)));
2864   format %{ "fmasd $a,$b,$c\t# $c = $a * $b + $c" %}
2865   ins_cost(150);
2866   ins_encode %{
2867     __ fmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
2868   %}
2869   ins_pipe( pipe_slow );
2870 %}
2871 
2872 // a * b + c
2873 instruct fmaF_reg(regF a, regF b, regF c) %{
2874   predicate(UseFMA);
2875   match(Set c (FmaF  c (Binary a b)));
2876   format %{ "fmass $a,$b,$c\t# $c = $a * $b + $c" %}
2877   ins_cost(150);
2878   ins_encode %{
2879     __ fmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
2880   %}
2881   ins_pipe( pipe_slow );
2882 %}
2883 
2884 // ====================VECTOR INSTRUCTIONS=====================================
2885 
2886 
2887 // Load vectors (4 bytes long)
2888 instruct loadV4(vecS dst, memory mem) %{
2889   predicate(n->as_LoadVector()->memory_size() == 4);
2890   match(Set dst (LoadVector mem));
2891   ins_cost(125);
2892   format %{ "movd    $dst,$mem\t! load vector (4 bytes)" %}
2893   ins_encode %{
2894     __ movdl($dst$$XMMRegister, $mem$$Address);
2895   %}
2896   ins_pipe( pipe_slow );
2897 %}
2898 
2899 // Load vectors (4 bytes long)
2900 instruct MoveVecS2Leg(legVecS dst, vecS src) %{
2901   match(Set dst src);
2902   format %{ "movss $dst,$src\t! load vector (4 bytes)" %}
2903   ins_encode %{
2904     __ movflt($dst$$XMMRegister, $src$$XMMRegister);
2905   %}
2906   ins_pipe( fpu_reg_reg );
2907 %}
2908 
2909 // Load vectors (4 bytes long)
2910 instruct MoveLeg2VecS(vecS dst, legVecS src) %{
2911   match(Set dst src);
2912   format %{ "movss $dst,$src\t! load vector (4 bytes)" %}
2913   ins_encode %{
2914     __ movflt($dst$$XMMRegister, $src$$XMMRegister);
2915   %}
2916   ins_pipe( fpu_reg_reg );
2917 %}
2918 
2919 // Load vectors (8 bytes long)
2920 instruct loadV8(vecD dst, memory mem) %{
2921   predicate(n->as_LoadVector()->memory_size() == 8);
2922   match(Set dst (LoadVector mem));
2923   ins_cost(125);
2924   format %{ "movq    $dst,$mem\t! load vector (8 bytes)" %}
2925   ins_encode %{
2926     __ movq($dst$$XMMRegister, $mem$$Address);
2927   %}
2928   ins_pipe( pipe_slow );
2929 %}
2930 
2931 // Load vectors (8 bytes long)
2932 instruct MoveVecD2Leg(legVecD dst, vecD src) %{
2933   match(Set dst src);
2934   format %{ "movsd $dst,$src\t! load vector (8 bytes)" %}
2935   ins_encode %{
2936     __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
2937   %}
2938   ins_pipe( fpu_reg_reg );
2939 %}
2940 
2941 // Load vectors (8 bytes long)
2942 instruct MoveLeg2VecD(vecD dst, legVecD src) %{
2943   match(Set dst src);
2944   format %{ "movsd $dst,$src\t! load vector (8 bytes)" %}
2945   ins_encode %{
2946     __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
2947   %}
2948   ins_pipe( fpu_reg_reg );
2949 %}
2950 
2951 // Load vectors (16 bytes long)
2952 instruct loadV16(vecX dst, memory mem) %{
2953   predicate(n->as_LoadVector()->memory_size() == 16);
2954   match(Set dst (LoadVector mem));
2955   ins_cost(125);
2956   format %{ "movdqu  $dst,$mem\t! load vector (16 bytes)" %}
2957   ins_encode %{
2958     __ movdqu($dst$$XMMRegister, $mem$$Address);
2959   %}
2960   ins_pipe( pipe_slow );
2961 %}
2962 
2963 // Load vectors (16 bytes long)
2964 instruct MoveVecX2Leg(legVecX dst, vecX src) %{
2965   match(Set dst src);
2966   format %{ "movdqu $dst,$src\t! load vector (16 bytes)" %}
2967   ins_encode %{
2968     if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
2969       int vector_len = 2;
2970       __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
2971     } else {
2972       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
2973     }
2974   %}
2975   ins_pipe( fpu_reg_reg );
2976 %}
2977 
2978 // Load vectors (16 bytes long)
2979 instruct MoveLeg2VecX(vecX dst, legVecX src) %{
2980   match(Set dst src);
2981   format %{ "movdqu $dst,$src\t! load vector (16 bytes)" %}
2982   ins_encode %{
2983     if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
2984       int vector_len = 2;
2985       __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
2986     } else {
2987       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
2988     }
2989   %}
2990   ins_pipe( fpu_reg_reg );
2991 %}
2992 
2993 // Load vectors (32 bytes long)
2994 instruct loadV32(vecY dst, memory mem) %{
2995   predicate(n->as_LoadVector()->memory_size() == 32);
2996   match(Set dst (LoadVector mem));
2997   ins_cost(125);
2998   format %{ "vmovdqu $dst,$mem\t! load vector (32 bytes)" %}
2999   ins_encode %{
3000     __ vmovdqu($dst$$XMMRegister, $mem$$Address);
3001   %}
3002   ins_pipe( pipe_slow );
3003 %}
3004 
3005 // Load vectors (32 bytes long)
3006 instruct MoveVecY2Leg(legVecY dst, vecY src) %{
3007   match(Set dst src);
3008   format %{ "vmovdqu $dst,$src\t! load vector (32 bytes)" %}
3009   ins_encode %{
3010     if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
3011       int vector_len = 2;
3012       __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
3013     } else {
3014       __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister);
3015     }
3016   %}
3017   ins_pipe( fpu_reg_reg );
3018 %}
3019 
3020 // Load vectors (32 bytes long)
3021 instruct MoveLeg2VecY(vecY dst, legVecY src) %{
3022   match(Set dst src);
3023   format %{ "vmovdqu $dst,$src\t! load vector (32 bytes)" %}
3024   ins_encode %{
3025     if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
3026       int vector_len = 2;
3027       __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
3028     } else {
3029       __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister);
3030     }
3031   %}
3032   ins_pipe( fpu_reg_reg );
3033 %}
3034 
3035 // Load vectors (64 bytes long)
3036 instruct loadV64_dword(vecZ dst, memory mem) %{
3037   predicate(n->as_LoadVector()->memory_size() == 64 && n->as_LoadVector()->element_size() <= 4);
3038   match(Set dst (LoadVector mem));
3039   ins_cost(125);
3040   format %{ "vmovdqul $dst k0,$mem\t! load vector (64 bytes)" %}
3041   ins_encode %{
3042     int vector_len = 2;
3043     __ evmovdqul($dst$$XMMRegister, $mem$$Address, vector_len);
3044   %}
3045   ins_pipe( pipe_slow );
3046 %}
3047 
3048 // Load vectors (64 bytes long)
3049 instruct loadV64_qword(vecZ dst, memory mem) %{
3050   predicate(n->as_LoadVector()->memory_size() == 64 && n->as_LoadVector()->element_size() > 4);
3051   match(Set dst (LoadVector mem));
3052   ins_cost(125);
3053   format %{ "vmovdquq $dst k0,$mem\t! load vector (64 bytes)" %}
3054   ins_encode %{
3055     int vector_len = 2;
3056     __ evmovdquq($dst$$XMMRegister, $mem$$Address, vector_len);
3057   %}
3058   ins_pipe( pipe_slow );
3059 %}
3060 
3061 instruct MoveVecZ2Leg(legVecZ dst, vecZ  src) %{
3062   match(Set dst src);
3063   format %{ "vmovdquq $dst k0,$src\t! Move vector (64 bytes)" %}
3064   ins_encode %{
3065     int vector_len = 2;
3066     __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
3067   %}
3068   ins_pipe( fpu_reg_reg );
3069 %}
3070 
3071 instruct MoveLeg2VecZ(vecZ dst, legVecZ  src) %{
3072   match(Set dst src);
3073   format %{ "vmovdquq $dst k0,$src\t! Move vector (64 bytes)" %}
3074   ins_encode %{
3075     int vector_len = 2;
3076     __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
3077   %}
3078   ins_pipe( fpu_reg_reg );
3079 %}
3080 
3081 // Store vectors
3082 instruct storeV4(memory mem, vecS src) %{
3083   predicate(n->as_StoreVector()->memory_size() == 4);
3084   match(Set mem (StoreVector mem src));
3085   ins_cost(145);
3086   format %{ "movd    $mem,$src\t! store vector (4 bytes)" %}
3087   ins_encode %{
3088     __ movdl($mem$$Address, $src$$XMMRegister);
3089   %}
3090   ins_pipe( pipe_slow );
3091 %}
3092 
3093 instruct storeV8(memory mem, vecD src) %{
3094   predicate(n->as_StoreVector()->memory_size() == 8);
3095   match(Set mem (StoreVector mem src));
3096   ins_cost(145);
3097   format %{ "movq    $mem,$src\t! store vector (8 bytes)" %}
3098   ins_encode %{
3099     __ movq($mem$$Address, $src$$XMMRegister);
3100   %}
3101   ins_pipe( pipe_slow );
3102 %}
3103 
3104 instruct storeV16(memory mem, vecX src) %{
3105   predicate(n->as_StoreVector()->memory_size() == 16);
3106   match(Set mem (StoreVector mem src));
3107   ins_cost(145);
3108   format %{ "movdqu  $mem,$src\t! store vector (16 bytes)" %}
3109   ins_encode %{
3110     __ movdqu($mem$$Address, $src$$XMMRegister);
3111   %}
3112   ins_pipe( pipe_slow );
3113 %}
3114 
3115 instruct storeV32(memory mem, vecY src) %{
3116   predicate(n->as_StoreVector()->memory_size() == 32);
3117   match(Set mem (StoreVector mem src));
3118   ins_cost(145);
3119   format %{ "vmovdqu $mem,$src\t! store vector (32 bytes)" %}
3120   ins_encode %{
3121     __ vmovdqu($mem$$Address, $src$$XMMRegister);
3122   %}
3123   ins_pipe( pipe_slow );
3124 %}
3125 
3126 instruct storeV64_dword(memory mem, vecZ src) %{
3127   predicate(n->as_StoreVector()->memory_size() == 64 && n->as_StoreVector()->element_size() <= 4);
3128   match(Set mem (StoreVector mem src));
3129   ins_cost(145);
3130   format %{ "vmovdqul $mem k0,$src\t! store vector (64 bytes)" %}
3131   ins_encode %{
3132     int vector_len = 2;
3133     __ evmovdqul($mem$$Address, $src$$XMMRegister, vector_len);
3134   %}
3135   ins_pipe( pipe_slow );
3136 %}
3137 
3138 instruct storeV64_qword(memory mem, vecZ src) %{
3139   predicate(n->as_StoreVector()->memory_size() == 64 && n->as_StoreVector()->element_size() > 4);
3140   match(Set mem (StoreVector mem src));
3141   ins_cost(145);
3142   format %{ "vmovdquq $mem k0,$src\t! store vector (64 bytes)" %}
3143   ins_encode %{
3144     int vector_len = 2;
3145     __ evmovdquq($mem$$Address, $src$$XMMRegister, vector_len);
3146   %}
3147   ins_pipe( pipe_slow );
3148 %}
3149 
3150 // ====================LEGACY REPLICATE=======================================
3151 
3152 instruct Repl16B(vecX dst, rRegI src) %{
3153   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
3154   match(Set dst (ReplicateB src));
3155   format %{ "movd    $dst,$src\n\t"
3156             "punpcklbw $dst,$dst\n\t"
3157             "pshuflw $dst,$dst,0x00\n\t"
3158             "punpcklqdq $dst,$dst\t! replicate16B" %}
3159   ins_encode %{
3160     __ movdl($dst$$XMMRegister, $src$$Register);
3161     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
3162     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3163     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3164   %}
3165   ins_pipe( pipe_slow );
3166 %}
3167 
3168 instruct Repl32B(vecY dst, rRegI src) %{
3169   predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
3170   match(Set dst (ReplicateB src));
3171   format %{ "movd    $dst,$src\n\t"
3172             "punpcklbw $dst,$dst\n\t"
3173             "pshuflw $dst,$dst,0x00\n\t"
3174             "punpcklqdq $dst,$dst\n\t"
3175             "vinserti128_high $dst,$dst\t! replicate32B" %}
3176   ins_encode %{
3177     __ movdl($dst$$XMMRegister, $src$$Register);
3178     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
3179     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3180     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3181     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3182   %}
3183   ins_pipe( pipe_slow );
3184 %}
3185 
3186 instruct Repl64B(legVecZ dst, rRegI src) %{
3187   predicate(n->as_Vector()->length() == 64 && !VM_Version::supports_avx512vlbw());
3188   match(Set dst (ReplicateB src));
3189   format %{ "movd    $dst,$src\n\t"
3190             "punpcklbw $dst,$dst\n\t"
3191             "pshuflw $dst,$dst,0x00\n\t"
3192             "punpcklqdq $dst,$dst\n\t"
3193             "vinserti128_high $dst,$dst\t"
3194             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate64B" %}
3195   ins_encode %{
3196     __ movdl($dst$$XMMRegister, $src$$Register);
3197     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
3198     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3199     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3200     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3201     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3202   %}
3203   ins_pipe( pipe_slow );
3204 %}
3205 
3206 instruct Repl16B_imm(vecX dst, immI con) %{
3207   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
3208   match(Set dst (ReplicateB con));
3209   format %{ "movq    $dst,[$constantaddress]\n\t"
3210             "punpcklqdq $dst,$dst\t! replicate16B($con)" %}
3211   ins_encode %{
3212     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
3213     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3214   %}
3215   ins_pipe( pipe_slow );
3216 %}
3217 
3218 instruct Repl32B_imm(vecY dst, immI con) %{
3219   predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
3220   match(Set dst (ReplicateB con));
3221   format %{ "movq    $dst,[$constantaddress]\n\t"
3222             "punpcklqdq $dst,$dst\n\t"
3223             "vinserti128_high $dst,$dst\t! lreplicate32B($con)" %}
3224   ins_encode %{
3225     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
3226     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3227     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3228   %}
3229   ins_pipe( pipe_slow );
3230 %}
3231 
3232 instruct Repl64B_imm(legVecZ dst, immI con) %{
3233   predicate(n->as_Vector()->length() == 64 && !VM_Version::supports_avx512vlbw());
3234   match(Set dst (ReplicateB con));
3235   format %{ "movq    $dst,[$constantaddress]\n\t"
3236             "punpcklqdq $dst,$dst\n\t"
3237             "vinserti128_high $dst,$dst\t"
3238             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate64B($con)" %}
3239   ins_encode %{
3240     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
3241     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3242     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3243     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3244   %}
3245   ins_pipe( pipe_slow );
3246 %}
3247 
3248 instruct Repl4S(vecD dst, rRegI src) %{
3249   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vlbw());
3250   match(Set dst (ReplicateS src));
3251   format %{ "movd    $dst,$src\n\t"
3252             "pshuflw $dst,$dst,0x00\t! replicate4S" %}
3253   ins_encode %{
3254     __ movdl($dst$$XMMRegister, $src$$Register);
3255     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3256   %}
3257   ins_pipe( pipe_slow );
3258 %}
3259 
3260 instruct Repl4S_mem(vecD dst, memory mem) %{
3261   predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
3262   match(Set dst (ReplicateS (LoadS mem)));
3263   format %{ "pshuflw $dst,$mem,0x00\t! replicate4S" %}
3264   ins_encode %{
3265     __ pshuflw($dst$$XMMRegister, $mem$$Address, 0x00);
3266   %}
3267   ins_pipe( pipe_slow );
3268 %}
3269 
3270 instruct Repl8S(vecX dst, rRegI src) %{
3271   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vlbw());
3272   match(Set dst (ReplicateS src));
3273   format %{ "movd    $dst,$src\n\t"
3274             "pshuflw $dst,$dst,0x00\n\t"
3275             "punpcklqdq $dst,$dst\t! replicate8S" %}
3276   ins_encode %{
3277     __ movdl($dst$$XMMRegister, $src$$Register);
3278     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3279     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3280   %}
3281   ins_pipe( pipe_slow );
3282 %}
3283 
3284 instruct Repl8S_mem(vecX dst, memory mem) %{
3285   predicate(n->as_Vector()->length() == 8 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
3286   match(Set dst (ReplicateS (LoadS mem)));
3287   format %{ "pshuflw $dst,$mem,0x00\n\t"
3288             "punpcklqdq $dst,$dst\t! replicate8S" %}
3289   ins_encode %{
3290     __ pshuflw($dst$$XMMRegister, $mem$$Address, 0x00);
3291     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3292   %}
3293   ins_pipe( pipe_slow );
3294 %}
3295 
3296 instruct Repl8S_imm(vecX dst, immI con) %{
3297   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vlbw());
3298   match(Set dst (ReplicateS con));
3299   format %{ "movq    $dst,[$constantaddress]\n\t"
3300             "punpcklqdq $dst,$dst\t! replicate8S($con)" %}
3301   ins_encode %{
3302     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
3303     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3304   %}
3305   ins_pipe( pipe_slow );
3306 %}
3307 
3308 instruct Repl16S(vecY dst, rRegI src) %{
3309   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
3310   match(Set dst (ReplicateS src));
3311   format %{ "movd    $dst,$src\n\t"
3312             "pshuflw $dst,$dst,0x00\n\t"
3313             "punpcklqdq $dst,$dst\n\t"
3314             "vinserti128_high $dst,$dst\t! replicate16S" %}
3315   ins_encode %{
3316     __ movdl($dst$$XMMRegister, $src$$Register);
3317     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3318     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3319     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3320   %}
3321   ins_pipe( pipe_slow );
3322 %}
3323 
3324 instruct Repl16S_mem(vecY dst, memory mem) %{
3325   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
3326   match(Set dst (ReplicateS (LoadS mem)));
3327   format %{ "pshuflw $dst,$mem,0x00\n\t"
3328             "punpcklqdq $dst,$dst\n\t"
3329             "vinserti128_high $dst,$dst\t! replicate16S" %}
3330   ins_encode %{
3331     __ pshuflw($dst$$XMMRegister, $mem$$Address, 0x00);
3332     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3333     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3334   %}
3335   ins_pipe( pipe_slow );
3336 %}
3337 
3338 instruct Repl16S_imm(vecY dst, immI con) %{
3339   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
3340   match(Set dst (ReplicateS con));
3341   format %{ "movq    $dst,[$constantaddress]\n\t"
3342             "punpcklqdq $dst,$dst\n\t"
3343             "vinserti128_high $dst,$dst\t! replicate16S($con)" %}
3344   ins_encode %{
3345     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
3346     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3347     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3348   %}
3349   ins_pipe( pipe_slow );
3350 %}
3351 
3352 instruct Repl32S(legVecZ dst, rRegI src) %{
3353   predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
3354   match(Set dst (ReplicateS src));
3355   format %{ "movd    $dst,$src\n\t"
3356             "pshuflw $dst,$dst,0x00\n\t"
3357             "punpcklqdq $dst,$dst\n\t"
3358             "vinserti128_high $dst,$dst\t"
3359             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate32S" %}
3360   ins_encode %{
3361     __ movdl($dst$$XMMRegister, $src$$Register);
3362     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3363     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3364     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3365     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3366   %}
3367   ins_pipe( pipe_slow );
3368 %}
3369 
3370 instruct Repl32S_mem(legVecZ dst, memory mem) %{
3371   predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
3372   match(Set dst (ReplicateS (LoadS mem)));
3373   format %{ "pshuflw $dst,$mem,0x00\n\t"
3374             "punpcklqdq $dst,$dst\n\t"
3375             "vinserti128_high $dst,$dst\t"
3376             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate32S" %}
3377   ins_encode %{
3378     __ pshuflw($dst$$XMMRegister, $mem$$Address, 0x00);
3379     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3380     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3381     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3382   %}
3383   ins_pipe( pipe_slow );
3384 %}
3385 
3386 instruct Repl32S_imm(legVecZ dst, immI con) %{
3387   predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
3388   match(Set dst (ReplicateS con));
3389   format %{ "movq    $dst,[$constantaddress]\n\t"
3390             "punpcklqdq $dst,$dst\n\t"
3391             "vinserti128_high $dst,$dst\t"
3392             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate32S($con)" %}
3393   ins_encode %{
3394     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
3395     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3396     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3397     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3398   %}
3399   ins_pipe( pipe_slow );
3400 %}
3401 
3402 instruct Repl4I(vecX dst, rRegI src) %{
3403   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3404   match(Set dst (ReplicateI src));
3405   format %{ "movd    $dst,$src\n\t"
3406             "pshufd  $dst,$dst,0x00\t! replicate4I" %}
3407   ins_encode %{
3408     __ movdl($dst$$XMMRegister, $src$$Register);
3409     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3410   %}
3411   ins_pipe( pipe_slow );
3412 %}
3413 
3414 instruct Repl4I_mem(vecX dst, memory mem) %{
3415   predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3416   match(Set dst (ReplicateI (LoadI mem)));
3417   format %{ "pshufd  $dst,$mem,0x00\t! replicate4I" %}
3418   ins_encode %{
3419     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3420   %}
3421   ins_pipe( pipe_slow );
3422 %}
3423 
3424 instruct Repl8I(vecY dst, rRegI src) %{
3425   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3426   match(Set dst (ReplicateI src));
3427   format %{ "movd    $dst,$src\n\t"
3428             "pshufd  $dst,$dst,0x00\n\t"
3429             "vinserti128_high $dst,$dst\t! replicate8I" %}
3430   ins_encode %{
3431     __ movdl($dst$$XMMRegister, $src$$Register);
3432     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3433     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3434   %}
3435   ins_pipe( pipe_slow );
3436 %}
3437 
3438 instruct Repl8I_mem(vecY dst, memory mem) %{
3439   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3440   match(Set dst (ReplicateI (LoadI mem)));
3441   format %{ "pshufd  $dst,$mem,0x00\n\t"
3442             "vinserti128_high $dst,$dst\t! replicate8I" %}
3443   ins_encode %{
3444     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3445     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3446   %}
3447   ins_pipe( pipe_slow );
3448 %}
3449 
3450 instruct Repl16I(legVecZ dst, rRegI src) %{
3451   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vl());
3452   match(Set dst (ReplicateI src));
3453   format %{ "movd    $dst,$src\n\t"
3454             "pshufd  $dst,$dst,0x00\n\t"
3455             "vinserti128_high $dst,$dst\t"
3456             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate16I" %}
3457   ins_encode %{
3458     __ movdl($dst$$XMMRegister, $src$$Register);
3459     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3460     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3461     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3462   %}
3463   ins_pipe( pipe_slow );
3464 %}
3465 
3466 instruct Repl16I_mem(legVecZ dst, memory mem) %{
3467   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vl());
3468   match(Set dst (ReplicateI (LoadI mem)));
3469   format %{ "pshufd  $dst,$mem,0x00\n\t"
3470             "vinserti128_high $dst,$dst\t"
3471             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate16I" %}
3472   ins_encode %{
3473     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3474     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3475     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3476   %}
3477   ins_pipe( pipe_slow );
3478 %}
3479 
3480 instruct Repl4I_imm(vecX dst, immI con) %{
3481   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3482   match(Set dst (ReplicateI con));
3483   format %{ "movq    $dst,[$constantaddress]\t! replicate4I($con)\n\t"
3484             "punpcklqdq $dst,$dst" %}
3485   ins_encode %{
3486     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
3487     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3488   %}
3489   ins_pipe( pipe_slow );
3490 %}
3491 
3492 instruct Repl8I_imm(vecY dst, immI con) %{
3493   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3494   match(Set dst (ReplicateI con));
3495   format %{ "movq    $dst,[$constantaddress]\t! replicate8I($con)\n\t"
3496             "punpcklqdq $dst,$dst\n\t"
3497             "vinserti128_high $dst,$dst" %}
3498   ins_encode %{
3499     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
3500     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3501     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3502   %}
3503   ins_pipe( pipe_slow );
3504 %}
3505 
3506 instruct Repl16I_imm(legVecZ dst, immI con) %{
3507   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vl());
3508   match(Set dst (ReplicateI con));
3509   format %{ "movq    $dst,[$constantaddress]\t"
3510             "punpcklqdq $dst,$dst\n\t"
3511             "vinserti128_high $dst,$dst"
3512             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate16I($con)" %}
3513   ins_encode %{
3514     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
3515     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3516     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3517     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3518   %}
3519   ins_pipe( pipe_slow );
3520 %}
3521 
3522 // Long could be loaded into xmm register directly from memory.
3523 instruct Repl2L_mem(vecX dst, memory mem) %{
3524   predicate(n->as_Vector()->length() == 2 && !VM_Version::supports_avx512vlbw());
3525   match(Set dst (ReplicateL (LoadL mem)));
3526   format %{ "movq    $dst,$mem\n\t"
3527             "punpcklqdq $dst,$dst\t! replicate2L" %}
3528   ins_encode %{
3529     __ movq($dst$$XMMRegister, $mem$$Address);
3530     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3531   %}
3532   ins_pipe( pipe_slow );
3533 %}
3534 
3535 // Replicate long (8 byte) scalar to be vector
3536 #ifdef _LP64
3537 instruct Repl4L(vecY dst, rRegL src) %{
3538   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3539   match(Set dst (ReplicateL src));
3540   format %{ "movdq   $dst,$src\n\t"
3541             "punpcklqdq $dst,$dst\n\t"
3542             "vinserti128_high $dst,$dst\t! replicate4L" %}
3543   ins_encode %{
3544     __ movdq($dst$$XMMRegister, $src$$Register);
3545     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3546     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3547   %}
3548   ins_pipe( pipe_slow );
3549 %}
3550 
3551 instruct Repl8L(legVecZ dst, rRegL src) %{
3552   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3553   match(Set dst (ReplicateL src));
3554   format %{ "movdq   $dst,$src\n\t"
3555             "punpcklqdq $dst,$dst\n\t"
3556             "vinserti128_high $dst,$dst\t"
3557             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate8L" %}
3558   ins_encode %{
3559     __ movdq($dst$$XMMRegister, $src$$Register);
3560     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3561     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3562     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3563   %}
3564   ins_pipe( pipe_slow );
3565 %}
3566 #else // _LP64
3567 instruct Repl4L(vecY dst, eRegL src, vecY tmp) %{
3568   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3569   match(Set dst (ReplicateL src));
3570   effect(TEMP dst, USE src, TEMP tmp);
3571   format %{ "movdl   $dst,$src.lo\n\t"
3572             "movdl   $tmp,$src.hi\n\t"
3573             "punpckldq $dst,$tmp\n\t"
3574             "punpcklqdq $dst,$dst\n\t"
3575             "vinserti128_high $dst,$dst\t! replicate4L" %}
3576   ins_encode %{
3577     __ movdl($dst$$XMMRegister, $src$$Register);
3578     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
3579     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
3580     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3581     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3582   %}
3583   ins_pipe( pipe_slow );
3584 %}
3585 
3586 instruct Repl8L(legVecZ dst, eRegL src, legVecZ tmp) %{
3587   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3588   match(Set dst (ReplicateL src));
3589   effect(TEMP dst, USE src, TEMP tmp);
3590   format %{ "movdl   $dst,$src.lo\n\t"
3591             "movdl   $tmp,$src.hi\n\t"
3592             "punpckldq $dst,$tmp\n\t"
3593             "punpcklqdq $dst,$dst\n\t"
3594             "vinserti128_high $dst,$dst\t"
3595             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate8L" %}
3596   ins_encode %{
3597     __ movdl($dst$$XMMRegister, $src$$Register);
3598     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
3599     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
3600     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3601     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3602     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3603   %}
3604   ins_pipe( pipe_slow );
3605 %}
3606 #endif // _LP64
3607 
3608 instruct Repl4L_imm(vecY dst, immL con) %{
3609   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3610   match(Set dst (ReplicateL con));
3611   format %{ "movq    $dst,[$constantaddress]\n\t"
3612             "punpcklqdq $dst,$dst\n\t"
3613             "vinserti128_high $dst,$dst\t! replicate4L($con)" %}
3614   ins_encode %{
3615     __ movq($dst$$XMMRegister, $constantaddress($con));
3616     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3617     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3618   %}
3619   ins_pipe( pipe_slow );
3620 %}
3621 
3622 instruct Repl8L_imm(legVecZ dst, immL con) %{
3623   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3624   match(Set dst (ReplicateL con));
3625   format %{ "movq    $dst,[$constantaddress]\n\t"
3626             "punpcklqdq $dst,$dst\n\t"
3627             "vinserti128_high $dst,$dst\t"
3628             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate8L($con)" %}
3629   ins_encode %{
3630     __ movq($dst$$XMMRegister, $constantaddress($con));
3631     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3632     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3633     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3634   %}
3635   ins_pipe( pipe_slow );
3636 %}
3637 
3638 instruct Repl4L_mem(vecY dst, memory mem) %{
3639   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3640   match(Set dst (ReplicateL (LoadL mem)));
3641   format %{ "movq    $dst,$mem\n\t"
3642             "punpcklqdq $dst,$dst\n\t"
3643             "vinserti128_high $dst,$dst\t! replicate4L" %}
3644   ins_encode %{
3645     __ movq($dst$$XMMRegister, $mem$$Address);
3646     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3647     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3648   %}
3649   ins_pipe( pipe_slow );
3650 %}
3651 
3652 instruct Repl8L_mem(legVecZ dst, memory mem) %{
3653   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3654   match(Set dst (ReplicateL (LoadL mem)));
3655   format %{ "movq    $dst,$mem\n\t"
3656             "punpcklqdq $dst,$dst\n\t"
3657             "vinserti128_high $dst,$dst\t"
3658             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate8L" %}
3659   ins_encode %{
3660     __ movq($dst$$XMMRegister, $mem$$Address);
3661     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3662     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3663     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3664   %}
3665   ins_pipe( pipe_slow );
3666 %}
3667 
3668 instruct Repl2F_mem(vecD dst, memory mem) %{
3669   predicate(n->as_Vector()->length() == 2 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3670   match(Set dst (ReplicateF (LoadF mem)));
3671   format %{ "pshufd  $dst,$mem,0x00\t! replicate2F" %}
3672   ins_encode %{
3673     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3674   %}
3675   ins_pipe( pipe_slow );
3676 %}
3677 
3678 instruct Repl4F_mem(vecX dst, memory mem) %{
3679   predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3680   match(Set dst (ReplicateF (LoadF mem)));
3681   format %{ "pshufd  $dst,$mem,0x00\t! replicate4F" %}
3682   ins_encode %{
3683     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3684   %}
3685   ins_pipe( pipe_slow );
3686 %}
3687 
3688 instruct Repl8F(vecY dst, vlRegF src) %{
3689   predicate(n->as_Vector()->length() == 8 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3690   match(Set dst (ReplicateF src));
3691   format %{ "pshufd  $dst,$src,0x00\n\t"
3692             "vinsertf128_high $dst,$dst\t! replicate8F" %}
3693   ins_encode %{
3694     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
3695     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3696   %}
3697   ins_pipe( pipe_slow );
3698 %}
3699 
3700 instruct Repl8F_mem(vecY dst, memory mem) %{
3701   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3702   match(Set dst (ReplicateF (LoadF mem)));
3703   format %{ "pshufd  $dst,$mem,0x00\n\t"
3704             "vinsertf128_high $dst,$dst\t! replicate8F" %}
3705   ins_encode %{
3706     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3707     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3708   %}
3709   ins_pipe( pipe_slow );
3710 %}
3711 
3712 instruct Repl16F(legVecZ dst, vlRegF src) %{
3713   predicate(n->as_Vector()->length() == 16 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3714   match(Set dst (ReplicateF src));
3715   format %{ "pshufd  $dst,$src,0x00\n\t"
3716             "vinsertf128_high $dst,$dst\t"
3717             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate16F" %}
3718   ins_encode %{
3719     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
3720     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3721     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3722   %}
3723   ins_pipe( pipe_slow );
3724 %}
3725 
3726 instruct Repl16F_mem(legVecZ dst, memory mem) %{
3727   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vl());
3728   match(Set dst (ReplicateF (LoadF mem)));
3729   format %{ "pshufd  $dst,$mem,0x00\n\t"
3730             "vinsertf128_high $dst,$dst\t"
3731             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate16F" %}
3732   ins_encode %{
3733     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3734     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3735     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3736   %}
3737   ins_pipe( pipe_slow );
3738 %}
3739 
3740 instruct Repl2F_zero(vecD dst, immF0 zero) %{
3741   predicate(n->as_Vector()->length() == 2);
3742   match(Set dst (ReplicateF zero));
3743   format %{ "xorps   $dst,$dst\t! replicate2F zero" %}
3744   ins_encode %{
3745     __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
3746   %}
3747   ins_pipe( fpu_reg_reg );
3748 %}
3749 
3750 instruct Repl4F_zero(vecX dst, immF0 zero) %{
3751   predicate(n->as_Vector()->length() == 4);
3752   match(Set dst (ReplicateF zero));
3753   format %{ "xorps   $dst,$dst\t! replicate4F zero" %}
3754   ins_encode %{
3755     __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
3756   %}
3757   ins_pipe( fpu_reg_reg );
3758 %}
3759 
3760 instruct Repl8F_zero(vecY dst, immF0 zero) %{
3761   predicate(n->as_Vector()->length() == 8 && UseAVX > 0);
3762   match(Set dst (ReplicateF zero));
3763   format %{ "vxorps  $dst,$dst,$dst\t! replicate8F zero" %}
3764   ins_encode %{
3765     int vector_len = 1;
3766     __ vxorps($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3767   %}
3768   ins_pipe( fpu_reg_reg );
3769 %}
3770 
3771 instruct Repl2D_mem(vecX dst, memory mem) %{
3772   predicate(n->as_Vector()->length() == 2 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3773   match(Set dst (ReplicateD (LoadD mem)));
3774   format %{ "pshufd  $dst,$mem,0x44\t! replicate2D" %}
3775   ins_encode %{
3776     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x44);
3777   %}
3778   ins_pipe( pipe_slow );
3779 %}
3780 
3781 instruct Repl4D(vecY dst, vlRegD src) %{
3782   predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3783   match(Set dst (ReplicateD src));
3784   format %{ "pshufd  $dst,$src,0x44\n\t"
3785             "vinsertf128_high $dst,$dst\t! replicate4D" %}
3786   ins_encode %{
3787     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
3788     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3789   %}
3790   ins_pipe( pipe_slow );
3791 %}
3792 
3793 instruct Repl4D_mem(vecY dst, memory mem) %{
3794   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3795   match(Set dst (ReplicateD (LoadD mem)));
3796   format %{ "pshufd  $dst,$mem,0x44\n\t"
3797             "vinsertf128_high $dst,$dst\t! replicate4D" %}
3798   ins_encode %{
3799     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x44);
3800     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3801   %}
3802   ins_pipe( pipe_slow );
3803 %}
3804 
3805 instruct Repl8D(legVecZ dst, vlRegD src) %{
3806   predicate(n->as_Vector()->length() == 8 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3807   match(Set dst (ReplicateD src));
3808   format %{ "pshufd  $dst,$src,0x44\n\t"
3809             "vinsertf128_high $dst,$dst\t"
3810             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate8D" %}
3811   ins_encode %{
3812     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
3813     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3814     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3815   %}
3816   ins_pipe( pipe_slow );
3817 %}
3818 
3819 instruct Repl8D_mem(legVecZ dst, memory mem) %{
3820   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3821   match(Set dst (ReplicateD (LoadD mem)));
3822   format %{ "pshufd  $dst,$mem,0x44\n\t"
3823             "vinsertf128_high $dst,$dst\t"
3824             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate8D" %}
3825   ins_encode %{
3826     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x44);
3827     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3828     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3829   %}
3830   ins_pipe( pipe_slow );
3831 %}
3832 
3833 // Replicate double (8 byte) scalar zero to be vector
3834 instruct Repl2D_zero(vecX dst, immD0 zero) %{
3835   predicate(n->as_Vector()->length() == 2);
3836   match(Set dst (ReplicateD zero));
3837   format %{ "xorpd   $dst,$dst\t! replicate2D zero" %}
3838   ins_encode %{
3839     __ xorpd($dst$$XMMRegister, $dst$$XMMRegister);
3840   %}
3841   ins_pipe( fpu_reg_reg );
3842 %}
3843 
3844 instruct Repl4D_zero(vecY dst, immD0 zero) %{
3845   predicate(n->as_Vector()->length() == 4 && UseAVX > 0);
3846   match(Set dst (ReplicateD zero));
3847   format %{ "vxorpd  $dst,$dst,$dst,vect256\t! replicate4D zero" %}
3848   ins_encode %{
3849     int vector_len = 1;
3850     __ vxorpd($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3851   %}
3852   ins_pipe( fpu_reg_reg );
3853 %}
3854 
3855 // ====================GENERIC REPLICATE==========================================
3856 
3857 // Replicate byte scalar to be vector
3858 instruct Repl4B(vecS dst, rRegI src) %{
3859   predicate(n->as_Vector()->length() == 4);
3860   match(Set dst (ReplicateB src));
3861   format %{ "movd    $dst,$src\n\t"
3862             "punpcklbw $dst,$dst\n\t"
3863             "pshuflw $dst,$dst,0x00\t! replicate4B" %}
3864   ins_encode %{
3865     __ movdl($dst$$XMMRegister, $src$$Register);
3866     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
3867     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3868   %}
3869   ins_pipe( pipe_slow );
3870 %}
3871 
3872 instruct Repl8B(vecD dst, rRegI src) %{
3873   predicate(n->as_Vector()->length() == 8);
3874   match(Set dst (ReplicateB src));
3875   format %{ "movd    $dst,$src\n\t"
3876             "punpcklbw $dst,$dst\n\t"
3877             "pshuflw $dst,$dst,0x00\t! replicate8B" %}
3878   ins_encode %{
3879     __ movdl($dst$$XMMRegister, $src$$Register);
3880     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
3881     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3882   %}
3883   ins_pipe( pipe_slow );
3884 %}
3885 
3886 // Replicate byte scalar immediate to be vector by loading from const table.
3887 instruct Repl4B_imm(vecS dst, immI con) %{
3888   predicate(n->as_Vector()->length() == 4);
3889   match(Set dst (ReplicateB con));
3890   format %{ "movdl   $dst,[$constantaddress]\t! replicate4B($con)" %}
3891   ins_encode %{
3892     __ movdl($dst$$XMMRegister, $constantaddress(replicate4_imm($con$$constant, 1)));
3893   %}
3894   ins_pipe( pipe_slow );
3895 %}
3896 
3897 instruct Repl8B_imm(vecD dst, immI con) %{
3898   predicate(n->as_Vector()->length() == 8);
3899   match(Set dst (ReplicateB con));
3900   format %{ "movq    $dst,[$constantaddress]\t! replicate8B($con)" %}
3901   ins_encode %{
3902     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
3903   %}
3904   ins_pipe( pipe_slow );
3905 %}
3906 
3907 // Replicate byte scalar zero to be vector
3908 instruct Repl4B_zero(vecS dst, immI0 zero) %{
3909   predicate(n->as_Vector()->length() == 4);
3910   match(Set dst (ReplicateB zero));
3911   format %{ "pxor    $dst,$dst\t! replicate4B zero" %}
3912   ins_encode %{
3913     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3914   %}
3915   ins_pipe( fpu_reg_reg );
3916 %}
3917 
3918 instruct Repl8B_zero(vecD dst, immI0 zero) %{
3919   predicate(n->as_Vector()->length() == 8);
3920   match(Set dst (ReplicateB zero));
3921   format %{ "pxor    $dst,$dst\t! replicate8B zero" %}
3922   ins_encode %{
3923     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3924   %}
3925   ins_pipe( fpu_reg_reg );
3926 %}
3927 
3928 instruct Repl16B_zero(vecX dst, immI0 zero) %{
3929   predicate(n->as_Vector()->length() == 16);
3930   match(Set dst (ReplicateB zero));
3931   format %{ "pxor    $dst,$dst\t! replicate16B zero" %}
3932   ins_encode %{
3933     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3934   %}
3935   ins_pipe( fpu_reg_reg );
3936 %}
3937 
3938 instruct Repl32B_zero(vecY dst, immI0 zero) %{
3939   predicate(n->as_Vector()->length() == 32);
3940   match(Set dst (ReplicateB zero));
3941   format %{ "vpxor   $dst,$dst,$dst\t! replicate32B zero" %}
3942   ins_encode %{
3943     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
3944     int vector_len = 1;
3945     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3946   %}
3947   ins_pipe( fpu_reg_reg );
3948 %}
3949 
3950 // Replicate char/short (2 byte) scalar to be vector
3951 instruct Repl2S(vecS dst, rRegI src) %{
3952   predicate(n->as_Vector()->length() == 2);
3953   match(Set dst (ReplicateS src));
3954   format %{ "movd    $dst,$src\n\t"
3955             "pshuflw $dst,$dst,0x00\t! replicate2S" %}
3956   ins_encode %{
3957     __ movdl($dst$$XMMRegister, $src$$Register);
3958     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3959   %}
3960   ins_pipe( fpu_reg_reg );
3961 %}
3962 
3963 // Replicate char/short (2 byte) scalar immediate to be vector by loading from const table.
3964 instruct Repl2S_imm(vecS dst, immI con) %{
3965   predicate(n->as_Vector()->length() == 2);
3966   match(Set dst (ReplicateS con));
3967   format %{ "movdl   $dst,[$constantaddress]\t! replicate2S($con)" %}
3968   ins_encode %{
3969     __ movdl($dst$$XMMRegister, $constantaddress(replicate4_imm($con$$constant, 2)));
3970   %}
3971   ins_pipe( fpu_reg_reg );
3972 %}
3973 
3974 instruct Repl4S_imm(vecD dst, immI con) %{
3975   predicate(n->as_Vector()->length() == 4);
3976   match(Set dst (ReplicateS con));
3977   format %{ "movq    $dst,[$constantaddress]\t! replicate4S($con)" %}
3978   ins_encode %{
3979     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
3980   %}
3981   ins_pipe( fpu_reg_reg );
3982 %}
3983 
3984 // Replicate char/short (2 byte) scalar zero to be vector
3985 instruct Repl2S_zero(vecS dst, immI0 zero) %{
3986   predicate(n->as_Vector()->length() == 2);
3987   match(Set dst (ReplicateS zero));
3988   format %{ "pxor    $dst,$dst\t! replicate2S zero" %}
3989   ins_encode %{
3990     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3991   %}
3992   ins_pipe( fpu_reg_reg );
3993 %}
3994 
3995 instruct Repl4S_zero(vecD dst, immI0 zero) %{
3996   predicate(n->as_Vector()->length() == 4);
3997   match(Set dst (ReplicateS zero));
3998   format %{ "pxor    $dst,$dst\t! replicate4S zero" %}
3999   ins_encode %{
4000     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4001   %}
4002   ins_pipe( fpu_reg_reg );
4003 %}
4004 
4005 instruct Repl8S_zero(vecX dst, immI0 zero) %{
4006   predicate(n->as_Vector()->length() == 8);
4007   match(Set dst (ReplicateS zero));
4008   format %{ "pxor    $dst,$dst\t! replicate8S zero" %}
4009   ins_encode %{
4010     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4011   %}
4012   ins_pipe( fpu_reg_reg );
4013 %}
4014 
4015 instruct Repl16S_zero(vecY dst, immI0 zero) %{
4016   predicate(n->as_Vector()->length() == 16);
4017   match(Set dst (ReplicateS zero));
4018   format %{ "vpxor   $dst,$dst,$dst\t! replicate16S zero" %}
4019   ins_encode %{
4020     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
4021     int vector_len = 1;
4022     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4023   %}
4024   ins_pipe( fpu_reg_reg );
4025 %}
4026 
4027 // Replicate integer (4 byte) scalar to be vector
4028 instruct Repl2I(vecD dst, rRegI src) %{
4029   predicate(n->as_Vector()->length() == 2);
4030   match(Set dst (ReplicateI src));
4031   format %{ "movd    $dst,$src\n\t"
4032             "pshufd  $dst,$dst,0x00\t! replicate2I" %}
4033   ins_encode %{
4034     __ movdl($dst$$XMMRegister, $src$$Register);
4035     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
4036   %}
4037   ins_pipe( fpu_reg_reg );
4038 %}
4039 
4040 // Integer could be loaded into xmm register directly from memory.
4041 instruct Repl2I_mem(vecD dst, memory mem) %{
4042   predicate(n->as_Vector()->length() == 2);
4043   match(Set dst (ReplicateI (LoadI mem)));
4044   format %{ "movd    $dst,$mem\n\t"
4045             "pshufd  $dst,$dst,0x00\t! replicate2I" %}
4046   ins_encode %{
4047     __ movdl($dst$$XMMRegister, $mem$$Address);
4048     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
4049   %}
4050   ins_pipe( fpu_reg_reg );
4051 %}
4052 
4053 // Replicate integer (4 byte) scalar immediate to be vector by loading from const table.
4054 instruct Repl2I_imm(vecD dst, immI con) %{
4055   predicate(n->as_Vector()->length() == 2);
4056   match(Set dst (ReplicateI con));
4057   format %{ "movq    $dst,[$constantaddress]\t! replicate2I($con)" %}
4058   ins_encode %{
4059     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
4060   %}
4061   ins_pipe( fpu_reg_reg );
4062 %}
4063 
4064 // Replicate integer (4 byte) scalar zero to be vector
4065 instruct Repl2I_zero(vecD dst, immI0 zero) %{
4066   predicate(n->as_Vector()->length() == 2);
4067   match(Set dst (ReplicateI zero));
4068   format %{ "pxor    $dst,$dst\t! replicate2I" %}
4069   ins_encode %{
4070     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4071   %}
4072   ins_pipe( fpu_reg_reg );
4073 %}
4074 
4075 instruct Repl4I_zero(vecX dst, immI0 zero) %{
4076   predicate(n->as_Vector()->length() == 4);
4077   match(Set dst (ReplicateI zero));
4078   format %{ "pxor    $dst,$dst\t! replicate4I zero)" %}
4079   ins_encode %{
4080     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4081   %}
4082   ins_pipe( fpu_reg_reg );
4083 %}
4084 
4085 instruct Repl8I_zero(vecY dst, immI0 zero) %{
4086   predicate(n->as_Vector()->length() == 8);
4087   match(Set dst (ReplicateI zero));
4088   format %{ "vpxor   $dst,$dst,$dst\t! replicate8I zero" %}
4089   ins_encode %{
4090     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
4091     int vector_len = 1;
4092     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4093   %}
4094   ins_pipe( fpu_reg_reg );
4095 %}
4096 
4097 // Replicate long (8 byte) scalar to be vector
4098 #ifdef _LP64
4099 instruct Repl2L(vecX dst, rRegL src) %{
4100   predicate(n->as_Vector()->length() == 2);
4101   match(Set dst (ReplicateL src));
4102   format %{ "movdq   $dst,$src\n\t"
4103             "punpcklqdq $dst,$dst\t! replicate2L" %}
4104   ins_encode %{
4105     __ movdq($dst$$XMMRegister, $src$$Register);
4106     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4107   %}
4108   ins_pipe( pipe_slow );
4109 %}
4110 #else // _LP64
4111 instruct Repl2L(vecX dst, eRegL src, vecX tmp) %{
4112   predicate(n->as_Vector()->length() == 2);
4113   match(Set dst (ReplicateL src));
4114   effect(TEMP dst, USE src, TEMP tmp);
4115   format %{ "movdl   $dst,$src.lo\n\t"
4116             "movdl   $tmp,$src.hi\n\t"
4117             "punpckldq $dst,$tmp\n\t"
4118             "punpcklqdq $dst,$dst\t! replicate2L"%}
4119   ins_encode %{
4120     __ movdl($dst$$XMMRegister, $src$$Register);
4121     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
4122     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
4123     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4124   %}
4125   ins_pipe( pipe_slow );
4126 %}
4127 #endif // _LP64
4128 
4129 // Replicate long (8 byte) scalar immediate to be vector by loading from const table.
4130 instruct Repl2L_imm(vecX dst, immL con) %{
4131   predicate(n->as_Vector()->length() == 2);
4132   match(Set dst (ReplicateL con));
4133   format %{ "movq    $dst,[$constantaddress]\n\t"
4134             "punpcklqdq $dst,$dst\t! replicate2L($con)" %}
4135   ins_encode %{
4136     __ movq($dst$$XMMRegister, $constantaddress($con));
4137     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4138   %}
4139   ins_pipe( pipe_slow );
4140 %}
4141 
4142 // Replicate long (8 byte) scalar zero to be vector
4143 instruct Repl2L_zero(vecX dst, immL0 zero) %{
4144   predicate(n->as_Vector()->length() == 2);
4145   match(Set dst (ReplicateL zero));
4146   format %{ "pxor    $dst,$dst\t! replicate2L zero" %}
4147   ins_encode %{
4148     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4149   %}
4150   ins_pipe( fpu_reg_reg );
4151 %}
4152 
4153 instruct Repl4L_zero(vecY dst, immL0 zero) %{
4154   predicate(n->as_Vector()->length() == 4);
4155   match(Set dst (ReplicateL zero));
4156   format %{ "vpxor   $dst,$dst,$dst\t! replicate4L zero" %}
4157   ins_encode %{
4158     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
4159     int vector_len = 1;
4160     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4161   %}
4162   ins_pipe( fpu_reg_reg );
4163 %}
4164 
4165 // Replicate float (4 byte) scalar to be vector
4166 instruct Repl2F(vecD dst, vlRegF src) %{
4167   predicate(n->as_Vector()->length() == 2);
4168   match(Set dst (ReplicateF src));
4169   format %{ "pshufd  $dst,$dst,0x00\t! replicate2F" %}
4170   ins_encode %{
4171     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
4172   %}
4173   ins_pipe( fpu_reg_reg );
4174 %}
4175 
4176 instruct Repl4F(vecX dst, vlRegF src) %{
4177   predicate(n->as_Vector()->length() == 4);
4178   match(Set dst (ReplicateF src));
4179   format %{ "pshufd  $dst,$dst,0x00\t! replicate4F" %}
4180   ins_encode %{
4181     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
4182   %}
4183   ins_pipe( pipe_slow );
4184 %}
4185 
4186 // Replicate double (8 bytes) scalar to be vector
4187 instruct Repl2D(vecX dst, vlRegD src) %{
4188   predicate(n->as_Vector()->length() == 2);
4189   match(Set dst (ReplicateD src));
4190   format %{ "pshufd  $dst,$src,0x44\t! replicate2D" %}
4191   ins_encode %{
4192     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
4193   %}
4194   ins_pipe( pipe_slow );
4195 %}
4196 
4197 // ====================EVEX REPLICATE=============================================
4198 
4199 instruct Repl4B_mem_evex(vecS dst, memory mem) %{
4200   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4201   match(Set dst (ReplicateB (LoadB mem)));
4202   format %{ "vpbroadcastb  $dst,$mem\t! replicate4B" %}
4203   ins_encode %{
4204     int vector_len = 0;
4205     __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
4206   %}
4207   ins_pipe( pipe_slow );
4208 %}
4209 
4210 instruct Repl8B_mem_evex(vecD dst, memory mem) %{
4211   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4212   match(Set dst (ReplicateB (LoadB mem)));
4213   format %{ "vpbroadcastb  $dst,$mem\t! replicate8B" %}
4214   ins_encode %{
4215     int vector_len = 0;
4216     __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
4217   %}
4218   ins_pipe( pipe_slow );
4219 %}
4220 
4221 instruct Repl16B_evex(vecX dst, rRegI src) %{
4222   predicate(n->as_Vector()->length() == 16 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4223   match(Set dst (ReplicateB src));
4224   format %{ "evpbroadcastb $dst,$src\t! replicate16B" %}
4225   ins_encode %{
4226    int vector_len = 0;
4227     __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vector_len);
4228   %}
4229   ins_pipe( pipe_slow );
4230 %}
4231 
4232 instruct Repl16B_mem_evex(vecX dst, memory mem) %{
4233   predicate(n->as_Vector()->length() == 16 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4234   match(Set dst (ReplicateB (LoadB mem)));
4235   format %{ "vpbroadcastb  $dst,$mem\t! replicate16B" %}
4236   ins_encode %{
4237     int vector_len = 0;
4238     __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
4239   %}
4240   ins_pipe( pipe_slow );
4241 %}
4242 
4243 instruct Repl32B_evex(vecY dst, rRegI src) %{
4244   predicate(n->as_Vector()->length() == 32 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4245   match(Set dst (ReplicateB src));
4246   format %{ "evpbroadcastb $dst,$src\t! replicate32B" %}
4247   ins_encode %{
4248    int vector_len = 1;
4249     __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vector_len);
4250   %}
4251   ins_pipe( pipe_slow );
4252 %}
4253 
4254 instruct Repl32B_mem_evex(vecY dst, memory mem) %{
4255   predicate(n->as_Vector()->length() == 32 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4256   match(Set dst (ReplicateB (LoadB mem)));
4257   format %{ "vpbroadcastb  $dst,$mem\t! replicate32B" %}
4258   ins_encode %{
4259     int vector_len = 1;
4260     __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
4261   %}
4262   ins_pipe( pipe_slow );
4263 %}
4264 
4265 instruct Repl64B_evex(vecZ dst, rRegI src) %{
4266   predicate(n->as_Vector()->length() == 64 && UseAVX > 2 && VM_Version::supports_avx512bw());
4267   match(Set dst (ReplicateB src));
4268   format %{ "evpbroadcastb $dst,$src\t! upper replicate64B" %}
4269   ins_encode %{
4270    int vector_len = 2;
4271     __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vector_len);
4272   %}
4273   ins_pipe( pipe_slow );
4274 %}
4275 
4276 instruct Repl64B_mem_evex(vecZ dst, memory mem) %{
4277   predicate(n->as_Vector()->length() == 64 && UseAVX > 2 && VM_Version::supports_avx512bw());
4278   match(Set dst (ReplicateB (LoadB mem)));
4279   format %{ "vpbroadcastb  $dst,$mem\t! replicate64B" %}
4280   ins_encode %{
4281     int vector_len = 2;
4282     __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
4283   %}
4284   ins_pipe( pipe_slow );
4285 %}
4286 
4287 instruct Repl16B_imm_evex(vecX dst, immI con) %{
4288   predicate(n->as_Vector()->length() == 16 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4289   match(Set dst (ReplicateB con));
4290   format %{ "movq    $dst,[$constantaddress]\n\t"
4291             "vpbroadcastb $dst,$dst\t! replicate16B" %}
4292   ins_encode %{
4293    int vector_len = 0;
4294     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
4295     __ vpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4296   %}
4297   ins_pipe( pipe_slow );
4298 %}
4299 
4300 instruct Repl32B_imm_evex(vecY dst, immI con) %{
4301   predicate(n->as_Vector()->length() == 32 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4302   match(Set dst (ReplicateB con));
4303   format %{ "movq    $dst,[$constantaddress]\n\t"
4304             "vpbroadcastb $dst,$dst\t! replicate32B" %}
4305   ins_encode %{
4306    int vector_len = 1;
4307     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
4308     __ vpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4309   %}
4310   ins_pipe( pipe_slow );
4311 %}
4312 
4313 instruct Repl64B_imm_evex(vecZ dst, immI con) %{
4314   predicate(n->as_Vector()->length() == 64 && UseAVX > 2 && VM_Version::supports_avx512bw());
4315   match(Set dst (ReplicateB con));
4316   format %{ "movq    $dst,[$constantaddress]\n\t"
4317             "vpbroadcastb $dst,$dst\t! upper replicate64B" %}
4318   ins_encode %{
4319    int vector_len = 2;
4320     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
4321     __ vpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4322   %}
4323   ins_pipe( pipe_slow );
4324 %}
4325 
4326 instruct Repl64B_zero_evex(vecZ dst, immI0 zero) %{
4327   predicate(n->as_Vector()->length() == 64 && UseAVX > 2);
4328   match(Set dst (ReplicateB zero));
4329   format %{ "vpxor   $dst k0,$dst,$dst\t! replicate64B zero" %}
4330   ins_encode %{
4331     // Use vxorpd since AVX does not have vpxor for 512-bit (EVEX will have it).
4332     int vector_len = 2;
4333     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4334   %}
4335   ins_pipe( fpu_reg_reg );
4336 %}
4337 
4338 instruct Repl4S_evex(vecD dst, rRegI src) %{
4339   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4340   match(Set dst (ReplicateS src));
4341   format %{ "evpbroadcastw $dst,$src\t! replicate4S" %}
4342   ins_encode %{
4343    int vector_len = 0;
4344     __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
4345   %}
4346   ins_pipe( pipe_slow );
4347 %}
4348 
4349 instruct Repl4S_mem_evex(vecD dst, memory mem) %{
4350   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4351   match(Set dst (ReplicateS (LoadS mem)));
4352   format %{ "vpbroadcastw  $dst,$mem\t! replicate4S" %}
4353   ins_encode %{
4354     int vector_len = 0;
4355     __ vpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
4356   %}
4357   ins_pipe( pipe_slow );
4358 %}
4359 
4360 instruct Repl8S_evex(vecX dst, rRegI src) %{
4361   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4362   match(Set dst (ReplicateS src));
4363   format %{ "evpbroadcastw $dst,$src\t! replicate8S" %}
4364   ins_encode %{
4365    int vector_len = 0;
4366     __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
4367   %}
4368   ins_pipe( pipe_slow );
4369 %}
4370 
4371 instruct Repl8S_mem_evex(vecX dst, memory mem) %{
4372   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4373   match(Set dst (ReplicateS (LoadS mem)));
4374   format %{ "vpbroadcastw  $dst,$mem\t! replicate8S" %}
4375   ins_encode %{
4376     int vector_len = 0;
4377     __ vpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
4378   %}
4379   ins_pipe( pipe_slow );
4380 %}
4381 
4382 instruct Repl16S_evex(vecY dst, rRegI src) %{
4383   predicate(n->as_Vector()->length() == 16 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4384   match(Set dst (ReplicateS src));
4385   format %{ "evpbroadcastw $dst,$src\t! replicate16S" %}
4386   ins_encode %{
4387    int vector_len = 1;
4388     __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
4389   %}
4390   ins_pipe( pipe_slow );
4391 %}
4392 
4393 instruct Repl16S_mem_evex(vecY dst, memory mem) %{
4394   predicate(n->as_Vector()->length() == 16 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4395   match(Set dst (ReplicateS (LoadS mem)));
4396   format %{ "vpbroadcastw  $dst,$mem\t! replicate16S" %}
4397   ins_encode %{
4398     int vector_len = 1;
4399     __ vpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
4400   %}
4401   ins_pipe( pipe_slow );
4402 %}
4403 
4404 instruct Repl32S_evex(vecZ dst, rRegI src) %{
4405   predicate(n->as_Vector()->length() == 32 && UseAVX > 2 && VM_Version::supports_avx512bw());
4406   match(Set dst (ReplicateS src));
4407   format %{ "evpbroadcastw $dst,$src\t! replicate32S" %}
4408   ins_encode %{
4409    int vector_len = 2;
4410     __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
4411   %}
4412   ins_pipe( pipe_slow );
4413 %}
4414 
4415 instruct Repl32S_mem_evex(vecZ dst, memory mem) %{
4416   predicate(n->as_Vector()->length() == 32 && UseAVX > 2 && VM_Version::supports_avx512bw());
4417   match(Set dst (ReplicateS (LoadS mem)));
4418   format %{ "vpbroadcastw  $dst,$mem\t! replicate32S" %}
4419   ins_encode %{
4420     int vector_len = 2;
4421     __ vpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
4422   %}
4423   ins_pipe( pipe_slow );
4424 %}
4425 
4426 instruct Repl8S_imm_evex(vecX dst, immI con) %{
4427   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4428   match(Set dst (ReplicateS con));
4429   format %{ "movq    $dst,[$constantaddress]\n\t"
4430             "vpbroadcastw $dst,$dst\t! replicate8S" %}
4431   ins_encode %{
4432    int vector_len = 0;
4433     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
4434     __ vpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4435   %}
4436   ins_pipe( pipe_slow );
4437 %}
4438 
4439 instruct Repl16S_imm_evex(vecY dst, immI con) %{
4440   predicate(n->as_Vector()->length() == 16 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4441   match(Set dst (ReplicateS con));
4442   format %{ "movq    $dst,[$constantaddress]\n\t"
4443             "vpbroadcastw $dst,$dst\t! replicate16S" %}
4444   ins_encode %{
4445    int vector_len = 1;
4446     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
4447     __ vpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4448   %}
4449   ins_pipe( pipe_slow );
4450 %}
4451 
4452 instruct Repl32S_imm_evex(vecZ dst, immI con) %{
4453   predicate(n->as_Vector()->length() == 32 && UseAVX > 2 && VM_Version::supports_avx512bw());
4454   match(Set dst (ReplicateS con));
4455   format %{ "movq    $dst,[$constantaddress]\n\t"
4456             "vpbroadcastw $dst,$dst\t! replicate32S" %}
4457   ins_encode %{
4458    int vector_len = 2;
4459     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
4460     __ vpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4461   %}
4462   ins_pipe( pipe_slow );
4463 %}
4464 
4465 instruct Repl32S_zero_evex(vecZ dst, immI0 zero) %{
4466   predicate(n->as_Vector()->length() == 32 && UseAVX > 2);
4467   match(Set dst (ReplicateS zero));
4468   format %{ "vpxor   $dst k0,$dst,$dst\t! replicate32S zero" %}
4469   ins_encode %{
4470     // Use vxorpd since AVX does not have vpxor for 512-bit (EVEX will have it).
4471     int vector_len = 2;
4472     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4473   %}
4474   ins_pipe( fpu_reg_reg );
4475 %}
4476 
4477 instruct Repl4I_evex(vecX dst, rRegI src) %{
4478   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
4479   match(Set dst (ReplicateI src));
4480   format %{ "evpbroadcastd  $dst,$src\t! replicate4I" %}
4481   ins_encode %{
4482     int vector_len = 0;
4483     __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vector_len);
4484   %}
4485   ins_pipe( pipe_slow );
4486 %}
4487 
4488 instruct Repl4I_mem_evex(vecX dst, memory mem) %{
4489   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
4490   match(Set dst (ReplicateI (LoadI mem)));
4491   format %{ "vpbroadcastd  $dst,$mem\t! replicate4I" %}
4492   ins_encode %{
4493     int vector_len = 0;
4494     __ vpbroadcastd($dst$$XMMRegister, $mem$$Address, vector_len);
4495   %}
4496   ins_pipe( pipe_slow );
4497 %}
4498 
4499 instruct Repl8I_evex(vecY dst, rRegI src) %{
4500   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vl());
4501   match(Set dst (ReplicateI src));
4502   format %{ "evpbroadcastd  $dst,$src\t! replicate8I" %}
4503   ins_encode %{
4504     int vector_len = 1;
4505     __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vector_len);
4506   %}
4507   ins_pipe( pipe_slow );
4508 %}
4509 
4510 instruct Repl8I_mem_evex(vecY dst, memory mem) %{
4511   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vl());
4512   match(Set dst (ReplicateI (LoadI mem)));
4513   format %{ "vpbroadcastd  $dst,$mem\t! replicate8I" %}
4514   ins_encode %{
4515     int vector_len = 1;
4516     __ vpbroadcastd($dst$$XMMRegister, $mem$$Address, vector_len);
4517   %}
4518   ins_pipe( pipe_slow );
4519 %}
4520 
4521 instruct Repl16I_evex(vecZ dst, rRegI src) %{
4522   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4523   match(Set dst (ReplicateI src));
4524   format %{ "evpbroadcastd  $dst,$src\t! replicate16I" %}
4525   ins_encode %{
4526     int vector_len = 2;
4527     __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vector_len);
4528   %}
4529   ins_pipe( pipe_slow );
4530 %}
4531 
4532 instruct Repl16I_mem_evex(vecZ dst, memory mem) %{
4533   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4534   match(Set dst (ReplicateI (LoadI mem)));
4535   format %{ "vpbroadcastd  $dst,$mem\t! replicate16I" %}
4536   ins_encode %{
4537     int vector_len = 2;
4538     __ vpbroadcastd($dst$$XMMRegister, $mem$$Address, vector_len);
4539   %}
4540   ins_pipe( pipe_slow );
4541 %}
4542 
4543 instruct Repl4I_imm_evex(vecX dst, immI con) %{
4544   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
4545   match(Set dst (ReplicateI con));
4546   format %{ "movq    $dst,[$constantaddress]\t! replicate8I($con)\n\t"
4547             "vpbroadcastd  $dst,$dst\t! replicate4I" %}
4548   ins_encode %{
4549     int vector_len = 0;
4550     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
4551     __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4552   %}
4553   ins_pipe( pipe_slow );
4554 %}
4555 
4556 instruct Repl8I_imm_evex(vecY dst, immI con) %{
4557   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vl());
4558   match(Set dst (ReplicateI con));
4559   format %{ "movq    $dst,[$constantaddress]\t! replicate8I($con)\n\t"
4560             "vpbroadcastd  $dst,$dst\t! replicate8I" %}
4561   ins_encode %{
4562     int vector_len = 1;
4563     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
4564     __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4565   %}
4566   ins_pipe( pipe_slow );
4567 %}
4568 
4569 instruct Repl16I_imm_evex(vecZ dst, immI con) %{
4570   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4571   match(Set dst (ReplicateI con));
4572   format %{ "movq    $dst,[$constantaddress]\t! replicate16I($con)\n\t"
4573             "vpbroadcastd  $dst,$dst\t! replicate16I" %}
4574   ins_encode %{
4575     int vector_len = 2;
4576     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
4577     __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4578   %}
4579   ins_pipe( pipe_slow );
4580 %}
4581 
4582 instruct Repl16I_zero_evex(vecZ dst, immI0 zero) %{
4583   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4584   match(Set dst (ReplicateI zero));
4585   format %{ "vpxor   $dst k0,$dst,$dst\t! replicate16I zero" %}
4586   ins_encode %{
4587     // Use vxorpd since AVX does not have vpxor for 512-bit (AVX2 will have it).
4588     int vector_len = 2;
4589     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4590   %}
4591   ins_pipe( fpu_reg_reg );
4592 %}
4593 
4594 // Replicate long (8 byte) scalar to be vector
4595 #ifdef _LP64
4596 instruct Repl4L_evex(vecY dst, rRegL src) %{
4597   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
4598   match(Set dst (ReplicateL src));
4599   format %{ "evpbroadcastq  $dst,$src\t! replicate4L" %}
4600   ins_encode %{
4601     int vector_len = 1;
4602     __ evpbroadcastq($dst$$XMMRegister, $src$$Register, vector_len);
4603   %}
4604   ins_pipe( pipe_slow );
4605 %}
4606 
4607 instruct Repl8L_evex(vecZ dst, rRegL src) %{
4608   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4609   match(Set dst (ReplicateL src));
4610   format %{ "evpbroadcastq  $dst,$src\t! replicate8L" %}
4611   ins_encode %{
4612     int vector_len = 2;
4613     __ evpbroadcastq($dst$$XMMRegister, $src$$Register, vector_len);
4614   %}
4615   ins_pipe( pipe_slow );
4616 %}
4617 #else // _LP64
4618 instruct Repl4L_evex(vecY dst, eRegL src, regD tmp) %{
4619   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
4620   match(Set dst (ReplicateL src));
4621   effect(TEMP dst, USE src, TEMP tmp);
4622   format %{ "movdl   $dst,$src.lo\n\t"
4623             "movdl   $tmp,$src.hi\n\t"
4624             "punpckldq $dst,$tmp\n\t"
4625             "vpbroadcastq  $dst,$dst\t! replicate4L" %}
4626   ins_encode %{
4627     int vector_len = 1;
4628     __ movdl($dst$$XMMRegister, $src$$Register);
4629     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
4630     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
4631     __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4632   %}
4633   ins_pipe( pipe_slow );
4634 %}
4635 
4636 instruct Repl8L_evex(legVecZ dst, eRegL src, legVecZ tmp) %{
4637   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4638   match(Set dst (ReplicateL src));
4639   effect(TEMP dst, USE src, TEMP tmp);
4640   format %{ "movdl   $dst,$src.lo\n\t"
4641             "movdl   $tmp,$src.hi\n\t"
4642             "punpckldq $dst,$tmp\n\t"
4643             "vpbroadcastq  $dst,$dst\t! replicate8L" %}
4644   ins_encode %{
4645     int vector_len = 2;
4646     __ movdl($dst$$XMMRegister, $src$$Register);
4647     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
4648     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
4649     __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4650   %}
4651   ins_pipe( pipe_slow );
4652 %}
4653 #endif // _LP64
4654 
4655 instruct Repl4L_imm_evex(vecY dst, immL con) %{
4656   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
4657   match(Set dst (ReplicateL con));
4658   format %{ "movq    $dst,[$constantaddress]\n\t"
4659             "vpbroadcastq  $dst,$dst\t! replicate4L" %}
4660   ins_encode %{
4661     int vector_len = 1;
4662     __ movq($dst$$XMMRegister, $constantaddress($con));
4663     __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4664   %}
4665   ins_pipe( pipe_slow );
4666 %}
4667 
4668 instruct Repl8L_imm_evex(vecZ dst, immL con) %{
4669   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4670   match(Set dst (ReplicateL con));
4671   format %{ "movq    $dst,[$constantaddress]\n\t"
4672             "vpbroadcastq  $dst,$dst\t! replicate8L" %}
4673   ins_encode %{
4674     int vector_len = 2;
4675     __ movq($dst$$XMMRegister, $constantaddress($con));
4676     __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4677   %}
4678   ins_pipe( pipe_slow );
4679 %}
4680 
4681 instruct Repl2L_mem_evex(vecX dst, memory mem) %{
4682   predicate(n->as_Vector()->length() == 2 && UseAVX > 2 && VM_Version::supports_avx512vl());
4683   match(Set dst (ReplicateL (LoadL mem)));
4684   format %{ "vpbroadcastd  $dst,$mem\t! replicate2L" %}
4685   ins_encode %{
4686     int vector_len = 0;
4687     __ vpbroadcastq($dst$$XMMRegister, $mem$$Address, vector_len);
4688   %}
4689   ins_pipe( pipe_slow );
4690 %}
4691 
4692 instruct Repl4L_mem_evex(vecY dst, memory mem) %{
4693   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
4694   match(Set dst (ReplicateL (LoadL mem)));
4695   format %{ "vpbroadcastd  $dst,$mem\t! replicate4L" %}
4696   ins_encode %{
4697     int vector_len = 1;
4698     __ vpbroadcastq($dst$$XMMRegister, $mem$$Address, vector_len);
4699   %}
4700   ins_pipe( pipe_slow );
4701 %}
4702 
4703 instruct Repl8L_mem_evex(vecZ dst, memory mem) %{
4704   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4705   match(Set dst (ReplicateL (LoadL mem)));
4706   format %{ "vpbroadcastd  $dst,$mem\t! replicate8L" %}
4707   ins_encode %{
4708     int vector_len = 2;
4709     __ vpbroadcastq($dst$$XMMRegister, $mem$$Address, vector_len);
4710   %}
4711   ins_pipe( pipe_slow );
4712 %}
4713 
4714 instruct Repl8L_zero_evex(vecZ dst, immL0 zero) %{
4715   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4716   match(Set dst (ReplicateL zero));
4717   format %{ "vpxor   $dst k0,$dst,$dst\t! replicate8L zero" %}
4718   ins_encode %{
4719     // Use vxorpd since AVX does not have vpxor for 512-bit (EVEX will have it).
4720     int vector_len = 2;
4721     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4722   %}
4723   ins_pipe( fpu_reg_reg );
4724 %}
4725 
4726 instruct Repl8F_evex(vecY dst, regF src) %{
4727   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vl());
4728   match(Set dst (ReplicateF src));
4729   format %{ "vpbroadcastss $dst,$src\t! replicate8F" %}
4730   ins_encode %{
4731     int vector_len = 1;
4732     __ vpbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vector_len);
4733   %}
4734   ins_pipe( pipe_slow );
4735 %}
4736 
4737 instruct Repl8F_mem_evex(vecY dst, memory mem) %{
4738   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vl());
4739   match(Set dst (ReplicateF (LoadF mem)));
4740   format %{ "vbroadcastss  $dst,$mem\t! replicate8F" %}
4741   ins_encode %{
4742     int vector_len = 1;
4743     __ vpbroadcastss($dst$$XMMRegister, $mem$$Address, vector_len);
4744   %}
4745   ins_pipe( pipe_slow );
4746 %}
4747 
4748 instruct Repl16F_evex(vecZ dst, regF src) %{
4749   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4750   match(Set dst (ReplicateF src));
4751   format %{ "vpbroadcastss $dst,$src\t! replicate16F" %}
4752   ins_encode %{
4753     int vector_len = 2;
4754     __ vpbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vector_len);
4755   %}
4756   ins_pipe( pipe_slow );
4757 %}
4758 
4759 instruct Repl16F_mem_evex(vecZ dst, memory mem) %{
4760   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4761   match(Set dst (ReplicateF (LoadF mem)));
4762   format %{ "vbroadcastss  $dst,$mem\t! replicate16F" %}
4763   ins_encode %{
4764     int vector_len = 2;
4765     __ vpbroadcastss($dst$$XMMRegister, $mem$$Address, vector_len);
4766   %}
4767   ins_pipe( pipe_slow );
4768 %}
4769 
4770 instruct Repl16F_zero_evex(vecZ dst, immF0 zero) %{
4771   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4772   match(Set dst (ReplicateF zero));
4773   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate16F zero" %}
4774   ins_encode %{
4775     // Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
4776     int vector_len = 2;
4777     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4778   %}
4779   ins_pipe( fpu_reg_reg );
4780 %}
4781 
4782 instruct Repl4D_evex(vecY dst, regD src) %{
4783   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
4784   match(Set dst (ReplicateD src));
4785   format %{ "vpbroadcastsd $dst,$src\t! replicate4D" %}
4786   ins_encode %{
4787     int vector_len = 1;
4788     __ vpbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
4789   %}
4790   ins_pipe( pipe_slow );
4791 %}
4792 
4793 instruct Repl4D_mem_evex(vecY dst, memory mem) %{
4794   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
4795   match(Set dst (ReplicateD (LoadD mem)));
4796   format %{ "vbroadcastsd  $dst,$mem\t! replicate4D" %}
4797   ins_encode %{
4798     int vector_len = 1;
4799     __ vpbroadcastsd($dst$$XMMRegister, $mem$$Address, vector_len);
4800   %}
4801   ins_pipe( pipe_slow );
4802 %}
4803 
4804 instruct Repl8D_evex(vecZ dst, regD src) %{
4805   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4806   match(Set dst (ReplicateD src));
4807   format %{ "vpbroadcastsd $dst,$src\t! replicate8D" %}
4808   ins_encode %{
4809     int vector_len = 2;
4810     __ vpbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
4811   %}
4812   ins_pipe( pipe_slow );
4813 %}
4814 
4815 instruct Repl8D_mem_evex(vecZ dst, memory mem) %{
4816   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4817   match(Set dst (ReplicateD (LoadD mem)));
4818   format %{ "vbroadcastsd  $dst,$mem\t! replicate8D" %}
4819   ins_encode %{
4820     int vector_len = 2;
4821     __ vpbroadcastsd($dst$$XMMRegister, $mem$$Address, vector_len);
4822   %}
4823   ins_pipe( pipe_slow );
4824 %}
4825 
4826 instruct Repl8D_zero_evex(vecZ dst, immD0 zero) %{
4827   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4828   match(Set dst (ReplicateD zero));
4829   format %{ "vpxor  $dst k0,$dst,$dst,vect512\t! replicate8D zero" %}
4830   ins_encode %{
4831     // Use vpxor in place of vxorpd since EVEX has a constriant on dq for vxorpd: this is a 512-bit operation
4832     int vector_len = 2;
4833     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4834   %}
4835   ins_pipe( fpu_reg_reg );
4836 %}
4837 
4838 // ====================REDUCTION ARITHMETIC=======================================
4839 
4840 instruct rsadd2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, vecD tmp, vecD tmp2) %{
4841   predicate(UseSSE > 2 && UseAVX == 0);
4842   match(Set dst (AddReductionVI src1 src2));
4843   effect(TEMP tmp2, TEMP tmp);
4844   format %{ "movdqu  $tmp2,$src2\n\t"
4845             "phaddd  $tmp2,$tmp2\n\t"
4846             "movd    $tmp,$src1\n\t"
4847             "paddd   $tmp,$tmp2\n\t"
4848             "movd    $dst,$tmp\t! add reduction2I" %}
4849   ins_encode %{
4850     __ movdqu($tmp2$$XMMRegister, $src2$$XMMRegister);
4851     __ phaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister);
4852     __ movdl($tmp$$XMMRegister, $src1$$Register);
4853     __ paddd($tmp$$XMMRegister, $tmp2$$XMMRegister);
4854     __ movdl($dst$$Register, $tmp$$XMMRegister);
4855   %}
4856   ins_pipe( pipe_slow );
4857 %}
4858 
4859 instruct rvadd2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, vecD tmp, vecD tmp2) %{
4860   predicate(VM_Version::supports_avxonly());
4861   match(Set dst (AddReductionVI src1 src2));
4862   effect(TEMP tmp, TEMP tmp2);
4863   format %{ "vphaddd  $tmp,$src2,$src2\n\t"
4864             "movd     $tmp2,$src1\n\t"
4865             "vpaddd   $tmp2,$tmp2,$tmp\n\t"
4866             "movd     $dst,$tmp2\t! add reduction2I" %}
4867   ins_encode %{
4868     int vector_len = 0;
4869     __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len);
4870     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4871     __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
4872     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4873   %}
4874   ins_pipe( pipe_slow );
4875 %}
4876 
4877 instruct rvadd2I_reduction_reg_evex(rRegI dst, rRegI src1, vecD src2, vecD tmp, vecD tmp2) %{
4878   predicate(UseAVX > 2);
4879   match(Set dst (AddReductionVI src1 src2));
4880   effect(TEMP tmp, TEMP tmp2);
4881   format %{ "pshufd  $tmp2,$src2,0x1\n\t"
4882             "vpaddd  $tmp,$src2,$tmp2\n\t"
4883             "movd    $tmp2,$src1\n\t"
4884             "vpaddd  $tmp2,$tmp,$tmp2\n\t"
4885             "movd    $dst,$tmp2\t! add reduction2I" %}
4886   ins_encode %{
4887     int vector_len = 0;
4888     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
4889     __ vpaddd($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4890     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4891     __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4892     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4893   %}
4894   ins_pipe( pipe_slow );
4895 %}
4896 
4897 instruct rsadd4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, vecX tmp, vecX tmp2) %{
4898   predicate(UseSSE > 2 && UseAVX == 0);
4899   match(Set dst (AddReductionVI src1 src2));
4900   effect(TEMP tmp, TEMP tmp2);
4901   format %{ "movdqu  $tmp,$src2\n\t"
4902             "phaddd  $tmp,$tmp\n\t"
4903             "phaddd  $tmp,$tmp\n\t"
4904             "movd    $tmp2,$src1\n\t"
4905             "paddd   $tmp2,$tmp\n\t"
4906             "movd    $dst,$tmp2\t! add reduction4I" %}
4907   ins_encode %{
4908     __ movdqu($tmp$$XMMRegister, $src2$$XMMRegister);
4909     __ phaddd($tmp$$XMMRegister, $tmp$$XMMRegister);
4910     __ phaddd($tmp$$XMMRegister, $tmp$$XMMRegister);
4911     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4912     __ paddd($tmp2$$XMMRegister, $tmp$$XMMRegister);
4913     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4914   %}
4915   ins_pipe( pipe_slow );
4916 %}
4917 
4918 instruct rvadd4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, vecX tmp, vecX tmp2) %{
4919   predicate(VM_Version::supports_avxonly());
4920   match(Set dst (AddReductionVI src1 src2));
4921   effect(TEMP tmp, TEMP tmp2);
4922   format %{ "vphaddd  $tmp,$src2,$src2\n\t"
4923             "vphaddd  $tmp,$tmp,$tmp\n\t"
4924             "movd     $tmp2,$src1\n\t"
4925             "vpaddd   $tmp2,$tmp2,$tmp\n\t"
4926             "movd     $dst,$tmp2\t! add reduction4I" %}
4927   ins_encode %{
4928     int vector_len = 0;
4929     __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len);
4930     __ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp$$XMMRegister, vector_len);
4931     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4932     __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
4933     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4934   %}
4935   ins_pipe( pipe_slow );
4936 %}
4937 
4938 instruct rvadd4I_reduction_reg_evex(rRegI dst, rRegI src1, vecX src2, vecX tmp, vecX tmp2) %{
4939   predicate(UseAVX > 2);
4940   match(Set dst (AddReductionVI src1 src2));
4941   effect(TEMP tmp, TEMP tmp2);
4942   format %{ "pshufd  $tmp2,$src2,0xE\n\t"
4943             "vpaddd  $tmp,$src2,$tmp2\n\t"
4944             "pshufd  $tmp2,$tmp,0x1\n\t"
4945             "vpaddd  $tmp,$tmp,$tmp2\n\t"
4946             "movd    $tmp2,$src1\n\t"
4947             "vpaddd  $tmp2,$tmp,$tmp2\n\t"
4948             "movd    $dst,$tmp2\t! add reduction4I" %}
4949   ins_encode %{
4950     int vector_len = 0;
4951     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
4952     __ vpaddd($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4953     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
4954     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4955     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4956     __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4957     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4958   %}
4959   ins_pipe( pipe_slow );
4960 %}
4961 
4962 instruct rvadd8I_reduction_reg(rRegI dst, rRegI src1, vecY src2, vecY tmp, vecY tmp2) %{
4963   predicate(VM_Version::supports_avxonly());
4964   match(Set dst (AddReductionVI src1 src2));
4965   effect(TEMP tmp, TEMP tmp2);
4966   format %{ "vphaddd  $tmp,$src2,$src2\n\t"
4967             "vphaddd  $tmp,$tmp,$tmp2\n\t"
4968             "vextracti128_high  $tmp2,$tmp\n\t"
4969             "vpaddd   $tmp,$tmp,$tmp2\n\t"
4970             "movd     $tmp2,$src1\n\t"
4971             "vpaddd   $tmp2,$tmp2,$tmp\n\t"
4972             "movd     $dst,$tmp2\t! add reduction8I" %}
4973   ins_encode %{
4974     int vector_len = 1;
4975     __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len);
4976     __ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4977     __ vextracti128_high($tmp2$$XMMRegister, $tmp$$XMMRegister);
4978     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
4979     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4980     __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
4981     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4982   %}
4983   ins_pipe( pipe_slow );
4984 %}
4985 
4986 instruct rvadd8I_reduction_reg_evex(rRegI dst, rRegI src1, vecY src2, vecY tmp, vecY tmp2) %{
4987   predicate(UseAVX > 2);
4988   match(Set dst (AddReductionVI src1 src2));
4989   effect(TEMP tmp, TEMP tmp2);
4990   format %{ "vextracti128_high  $tmp,$src2\n\t"
4991             "vpaddd  $tmp,$tmp,$src2\n\t"
4992             "pshufd  $tmp2,$tmp,0xE\n\t"
4993             "vpaddd  $tmp,$tmp,$tmp2\n\t"
4994             "pshufd  $tmp2,$tmp,0x1\n\t"
4995             "vpaddd  $tmp,$tmp,$tmp2\n\t"
4996             "movd    $tmp2,$src1\n\t"
4997             "vpaddd  $tmp2,$tmp,$tmp2\n\t"
4998             "movd    $dst,$tmp2\t! add reduction8I" %}
4999   ins_encode %{
5000     int vector_len = 0;
5001     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
5002     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
5003     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
5004     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5005     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
5006     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5007     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5008     __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5009     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5010   %}
5011   ins_pipe( pipe_slow );
5012 %}
5013 
5014 instruct rvadd16I_reduction_reg_evex(rRegI dst, rRegI src1, legVecZ src2, legVecZ tmp, legVecZ tmp2, legVecZ tmp3) %{
5015   predicate(UseAVX > 2);
5016   match(Set dst (AddReductionVI src1 src2));
5017   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
5018   format %{ "vextracti64x4_high  $tmp3,$src2\n\t"
5019             "vpaddd  $tmp3,$tmp3,$src2\n\t"
5020             "vextracti128_high  $tmp,$tmp3\n\t"
5021             "vpaddd  $tmp,$tmp,$tmp3\n\t"
5022             "pshufd  $tmp2,$tmp,0xE\n\t"
5023             "vpaddd  $tmp,$tmp,$tmp2\n\t"
5024             "pshufd  $tmp2,$tmp,0x1\n\t"
5025             "vpaddd  $tmp,$tmp,$tmp2\n\t"
5026             "movd    $tmp2,$src1\n\t"
5027             "vpaddd  $tmp2,$tmp,$tmp2\n\t"
5028             "movd    $dst,$tmp2\t! mul reduction16I" %}
5029   ins_encode %{
5030     __ vextracti64x4_high($tmp3$$XMMRegister, $src2$$XMMRegister);
5031     __ vpaddd($tmp3$$XMMRegister, $tmp3$$XMMRegister, $src2$$XMMRegister, 1);
5032     __ vextracti128_high($tmp$$XMMRegister, $tmp3$$XMMRegister);
5033     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp3$$XMMRegister, 0);
5034     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
5035     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5036     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
5037     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5038     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5039     __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5040     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5041   %}
5042   ins_pipe( pipe_slow );
5043 %}
5044 
5045 #ifdef _LP64
5046 instruct rvadd2L_reduction_reg(rRegL dst, rRegL src1, vecX src2, vecX tmp, vecX tmp2) %{
5047   predicate(UseAVX > 2);
5048   match(Set dst (AddReductionVL src1 src2));
5049   effect(TEMP tmp, TEMP tmp2);
5050   format %{ "pshufd  $tmp2,$src2,0xE\n\t"
5051             "vpaddq  $tmp,$src2,$tmp2\n\t"
5052             "movdq   $tmp2,$src1\n\t"
5053             "vpaddq  $tmp2,$tmp,$tmp2\n\t"
5054             "movdq   $dst,$tmp2\t! add reduction2L" %}
5055   ins_encode %{
5056     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
5057     __ vpaddq($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, 0);
5058     __ movdq($tmp2$$XMMRegister, $src1$$Register);
5059     __ vpaddq($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5060     __ movdq($dst$$Register, $tmp2$$XMMRegister);
5061   %}
5062   ins_pipe( pipe_slow );
5063 %}
5064 
5065 instruct rvadd4L_reduction_reg(rRegL dst, rRegL src1, vecY src2, vecY tmp, vecY tmp2) %{
5066   predicate(UseAVX > 2);
5067   match(Set dst (AddReductionVL src1 src2));
5068   effect(TEMP tmp, TEMP tmp2);
5069   format %{ "vextracti128_high  $tmp,$src2\n\t"
5070             "vpaddq  $tmp2,$tmp,$src2\n\t"
5071             "pshufd  $tmp,$tmp2,0xE\n\t"
5072             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
5073             "movdq   $tmp,$src1\n\t"
5074             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
5075             "movdq   $dst,$tmp2\t! add reduction4L" %}
5076   ins_encode %{
5077     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
5078     __ vpaddq($tmp2$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, 0);
5079     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5080     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5081     __ movdq($tmp$$XMMRegister, $src1$$Register);
5082     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5083     __ movdq($dst$$Register, $tmp2$$XMMRegister);
5084   %}
5085   ins_pipe( pipe_slow );
5086 %}
5087 
5088 instruct rvadd8L_reduction_reg(rRegL dst, rRegL src1, legVecZ src2, legVecZ tmp, legVecZ tmp2) %{
5089   predicate(UseAVX > 2);
5090   match(Set dst (AddReductionVL src1 src2));
5091   effect(TEMP tmp, TEMP tmp2);
5092   format %{ "vextracti64x4_high  $tmp2,$src2\n\t"
5093             "vpaddq  $tmp2,$tmp2,$src2\n\t"
5094             "vextracti128_high  $tmp,$tmp2\n\t"
5095             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
5096             "pshufd  $tmp,$tmp2,0xE\n\t"
5097             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
5098             "movdq   $tmp,$src1\n\t"
5099             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
5100             "movdq   $dst,$tmp2\t! add reduction8L" %}
5101   ins_encode %{
5102     __ vextracti64x4_high($tmp2$$XMMRegister, $src2$$XMMRegister);
5103     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
5104     __ vextracti128_high($tmp$$XMMRegister, $tmp2$$XMMRegister);
5105     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5106     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5107     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5108     __ movdq($tmp$$XMMRegister, $src1$$Register);
5109     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5110     __ movdq($dst$$Register, $tmp2$$XMMRegister);
5111   %}
5112   ins_pipe( pipe_slow );
5113 %}
5114 #endif
5115 
5116 instruct rsadd2F_reduction_reg(regF dst, vecD src2, vecD tmp) %{
5117   predicate(UseSSE >= 1 && UseAVX == 0);
5118   match(Set dst (AddReductionVF dst src2));
5119   effect(TEMP dst, TEMP tmp);
5120   format %{ "addss   $dst,$src2\n\t"
5121             "pshufd  $tmp,$src2,0x01\n\t"
5122             "addss   $dst,$tmp\t! add reduction2F" %}
5123   ins_encode %{
5124     __ addss($dst$$XMMRegister, $src2$$XMMRegister);
5125     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5126     __ addss($dst$$XMMRegister, $tmp$$XMMRegister);
5127   %}
5128   ins_pipe( pipe_slow );
5129 %}
5130 
5131 instruct rvadd2F_reduction_reg(regF dst, vecD src2, vecD tmp) %{
5132   predicate(UseAVX > 0);
5133   match(Set dst (AddReductionVF dst src2));
5134   effect(TEMP dst, TEMP tmp);
5135   format %{ "vaddss  $dst,$dst,$src2\n\t"
5136             "pshufd  $tmp,$src2,0x01\n\t"
5137             "vaddss  $dst,$dst,$tmp\t! add reduction2F" %}
5138   ins_encode %{
5139     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5140     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5141     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5142   %}
5143   ins_pipe( pipe_slow );
5144 %}
5145 
5146 instruct rsadd4F_reduction_reg(regF dst, vecX src2, vecX tmp) %{
5147   predicate(UseSSE >= 1 && UseAVX == 0);
5148   match(Set dst (AddReductionVF dst src2));
5149   effect(TEMP dst, TEMP tmp);
5150   format %{ "addss   $dst,$src2\n\t"
5151             "pshufd  $tmp,$src2,0x01\n\t"
5152             "addss   $dst,$tmp\n\t"
5153             "pshufd  $tmp,$src2,0x02\n\t"
5154             "addss   $dst,$tmp\n\t"
5155             "pshufd  $tmp,$src2,0x03\n\t"
5156             "addss   $dst,$tmp\t! add reduction4F" %}
5157   ins_encode %{
5158     __ addss($dst$$XMMRegister, $src2$$XMMRegister);
5159     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5160     __ addss($dst$$XMMRegister, $tmp$$XMMRegister);
5161     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5162     __ addss($dst$$XMMRegister, $tmp$$XMMRegister);
5163     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5164     __ addss($dst$$XMMRegister, $tmp$$XMMRegister);
5165   %}
5166   ins_pipe( pipe_slow );
5167 %}
5168 
5169 instruct rvadd4F_reduction_reg(regF dst, vecX src2, vecX tmp) %{
5170   predicate(UseAVX > 0);
5171   match(Set dst (AddReductionVF dst src2));
5172   effect(TEMP tmp, TEMP dst);
5173   format %{ "vaddss  $dst,dst,$src2\n\t"
5174             "pshufd  $tmp,$src2,0x01\n\t"
5175             "vaddss  $dst,$dst,$tmp\n\t"
5176             "pshufd  $tmp,$src2,0x02\n\t"
5177             "vaddss  $dst,$dst,$tmp\n\t"
5178             "pshufd  $tmp,$src2,0x03\n\t"
5179             "vaddss  $dst,$dst,$tmp\t! add reduction4F" %}
5180   ins_encode %{
5181     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5182     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5183     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5184     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5185     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5186     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5187     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5188   %}
5189   ins_pipe( pipe_slow );
5190 %}
5191 
5192 instruct radd8F_reduction_reg(regF dst, vecY src2, vecY tmp, vecY tmp2) %{
5193   predicate(UseAVX > 0);
5194   match(Set dst (AddReductionVF dst src2));
5195   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5196   format %{ "vaddss  $dst,$dst,$src2\n\t"
5197             "pshufd  $tmp,$src2,0x01\n\t"
5198             "vaddss  $dst,$dst,$tmp\n\t"
5199             "pshufd  $tmp,$src2,0x02\n\t"
5200             "vaddss  $dst,$dst,$tmp\n\t"
5201             "pshufd  $tmp,$src2,0x03\n\t"
5202             "vaddss  $dst,$dst,$tmp\n\t"
5203             "vextractf128_high  $tmp2,$src2\n\t"
5204             "vaddss  $dst,$dst,$tmp2\n\t"
5205             "pshufd  $tmp,$tmp2,0x01\n\t"
5206             "vaddss  $dst,$dst,$tmp\n\t"
5207             "pshufd  $tmp,$tmp2,0x02\n\t"
5208             "vaddss  $dst,$dst,$tmp\n\t"
5209             "pshufd  $tmp,$tmp2,0x03\n\t"
5210             "vaddss  $dst,$dst,$tmp\t! add reduction8F" %}
5211   ins_encode %{
5212     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5213     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5214     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5215     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5216     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5217     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5218     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5219     __ vextractf128_high($tmp2$$XMMRegister, $src2$$XMMRegister);
5220     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5221     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5222     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5223     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5224     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5225     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5226     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5227   %}
5228   ins_pipe( pipe_slow );
5229 %}
5230 
5231 instruct radd16F_reduction_reg(regF dst, legVecZ src2, legVecZ tmp, legVecZ tmp2) %{
5232   predicate(UseAVX > 2);
5233   match(Set dst (AddReductionVF dst src2));
5234   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5235   format %{ "vaddss  $dst,$dst,$src2\n\t"
5236             "pshufd  $tmp,$src2,0x01\n\t"
5237             "vaddss  $dst,$dst,$tmp\n\t"
5238             "pshufd  $tmp,$src2,0x02\n\t"
5239             "vaddss  $dst,$dst,$tmp\n\t"
5240             "pshufd  $tmp,$src2,0x03\n\t"
5241             "vaddss  $dst,$dst,$tmp\n\t"
5242             "vextractf32x4  $tmp2,$src2,0x1\n\t"
5243             "vaddss  $dst,$dst,$tmp2\n\t"
5244             "pshufd  $tmp,$tmp2,0x01\n\t"
5245             "vaddss  $dst,$dst,$tmp\n\t"
5246             "pshufd  $tmp,$tmp2,0x02\n\t"
5247             "vaddss  $dst,$dst,$tmp\n\t"
5248             "pshufd  $tmp,$tmp2,0x03\n\t"
5249             "vaddss  $dst,$dst,$tmp\n\t"
5250             "vextractf32x4  $tmp2,$src2,0x2\n\t"
5251             "vaddss  $dst,$dst,$tmp2\n\t"
5252             "pshufd  $tmp,$tmp2,0x01\n\t"
5253             "vaddss  $dst,$dst,$tmp\n\t"
5254             "pshufd  $tmp,$tmp2,0x02\n\t"
5255             "vaddss  $dst,$dst,$tmp\n\t"
5256             "pshufd  $tmp,$tmp2,0x03\n\t"
5257             "vaddss  $dst,$dst,$tmp\n\t"
5258             "vextractf32x4  $tmp2,$src2,0x3\n\t"
5259             "vaddss  $dst,$dst,$tmp2\n\t"
5260             "pshufd  $tmp,$tmp2,0x01\n\t"
5261             "vaddss  $dst,$dst,$tmp\n\t"
5262             "pshufd  $tmp,$tmp2,0x02\n\t"
5263             "vaddss  $dst,$dst,$tmp\n\t"
5264             "pshufd  $tmp,$tmp2,0x03\n\t"
5265             "vaddss  $dst,$dst,$tmp\t! add reduction16F" %}
5266   ins_encode %{
5267     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5268     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5269     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5270     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5271     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5272     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5273     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5274     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5275     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5276     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5277     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5278     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5279     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5280     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5281     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5282     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
5283     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5284     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5285     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5286     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5287     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5288     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5289     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5290     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
5291     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5292     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5293     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5294     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5295     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5296     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5297     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5298   %}
5299   ins_pipe( pipe_slow );
5300 %}
5301 
5302 instruct rsadd2D_reduction_reg(regD dst, vecX src2, vecX tmp) %{
5303   predicate(UseSSE >= 1 && UseAVX == 0);
5304   match(Set dst (AddReductionVD dst src2));
5305   effect(TEMP tmp, TEMP dst);
5306   format %{ "addsd   $dst,$src2\n\t"
5307             "pshufd  $tmp,$src2,0xE\n\t"
5308             "addsd   $dst,$tmp\t! add reduction2D" %}
5309   ins_encode %{
5310     __ addsd($dst$$XMMRegister, $src2$$XMMRegister);
5311     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5312     __ addsd($dst$$XMMRegister, $tmp$$XMMRegister);
5313   %}
5314   ins_pipe( pipe_slow );
5315 %}
5316 
5317 instruct rvadd2D_reduction_reg(regD dst, vecX src2, vecX tmp) %{
5318   predicate(UseAVX > 0);
5319   match(Set dst (AddReductionVD dst src2));
5320   effect(TEMP tmp, TEMP dst);
5321   format %{ "vaddsd  $dst,$dst,$src2\n\t"
5322             "pshufd  $tmp,$src2,0xE\n\t"
5323             "vaddsd  $dst,$dst,$tmp\t! add reduction2D" %}
5324   ins_encode %{
5325     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5326     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5327     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5328   %}
5329   ins_pipe( pipe_slow );
5330 %}
5331 
5332 instruct rvadd4D_reduction_reg(regD dst, vecY src2, vecX tmp, vecX tmp2) %{
5333   predicate(UseAVX > 0);
5334   match(Set dst (AddReductionVD dst src2));
5335   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5336   format %{ "vaddsd  $dst,$dst,$src2\n\t"
5337             "pshufd  $tmp,$src2,0xE\n\t"
5338             "vaddsd  $dst,$dst,$tmp\n\t"
5339             "vextractf128  $tmp2,$src2,0x1\n\t"
5340             "vaddsd  $dst,$dst,$tmp2\n\t"
5341             "pshufd  $tmp,$tmp2,0xE\n\t"
5342             "vaddsd  $dst,$dst,$tmp\t! add reduction4D" %}
5343   ins_encode %{
5344     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5345     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5346     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5347     __ vextractf128($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5348     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5349     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5350     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5351   %}
5352   ins_pipe( pipe_slow );
5353 %}
5354 
5355 instruct rvadd8D_reduction_reg(regD dst, legVecZ src2, legVecZ tmp, legVecZ tmp2) %{
5356   predicate(UseAVX > 2);
5357   match(Set dst (AddReductionVD dst src2));
5358   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5359   format %{ "vaddsd  $dst,$dst,$src2\n\t"
5360             "pshufd  $tmp,$src2,0xE\n\t"
5361             "vaddsd  $dst,$dst,$tmp\n\t"
5362             "vextractf32x4  $tmp2,$src2,0x1\n\t"
5363             "vaddsd  $dst,$dst,$tmp2\n\t"
5364             "pshufd  $tmp,$tmp2,0xE\n\t"
5365             "vaddsd  $dst,$dst,$tmp\n\t"
5366             "vextractf32x4  $tmp2,$src2,0x2\n\t"
5367             "vaddsd  $dst,$dst,$tmp2\n\t"
5368             "pshufd  $tmp,$tmp2,0xE\n\t"
5369             "vaddsd  $dst,$dst,$tmp\n\t"
5370             "vextractf32x4  $tmp2,$src2,0x3\n\t"
5371             "vaddsd  $dst,$dst,$tmp2\n\t"
5372             "pshufd  $tmp,$tmp2,0xE\n\t"
5373             "vaddsd  $dst,$dst,$tmp\t! add reduction8D" %}
5374   ins_encode %{
5375     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5376     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5377     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5378     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5379     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5380     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5381     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5382     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
5383     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5384     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5385     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5386     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
5387     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5388     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5389     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5390   %}
5391   ins_pipe( pipe_slow );
5392 %}
5393 
5394 instruct rsmul2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, vecD tmp, vecD tmp2) %{
5395   predicate(UseSSE > 3 && UseAVX == 0);
5396   match(Set dst (MulReductionVI src1 src2));
5397   effect(TEMP tmp, TEMP tmp2);
5398   format %{ "pshufd  $tmp2,$src2,0x1\n\t"
5399             "pmulld  $tmp2,$src2\n\t"
5400             "movd    $tmp,$src1\n\t"
5401             "pmulld  $tmp2,$tmp\n\t"
5402             "movd    $dst,$tmp2\t! mul reduction2I" %}
5403   ins_encode %{
5404     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5405     __ pmulld($tmp2$$XMMRegister, $src2$$XMMRegister);
5406     __ movdl($tmp$$XMMRegister, $src1$$Register);
5407     __ pmulld($tmp2$$XMMRegister, $tmp$$XMMRegister);
5408     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5409   %}
5410   ins_pipe( pipe_slow );
5411 %}
5412 
5413 instruct rvmul2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, vecD tmp, vecD tmp2) %{
5414   predicate(UseAVX > 0);
5415   match(Set dst (MulReductionVI src1 src2));
5416   effect(TEMP tmp, TEMP tmp2);
5417   format %{ "pshufd   $tmp2,$src2,0x1\n\t"
5418             "vpmulld  $tmp,$src2,$tmp2\n\t"
5419             "movd     $tmp2,$src1\n\t"
5420             "vpmulld  $tmp2,$tmp,$tmp2\n\t"
5421             "movd     $dst,$tmp2\t! mul reduction2I" %}
5422   ins_encode %{
5423     int vector_len = 0;
5424     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5425     __ vpmulld($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5426     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5427     __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5428     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5429   %}
5430   ins_pipe( pipe_slow );
5431 %}
5432 
5433 instruct rsmul4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, vecX tmp, vecX tmp2) %{
5434   predicate(UseSSE > 3 && UseAVX == 0);
5435   match(Set dst (MulReductionVI src1 src2));
5436   effect(TEMP tmp, TEMP tmp2);
5437   format %{ "pshufd  $tmp2,$src2,0xE\n\t"
5438             "pmulld  $tmp2,$src2\n\t"
5439             "pshufd  $tmp,$tmp2,0x1\n\t"
5440             "pmulld  $tmp2,$tmp\n\t"
5441             "movd    $tmp,$src1\n\t"
5442             "pmulld  $tmp2,$tmp\n\t"
5443             "movd    $dst,$tmp2\t! mul reduction4I" %}
5444   ins_encode %{
5445     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
5446     __ pmulld($tmp2$$XMMRegister, $src2$$XMMRegister);
5447     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x1);
5448     __ pmulld($tmp2$$XMMRegister, $tmp$$XMMRegister);
5449     __ movdl($tmp$$XMMRegister, $src1$$Register);
5450     __ pmulld($tmp2$$XMMRegister, $tmp$$XMMRegister);
5451     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5452   %}
5453   ins_pipe( pipe_slow );
5454 %}
5455 
5456 instruct rvmul4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, vecX tmp, vecX tmp2) %{
5457   predicate(UseAVX > 0);
5458   match(Set dst (MulReductionVI src1 src2));
5459   effect(TEMP tmp, TEMP tmp2);
5460   format %{ "pshufd   $tmp2,$src2,0xE\n\t"
5461             "vpmulld  $tmp,$src2,$tmp2\n\t"
5462             "pshufd   $tmp2,$tmp,0x1\n\t"
5463             "vpmulld  $tmp,$tmp,$tmp2\n\t"
5464             "movd     $tmp2,$src1\n\t"
5465             "vpmulld  $tmp2,$tmp,$tmp2\n\t"
5466             "movd     $dst,$tmp2\t! mul reduction4I" %}
5467   ins_encode %{
5468     int vector_len = 0;
5469     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
5470     __ vpmulld($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5471     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
5472     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5473     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5474     __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5475     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5476   %}
5477   ins_pipe( pipe_slow );
5478 %}
5479 
5480 instruct rvmul8I_reduction_reg(rRegI dst, rRegI src1, vecY src2, vecY tmp, vecY tmp2) %{
5481   predicate(UseAVX > 1);
5482   match(Set dst (MulReductionVI src1 src2));
5483   effect(TEMP tmp, TEMP tmp2);
5484   format %{ "vextracti128_high  $tmp,$src2\n\t"
5485             "vpmulld  $tmp,$tmp,$src2\n\t"
5486             "pshufd   $tmp2,$tmp,0xE\n\t"
5487             "vpmulld  $tmp,$tmp,$tmp2\n\t"
5488             "pshufd   $tmp2,$tmp,0x1\n\t"
5489             "vpmulld  $tmp,$tmp,$tmp2\n\t"
5490             "movd     $tmp2,$src1\n\t"
5491             "vpmulld  $tmp2,$tmp,$tmp2\n\t"
5492             "movd     $dst,$tmp2\t! mul reduction8I" %}
5493   ins_encode %{
5494     int vector_len = 0;
5495     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
5496     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
5497     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
5498     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5499     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
5500     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5501     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5502     __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5503     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5504   %}
5505   ins_pipe( pipe_slow );
5506 %}
5507 
5508 instruct rvmul16I_reduction_reg(rRegI dst, rRegI src1, legVecZ src2, legVecZ tmp, legVecZ tmp2, legVecZ tmp3) %{
5509   predicate(UseAVX > 2);
5510   match(Set dst (MulReductionVI src1 src2));
5511   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
5512   format %{ "vextracti64x4_high  $tmp3,$src2\n\t"
5513             "vpmulld  $tmp3,$tmp3,$src2\n\t"
5514             "vextracti128_high  $tmp,$tmp3\n\t"
5515             "vpmulld  $tmp,$tmp,$src2\n\t"
5516             "pshufd   $tmp2,$tmp,0xE\n\t"
5517             "vpmulld  $tmp,$tmp,$tmp2\n\t"
5518             "pshufd   $tmp2,$tmp,0x1\n\t"
5519             "vpmulld  $tmp,$tmp,$tmp2\n\t"
5520             "movd     $tmp2,$src1\n\t"
5521             "vpmulld  $tmp2,$tmp,$tmp2\n\t"
5522             "movd     $dst,$tmp2\t! mul reduction16I" %}
5523   ins_encode %{
5524     __ vextracti64x4_high($tmp3$$XMMRegister, $src2$$XMMRegister);
5525     __ vpmulld($tmp3$$XMMRegister, $tmp3$$XMMRegister, $src2$$XMMRegister, 1);
5526     __ vextracti128_high($tmp$$XMMRegister, $tmp3$$XMMRegister);
5527     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp3$$XMMRegister, 0);
5528     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
5529     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5530     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
5531     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5532     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5533     __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5534     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5535   %}
5536   ins_pipe( pipe_slow );
5537 %}
5538 
5539 #ifdef _LP64
5540 instruct rvmul2L_reduction_reg(rRegL dst, rRegL src1, vecX src2, vecX tmp, vecX tmp2) %{
5541   predicate(UseAVX > 2 && VM_Version::supports_avx512dq());
5542   match(Set dst (MulReductionVL src1 src2));
5543   effect(TEMP tmp, TEMP tmp2);
5544   format %{ "pshufd   $tmp2,$src2,0xE\n\t"
5545             "vpmullq  $tmp,$src2,$tmp2\n\t"
5546             "movdq    $tmp2,$src1\n\t"
5547             "vpmullq  $tmp2,$tmp,$tmp2\n\t"
5548             "movdq    $dst,$tmp2\t! mul reduction2L" %}
5549   ins_encode %{
5550     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
5551     __ vpmullq($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, 0);
5552     __ movdq($tmp2$$XMMRegister, $src1$$Register);
5553     __ vpmullq($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5554     __ movdq($dst$$Register, $tmp2$$XMMRegister);
5555   %}
5556   ins_pipe( pipe_slow );
5557 %}
5558 
5559 instruct rvmul4L_reduction_reg(rRegL dst, rRegL src1, vecY src2, vecY tmp, vecY tmp2) %{
5560   predicate(UseAVX > 2 && VM_Version::supports_avx512dq());
5561   match(Set dst (MulReductionVL src1 src2));
5562   effect(TEMP tmp, TEMP tmp2);
5563   format %{ "vextracti128_high  $tmp,$src2\n\t"
5564             "vpmullq  $tmp2,$tmp,$src2\n\t"
5565             "pshufd   $tmp,$tmp2,0xE\n\t"
5566             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5567             "movdq    $tmp,$src1\n\t"
5568             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5569             "movdq    $dst,$tmp2\t! mul reduction4L" %}
5570   ins_encode %{
5571     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
5572     __ vpmullq($tmp2$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, 0);
5573     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5574     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5575     __ movdq($tmp$$XMMRegister, $src1$$Register);
5576     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5577     __ movdq($dst$$Register, $tmp2$$XMMRegister);
5578   %}
5579   ins_pipe( pipe_slow );
5580 %}
5581 
5582 instruct rvmul8L_reduction_reg(rRegL dst, rRegL src1, legVecZ src2, legVecZ tmp, legVecZ tmp2) %{
5583   predicate(UseAVX > 2 && VM_Version::supports_avx512dq());
5584   match(Set dst (MulReductionVL src1 src2));
5585   effect(TEMP tmp, TEMP tmp2);
5586   format %{ "vextracti64x4_high  $tmp2,$src2\n\t"
5587             "vpmullq  $tmp2,$tmp2,$src2\n\t"
5588             "vextracti128_high  $tmp,$tmp2\n\t"
5589             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5590             "pshufd   $tmp,$tmp2,0xE\n\t"
5591             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5592             "movdq    $tmp,$src1\n\t"
5593             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5594             "movdq    $dst,$tmp2\t! mul reduction8L" %}
5595   ins_encode %{
5596     __ vextracti64x4_high($tmp2$$XMMRegister, $src2$$XMMRegister);
5597     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
5598     __ vextracti128_high($tmp$$XMMRegister, $tmp2$$XMMRegister);
5599     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5600     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5601     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5602     __ movdq($tmp$$XMMRegister, $src1$$Register);
5603     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5604     __ movdq($dst$$Register, $tmp2$$XMMRegister);
5605   %}
5606   ins_pipe( pipe_slow );
5607 %}
5608 #endif
5609 
5610 instruct rsmul2F_reduction(regF dst, vecD src2, vecD tmp) %{
5611   predicate(UseSSE >= 1 && UseAVX == 0);
5612   match(Set dst (MulReductionVF dst src2));
5613   effect(TEMP dst, TEMP tmp);
5614   format %{ "mulss   $dst,$src2\n\t"
5615             "pshufd  $tmp,$src2,0x01\n\t"
5616             "mulss   $dst,$tmp\t! mul reduction2F" %}
5617   ins_encode %{
5618     __ mulss($dst$$XMMRegister, $src2$$XMMRegister);
5619     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5620     __ mulss($dst$$XMMRegister, $tmp$$XMMRegister);
5621   %}
5622   ins_pipe( pipe_slow );
5623 %}
5624 
5625 instruct rvmul2F_reduction_reg(regF dst, vecD src2, vecD tmp) %{
5626   predicate(UseAVX > 0);
5627   match(Set dst (MulReductionVF dst src2));
5628   effect(TEMP tmp, TEMP dst);
5629   format %{ "vmulss  $dst,$dst,$src2\n\t"
5630             "pshufd  $tmp,$src2,0x01\n\t"
5631             "vmulss  $dst,$dst,$tmp\t! mul reduction2F" %}
5632   ins_encode %{
5633     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5634     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5635     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5636   %}
5637   ins_pipe( pipe_slow );
5638 %}
5639 
5640 instruct rsmul4F_reduction_reg(regF dst, vecX src2, vecX tmp) %{
5641   predicate(UseSSE >= 1 && UseAVX == 0);
5642   match(Set dst (MulReductionVF dst src2));
5643   effect(TEMP dst, TEMP tmp);
5644   format %{ "mulss   $dst,$src2\n\t"
5645             "pshufd  $tmp,$src2,0x01\n\t"
5646             "mulss   $dst,$tmp\n\t"
5647             "pshufd  $tmp,$src2,0x02\n\t"
5648             "mulss   $dst,$tmp\n\t"
5649             "pshufd  $tmp,$src2,0x03\n\t"
5650             "mulss   $dst,$tmp\t! mul reduction4F" %}
5651   ins_encode %{
5652     __ mulss($dst$$XMMRegister, $src2$$XMMRegister);
5653     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5654     __ mulss($dst$$XMMRegister, $tmp$$XMMRegister);
5655     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5656     __ mulss($dst$$XMMRegister, $tmp$$XMMRegister);
5657     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5658     __ mulss($dst$$XMMRegister, $tmp$$XMMRegister);
5659   %}
5660   ins_pipe( pipe_slow );
5661 %}
5662 
5663 instruct rvmul4F_reduction_reg(regF dst, vecX src2, vecX tmp) %{
5664   predicate(UseAVX > 0);
5665   match(Set dst (MulReductionVF dst src2));
5666   effect(TEMP tmp, TEMP dst);
5667   format %{ "vmulss  $dst,$dst,$src2\n\t"
5668             "pshufd  $tmp,$src2,0x01\n\t"
5669             "vmulss  $dst,$dst,$tmp\n\t"
5670             "pshufd  $tmp,$src2,0x02\n\t"
5671             "vmulss  $dst,$dst,$tmp\n\t"
5672             "pshufd  $tmp,$src2,0x03\n\t"
5673             "vmulss  $dst,$dst,$tmp\t! mul reduction4F" %}
5674   ins_encode %{
5675     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5676     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5677     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5678     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5679     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5680     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5681     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5682   %}
5683   ins_pipe( pipe_slow );
5684 %}
5685 
5686 instruct rvmul8F_reduction_reg(regF dst, vecY src2, vecY tmp, vecY tmp2) %{
5687   predicate(UseAVX > 0);
5688   match(Set dst (MulReductionVF dst src2));
5689   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5690   format %{ "vmulss  $dst,$dst,$src2\n\t"
5691             "pshufd  $tmp,$src2,0x01\n\t"
5692             "vmulss  $dst,$dst,$tmp\n\t"
5693             "pshufd  $tmp,$src2,0x02\n\t"
5694             "vmulss  $dst,$dst,$tmp\n\t"
5695             "pshufd  $tmp,$src2,0x03\n\t"
5696             "vmulss  $dst,$dst,$tmp\n\t"
5697             "vextractf128_high  $tmp2,$src2\n\t"
5698             "vmulss  $dst,$dst,$tmp2\n\t"
5699             "pshufd  $tmp,$tmp2,0x01\n\t"
5700             "vmulss  $dst,$dst,$tmp\n\t"
5701             "pshufd  $tmp,$tmp2,0x02\n\t"
5702             "vmulss  $dst,$dst,$tmp\n\t"
5703             "pshufd  $tmp,$tmp2,0x03\n\t"
5704             "vmulss  $dst,$dst,$tmp\t! mul reduction8F" %}
5705   ins_encode %{
5706     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5707     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5708     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5709     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5710     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5711     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5712     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5713     __ vextractf128_high($tmp2$$XMMRegister, $src2$$XMMRegister);
5714     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5715     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5716     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5717     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5718     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5719     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5720     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5721   %}
5722   ins_pipe( pipe_slow );
5723 %}
5724 
5725 instruct rvmul16F_reduction_reg(regF dst, legVecZ src2, legVecZ tmp, legVecZ tmp2) %{
5726   predicate(UseAVX > 2);
5727   match(Set dst (MulReductionVF dst src2));
5728   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5729   format %{ "vmulss  $dst,$dst,$src2\n\t"
5730             "pshufd  $tmp,$src2,0x01\n\t"
5731             "vmulss  $dst,$dst,$tmp\n\t"
5732             "pshufd  $tmp,$src2,0x02\n\t"
5733             "vmulss  $dst,$dst,$tmp\n\t"
5734             "pshufd  $tmp,$src2,0x03\n\t"
5735             "vmulss  $dst,$dst,$tmp\n\t"
5736             "vextractf32x4  $tmp2,$src2,0x1\n\t"
5737             "vmulss  $dst,$dst,$tmp2\n\t"
5738             "pshufd  $tmp,$tmp2,0x01\n\t"
5739             "vmulss  $dst,$dst,$tmp\n\t"
5740             "pshufd  $tmp,$tmp2,0x02\n\t"
5741             "vmulss  $dst,$dst,$tmp\n\t"
5742             "pshufd  $tmp,$tmp2,0x03\n\t"
5743             "vmulss  $dst,$dst,$tmp\n\t"
5744             "vextractf32x4  $tmp2,$src2,0x2\n\t"
5745             "vmulss  $dst,$dst,$tmp2\n\t"
5746             "pshufd  $tmp,$tmp2,0x01\n\t"
5747             "vmulss  $dst,$dst,$tmp\n\t"
5748             "pshufd  $tmp,$tmp2,0x02\n\t"
5749             "vmulss  $dst,$dst,$tmp\n\t"
5750             "pshufd  $tmp,$tmp2,0x03\n\t"
5751             "vmulss  $dst,$dst,$tmp\n\t"
5752             "vextractf32x4  $tmp2,$src2,0x3\n\t"
5753             "vmulss  $dst,$dst,$tmp2\n\t"
5754             "pshufd  $tmp,$tmp2,0x01\n\t"
5755             "vmulss  $dst,$dst,$tmp\n\t"
5756             "pshufd  $tmp,$tmp2,0x02\n\t"
5757             "vmulss  $dst,$dst,$tmp\n\t"
5758             "pshufd  $tmp,$tmp2,0x03\n\t"
5759             "vmulss  $dst,$dst,$tmp\t! mul reduction16F" %}
5760   ins_encode %{
5761     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5762     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5763     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5764     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5765     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5766     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5767     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5768     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5769     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5770     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5771     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5772     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5773     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5774     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5775     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5776     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
5777     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5778     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5779     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5780     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5781     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5782     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5783     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5784     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
5785     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5786     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5787     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5788     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5789     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5790     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5791     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5792   %}
5793   ins_pipe( pipe_slow );
5794 %}
5795 
5796 instruct rsmul2D_reduction_reg(regD dst, vecX src2, vecX tmp) %{
5797   predicate(UseSSE >= 1 && UseAVX == 0);
5798   match(Set dst (MulReductionVD dst src2));
5799   effect(TEMP dst, TEMP tmp);
5800   format %{ "mulsd   $dst,$src2\n\t"
5801             "pshufd  $tmp,$src2,0xE\n\t"
5802             "mulsd   $dst,$tmp\t! mul reduction2D" %}
5803   ins_encode %{
5804     __ mulsd($dst$$XMMRegister, $src2$$XMMRegister);
5805     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5806     __ mulsd($dst$$XMMRegister, $tmp$$XMMRegister);
5807   %}
5808   ins_pipe( pipe_slow );
5809 %}
5810 
5811 instruct rvmul2D_reduction_reg(regD dst, vecX src2, vecX tmp) %{
5812   predicate(UseAVX > 0);
5813   match(Set dst (MulReductionVD dst src2));
5814   effect(TEMP tmp, TEMP dst);
5815   format %{ "vmulsd  $dst,$dst,$src2\n\t"
5816             "pshufd  $tmp,$src2,0xE\n\t"
5817             "vmulsd  $dst,$dst,$tmp\t! mul reduction2D" %}
5818   ins_encode %{
5819     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5820     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5821     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5822   %}
5823   ins_pipe( pipe_slow );
5824 %}
5825 
5826 instruct rvmul4D_reduction_reg(regD dst, vecY src2, vecY tmp, vecY tmp2) %{
5827   predicate(UseAVX > 0);
5828   match(Set dst (MulReductionVD dst src2));
5829   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5830   format %{ "vmulsd  $dst,$dst,$src2\n\t"
5831             "pshufd  $tmp,$src2,0xE\n\t"
5832             "vmulsd  $dst,$dst,$tmp\n\t"
5833             "vextractf128_high  $tmp2,$src2\n\t"
5834             "vmulsd  $dst,$dst,$tmp2\n\t"
5835             "pshufd  $tmp,$tmp2,0xE\n\t"
5836             "vmulsd  $dst,$dst,$tmp\t! mul reduction4D" %}
5837   ins_encode %{
5838     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5839     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5840     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5841     __ vextractf128_high($tmp2$$XMMRegister, $src2$$XMMRegister);
5842     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5843     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5844     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5845   %}
5846   ins_pipe( pipe_slow );
5847 %}
5848 
5849 instruct rvmul8D_reduction_reg(regD dst, legVecZ src2, legVecZ tmp, legVecZ tmp2) %{
5850   predicate(UseAVX > 2);
5851   match(Set dst (MulReductionVD dst src2));
5852   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5853   format %{ "vmulsd  $dst,$dst,$src2\n\t"
5854             "pshufd  $tmp,$src2,0xE\n\t"
5855             "vmulsd  $dst,$dst,$tmp\n\t"
5856             "vextractf32x4  $tmp2,$src2,0x1\n\t"
5857             "vmulsd  $dst,$dst,$tmp2\n\t"
5858             "pshufd  $tmp,$src2,0xE\n\t"
5859             "vmulsd  $dst,$dst,$tmp\n\t"
5860             "vextractf32x4  $tmp2,$src2,0x2\n\t"
5861             "vmulsd  $dst,$dst,$tmp2\n\t"
5862             "pshufd  $tmp,$tmp2,0xE\n\t"
5863             "vmulsd  $dst,$dst,$tmp\n\t"
5864             "vextractf32x4  $tmp2,$src2,0x3\n\t"
5865             "vmulsd  $dst,$dst,$tmp2\n\t"
5866             "pshufd  $tmp,$tmp2,0xE\n\t"
5867             "vmulsd  $dst,$dst,$tmp\t! mul reduction8D" %}
5868   ins_encode %{
5869     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5870     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5871     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5872     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5873     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5874     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5875     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5876     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
5877     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5878     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5879     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5880     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
5881     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5882     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5883     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5884   %}
5885   ins_pipe( pipe_slow );
5886 %}
5887 
5888 // ====================VECTOR ARITHMETIC=======================================
5889 
5890 // --------------------------------- ADD --------------------------------------
5891 
5892 // Bytes vector add
5893 instruct vadd4B(vecS dst, vecS src) %{
5894   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
5895   match(Set dst (AddVB dst src));
5896   format %{ "paddb   $dst,$src\t! add packed4B" %}
5897   ins_encode %{
5898     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
5899   %}
5900   ins_pipe( pipe_slow );
5901 %}
5902 
5903 instruct vadd4B_reg(vecS dst, vecS src1, vecS src2) %{
5904   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
5905   match(Set dst (AddVB src1 src2));
5906   format %{ "vpaddb  $dst,$src1,$src2\t! add packed4B" %}
5907   ins_encode %{
5908     int vector_len = 0;
5909     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5910   %}
5911   ins_pipe( pipe_slow );
5912 %}
5913 
5914 
5915 instruct vadd4B_mem(vecS dst, vecS src, memory mem) %{
5916   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
5917   match(Set dst (AddVB src (LoadVector mem)));
5918   format %{ "vpaddb  $dst,$src,$mem\t! add packed4B" %}
5919   ins_encode %{
5920     int vector_len = 0;
5921     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5922   %}
5923   ins_pipe( pipe_slow );
5924 %}
5925 
5926 instruct vadd8B(vecD dst, vecD src) %{
5927   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
5928   match(Set dst (AddVB dst src));
5929   format %{ "paddb   $dst,$src\t! add packed8B" %}
5930   ins_encode %{
5931     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
5932   %}
5933   ins_pipe( pipe_slow );
5934 %}
5935 
5936 instruct vadd8B_reg(vecD dst, vecD src1, vecD src2) %{
5937   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
5938   match(Set dst (AddVB src1 src2));
5939   format %{ "vpaddb  $dst,$src1,$src2\t! add packed8B" %}
5940   ins_encode %{
5941     int vector_len = 0;
5942     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5943   %}
5944   ins_pipe( pipe_slow );
5945 %}
5946 
5947 
5948 instruct vadd8B_mem(vecD dst, vecD src, memory mem) %{
5949   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
5950   match(Set dst (AddVB src (LoadVector mem)));
5951   format %{ "vpaddb  $dst,$src,$mem\t! add packed8B" %}
5952   ins_encode %{
5953     int vector_len = 0;
5954     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5955   %}
5956   ins_pipe( pipe_slow );
5957 %}
5958 
5959 instruct vadd16B(vecX dst, vecX src) %{
5960   predicate(UseAVX == 0 && n->as_Vector()->length() == 16);
5961   match(Set dst (AddVB dst src));
5962   format %{ "paddb   $dst,$src\t! add packed16B" %}
5963   ins_encode %{
5964     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
5965   %}
5966   ins_pipe( pipe_slow );
5967 %}
5968 
5969 instruct vadd16B_reg(vecX dst, vecX src1, vecX src2) %{
5970   predicate(UseAVX > 0  && n->as_Vector()->length() == 16);
5971   match(Set dst (AddVB src1 src2));
5972   format %{ "vpaddb  $dst,$src1,$src2\t! add packed16B" %}
5973   ins_encode %{
5974     int vector_len = 0;
5975     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5976   %}
5977   ins_pipe( pipe_slow );
5978 %}
5979 
5980 instruct vadd16B_mem(vecX dst, vecX src, memory mem) %{
5981   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
5982   match(Set dst (AddVB src (LoadVector mem)));
5983   format %{ "vpaddb  $dst,$src,$mem\t! add packed16B" %}
5984   ins_encode %{
5985     int vector_len = 0;
5986     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5987   %}
5988   ins_pipe( pipe_slow );
5989 %}
5990 
5991 instruct vadd32B_reg(vecY dst, vecY src1, vecY src2) %{
5992   predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
5993   match(Set dst (AddVB src1 src2));
5994   format %{ "vpaddb  $dst,$src1,$src2\t! add packed32B" %}
5995   ins_encode %{
5996     int vector_len = 1;
5997     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5998   %}
5999   ins_pipe( pipe_slow );
6000 %}
6001 
6002 instruct vadd32B_mem(vecY dst, vecY src, memory mem) %{
6003   predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
6004   match(Set dst (AddVB src (LoadVector mem)));
6005   format %{ "vpaddb  $dst,$src,$mem\t! add packed32B" %}
6006   ins_encode %{
6007     int vector_len = 1;
6008     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6009   %}
6010   ins_pipe( pipe_slow );
6011 %}
6012 
6013 instruct vadd64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
6014   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64);
6015   match(Set dst (AddVB src1 src2));
6016   format %{ "vpaddb  $dst,$src1,$src2\t! add packed64B" %}
6017   ins_encode %{
6018     int vector_len = 2;
6019     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6020   %}
6021   ins_pipe( pipe_slow );
6022 %}
6023 
6024 instruct vadd64B_mem(vecZ dst, vecZ src, memory mem) %{
6025   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64);
6026   match(Set dst (AddVB src (LoadVector mem)));
6027   format %{ "vpaddb  $dst,$src,$mem\t! add packed64B" %}
6028   ins_encode %{
6029     int vector_len = 2;
6030     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6031   %}
6032   ins_pipe( pipe_slow );
6033 %}
6034 
6035 // Shorts/Chars vector add
6036 instruct vadd2S(vecS dst, vecS src) %{
6037   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
6038   match(Set dst (AddVS dst src));
6039   format %{ "paddw   $dst,$src\t! add packed2S" %}
6040   ins_encode %{
6041     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
6042   %}
6043   ins_pipe( pipe_slow );
6044 %}
6045 
6046 instruct vadd2S_reg(vecS dst, vecS src1, vecS src2) %{
6047   predicate(UseAVX > 0  && n->as_Vector()->length() == 2);
6048   match(Set dst (AddVS src1 src2));
6049   format %{ "vpaddw  $dst,$src1,$src2\t! add packed2S" %}
6050   ins_encode %{
6051     int vector_len = 0;
6052     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6053   %}
6054   ins_pipe( pipe_slow );
6055 %}
6056 
6057 instruct vadd2S_mem(vecS dst, vecS src, memory mem) %{
6058   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6059   match(Set dst (AddVS src (LoadVector mem)));
6060   format %{ "vpaddw  $dst,$src,$mem\t! add packed2S" %}
6061   ins_encode %{
6062     int vector_len = 0;
6063     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6064   %}
6065   ins_pipe( pipe_slow );
6066 %}
6067 
6068 instruct vadd4S(vecD dst, vecD src) %{
6069   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
6070   match(Set dst (AddVS dst src));
6071   format %{ "paddw   $dst,$src\t! add packed4S" %}
6072   ins_encode %{
6073     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
6074   %}
6075   ins_pipe( pipe_slow );
6076 %}
6077 
6078 instruct vadd4S_reg(vecD dst, vecD src1, vecD src2) %{
6079   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6080   match(Set dst (AddVS src1 src2));
6081   format %{ "vpaddw  $dst,$src1,$src2\t! add packed4S" %}
6082   ins_encode %{
6083     int vector_len = 0;
6084     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6085   %}
6086   ins_pipe( pipe_slow );
6087 %}
6088 
6089 instruct vadd4S_mem(vecD dst, vecD src, memory mem) %{
6090   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6091   match(Set dst (AddVS src (LoadVector mem)));
6092   format %{ "vpaddw  $dst,$src,$mem\t! add packed4S" %}
6093   ins_encode %{
6094     int vector_len = 0;
6095     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6096   %}
6097   ins_pipe( pipe_slow );
6098 %}
6099 
6100 instruct vadd8S(vecX dst, vecX src) %{
6101   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
6102   match(Set dst (AddVS dst src));
6103   format %{ "paddw   $dst,$src\t! add packed8S" %}
6104   ins_encode %{
6105     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
6106   %}
6107   ins_pipe( pipe_slow );
6108 %}
6109 
6110 instruct vadd8S_reg(vecX dst, vecX src1, vecX src2) %{
6111   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6112   match(Set dst (AddVS src1 src2));
6113   format %{ "vpaddw  $dst,$src1,$src2\t! add packed8S" %}
6114   ins_encode %{
6115     int vector_len = 0;
6116     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6117   %}
6118   ins_pipe( pipe_slow );
6119 %}
6120 
6121 instruct vadd8S_mem(vecX dst, vecX src, memory mem) %{
6122   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6123   match(Set dst (AddVS src (LoadVector mem)));
6124   format %{ "vpaddw  $dst,$src,$mem\t! add packed8S" %}
6125   ins_encode %{
6126     int vector_len = 0;
6127     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6128   %}
6129   ins_pipe( pipe_slow );
6130 %}
6131 
6132 instruct vadd16S_reg(vecY dst, vecY src1, vecY src2) %{
6133   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
6134   match(Set dst (AddVS src1 src2));
6135   format %{ "vpaddw  $dst,$src1,$src2\t! add packed16S" %}
6136   ins_encode %{
6137     int vector_len = 1;
6138     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6139   %}
6140   ins_pipe( pipe_slow );
6141 %}
6142 
6143 instruct vadd16S_mem(vecY dst, vecY src, memory mem) %{
6144   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
6145   match(Set dst (AddVS src (LoadVector mem)));
6146   format %{ "vpaddw  $dst,$src,$mem\t! add packed16S" %}
6147   ins_encode %{
6148     int vector_len = 1;
6149     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6150   %}
6151   ins_pipe( pipe_slow );
6152 %}
6153 
6154 instruct vadd32S_reg(vecZ dst, vecZ src1, vecZ src2) %{
6155   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
6156   match(Set dst (AddVS src1 src2));
6157   format %{ "vpaddw  $dst,$src1,$src2\t! add packed32S" %}
6158   ins_encode %{
6159     int vector_len = 2;
6160     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6161   %}
6162   ins_pipe( pipe_slow );
6163 %}
6164 
6165 instruct vadd32S_mem(vecZ dst, vecZ src, memory mem) %{
6166   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
6167   match(Set dst (AddVS src (LoadVector mem)));
6168   format %{ "vpaddw  $dst,$src,$mem\t! add packed32S" %}
6169   ins_encode %{
6170     int vector_len = 2;
6171     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6172   %}
6173   ins_pipe( pipe_slow );
6174 %}
6175 
6176 // Integers vector add
6177 instruct vadd2I(vecD dst, vecD src) %{
6178   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
6179   match(Set dst (AddVI dst src));
6180   format %{ "paddd   $dst,$src\t! add packed2I" %}
6181   ins_encode %{
6182     __ paddd($dst$$XMMRegister, $src$$XMMRegister);
6183   %}
6184   ins_pipe( pipe_slow );
6185 %}
6186 
6187 instruct vadd2I_reg(vecD dst, vecD src1, vecD src2) %{
6188   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6189   match(Set dst (AddVI src1 src2));
6190   format %{ "vpaddd  $dst,$src1,$src2\t! add packed2I" %}
6191   ins_encode %{
6192     int vector_len = 0;
6193     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6194   %}
6195   ins_pipe( pipe_slow );
6196 %}
6197 
6198 instruct vadd2I_mem(vecD dst, vecD src, memory mem) %{
6199   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6200   match(Set dst (AddVI src (LoadVector mem)));
6201   format %{ "vpaddd  $dst,$src,$mem\t! add packed2I" %}
6202   ins_encode %{
6203     int vector_len = 0;
6204     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6205   %}
6206   ins_pipe( pipe_slow );
6207 %}
6208 
6209 instruct vadd4I(vecX dst, vecX src) %{
6210   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
6211   match(Set dst (AddVI dst src));
6212   format %{ "paddd   $dst,$src\t! add packed4I" %}
6213   ins_encode %{
6214     __ paddd($dst$$XMMRegister, $src$$XMMRegister);
6215   %}
6216   ins_pipe( pipe_slow );
6217 %}
6218 
6219 instruct vadd4I_reg(vecX dst, vecX src1, vecX src2) %{
6220   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6221   match(Set dst (AddVI src1 src2));
6222   format %{ "vpaddd  $dst,$src1,$src2\t! add packed4I" %}
6223   ins_encode %{
6224     int vector_len = 0;
6225     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6226   %}
6227   ins_pipe( pipe_slow );
6228 %}
6229 
6230 instruct vadd4I_mem(vecX dst, vecX src, memory mem) %{
6231   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6232   match(Set dst (AddVI src (LoadVector mem)));
6233   format %{ "vpaddd  $dst,$src,$mem\t! add packed4I" %}
6234   ins_encode %{
6235     int vector_len = 0;
6236     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6237   %}
6238   ins_pipe( pipe_slow );
6239 %}
6240 
6241 instruct vadd8I_reg(vecY dst, vecY src1, vecY src2) %{
6242   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
6243   match(Set dst (AddVI src1 src2));
6244   format %{ "vpaddd  $dst,$src1,$src2\t! add packed8I" %}
6245   ins_encode %{
6246     int vector_len = 1;
6247     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6248   %}
6249   ins_pipe( pipe_slow );
6250 %}
6251 
6252 instruct vadd8I_mem(vecY dst, vecY src, memory mem) %{
6253   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
6254   match(Set dst (AddVI src (LoadVector mem)));
6255   format %{ "vpaddd  $dst,$src,$mem\t! add packed8I" %}
6256   ins_encode %{
6257     int vector_len = 1;
6258     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6259   %}
6260   ins_pipe( pipe_slow );
6261 %}
6262 
6263 instruct vadd16I_reg(vecZ dst, vecZ src1, vecZ src2) %{
6264   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
6265   match(Set dst (AddVI src1 src2));
6266   format %{ "vpaddd  $dst,$src1,$src2\t! add packed16I" %}
6267   ins_encode %{
6268     int vector_len = 2;
6269     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6270   %}
6271   ins_pipe( pipe_slow );
6272 %}
6273 
6274 instruct vadd16I_mem(vecZ dst, vecZ src, memory mem) %{
6275   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
6276   match(Set dst (AddVI src (LoadVector mem)));
6277   format %{ "vpaddd  $dst,$src,$mem\t! add packed16I" %}
6278   ins_encode %{
6279     int vector_len = 2;
6280     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6281   %}
6282   ins_pipe( pipe_slow );
6283 %}
6284 
6285 // Longs vector add
6286 instruct vadd2L(vecX dst, vecX src) %{
6287   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
6288   match(Set dst (AddVL dst src));
6289   format %{ "paddq   $dst,$src\t! add packed2L" %}
6290   ins_encode %{
6291     __ paddq($dst$$XMMRegister, $src$$XMMRegister);
6292   %}
6293   ins_pipe( pipe_slow );
6294 %}
6295 
6296 instruct vadd2L_reg(vecX dst, vecX src1, vecX src2) %{
6297   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6298   match(Set dst (AddVL src1 src2));
6299   format %{ "vpaddq  $dst,$src1,$src2\t! add packed2L" %}
6300   ins_encode %{
6301     int vector_len = 0;
6302     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6303   %}
6304   ins_pipe( pipe_slow );
6305 %}
6306 
6307 instruct vadd2L_mem(vecX dst, vecX src, memory mem) %{
6308   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6309   match(Set dst (AddVL src (LoadVector mem)));
6310   format %{ "vpaddq  $dst,$src,$mem\t! add packed2L" %}
6311   ins_encode %{
6312     int vector_len = 0;
6313     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6314   %}
6315   ins_pipe( pipe_slow );
6316 %}
6317 
6318 instruct vadd4L_reg(vecY dst, vecY src1, vecY src2) %{
6319   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
6320   match(Set dst (AddVL src1 src2));
6321   format %{ "vpaddq  $dst,$src1,$src2\t! add packed4L" %}
6322   ins_encode %{
6323     int vector_len = 1;
6324     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6325   %}
6326   ins_pipe( pipe_slow );
6327 %}
6328 
6329 instruct vadd4L_mem(vecY dst, vecY src, memory mem) %{
6330   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
6331   match(Set dst (AddVL src (LoadVector mem)));
6332   format %{ "vpaddq  $dst,$src,$mem\t! add packed4L" %}
6333   ins_encode %{
6334     int vector_len = 1;
6335     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6336   %}
6337   ins_pipe( pipe_slow );
6338 %}
6339 
6340 instruct vadd8L_reg(vecZ dst, vecZ src1, vecZ src2) %{
6341   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
6342   match(Set dst (AddVL src1 src2));
6343   format %{ "vpaddq  $dst,$src1,$src2\t! add packed8L" %}
6344   ins_encode %{
6345     int vector_len = 2;
6346     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6347   %}
6348   ins_pipe( pipe_slow );
6349 %}
6350 
6351 instruct vadd8L_mem(vecZ dst, vecZ src, memory mem) %{
6352   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
6353   match(Set dst (AddVL src (LoadVector mem)));
6354   format %{ "vpaddq  $dst,$src,$mem\t! add packed8L" %}
6355   ins_encode %{
6356     int vector_len = 2;
6357     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6358   %}
6359   ins_pipe( pipe_slow );
6360 %}
6361 
6362 // Floats vector add
6363 instruct vadd2F(vecD dst, vecD src) %{
6364   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
6365   match(Set dst (AddVF dst src));
6366   format %{ "addps   $dst,$src\t! add packed2F" %}
6367   ins_encode %{
6368     __ addps($dst$$XMMRegister, $src$$XMMRegister);
6369   %}
6370   ins_pipe( pipe_slow );
6371 %}
6372 
6373 instruct vadd2F_reg(vecD dst, vecD src1, vecD src2) %{
6374   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6375   match(Set dst (AddVF src1 src2));
6376   format %{ "vaddps  $dst,$src1,$src2\t! add packed2F" %}
6377   ins_encode %{
6378     int vector_len = 0;
6379     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6380   %}
6381   ins_pipe( pipe_slow );
6382 %}
6383 
6384 instruct vadd2F_mem(vecD dst, vecD src, memory mem) %{
6385   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6386   match(Set dst (AddVF src (LoadVector mem)));
6387   format %{ "vaddps  $dst,$src,$mem\t! add packed2F" %}
6388   ins_encode %{
6389     int vector_len = 0;
6390     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6391   %}
6392   ins_pipe( pipe_slow );
6393 %}
6394 
6395 instruct vadd4F(vecX dst, vecX src) %{
6396   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
6397   match(Set dst (AddVF dst src));
6398   format %{ "addps   $dst,$src\t! add packed4F" %}
6399   ins_encode %{
6400     __ addps($dst$$XMMRegister, $src$$XMMRegister);
6401   %}
6402   ins_pipe( pipe_slow );
6403 %}
6404 
6405 instruct vadd4F_reg(vecX dst, vecX src1, vecX src2) %{
6406   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6407   match(Set dst (AddVF src1 src2));
6408   format %{ "vaddps  $dst,$src1,$src2\t! add packed4F" %}
6409   ins_encode %{
6410     int vector_len = 0;
6411     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6412   %}
6413   ins_pipe( pipe_slow );
6414 %}
6415 
6416 instruct vadd4F_mem(vecX dst, vecX src, memory mem) %{
6417   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6418   match(Set dst (AddVF src (LoadVector mem)));
6419   format %{ "vaddps  $dst,$src,$mem\t! add packed4F" %}
6420   ins_encode %{
6421     int vector_len = 0;
6422     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6423   %}
6424   ins_pipe( pipe_slow );
6425 %}
6426 
6427 instruct vadd8F_reg(vecY dst, vecY src1, vecY src2) %{
6428   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6429   match(Set dst (AddVF src1 src2));
6430   format %{ "vaddps  $dst,$src1,$src2\t! add packed8F" %}
6431   ins_encode %{
6432     int vector_len = 1;
6433     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6434   %}
6435   ins_pipe( pipe_slow );
6436 %}
6437 
6438 instruct vadd8F_mem(vecY dst, vecY src, memory mem) %{
6439   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6440   match(Set dst (AddVF src (LoadVector mem)));
6441   format %{ "vaddps  $dst,$src,$mem\t! add packed8F" %}
6442   ins_encode %{
6443     int vector_len = 1;
6444     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6445   %}
6446   ins_pipe( pipe_slow );
6447 %}
6448 
6449 instruct vadd16F_reg(vecZ dst, vecZ src1, vecZ src2) %{
6450   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
6451   match(Set dst (AddVF src1 src2));
6452   format %{ "vaddps  $dst,$src1,$src2\t! add packed16F" %}
6453   ins_encode %{
6454     int vector_len = 2;
6455     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6456   %}
6457   ins_pipe( pipe_slow );
6458 %}
6459 
6460 instruct vadd16F_mem(vecZ dst, vecZ src, memory mem) %{
6461   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
6462   match(Set dst (AddVF src (LoadVector mem)));
6463   format %{ "vaddps  $dst,$src,$mem\t! add packed16F" %}
6464   ins_encode %{
6465     int vector_len = 2;
6466     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6467   %}
6468   ins_pipe( pipe_slow );
6469 %}
6470 
6471 // Doubles vector add
6472 instruct vadd2D(vecX dst, vecX src) %{
6473   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
6474   match(Set dst (AddVD dst src));
6475   format %{ "addpd   $dst,$src\t! add packed2D" %}
6476   ins_encode %{
6477     __ addpd($dst$$XMMRegister, $src$$XMMRegister);
6478   %}
6479   ins_pipe( pipe_slow );
6480 %}
6481 
6482 instruct vadd2D_reg(vecX dst, vecX src1, vecX src2) %{
6483   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6484   match(Set dst (AddVD src1 src2));
6485   format %{ "vaddpd  $dst,$src1,$src2\t! add packed2D" %}
6486   ins_encode %{
6487     int vector_len = 0;
6488     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6489   %}
6490   ins_pipe( pipe_slow );
6491 %}
6492 
6493 instruct vadd2D_mem(vecX dst, vecX src, memory mem) %{
6494   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6495   match(Set dst (AddVD src (LoadVector mem)));
6496   format %{ "vaddpd  $dst,$src,$mem\t! add packed2D" %}
6497   ins_encode %{
6498     int vector_len = 0;
6499     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6500   %}
6501   ins_pipe( pipe_slow );
6502 %}
6503 
6504 instruct vadd4D_reg(vecY dst, vecY src1, vecY src2) %{
6505   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6506   match(Set dst (AddVD src1 src2));
6507   format %{ "vaddpd  $dst,$src1,$src2\t! add packed4D" %}
6508   ins_encode %{
6509     int vector_len = 1;
6510     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6511   %}
6512   ins_pipe( pipe_slow );
6513 %}
6514 
6515 instruct vadd4D_mem(vecY dst, vecY src, memory mem) %{
6516   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6517   match(Set dst (AddVD src (LoadVector mem)));
6518   format %{ "vaddpd  $dst,$src,$mem\t! add packed4D" %}
6519   ins_encode %{
6520     int vector_len = 1;
6521     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6522   %}
6523   ins_pipe( pipe_slow );
6524 %}
6525 
6526 instruct vadd8D_reg(vecZ dst, vecZ src1, vecZ src2) %{
6527   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
6528   match(Set dst (AddVD src1 src2));
6529   format %{ "vaddpd  $dst,$src1,$src2\t! add packed8D" %}
6530   ins_encode %{
6531     int vector_len = 2;
6532     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6533   %}
6534   ins_pipe( pipe_slow );
6535 %}
6536 
6537 instruct vadd8D_mem(vecZ dst, vecZ src, memory mem) %{
6538   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
6539   match(Set dst (AddVD src (LoadVector mem)));
6540   format %{ "vaddpd  $dst,$src,$mem\t! add packed8D" %}
6541   ins_encode %{
6542     int vector_len = 2;
6543     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6544   %}
6545   ins_pipe( pipe_slow );
6546 %}
6547 
6548 // --------------------------------- SUB --------------------------------------
6549 
6550 // Bytes vector sub
6551 instruct vsub4B(vecS dst, vecS src) %{
6552   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
6553   match(Set dst (SubVB dst src));
6554   format %{ "psubb   $dst,$src\t! sub packed4B" %}
6555   ins_encode %{
6556     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
6557   %}
6558   ins_pipe( pipe_slow );
6559 %}
6560 
6561 instruct vsub4B_reg(vecS dst, vecS src1, vecS src2) %{
6562   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6563   match(Set dst (SubVB src1 src2));
6564   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed4B" %}
6565   ins_encode %{
6566     int vector_len = 0;
6567     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6568   %}
6569   ins_pipe( pipe_slow );
6570 %}
6571 
6572 instruct vsub4B_mem(vecS dst, vecS src, memory mem) %{
6573   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6574   match(Set dst (SubVB src (LoadVector mem)));
6575   format %{ "vpsubb  $dst,$src,$mem\t! sub packed4B" %}
6576   ins_encode %{
6577     int vector_len = 0;
6578     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6579   %}
6580   ins_pipe( pipe_slow );
6581 %}
6582 
6583 instruct vsub8B(vecD dst, vecD src) %{
6584   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
6585   match(Set dst (SubVB dst src));
6586   format %{ "psubb   $dst,$src\t! sub packed8B" %}
6587   ins_encode %{
6588     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
6589   %}
6590   ins_pipe( pipe_slow );
6591 %}
6592 
6593 instruct vsub8B_reg(vecD dst, vecD src1, vecD src2) %{
6594   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6595   match(Set dst (SubVB src1 src2));
6596   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed8B" %}
6597   ins_encode %{
6598     int vector_len = 0;
6599     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6600   %}
6601   ins_pipe( pipe_slow );
6602 %}
6603 
6604 instruct vsub8B_mem(vecD dst, vecD src, memory mem) %{
6605   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6606   match(Set dst (SubVB src (LoadVector mem)));
6607   format %{ "vpsubb  $dst,$src,$mem\t! sub packed8B" %}
6608   ins_encode %{
6609     int vector_len = 0;
6610     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6611   %}
6612   ins_pipe( pipe_slow );
6613 %}
6614 
6615 instruct vsub16B(vecX dst, vecX src) %{
6616   predicate(UseAVX == 0 && n->as_Vector()->length() == 16);
6617   match(Set dst (SubVB dst src));
6618   format %{ "psubb   $dst,$src\t! sub packed16B" %}
6619   ins_encode %{
6620     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
6621   %}
6622   ins_pipe( pipe_slow );
6623 %}
6624 
6625 instruct vsub16B_reg(vecX dst, vecX src1, vecX src2) %{
6626   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
6627   match(Set dst (SubVB src1 src2));
6628   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed16B" %}
6629   ins_encode %{
6630     int vector_len = 0;
6631     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6632   %}
6633   ins_pipe( pipe_slow );
6634 %}
6635 
6636 instruct vsub16B_mem(vecX dst, vecX src, memory mem) %{
6637   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
6638   match(Set dst (SubVB src (LoadVector mem)));
6639   format %{ "vpsubb  $dst,$src,$mem\t! sub packed16B" %}
6640   ins_encode %{
6641     int vector_len = 0;
6642     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6643   %}
6644   ins_pipe( pipe_slow );
6645 %}
6646 
6647 instruct vsub32B_reg(vecY dst, vecY src1, vecY src2) %{
6648   predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
6649   match(Set dst (SubVB src1 src2));
6650   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed32B" %}
6651   ins_encode %{
6652     int vector_len = 1;
6653     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6654   %}
6655   ins_pipe( pipe_slow );
6656 %}
6657 
6658 instruct vsub32B_mem(vecY dst, vecY src, memory mem) %{
6659   predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
6660   match(Set dst (SubVB src (LoadVector mem)));
6661   format %{ "vpsubb  $dst,$src,$mem\t! sub packed32B" %}
6662   ins_encode %{
6663     int vector_len = 1;
6664     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6665   %}
6666   ins_pipe( pipe_slow );
6667 %}
6668 
6669 instruct vsub64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
6670   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64);
6671   match(Set dst (SubVB src1 src2));
6672   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed64B" %}
6673   ins_encode %{
6674     int vector_len = 2;
6675     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6676   %}
6677   ins_pipe( pipe_slow );
6678 %}
6679 
6680 instruct vsub64B_mem(vecZ dst, vecZ src, memory mem) %{
6681   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64);
6682   match(Set dst (SubVB src (LoadVector mem)));
6683   format %{ "vpsubb  $dst,$src,$mem\t! sub packed64B" %}
6684   ins_encode %{
6685     int vector_len = 2;
6686     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6687   %}
6688   ins_pipe( pipe_slow );
6689 %}
6690 
6691 // Shorts/Chars vector sub
6692 instruct vsub2S(vecS dst, vecS src) %{
6693   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
6694   match(Set dst (SubVS dst src));
6695   format %{ "psubw   $dst,$src\t! sub packed2S" %}
6696   ins_encode %{
6697     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
6698   %}
6699   ins_pipe( pipe_slow );
6700 %}
6701 
6702 instruct vsub2S_reg(vecS dst, vecS src1, vecS src2) %{
6703   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6704   match(Set dst (SubVS src1 src2));
6705   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed2S" %}
6706   ins_encode %{
6707     int vector_len = 0;
6708     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6709   %}
6710   ins_pipe( pipe_slow );
6711 %}
6712 
6713 instruct vsub2S_mem(vecS dst, vecS src, memory mem) %{
6714   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6715   match(Set dst (SubVS src (LoadVector mem)));
6716   format %{ "vpsubw  $dst,$src,$mem\t! sub packed2S" %}
6717   ins_encode %{
6718     int vector_len = 0;
6719     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6720   %}
6721   ins_pipe( pipe_slow );
6722 %}
6723 
6724 instruct vsub4S(vecD dst, vecD src) %{
6725   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
6726   match(Set dst (SubVS dst src));
6727   format %{ "psubw   $dst,$src\t! sub packed4S" %}
6728   ins_encode %{
6729     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
6730   %}
6731   ins_pipe( pipe_slow );
6732 %}
6733 
6734 instruct vsub4S_reg(vecD dst, vecD src1, vecD src2) %{
6735   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6736   match(Set dst (SubVS src1 src2));
6737   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed4S" %}
6738   ins_encode %{
6739     int vector_len = 0;
6740     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6741   %}
6742   ins_pipe( pipe_slow );
6743 %}
6744 
6745 instruct vsub4S_mem(vecD dst, vecD src, memory mem) %{
6746   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6747   match(Set dst (SubVS src (LoadVector mem)));
6748   format %{ "vpsubw  $dst,$src,$mem\t! sub packed4S" %}
6749   ins_encode %{
6750     int vector_len = 0;
6751     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6752   %}
6753   ins_pipe( pipe_slow );
6754 %}
6755 
6756 instruct vsub8S(vecX dst, vecX src) %{
6757   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
6758   match(Set dst (SubVS dst src));
6759   format %{ "psubw   $dst,$src\t! sub packed8S" %}
6760   ins_encode %{
6761     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
6762   %}
6763   ins_pipe( pipe_slow );
6764 %}
6765 
6766 instruct vsub8S_reg(vecX dst, vecX src1, vecX src2) %{
6767   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6768   match(Set dst (SubVS src1 src2));
6769   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed8S" %}
6770   ins_encode %{
6771     int vector_len = 0;
6772     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6773   %}
6774   ins_pipe( pipe_slow );
6775 %}
6776 
6777 instruct vsub8S_mem(vecX dst, vecX src, memory mem) %{
6778   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6779   match(Set dst (SubVS src (LoadVector mem)));
6780   format %{ "vpsubw  $dst,$src,$mem\t! sub packed8S" %}
6781   ins_encode %{
6782     int vector_len = 0;
6783     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6784   %}
6785   ins_pipe( pipe_slow );
6786 %}
6787 
6788 instruct vsub16S_reg(vecY dst, vecY src1, vecY src2) %{
6789   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
6790   match(Set dst (SubVS src1 src2));
6791   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed16S" %}
6792   ins_encode %{
6793     int vector_len = 1;
6794     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6795   %}
6796   ins_pipe( pipe_slow );
6797 %}
6798 
6799 instruct vsub16S_mem(vecY dst, vecY src, memory mem) %{
6800   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
6801   match(Set dst (SubVS src (LoadVector mem)));
6802   format %{ "vpsubw  $dst,$src,$mem\t! sub packed16S" %}
6803   ins_encode %{
6804     int vector_len = 1;
6805     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6806   %}
6807   ins_pipe( pipe_slow );
6808 %}
6809 
6810 instruct vsub32S_reg(vecZ dst, vecZ src1, vecZ src2) %{
6811   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
6812   match(Set dst (SubVS src1 src2));
6813   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed32S" %}
6814   ins_encode %{
6815     int vector_len = 2;
6816     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6817   %}
6818   ins_pipe( pipe_slow );
6819 %}
6820 
6821 instruct vsub32S_mem(vecZ dst, vecZ src, memory mem) %{
6822   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
6823   match(Set dst (SubVS src (LoadVector mem)));
6824   format %{ "vpsubw  $dst,$src,$mem\t! sub packed32S" %}
6825   ins_encode %{
6826     int vector_len = 2;
6827     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6828   %}
6829   ins_pipe( pipe_slow );
6830 %}
6831 
6832 // Integers vector sub
6833 instruct vsub2I(vecD dst, vecD src) %{
6834   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
6835   match(Set dst (SubVI dst src));
6836   format %{ "psubd   $dst,$src\t! sub packed2I" %}
6837   ins_encode %{
6838     __ psubd($dst$$XMMRegister, $src$$XMMRegister);
6839   %}
6840   ins_pipe( pipe_slow );
6841 %}
6842 
6843 instruct vsub2I_reg(vecD dst, vecD src1, vecD src2) %{
6844   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6845   match(Set dst (SubVI src1 src2));
6846   format %{ "vpsubd  $dst,$src1,$src2\t! sub packed2I" %}
6847   ins_encode %{
6848     int vector_len = 0;
6849     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6850   %}
6851   ins_pipe( pipe_slow );
6852 %}
6853 
6854 instruct vsub2I_mem(vecD dst, vecD src, memory mem) %{
6855   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6856   match(Set dst (SubVI src (LoadVector mem)));
6857   format %{ "vpsubd  $dst,$src,$mem\t! sub packed2I" %}
6858   ins_encode %{
6859     int vector_len = 0;
6860     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6861   %}
6862   ins_pipe( pipe_slow );
6863 %}
6864 
6865 instruct vsub4I(vecX dst, vecX src) %{
6866   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
6867   match(Set dst (SubVI dst src));
6868   format %{ "psubd   $dst,$src\t! sub packed4I" %}
6869   ins_encode %{
6870     __ psubd($dst$$XMMRegister, $src$$XMMRegister);
6871   %}
6872   ins_pipe( pipe_slow );
6873 %}
6874 
6875 instruct vsub4I_reg(vecX dst, vecX src1, vecX src2) %{
6876   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6877   match(Set dst (SubVI src1 src2));
6878   format %{ "vpsubd  $dst,$src1,$src2\t! sub packed4I" %}
6879   ins_encode %{
6880     int vector_len = 0;
6881     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6882   %}
6883   ins_pipe( pipe_slow );
6884 %}
6885 
6886 instruct vsub4I_mem(vecX dst, vecX src, memory mem) %{
6887   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6888   match(Set dst (SubVI src (LoadVector mem)));
6889   format %{ "vpsubd  $dst,$src,$mem\t! sub packed4I" %}
6890   ins_encode %{
6891     int vector_len = 0;
6892     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6893   %}
6894   ins_pipe( pipe_slow );
6895 %}
6896 
6897 instruct vsub8I_reg(vecY dst, vecY src1, vecY src2) %{
6898   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
6899   match(Set dst (SubVI src1 src2));
6900   format %{ "vpsubd  $dst,$src1,$src2\t! sub packed8I" %}
6901   ins_encode %{
6902     int vector_len = 1;
6903     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6904   %}
6905   ins_pipe( pipe_slow );
6906 %}
6907 
6908 instruct vsub8I_mem(vecY dst, vecY src, memory mem) %{
6909   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
6910   match(Set dst (SubVI src (LoadVector mem)));
6911   format %{ "vpsubd  $dst,$src,$mem\t! sub packed8I" %}
6912   ins_encode %{
6913     int vector_len = 1;
6914     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6915   %}
6916   ins_pipe( pipe_slow );
6917 %}
6918 
6919 instruct vsub16I_reg(vecZ dst, vecZ src1, vecZ src2) %{
6920   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
6921   match(Set dst (SubVI src1 src2));
6922   format %{ "vpsubd  $dst,$src1,$src2\t! sub packed16I" %}
6923   ins_encode %{
6924     int vector_len = 2;
6925     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6926   %}
6927   ins_pipe( pipe_slow );
6928 %}
6929 
6930 instruct vsub16I_mem(vecZ dst, vecZ src, memory mem) %{
6931   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
6932   match(Set dst (SubVI src (LoadVector mem)));
6933   format %{ "vpsubd  $dst,$src,$mem\t! sub packed16I" %}
6934   ins_encode %{
6935     int vector_len = 2;
6936     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6937   %}
6938   ins_pipe( pipe_slow );
6939 %}
6940 
6941 // Longs vector sub
6942 instruct vsub2L(vecX dst, vecX src) %{
6943   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
6944   match(Set dst (SubVL dst src));
6945   format %{ "psubq   $dst,$src\t! sub packed2L" %}
6946   ins_encode %{
6947     __ psubq($dst$$XMMRegister, $src$$XMMRegister);
6948   %}
6949   ins_pipe( pipe_slow );
6950 %}
6951 
6952 instruct vsub2L_reg(vecX dst, vecX src1, vecX src2) %{
6953   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6954   match(Set dst (SubVL src1 src2));
6955   format %{ "vpsubq  $dst,$src1,$src2\t! sub packed2L" %}
6956   ins_encode %{
6957     int vector_len = 0;
6958     __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6959   %}
6960   ins_pipe( pipe_slow );
6961 %}
6962 
6963 instruct vsub2L_mem(vecX dst, vecX src, memory mem) %{
6964   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6965   match(Set dst (SubVL src (LoadVector mem)));
6966   format %{ "vpsubq  $dst,$src,$mem\t! sub packed2L" %}
6967   ins_encode %{
6968     int vector_len = 0;
6969     __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6970   %}
6971   ins_pipe( pipe_slow );
6972 %}
6973 
6974 instruct vsub4L_reg(vecY dst, vecY src1, vecY src2) %{
6975   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
6976   match(Set dst (SubVL src1 src2));
6977   format %{ "vpsubq  $dst,$src1,$src2\t! sub packed4L" %}
6978   ins_encode %{
6979     int vector_len = 1;
6980     __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6981   %}
6982   ins_pipe( pipe_slow );
6983 %}
6984 
6985 instruct vsub4L_mem(vecY dst, vecY src, memory mem) %{
6986   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
6987   match(Set dst (SubVL src (LoadVector mem)));
6988   format %{ "vpsubq  $dst,$src,$mem\t! sub packed4L" %}
6989   ins_encode %{
6990     int vector_len = 1;
6991     __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6992   %}
6993   ins_pipe( pipe_slow );
6994 %}
6995 
6996 instruct vsub8L_reg(vecZ dst, vecZ src1, vecZ src2) %{
6997   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
6998   match(Set dst (SubVL src1 src2));
6999   format %{ "vpsubq  $dst,$src1,$src2\t! sub packed8L" %}
7000   ins_encode %{
7001     int vector_len = 2;
7002     __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7003   %}
7004   ins_pipe( pipe_slow );
7005 %}
7006 
7007 instruct vsub8L_mem(vecZ dst, vecZ src, memory mem) %{
7008   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
7009   match(Set dst (SubVL src (LoadVector mem)));
7010   format %{ "vpsubq  $dst,$src,$mem\t! sub packed8L" %}
7011   ins_encode %{
7012     int vector_len = 2;
7013     __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7014   %}
7015   ins_pipe( pipe_slow );
7016 %}
7017 
7018 // Floats vector sub
7019 instruct vsub2F(vecD dst, vecD src) %{
7020   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
7021   match(Set dst (SubVF dst src));
7022   format %{ "subps   $dst,$src\t! sub packed2F" %}
7023   ins_encode %{
7024     __ subps($dst$$XMMRegister, $src$$XMMRegister);
7025   %}
7026   ins_pipe( pipe_slow );
7027 %}
7028 
7029 instruct vsub2F_reg(vecD dst, vecD src1, vecD src2) %{
7030   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7031   match(Set dst (SubVF src1 src2));
7032   format %{ "vsubps  $dst,$src1,$src2\t! sub packed2F" %}
7033   ins_encode %{
7034     int vector_len = 0;
7035     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7036   %}
7037   ins_pipe( pipe_slow );
7038 %}
7039 
7040 instruct vsub2F_mem(vecD dst, vecD src, memory mem) %{
7041   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7042   match(Set dst (SubVF src (LoadVector mem)));
7043   format %{ "vsubps  $dst,$src,$mem\t! sub packed2F" %}
7044   ins_encode %{
7045     int vector_len = 0;
7046     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7047   %}
7048   ins_pipe( pipe_slow );
7049 %}
7050 
7051 instruct vsub4F(vecX dst, vecX src) %{
7052   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
7053   match(Set dst (SubVF dst src));
7054   format %{ "subps   $dst,$src\t! sub packed4F" %}
7055   ins_encode %{
7056     __ subps($dst$$XMMRegister, $src$$XMMRegister);
7057   %}
7058   ins_pipe( pipe_slow );
7059 %}
7060 
7061 instruct vsub4F_reg(vecX dst, vecX src1, vecX src2) %{
7062   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7063   match(Set dst (SubVF src1 src2));
7064   format %{ "vsubps  $dst,$src1,$src2\t! sub packed4F" %}
7065   ins_encode %{
7066     int vector_len = 0;
7067     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7068   %}
7069   ins_pipe( pipe_slow );
7070 %}
7071 
7072 instruct vsub4F_mem(vecX dst, vecX src, memory mem) %{
7073   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7074   match(Set dst (SubVF src (LoadVector mem)));
7075   format %{ "vsubps  $dst,$src,$mem\t! sub packed4F" %}
7076   ins_encode %{
7077     int vector_len = 0;
7078     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7079   %}
7080   ins_pipe( pipe_slow );
7081 %}
7082 
7083 instruct vsub8F_reg(vecY dst, vecY src1, vecY src2) %{
7084   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
7085   match(Set dst (SubVF src1 src2));
7086   format %{ "vsubps  $dst,$src1,$src2\t! sub packed8F" %}
7087   ins_encode %{
7088     int vector_len = 1;
7089     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7090   %}
7091   ins_pipe( pipe_slow );
7092 %}
7093 
7094 instruct vsub8F_mem(vecY dst, vecY src, memory mem) %{
7095   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
7096   match(Set dst (SubVF src (LoadVector mem)));
7097   format %{ "vsubps  $dst,$src,$mem\t! sub packed8F" %}
7098   ins_encode %{
7099     int vector_len = 1;
7100     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7101   %}
7102   ins_pipe( pipe_slow );
7103 %}
7104 
7105 instruct vsub16F_reg(vecZ dst, vecZ src1, vecZ src2) %{
7106   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
7107   match(Set dst (SubVF src1 src2));
7108   format %{ "vsubps  $dst,$src1,$src2\t! sub packed16F" %}
7109   ins_encode %{
7110     int vector_len = 2;
7111     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7112   %}
7113   ins_pipe( pipe_slow );
7114 %}
7115 
7116 instruct vsub16F_mem(vecZ dst, vecZ src, memory mem) %{
7117   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
7118   match(Set dst (SubVF src (LoadVector mem)));
7119   format %{ "vsubps  $dst,$src,$mem\t! sub packed16F" %}
7120   ins_encode %{
7121     int vector_len = 2;
7122     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7123   %}
7124   ins_pipe( pipe_slow );
7125 %}
7126 
7127 // Doubles vector sub
7128 instruct vsub2D(vecX dst, vecX src) %{
7129   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
7130   match(Set dst (SubVD dst src));
7131   format %{ "subpd   $dst,$src\t! sub packed2D" %}
7132   ins_encode %{
7133     __ subpd($dst$$XMMRegister, $src$$XMMRegister);
7134   %}
7135   ins_pipe( pipe_slow );
7136 %}
7137 
7138 instruct vsub2D_reg(vecX dst, vecX src1, vecX src2) %{
7139   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7140   match(Set dst (SubVD src1 src2));
7141   format %{ "vsubpd  $dst,$src1,$src2\t! sub packed2D" %}
7142   ins_encode %{
7143     int vector_len = 0;
7144     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7145   %}
7146   ins_pipe( pipe_slow );
7147 %}
7148 
7149 instruct vsub2D_mem(vecX dst, vecX src, memory mem) %{
7150   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7151   match(Set dst (SubVD src (LoadVector mem)));
7152   format %{ "vsubpd  $dst,$src,$mem\t! sub packed2D" %}
7153   ins_encode %{
7154     int vector_len = 0;
7155     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7156   %}
7157   ins_pipe( pipe_slow );
7158 %}
7159 
7160 instruct vsub4D_reg(vecY dst, vecY src1, vecY src2) %{
7161   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7162   match(Set dst (SubVD src1 src2));
7163   format %{ "vsubpd  $dst,$src1,$src2\t! sub packed4D" %}
7164   ins_encode %{
7165     int vector_len = 1;
7166     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7167   %}
7168   ins_pipe( pipe_slow );
7169 %}
7170 
7171 instruct vsub4D_mem(vecY dst, vecY src, memory mem) %{
7172   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7173   match(Set dst (SubVD src (LoadVector mem)));
7174   format %{ "vsubpd  $dst,$src,$mem\t! sub packed4D" %}
7175   ins_encode %{
7176     int vector_len = 1;
7177     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7178   %}
7179   ins_pipe( pipe_slow );
7180 %}
7181 
7182 instruct vsub8D_reg(vecZ dst, vecZ src1, vecZ src2) %{
7183   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
7184   match(Set dst (SubVD src1 src2));
7185   format %{ "vsubpd  $dst,$src1,$src2\t! sub packed8D" %}
7186   ins_encode %{
7187     int vector_len = 2;
7188     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7189   %}
7190   ins_pipe( pipe_slow );
7191 %}
7192 
7193 instruct vsub8D_mem(vecZ dst, vecZ src, memory mem) %{
7194   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
7195   match(Set dst (SubVD src (LoadVector mem)));
7196   format %{ "vsubpd  $dst,$src,$mem\t! sub packed8D" %}
7197   ins_encode %{
7198     int vector_len = 2;
7199     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7200   %}
7201   ins_pipe( pipe_slow );
7202 %}
7203 
7204 // --------------------------------- MUL --------------------------------------
7205 
7206 // Byte vector mul
7207 instruct mul4B_reg(vecS dst, vecS src1, vecS src2, vecS tmp, rRegI scratch) %{
7208   predicate(UseSSE > 3 && n->as_Vector()->length() == 4);
7209   match(Set dst (MulVB src1 src2));
7210   effect(TEMP dst, TEMP tmp, TEMP scratch);
7211   format %{"pmovsxbw  $tmp,$src1\n\t"
7212            "pmovsxbw  $dst,$src2\n\t"
7213            "pmullw    $tmp,$dst\n\t"
7214            "movdqu    $dst,[0x00ff00ff0x00ff00ff]\n\t"
7215            "pand      $dst,$tmp\n\t"
7216            "packuswb  $dst,$dst\t! mul packed4B" %}
7217   ins_encode %{
7218     __ pmovsxbw($tmp$$XMMRegister, $src1$$XMMRegister);
7219     __ pmovsxbw($dst$$XMMRegister, $src2$$XMMRegister);
7220     __ pmullw($tmp$$XMMRegister, $dst$$XMMRegister);
7221     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
7222     __ pand($dst$$XMMRegister, $tmp$$XMMRegister);
7223     __ packuswb($dst$$XMMRegister, $dst$$XMMRegister);
7224   %}
7225   ins_pipe( pipe_slow );
7226 %}
7227 
7228 instruct mul8B_reg(vecD dst, vecD src1, vecD src2, vecD tmp, rRegI scratch) %{
7229   predicate(UseSSE > 3 && n->as_Vector()->length() == 8);
7230   match(Set dst (MulVB src1 src2));
7231   effect(TEMP dst, TEMP tmp, TEMP scratch);
7232   format %{"pmovsxbw  $tmp,$src1\n\t"
7233            "pmovsxbw  $dst,$src2\n\t"
7234            "pmullw    $tmp,$dst\n\t"
7235            "movdqu    $dst,[0x00ff00ff0x00ff00ff]\n\t"
7236            "pand      $dst,$tmp\n\t"
7237            "packuswb  $dst,$dst\t! mul packed8B" %}
7238   ins_encode %{
7239     __ pmovsxbw($tmp$$XMMRegister, $src1$$XMMRegister);
7240     __ pmovsxbw($dst$$XMMRegister, $src2$$XMMRegister);
7241     __ pmullw($tmp$$XMMRegister, $dst$$XMMRegister);
7242     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
7243     __ pand($dst$$XMMRegister, $tmp$$XMMRegister);
7244     __ packuswb($dst$$XMMRegister, $dst$$XMMRegister);
7245   %}
7246   ins_pipe( pipe_slow );
7247 %}
7248 
7249 instruct mul16B_reg(vecX dst, vecX src1, vecX src2, vecX tmp1, vecX tmp2, rRegI scratch) %{
7250   predicate(UseSSE > 3 && n->as_Vector()->length() == 16);
7251   match(Set dst (MulVB src1 src2));
7252   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP scratch);
7253   format %{"pmovsxbw  $tmp1,$src1\n\t"
7254            "pmovsxbw  $tmp2,$src2\n\t"
7255            "pmullw    $tmp1,$tmp2\n\t"
7256            "pshufd    $tmp2,$src1,0xEE\n\t"
7257            "pshufd    $dst,$src2,0xEE\n\t"
7258            "pmovsxbw  $tmp2,$tmp2\n\t"
7259            "pmovsxbw  $dst,$dst\n\t"
7260            "pmullw    $tmp2,$dst\n\t"
7261            "movdqu    $dst,[0x00ff00ff0x00ff00ff]\n\t"
7262            "pand      $tmp2,$dst\n\t"
7263            "pand      $dst,$tmp1\n\t"
7264            "packuswb  $dst,$tmp2\t! mul packed16B" %}
7265   ins_encode %{
7266     __ pmovsxbw($tmp1$$XMMRegister, $src1$$XMMRegister);
7267     __ pmovsxbw($tmp2$$XMMRegister, $src2$$XMMRegister);
7268     __ pmullw($tmp1$$XMMRegister, $tmp2$$XMMRegister);
7269     __ pshufd($tmp2$$XMMRegister, $src1$$XMMRegister, 0xEE);
7270     __ pshufd($dst$$XMMRegister, $src2$$XMMRegister, 0xEE);
7271     __ pmovsxbw($tmp2$$XMMRegister, $tmp2$$XMMRegister);
7272     __ pmovsxbw($dst$$XMMRegister, $dst$$XMMRegister);
7273     __ pmullw($tmp2$$XMMRegister, $dst$$XMMRegister);
7274     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
7275     __ pand($tmp2$$XMMRegister, $dst$$XMMRegister);
7276     __ pand($dst$$XMMRegister, $tmp1$$XMMRegister);
7277     __ packuswb($dst$$XMMRegister, $tmp2$$XMMRegister);
7278   %}
7279   ins_pipe( pipe_slow );
7280 %}
7281 
7282 instruct vmul16B_reg_avx(vecX dst, vecX src1, vecX src2, vecX tmp, rRegI scratch) %{
7283   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
7284   match(Set dst (MulVB src1 src2));
7285   effect(TEMP dst, TEMP tmp, TEMP scratch);
7286   format %{"vpmovsxbw  $tmp,$src1\n\t"
7287            "vpmovsxbw  $dst,$src2\n\t"
7288            "vpmullw    $tmp,$tmp,$dst\n\t"
7289            "vmovdqu    $dst,[0x00ff00ff0x00ff00ff]\n\t"
7290            "vpand      $dst,$dst,$tmp\n\t"
7291            "vextracti128_high  $tmp,$dst\n\t"
7292            "vpackuswb  $dst,$dst,$dst\n\t! mul packed16B" %}
7293   ins_encode %{
7294   int vector_len = 1;
7295     __ vpmovsxbw($tmp$$XMMRegister, $src1$$XMMRegister, vector_len);
7296     __ vpmovsxbw($dst$$XMMRegister, $src2$$XMMRegister, vector_len);
7297     __ vpmullw($tmp$$XMMRegister, $tmp$$XMMRegister, $dst$$XMMRegister, vector_len);
7298     __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
7299     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
7300     __ vextracti128_high($tmp$$XMMRegister, $dst$$XMMRegister);
7301     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, 0);
7302   %}
7303   ins_pipe( pipe_slow );
7304 %}
7305 
7306 instruct vmul32B_reg_avx(vecY dst, vecY src1, vecY src2, vecY tmp1, vecY tmp2, rRegI scratch) %{
7307   predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
7308   match(Set dst (MulVB src1 src2));
7309   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP scratch);
7310   format %{"vextracti128_high  $tmp1,$src1\n\t"
7311            "vextracti128_high  $dst,$src2\n\t"
7312            "vpmovsxbw $tmp1,$tmp1\n\t"
7313            "vpmovsxbw $dst,$dst\n\t"
7314            "vpmullw $tmp1,$tmp1,$dst\n\t"
7315            "vpmovsxbw $tmp2,$src1\n\t"
7316            "vpmovsxbw $dst,$src2\n\t"
7317            "vpmullw $tmp2,$tmp2,$dst\n\t"
7318            "vmovdqu $dst, [0x00ff00ff0x00ff00ff]\n\t"
7319            "vpbroadcastd $dst, $dst\n\t"
7320            "vpand $tmp1,$tmp1,$dst\n\t"
7321            "vpand $dst,$dst,$tmp2\n\t"
7322            "vpackuswb $dst,$dst,$tmp1\n\t"
7323            "vpermq $dst, $dst, 0xD8\t! mul packed32B" %}
7324   ins_encode %{
7325     int vector_len = 1;
7326     __ vextracti128_high($tmp1$$XMMRegister, $src1$$XMMRegister);
7327     __ vextracti128_high($dst$$XMMRegister, $src2$$XMMRegister);
7328     __ vpmovsxbw($tmp1$$XMMRegister, $tmp1$$XMMRegister, vector_len);
7329     __ vpmovsxbw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
7330     __ vpmullw($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vector_len);
7331     __ vpmovsxbw($tmp2$$XMMRegister, $src1$$XMMRegister, vector_len);
7332     __ vpmovsxbw($dst$$XMMRegister, $src2$$XMMRegister, vector_len);
7333     __ vpmullw($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vector_len);
7334     __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
7335     __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
7336     __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vector_len);
7337     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister, vector_len);
7338     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp1$$XMMRegister, vector_len);
7339     __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vector_len);
7340   %}
7341   ins_pipe( pipe_slow );
7342 %}
7343 
7344 instruct vmul64B_reg_avx(vecZ dst, vecZ src1, vecZ src2, vecZ tmp1, vecZ tmp2, rRegI scratch) %{
7345   predicate(UseAVX > 2 && n->as_Vector()->length() == 64);
7346   match(Set dst (MulVB src1 src2));
7347   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP scratch);
7348   format %{"vextracti64x4_high  $tmp1,$src1\n\t"
7349            "vextracti64x4_high  $dst,$src2\n\t"
7350            "vpmovsxbw $tmp1,$tmp1\n\t"
7351            "vpmovsxbw $dst,$dst\n\t"
7352            "vpmullw $tmp1,$tmp1,$dst\n\t"
7353            "vpmovsxbw $tmp2,$src1\n\t"
7354            "vpmovsxbw $dst,$src2\n\t"
7355            "vpmullw $tmp2,$tmp2,$dst\n\t"
7356            "vmovdqu $dst, [0x00ff00ff0x00ff00ff]\n\t"
7357            "vpbroadcastd $dst, $dst\n\t"
7358            "vpand $tmp1,$tmp1,$dst\n\t"
7359            "vpand $tmp2,$tmp2,$dst\n\t"
7360            "vpackuswb $dst,$tmp1,$tmp2\n\t"
7361            "evmovdquq  $tmp2,[0x0604020007050301]\n\t"
7362            "vpermq $dst,$tmp2,$dst,0x01\t! mul packed64B" %}
7363 
7364   ins_encode %{
7365     int vector_len = 2;
7366     __ vextracti64x4_high($tmp1$$XMMRegister, $src1$$XMMRegister);
7367     __ vextracti64x4_high($dst$$XMMRegister, $src2$$XMMRegister);
7368     __ vpmovsxbw($tmp1$$XMMRegister, $tmp1$$XMMRegister, vector_len);
7369     __ vpmovsxbw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
7370     __ vpmullw($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vector_len);
7371     __ vpmovsxbw($tmp2$$XMMRegister, $src1$$XMMRegister, vector_len);
7372     __ vpmovsxbw($dst$$XMMRegister, $src2$$XMMRegister, vector_len);
7373     __ vpmullw($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vector_len);
7374     __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
7375     __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
7376     __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vector_len);
7377     __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vector_len);
7378     __ vpackuswb($dst$$XMMRegister, $tmp1$$XMMRegister, $tmp2$$XMMRegister, vector_len);
7379     __ evmovdquq($tmp2$$XMMRegister, ExternalAddress(vector_byte_perm_mask()), vector_len, $scratch$$Register);
7380     __ vpermq($dst$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vector_len);
7381 
7382   %}
7383   ins_pipe( pipe_slow );
7384 %}
7385 
7386 // Shorts/Chars vector mul
7387 instruct vmul2S(vecS dst, vecS src) %{
7388   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
7389   match(Set dst (MulVS dst src));
7390   format %{ "pmullw $dst,$src\t! mul packed2S" %}
7391   ins_encode %{
7392     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
7393   %}
7394   ins_pipe( pipe_slow );
7395 %}
7396 
7397 instruct vmul2S_reg(vecS dst, vecS src1, vecS src2) %{
7398   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7399   match(Set dst (MulVS src1 src2));
7400   format %{ "vpmullw $dst,$src1,$src2\t! mul packed2S" %}
7401   ins_encode %{
7402     int vector_len = 0;
7403     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7404   %}
7405   ins_pipe( pipe_slow );
7406 %}
7407 
7408 instruct vmul2S_mem(vecS dst, vecS src, memory mem) %{
7409   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7410   match(Set dst (MulVS src (LoadVector mem)));
7411   format %{ "vpmullw $dst,$src,$mem\t! mul packed2S" %}
7412   ins_encode %{
7413     int vector_len = 0;
7414     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7415   %}
7416   ins_pipe( pipe_slow );
7417 %}
7418 
7419 instruct vmul4S(vecD dst, vecD src) %{
7420   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
7421   match(Set dst (MulVS dst src));
7422   format %{ "pmullw  $dst,$src\t! mul packed4S" %}
7423   ins_encode %{
7424     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
7425   %}
7426   ins_pipe( pipe_slow );
7427 %}
7428 
7429 instruct vmul4S_reg(vecD dst, vecD src1, vecD src2) %{
7430   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7431   match(Set dst (MulVS src1 src2));
7432   format %{ "vpmullw $dst,$src1,$src2\t! mul packed4S" %}
7433   ins_encode %{
7434     int vector_len = 0;
7435     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7436   %}
7437   ins_pipe( pipe_slow );
7438 %}
7439 
7440 instruct vmul4S_mem(vecD dst, vecD src, memory mem) %{
7441   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7442   match(Set dst (MulVS src (LoadVector mem)));
7443   format %{ "vpmullw $dst,$src,$mem\t! mul packed4S" %}
7444   ins_encode %{
7445     int vector_len = 0;
7446     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7447   %}
7448   ins_pipe( pipe_slow );
7449 %}
7450 
7451 instruct vmul8S(vecX dst, vecX src) %{
7452   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
7453   match(Set dst (MulVS dst src));
7454   format %{ "pmullw  $dst,$src\t! mul packed8S" %}
7455   ins_encode %{
7456     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
7457   %}
7458   ins_pipe( pipe_slow );
7459 %}
7460 
7461 instruct vmul8S_reg(vecX dst, vecX src1, vecX src2) %{
7462   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
7463   match(Set dst (MulVS src1 src2));
7464   format %{ "vpmullw $dst,$src1,$src2\t! mul packed8S" %}
7465   ins_encode %{
7466     int vector_len = 0;
7467     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7468   %}
7469   ins_pipe( pipe_slow );
7470 %}
7471 
7472 instruct vmul8S_mem(vecX dst, vecX src, memory mem) %{
7473   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
7474   match(Set dst (MulVS src (LoadVector mem)));
7475   format %{ "vpmullw $dst,$src,$mem\t! mul packed8S" %}
7476   ins_encode %{
7477     int vector_len = 0;
7478     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7479   %}
7480   ins_pipe( pipe_slow );
7481 %}
7482 
7483 instruct vmul16S_reg(vecY dst, vecY src1, vecY src2) %{
7484   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
7485   match(Set dst (MulVS src1 src2));
7486   format %{ "vpmullw $dst,$src1,$src2\t! mul packed16S" %}
7487   ins_encode %{
7488     int vector_len = 1;
7489     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7490   %}
7491   ins_pipe( pipe_slow );
7492 %}
7493 
7494 instruct vmul16S_mem(vecY dst, vecY src, memory mem) %{
7495   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
7496   match(Set dst (MulVS src (LoadVector mem)));
7497   format %{ "vpmullw $dst,$src,$mem\t! mul packed16S" %}
7498   ins_encode %{
7499     int vector_len = 1;
7500     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7501   %}
7502   ins_pipe( pipe_slow );
7503 %}
7504 
7505 instruct vmul32S_reg(vecZ dst, vecZ src1, vecZ src2) %{
7506   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
7507   match(Set dst (MulVS src1 src2));
7508   format %{ "vpmullw $dst,$src1,$src2\t! mul packed32S" %}
7509   ins_encode %{
7510     int vector_len = 2;
7511     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7512   %}
7513   ins_pipe( pipe_slow );
7514 %}
7515 
7516 instruct vmul32S_mem(vecZ dst, vecZ src, memory mem) %{
7517   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
7518   match(Set dst (MulVS src (LoadVector mem)));
7519   format %{ "vpmullw $dst,$src,$mem\t! mul packed32S" %}
7520   ins_encode %{
7521     int vector_len = 2;
7522     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7523   %}
7524   ins_pipe( pipe_slow );
7525 %}
7526 
7527 // Integers vector mul (sse4_1)
7528 instruct vmul2I(vecD dst, vecD src) %{
7529   predicate(UseSSE > 3 && n->as_Vector()->length() == 2);
7530   match(Set dst (MulVI dst src));
7531   format %{ "pmulld  $dst,$src\t! mul packed2I" %}
7532   ins_encode %{
7533     __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
7534   %}
7535   ins_pipe( pipe_slow );
7536 %}
7537 
7538 instruct vmul2I_reg(vecD dst, vecD src1, vecD src2) %{
7539   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7540   match(Set dst (MulVI src1 src2));
7541   format %{ "vpmulld $dst,$src1,$src2\t! mul packed2I" %}
7542   ins_encode %{
7543     int vector_len = 0;
7544     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7545   %}
7546   ins_pipe( pipe_slow );
7547 %}
7548 
7549 instruct vmul2I_mem(vecD dst, vecD src, memory mem) %{
7550   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7551   match(Set dst (MulVI src (LoadVector mem)));
7552   format %{ "vpmulld $dst,$src,$mem\t! mul packed2I" %}
7553   ins_encode %{
7554     int vector_len = 0;
7555     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7556   %}
7557   ins_pipe( pipe_slow );
7558 %}
7559 
7560 instruct vmul4I(vecX dst, vecX src) %{
7561   predicate(UseSSE > 3 && n->as_Vector()->length() == 4);
7562   match(Set dst (MulVI dst src));
7563   format %{ "pmulld  $dst,$src\t! mul packed4I" %}
7564   ins_encode %{
7565     __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
7566   %}
7567   ins_pipe( pipe_slow );
7568 %}
7569 
7570 instruct vmul4I_reg(vecX dst, vecX src1, vecX src2) %{
7571   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7572   match(Set dst (MulVI src1 src2));
7573   format %{ "vpmulld $dst,$src1,$src2\t! mul packed4I" %}
7574   ins_encode %{
7575     int vector_len = 0;
7576     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7577   %}
7578   ins_pipe( pipe_slow );
7579 %}
7580 
7581 instruct vmul4I_mem(vecX dst, vecX src, memory mem) %{
7582   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7583   match(Set dst (MulVI src (LoadVector mem)));
7584   format %{ "vpmulld $dst,$src,$mem\t! mul packed4I" %}
7585   ins_encode %{
7586     int vector_len = 0;
7587     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7588   %}
7589   ins_pipe( pipe_slow );
7590 %}
7591 
7592 instruct vmul2L_reg(vecX dst, vecX src1, vecX src2) %{
7593   predicate(UseAVX > 2 && n->as_Vector()->length() == 2 && VM_Version::supports_avx512dq());
7594   match(Set dst (MulVL src1 src2));
7595   format %{ "vpmullq $dst,$src1,$src2\t! mul packed2L" %}
7596   ins_encode %{
7597     int vector_len = 0;
7598     __ vpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7599   %}
7600   ins_pipe( pipe_slow );
7601 %}
7602 
7603 instruct vmul2L_mem(vecX dst, vecX src, memory mem) %{
7604   predicate(UseAVX > 2 && n->as_Vector()->length() == 2 && VM_Version::supports_avx512dq());
7605   match(Set dst (MulVL src (LoadVector mem)));
7606   format %{ "vpmullq $dst,$src,$mem\t! mul packed2L" %}
7607   ins_encode %{
7608     int vector_len = 0;
7609     __ vpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7610   %}
7611   ins_pipe( pipe_slow );
7612 %}
7613 
7614 instruct vmul4L_reg(vecY dst, vecY src1, vecY src2) %{
7615   predicate(UseAVX > 2 && n->as_Vector()->length() == 4 && VM_Version::supports_avx512dq());
7616   match(Set dst (MulVL src1 src2));
7617   format %{ "vpmullq $dst,$src1,$src2\t! mul packed4L" %}
7618   ins_encode %{
7619     int vector_len = 1;
7620     __ vpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7621   %}
7622   ins_pipe( pipe_slow );
7623 %}
7624 
7625 instruct vmul4L_mem(vecY dst, vecY src, memory mem) %{
7626   predicate(UseAVX > 2 && n->as_Vector()->length() == 4 && VM_Version::supports_avx512dq());
7627   match(Set dst (MulVL src (LoadVector mem)));
7628   format %{ "vpmullq $dst,$src,$mem\t! mul packed4L" %}
7629   ins_encode %{
7630     int vector_len = 1;
7631     __ vpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7632   %}
7633   ins_pipe( pipe_slow );
7634 %}
7635 
7636 instruct vmul8L_reg(vecZ dst, vecZ src1, vecZ src2) %{
7637   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && VM_Version::supports_avx512dq());
7638   match(Set dst (MulVL src1 src2));
7639   format %{ "vpmullq $dst,$src1,$src2\t! mul packed8L" %}
7640   ins_encode %{
7641     int vector_len = 2;
7642     __ vpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7643   %}
7644   ins_pipe( pipe_slow );
7645 %}
7646 
7647 instruct vmul8L_mem(vecZ dst, vecZ src, memory mem) %{
7648   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && VM_Version::supports_avx512dq());
7649   match(Set dst (MulVL src (LoadVector mem)));
7650   format %{ "vpmullq $dst,$src,$mem\t! mul packed8L" %}
7651   ins_encode %{
7652     int vector_len = 2;
7653     __ vpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7654   %}
7655   ins_pipe( pipe_slow );
7656 %}
7657 
7658 instruct vmul8I_reg(vecY dst, vecY src1, vecY src2) %{
7659   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
7660   match(Set dst (MulVI src1 src2));
7661   format %{ "vpmulld $dst,$src1,$src2\t! mul packed8I" %}
7662   ins_encode %{
7663     int vector_len = 1;
7664     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7665   %}
7666   ins_pipe( pipe_slow );
7667 %}
7668 
7669 instruct vmul8I_mem(vecY dst, vecY src, memory mem) %{
7670   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
7671   match(Set dst (MulVI src (LoadVector mem)));
7672   format %{ "vpmulld $dst,$src,$mem\t! mul packed8I" %}
7673   ins_encode %{
7674     int vector_len = 1;
7675     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7676   %}
7677   ins_pipe( pipe_slow );
7678 %}
7679 
7680 instruct vmul16I_reg(vecZ dst, vecZ src1, vecZ src2) %{
7681   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
7682   match(Set dst (MulVI src1 src2));
7683   format %{ "vpmulld $dst,$src1,$src2\t! mul packed16I" %}
7684   ins_encode %{
7685     int vector_len = 2;
7686     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7687   %}
7688   ins_pipe( pipe_slow );
7689 %}
7690 
7691 instruct vmul16I_mem(vecZ dst, vecZ src, memory mem) %{
7692   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
7693   match(Set dst (MulVI src (LoadVector mem)));
7694   format %{ "vpmulld $dst,$src,$mem\t! mul packed16I" %}
7695   ins_encode %{
7696     int vector_len = 2;
7697     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7698   %}
7699   ins_pipe( pipe_slow );
7700 %}
7701 
7702 // Floats vector mul
7703 instruct vmul2F(vecD dst, vecD src) %{
7704   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
7705   match(Set dst (MulVF dst src));
7706   format %{ "mulps   $dst,$src\t! mul packed2F" %}
7707   ins_encode %{
7708     __ mulps($dst$$XMMRegister, $src$$XMMRegister);
7709   %}
7710   ins_pipe( pipe_slow );
7711 %}
7712 
7713 instruct vmul2F_reg(vecD dst, vecD src1, vecD src2) %{
7714   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7715   match(Set dst (MulVF src1 src2));
7716   format %{ "vmulps  $dst,$src1,$src2\t! mul packed2F" %}
7717   ins_encode %{
7718     int vector_len = 0;
7719     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7720   %}
7721   ins_pipe( pipe_slow );
7722 %}
7723 
7724 instruct vmul2F_mem(vecD dst, vecD src, memory mem) %{
7725   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7726   match(Set dst (MulVF src (LoadVector mem)));
7727   format %{ "vmulps  $dst,$src,$mem\t! mul packed2F" %}
7728   ins_encode %{
7729     int vector_len = 0;
7730     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7731   %}
7732   ins_pipe( pipe_slow );
7733 %}
7734 
7735 instruct vmul4F(vecX dst, vecX src) %{
7736   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
7737   match(Set dst (MulVF dst src));
7738   format %{ "mulps   $dst,$src\t! mul packed4F" %}
7739   ins_encode %{
7740     __ mulps($dst$$XMMRegister, $src$$XMMRegister);
7741   %}
7742   ins_pipe( pipe_slow );
7743 %}
7744 
7745 instruct vmul4F_reg(vecX dst, vecX src1, vecX src2) %{
7746   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7747   match(Set dst (MulVF src1 src2));
7748   format %{ "vmulps  $dst,$src1,$src2\t! mul packed4F" %}
7749   ins_encode %{
7750     int vector_len = 0;
7751     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7752   %}
7753   ins_pipe( pipe_slow );
7754 %}
7755 
7756 instruct vmul4F_mem(vecX dst, vecX src, memory mem) %{
7757   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7758   match(Set dst (MulVF src (LoadVector mem)));
7759   format %{ "vmulps  $dst,$src,$mem\t! mul packed4F" %}
7760   ins_encode %{
7761     int vector_len = 0;
7762     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7763   %}
7764   ins_pipe( pipe_slow );
7765 %}
7766 
7767 instruct vmul8F_reg(vecY dst, vecY src1, vecY src2) %{
7768   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
7769   match(Set dst (MulVF src1 src2));
7770   format %{ "vmulps  $dst,$src1,$src2\t! mul packed8F" %}
7771   ins_encode %{
7772     int vector_len = 1;
7773     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7774   %}
7775   ins_pipe( pipe_slow );
7776 %}
7777 
7778 instruct vmul8F_mem(vecY dst, vecY src, memory mem) %{
7779   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
7780   match(Set dst (MulVF src (LoadVector mem)));
7781   format %{ "vmulps  $dst,$src,$mem\t! mul packed8F" %}
7782   ins_encode %{
7783     int vector_len = 1;
7784     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7785   %}
7786   ins_pipe( pipe_slow );
7787 %}
7788 
7789 instruct vmul16F_reg(vecZ dst, vecZ src1, vecZ src2) %{
7790   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
7791   match(Set dst (MulVF src1 src2));
7792   format %{ "vmulps  $dst,$src1,$src2\t! mul packed16F" %}
7793   ins_encode %{
7794     int vector_len = 2;
7795     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7796   %}
7797   ins_pipe( pipe_slow );
7798 %}
7799 
7800 instruct vmul16F_mem(vecZ dst, vecZ src, memory mem) %{
7801   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
7802   match(Set dst (MulVF src (LoadVector mem)));
7803   format %{ "vmulps  $dst,$src,$mem\t! mul packed16F" %}
7804   ins_encode %{
7805     int vector_len = 2;
7806     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7807   %}
7808   ins_pipe( pipe_slow );
7809 %}
7810 
7811 // Doubles vector mul
7812 instruct vmul2D(vecX dst, vecX src) %{
7813   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
7814   match(Set dst (MulVD dst src));
7815   format %{ "mulpd   $dst,$src\t! mul packed2D" %}
7816   ins_encode %{
7817     __ mulpd($dst$$XMMRegister, $src$$XMMRegister);
7818   %}
7819   ins_pipe( pipe_slow );
7820 %}
7821 
7822 instruct vmul2D_reg(vecX dst, vecX src1, vecX src2) %{
7823   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7824   match(Set dst (MulVD src1 src2));
7825   format %{ "vmulpd  $dst,$src1,$src2\t! mul packed2D" %}
7826   ins_encode %{
7827     int vector_len = 0;
7828     __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7829   %}
7830   ins_pipe( pipe_slow );
7831 %}
7832 
7833 instruct vmul2D_mem(vecX dst, vecX src, memory mem) %{
7834   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7835   match(Set dst (MulVD src (LoadVector mem)));
7836   format %{ "vmulpd  $dst,$src,$mem\t! mul packed2D" %}
7837   ins_encode %{
7838     int vector_len = 0;
7839     __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7840   %}
7841   ins_pipe( pipe_slow );
7842 %}
7843 
7844 instruct vmul4D_reg(vecY dst, vecY src1, vecY src2) %{
7845   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7846   match(Set dst (MulVD src1 src2));
7847   format %{ "vmulpd  $dst,$src1,$src2\t! mul packed4D" %}
7848   ins_encode %{
7849     int vector_len = 1;
7850     __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7851   %}
7852   ins_pipe( pipe_slow );
7853 %}
7854 
7855 instruct vmul4D_mem(vecY dst, vecY src, memory mem) %{
7856   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7857   match(Set dst (MulVD src (LoadVector mem)));
7858   format %{ "vmulpd  $dst,$src,$mem\t! mul packed4D" %}
7859   ins_encode %{
7860     int vector_len = 1;
7861     __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7862   %}
7863   ins_pipe( pipe_slow );
7864 %}
7865 
7866 instruct vmul8D_reg(vecZ dst, vecZ src1, vecZ src2) %{
7867   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
7868   match(Set dst (MulVD src1 src2));
7869   format %{ "vmulpd  $dst k0,$src1,$src2\t! mul packed8D" %}
7870   ins_encode %{
7871     int vector_len = 2;
7872     __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7873   %}
7874   ins_pipe( pipe_slow );
7875 %}
7876 
7877 instruct vmul8D_mem(vecZ dst, vecZ src, memory mem) %{
7878   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
7879   match(Set dst (MulVD src (LoadVector mem)));
7880   format %{ "vmulpd  $dst k0,$src,$mem\t! mul packed8D" %}
7881   ins_encode %{
7882     int vector_len = 2;
7883     __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7884   %}
7885   ins_pipe( pipe_slow );
7886 %}
7887 
7888 instruct vcmov8F_reg(legVecY dst, legVecY src1, legVecY src2, immI8 cop, cmpOp_vcmppd copnd) %{
7889   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
7890   match(Set dst (CMoveVF (Binary copnd cop) (Binary src1 src2)));
7891   effect(TEMP dst, USE src1, USE src2);
7892   format %{ "cmpps.$copnd  $dst, $src1, $src2  ! vcmovevf, cond=$cop\n\t"
7893             "blendvps $dst,$src1,$src2,$dst ! vcmovevf\n\t"
7894          %}
7895   ins_encode %{
7896     int vector_len = 1;
7897     int cond = (Assembler::Condition)($copnd$$cmpcode);
7898     __ cmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cond, vector_len);
7899     __ blendvps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $dst$$XMMRegister, vector_len);
7900   %}
7901   ins_pipe( pipe_slow );
7902 %}
7903 
7904 instruct vcmov4D_reg(legVecY dst, legVecY src1, legVecY src2, immI8 cop, cmpOp_vcmppd copnd) %{
7905   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7906   match(Set dst (CMoveVD (Binary copnd cop) (Binary src1 src2)));
7907   effect(TEMP dst, USE src1, USE src2);
7908   format %{ "cmppd.$copnd  $dst, $src1, $src2  ! vcmovevd, cond=$cop\n\t"
7909             "blendvpd $dst,$src1,$src2,$dst ! vcmovevd\n\t"
7910          %}
7911   ins_encode %{
7912     int vector_len = 1;
7913     int cond = (Assembler::Condition)($copnd$$cmpcode);
7914     __ cmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cond, vector_len);
7915     __ blendvpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $dst$$XMMRegister, vector_len);
7916   %}
7917   ins_pipe( pipe_slow );
7918 %}
7919 
7920 // --------------------------------- DIV --------------------------------------
7921 
7922 // Floats vector div
7923 instruct vdiv2F(vecD dst, vecD src) %{
7924   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
7925   match(Set dst (DivVF dst src));
7926   format %{ "divps   $dst,$src\t! div packed2F" %}
7927   ins_encode %{
7928     __ divps($dst$$XMMRegister, $src$$XMMRegister);
7929   %}
7930   ins_pipe( pipe_slow );
7931 %}
7932 
7933 instruct vdiv2F_reg(vecD dst, vecD src1, vecD src2) %{
7934   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7935   match(Set dst (DivVF src1 src2));
7936   format %{ "vdivps  $dst,$src1,$src2\t! div packed2F" %}
7937   ins_encode %{
7938     int vector_len = 0;
7939     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7940   %}
7941   ins_pipe( pipe_slow );
7942 %}
7943 
7944 instruct vdiv2F_mem(vecD dst, vecD src, memory mem) %{
7945   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7946   match(Set dst (DivVF src (LoadVector mem)));
7947   format %{ "vdivps  $dst,$src,$mem\t! div packed2F" %}
7948   ins_encode %{
7949     int vector_len = 0;
7950     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7951   %}
7952   ins_pipe( pipe_slow );
7953 %}
7954 
7955 instruct vdiv4F(vecX dst, vecX src) %{
7956   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
7957   match(Set dst (DivVF dst src));
7958   format %{ "divps   $dst,$src\t! div packed4F" %}
7959   ins_encode %{
7960     __ divps($dst$$XMMRegister, $src$$XMMRegister);
7961   %}
7962   ins_pipe( pipe_slow );
7963 %}
7964 
7965 instruct vdiv4F_reg(vecX dst, vecX src1, vecX src2) %{
7966   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7967   match(Set dst (DivVF src1 src2));
7968   format %{ "vdivps  $dst,$src1,$src2\t! div packed4F" %}
7969   ins_encode %{
7970     int vector_len = 0;
7971     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7972   %}
7973   ins_pipe( pipe_slow );
7974 %}
7975 
7976 instruct vdiv4F_mem(vecX dst, vecX src, memory mem) %{
7977   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7978   match(Set dst (DivVF src (LoadVector mem)));
7979   format %{ "vdivps  $dst,$src,$mem\t! div packed4F" %}
7980   ins_encode %{
7981     int vector_len = 0;
7982     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7983   %}
7984   ins_pipe( pipe_slow );
7985 %}
7986 
7987 instruct vdiv8F_reg(vecY dst, vecY src1, vecY src2) %{
7988   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
7989   match(Set dst (DivVF src1 src2));
7990   format %{ "vdivps  $dst,$src1,$src2\t! div packed8F" %}
7991   ins_encode %{
7992     int vector_len = 1;
7993     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7994   %}
7995   ins_pipe( pipe_slow );
7996 %}
7997 
7998 instruct vdiv8F_mem(vecY dst, vecY src, memory mem) %{
7999   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
8000   match(Set dst (DivVF src (LoadVector mem)));
8001   format %{ "vdivps  $dst,$src,$mem\t! div packed8F" %}
8002   ins_encode %{
8003     int vector_len = 1;
8004     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8005   %}
8006   ins_pipe( pipe_slow );
8007 %}
8008 
8009 instruct vdiv16F_reg(vecZ dst, vecZ src1, vecZ src2) %{
8010   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
8011   match(Set dst (DivVF src1 src2));
8012   format %{ "vdivps  $dst,$src1,$src2\t! div packed16F" %}
8013   ins_encode %{
8014     int vector_len = 2;
8015     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8016   %}
8017   ins_pipe( pipe_slow );
8018 %}
8019 
8020 instruct vdiv16F_mem(vecZ dst, vecZ src, memory mem) %{
8021   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
8022   match(Set dst (DivVF src (LoadVector mem)));
8023   format %{ "vdivps  $dst,$src,$mem\t! div packed16F" %}
8024   ins_encode %{
8025     int vector_len = 2;
8026     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8027   %}
8028   ins_pipe( pipe_slow );
8029 %}
8030 
8031 // Doubles vector div
8032 instruct vdiv2D(vecX dst, vecX src) %{
8033   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
8034   match(Set dst (DivVD dst src));
8035   format %{ "divpd   $dst,$src\t! div packed2D" %}
8036   ins_encode %{
8037     __ divpd($dst$$XMMRegister, $src$$XMMRegister);
8038   %}
8039   ins_pipe( pipe_slow );
8040 %}
8041 
8042 instruct vdiv2D_reg(vecX dst, vecX src1, vecX src2) %{
8043   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8044   match(Set dst (DivVD src1 src2));
8045   format %{ "vdivpd  $dst,$src1,$src2\t! div packed2D" %}
8046   ins_encode %{
8047     int vector_len = 0;
8048     __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8049   %}
8050   ins_pipe( pipe_slow );
8051 %}
8052 
8053 instruct vdiv2D_mem(vecX dst, vecX src, memory mem) %{
8054   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8055   match(Set dst (DivVD src (LoadVector mem)));
8056   format %{ "vdivpd  $dst,$src,$mem\t! div packed2D" %}
8057   ins_encode %{
8058     int vector_len = 0;
8059     __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8060   %}
8061   ins_pipe( pipe_slow );
8062 %}
8063 
8064 instruct vdiv4D_reg(vecY dst, vecY src1, vecY src2) %{
8065   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8066   match(Set dst (DivVD src1 src2));
8067   format %{ "vdivpd  $dst,$src1,$src2\t! div packed4D" %}
8068   ins_encode %{
8069     int vector_len = 1;
8070     __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8071   %}
8072   ins_pipe( pipe_slow );
8073 %}
8074 
8075 instruct vdiv4D_mem(vecY dst, vecY src, memory mem) %{
8076   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8077   match(Set dst (DivVD src (LoadVector mem)));
8078   format %{ "vdivpd  $dst,$src,$mem\t! div packed4D" %}
8079   ins_encode %{
8080     int vector_len = 1;
8081     __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8082   %}
8083   ins_pipe( pipe_slow );
8084 %}
8085 
8086 instruct vdiv8D_reg(vecZ dst, vecZ src1, vecZ src2) %{
8087   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8088   match(Set dst (DivVD src1 src2));
8089   format %{ "vdivpd  $dst,$src1,$src2\t! div packed8D" %}
8090   ins_encode %{
8091     int vector_len = 2;
8092     __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8093   %}
8094   ins_pipe( pipe_slow );
8095 %}
8096 
8097 instruct vdiv8D_mem(vecZ dst, vecZ src, memory mem) %{
8098   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8099   match(Set dst (DivVD src (LoadVector mem)));
8100   format %{ "vdivpd  $dst,$src,$mem\t! div packed8D" %}
8101   ins_encode %{
8102     int vector_len = 2;
8103     __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8104   %}
8105   ins_pipe( pipe_slow );
8106 %}
8107 
8108 // --------------------------------- Sqrt --------------------------------------
8109 
8110 // Floating point vector sqrt
8111 instruct vsqrt2D_reg(vecX dst, vecX src) %{
8112   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8113   match(Set dst (SqrtVD src));
8114   format %{ "vsqrtpd  $dst,$src\t! sqrt packed2D" %}
8115   ins_encode %{
8116     int vector_len = 0;
8117     __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
8118   %}
8119   ins_pipe( pipe_slow );
8120 %}
8121 
8122 instruct vsqrt2D_mem(vecX dst, memory mem) %{
8123   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8124   match(Set dst (SqrtVD (LoadVector mem)));
8125   format %{ "vsqrtpd  $dst,$mem\t! sqrt packed2D" %}
8126   ins_encode %{
8127     int vector_len = 0;
8128     __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vector_len);
8129   %}
8130   ins_pipe( pipe_slow );
8131 %}
8132 
8133 instruct vsqrt4D_reg(vecY dst, vecY src) %{
8134   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8135   match(Set dst (SqrtVD src));
8136   format %{ "vsqrtpd  $dst,$src\t! sqrt packed4D" %}
8137   ins_encode %{
8138     int vector_len = 1;
8139     __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
8140   %}
8141   ins_pipe( pipe_slow );
8142 %}
8143 
8144 instruct vsqrt4D_mem(vecY dst, memory mem) %{
8145   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8146   match(Set dst (SqrtVD (LoadVector mem)));
8147   format %{ "vsqrtpd  $dst,$mem\t! sqrt packed4D" %}
8148   ins_encode %{
8149     int vector_len = 1;
8150     __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vector_len);
8151   %}
8152   ins_pipe( pipe_slow );
8153 %}
8154 
8155 instruct vsqrt8D_reg(vecZ dst, vecZ src) %{
8156   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8157   match(Set dst (SqrtVD src));
8158   format %{ "vsqrtpd  $dst,$src\t! sqrt packed8D" %}
8159   ins_encode %{
8160     int vector_len = 2;
8161     __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
8162   %}
8163   ins_pipe( pipe_slow );
8164 %}
8165 
8166 instruct vsqrt8D_mem(vecZ dst, memory mem) %{
8167   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8168   match(Set dst (SqrtVD (LoadVector mem)));
8169   format %{ "vsqrtpd  $dst,$mem\t! sqrt packed8D" %}
8170   ins_encode %{
8171     int vector_len = 2;
8172     __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vector_len);
8173   %}
8174   ins_pipe( pipe_slow );
8175 %}
8176 
8177 instruct vsqrt2F_reg(vecD dst, vecD src) %{
8178   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8179   match(Set dst (SqrtVF src));
8180   format %{ "vsqrtps  $dst,$src\t! sqrt packed2F" %}
8181   ins_encode %{
8182     int vector_len = 0;
8183     __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vector_len);
8184   %}
8185   ins_pipe( pipe_slow );
8186 %}
8187 
8188 instruct vsqrt2F_mem(vecD dst, memory mem) %{
8189   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8190   match(Set dst (SqrtVF (LoadVector mem)));
8191   format %{ "vsqrtps  $dst,$mem\t! sqrt packed2F" %}
8192   ins_encode %{
8193     int vector_len = 0;
8194     __ vsqrtps($dst$$XMMRegister, $mem$$Address, vector_len);
8195   %}
8196   ins_pipe( pipe_slow );
8197 %}
8198 
8199 instruct vsqrt4F_reg(vecX dst, vecX src) %{
8200   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8201   match(Set dst (SqrtVF src));
8202   format %{ "vsqrtps  $dst,$src\t! sqrt packed4F" %}
8203   ins_encode %{
8204     int vector_len = 0;
8205     __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vector_len);
8206   %}
8207   ins_pipe( pipe_slow );
8208 %}
8209 
8210 instruct vsqrt4F_mem(vecX dst, memory mem) %{
8211   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8212   match(Set dst (SqrtVF (LoadVector mem)));
8213   format %{ "vsqrtps  $dst,$mem\t! sqrt packed4F" %}
8214   ins_encode %{
8215     int vector_len = 0;
8216     __ vsqrtps($dst$$XMMRegister, $mem$$Address, vector_len);
8217   %}
8218   ins_pipe( pipe_slow );
8219 %}
8220 
8221 instruct vsqrt8F_reg(vecY dst, vecY src) %{
8222   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
8223   match(Set dst (SqrtVF src));
8224   format %{ "vsqrtps  $dst,$src\t! sqrt packed8F" %}
8225   ins_encode %{
8226     int vector_len = 1;
8227     __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vector_len);
8228   %}
8229   ins_pipe( pipe_slow );
8230 %}
8231 
8232 instruct vsqrt8F_mem(vecY dst, memory mem) %{
8233   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
8234   match(Set dst (SqrtVF (LoadVector mem)));
8235   format %{ "vsqrtps  $dst,$mem\t! sqrt packed8F" %}
8236   ins_encode %{
8237     int vector_len = 1;
8238     __ vsqrtps($dst$$XMMRegister, $mem$$Address, vector_len);
8239   %}
8240   ins_pipe( pipe_slow );
8241 %}
8242 
8243 instruct vsqrt16F_reg(vecZ dst, vecZ src) %{
8244   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
8245   match(Set dst (SqrtVF src));
8246   format %{ "vsqrtps  $dst,$src\t! sqrt packed16F" %}
8247   ins_encode %{
8248     int vector_len = 2;
8249     __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vector_len);
8250   %}
8251   ins_pipe( pipe_slow );
8252 %}
8253 
8254 instruct vsqrt16F_mem(vecZ dst, memory mem) %{
8255   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
8256   match(Set dst (SqrtVF (LoadVector mem)));
8257   format %{ "vsqrtps  $dst,$mem\t! sqrt packed16F" %}
8258   ins_encode %{
8259     int vector_len = 2;
8260     __ vsqrtps($dst$$XMMRegister, $mem$$Address, vector_len);
8261   %}
8262   ins_pipe( pipe_slow );
8263 %}
8264 
8265 // ------------------------------ Shift ---------------------------------------
8266 
8267 // Left and right shift count vectors are the same on x86
8268 // (only lowest bits of xmm reg are used for count).
8269 instruct vshiftcnt(vecS dst, rRegI cnt) %{
8270   match(Set dst (LShiftCntV cnt));
8271   match(Set dst (RShiftCntV cnt));
8272   format %{ "movdl    $dst,$cnt\t! load shift count" %}
8273   ins_encode %{
8274     __ movdl($dst$$XMMRegister, $cnt$$Register);
8275   %}
8276   ins_pipe( pipe_slow );
8277 %}
8278 
8279 instruct vshiftcntimm(vecS dst, immI8 cnt, rRegI tmp) %{
8280   match(Set dst cnt);
8281   effect(TEMP tmp);
8282   format %{ "movl    $tmp,$cnt\t"
8283             "movdl   $dst,$tmp\t! load shift count" %}
8284   ins_encode %{
8285     __ movl($tmp$$Register, $cnt$$constant);
8286     __ movdl($dst$$XMMRegister, $tmp$$Register);
8287   %}
8288   ins_pipe( pipe_slow );
8289 %}
8290 
8291 // Byte vector shift
8292 instruct vshift4B(vecS dst, vecS src, vecS shift, vecS tmp, rRegI scratch) %{
8293   predicate(UseSSE > 3 && n->as_Vector()->length() == 4);
8294   match(Set dst (LShiftVB src shift));
8295   match(Set dst (RShiftVB src shift));
8296   match(Set dst (URShiftVB src shift));
8297   effect(TEMP dst, USE src, USE shift, TEMP tmp, TEMP scratch);
8298   format %{"vextendbw $tmp,$src\n\t"
8299            "vshiftw   $tmp,$shift\n\t"
8300            "movdqu    $dst,[0x00ff00ff0x00ff00ff]\n\t"
8301            "pand      $dst,$tmp\n\t"
8302            "packuswb  $dst,$dst\n\t ! packed4B shift" %}
8303   ins_encode %{
8304     int opcode = this->as_Mach()->ideal_Opcode();
8305 
8306     __ vextendbw(opcode, $tmp$$XMMRegister, $src$$XMMRegister);
8307     __ vshiftw(opcode, $tmp$$XMMRegister, $shift$$XMMRegister);
8308     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register); 
8309     __ pand($dst$$XMMRegister, $tmp$$XMMRegister);
8310     __ packuswb($dst$$XMMRegister, $dst$$XMMRegister);
8311   %}
8312   ins_pipe( pipe_slow );
8313 %}
8314 
8315 instruct vshift8B(vecD dst, vecD src, vecS shift, vecD tmp, rRegI scratch) %{
8316   predicate(UseSSE > 3 && n->as_Vector()->length() == 8);
8317   match(Set dst (LShiftVB src shift));
8318   match(Set dst (RShiftVB src shift));
8319   match(Set dst (URShiftVB src shift));
8320   effect(TEMP dst, USE src, USE shift, TEMP tmp, TEMP scratch);
8321   format %{"vextendbw $tmp,$src\n\t"
8322            "vshiftw   $tmp,$shift\n\t"
8323            "movdqu    $dst,[0x00ff00ff0x00ff00ff]\n\t"
8324            "pand      $dst,$tmp\n\t"
8325            "packuswb  $dst,$dst\n\t ! packed8B shift" %}
8326   ins_encode %{
8327     int opcode = this->as_Mach()->ideal_Opcode();
8328 
8329     __ vextendbw(opcode, $tmp$$XMMRegister, $src$$XMMRegister);
8330     __ vshiftw(opcode, $tmp$$XMMRegister, $shift$$XMMRegister);
8331     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register); 
8332     __ pand($dst$$XMMRegister, $tmp$$XMMRegister);
8333     __ packuswb($dst$$XMMRegister, $dst$$XMMRegister);
8334   %}
8335   ins_pipe( pipe_slow );
8336 %}
8337 
8338 instruct vshift16B(vecX dst, vecX src, vecS shift, vecX tmp1, vecX tmp2, rRegI scratch) %{
8339   predicate(UseSSE > 3  && UseAVX <= 1 && n->as_Vector()->length() == 16);
8340   match(Set dst (LShiftVB src shift));
8341   match(Set dst (RShiftVB src shift));
8342   match(Set dst (URShiftVB src shift));
8343   effect(TEMP dst, USE src, USE shift, TEMP tmp1, TEMP tmp2, TEMP scratch);
8344   format %{"vextendbw $tmp1,$src\n\t"
8345            "vshiftw   $tmp1,$shift\n\t"
8346            "pshufd    $tmp2,$src\n\t"
8347            "vextendbw $tmp2,$tmp2\n\t"
8348            "vshiftw   $tmp2,$shift\n\t"
8349            "movdqu    $dst,[0x00ff00ff0x00ff00ff]\n\t"
8350            "pand      $tmp2,$dst\n\t"
8351            "pand      $dst,$tmp1\n\t"
8352            "packuswb  $dst,$tmp2\n\t! packed16B shift" %}
8353   ins_encode %{
8354     int opcode = this->as_Mach()->ideal_Opcode();
8355 
8356     __ vextendbw(opcode, $tmp1$$XMMRegister, $src$$XMMRegister);
8357     __ vshiftw(opcode, $tmp1$$XMMRegister, $shift$$XMMRegister);
8358     __ pshufd($tmp2$$XMMRegister, $src$$XMMRegister, 0xE);
8359     __ vextendbw(opcode, $tmp2$$XMMRegister, $tmp2$$XMMRegister);
8360     __ vshiftw(opcode, $tmp2$$XMMRegister, $shift$$XMMRegister);
8361     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
8362     __ pand($tmp2$$XMMRegister, $dst$$XMMRegister);
8363     __ pand($dst$$XMMRegister, $tmp1$$XMMRegister);
8364     __ packuswb($dst$$XMMRegister, $tmp2$$XMMRegister);
8365   %}
8366   ins_pipe( pipe_slow );
8367 %}
8368 
8369 instruct vshift16B_avx(vecX dst, vecX src, vecS shift, vecX tmp, rRegI scratch) %{
8370   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
8371   match(Set dst (LShiftVB src shift));
8372   match(Set dst (RShiftVB src shift));
8373   match(Set dst (URShiftVB src shift));
8374   effect(TEMP dst, USE src, USE shift, TEMP tmp, TEMP scratch);
8375   format %{"vextendbw  $tmp,$src\n\t"
8376            "vshiftw    $tmp,$tmp,$shift\n\t"
8377            "vpand      $tmp,$tmp,[0x00ff00ff0x00ff00ff]\n\t"
8378            "vextracti128_high  $dst,$tmp\n\t"
8379            "vpackuswb  $dst,$tmp,$dst\n\t! packed16B shift" %}
8380   ins_encode %{
8381     int opcode = this->as_Mach()->ideal_Opcode();
8382 
8383     int vector_len = 1;
8384     __ vextendbw(opcode, $tmp$$XMMRegister, $src$$XMMRegister, vector_len);
8385     __ vshiftw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vector_len);
8386     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vector_len, $scratch$$Register);
8387     __ vextracti128_high($dst$$XMMRegister, $tmp$$XMMRegister);
8388     __ vpackuswb($dst$$XMMRegister, $tmp$$XMMRegister, $dst$$XMMRegister, 0);
8389   %}
8390   ins_pipe( pipe_slow );
8391 %}
8392 
8393 instruct vshift32B_avx(vecY dst, vecY src, vecS shift, vecY tmp, rRegI scratch) %{
8394   predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
8395   match(Set dst (LShiftVB src shift));
8396   match(Set dst (RShiftVB src shift));
8397   match(Set dst (URShiftVB src shift));
8398   effect(TEMP dst, USE src, USE shift, TEMP tmp, TEMP scratch);
8399   format %{"vextracti128_high  $tmp,$src\n\t"
8400            "vextendbw  $tmp,$tmp\n\t"
8401            "vextendbw  $dst,$src\n\t"
8402            "vshiftw    $tmp,$tmp,$shift\n\t"
8403            "vshiftw    $dst,$dst,$shift\n\t"
8404            "vpand      $tmp,$tmp,[0x00ff00ff0x00ff00ff]\n\t"
8405            "vpand      $dst,$dst,[0x00ff00ff0x00ff00ff]\n\t"
8406            "vpackuswb  $dst,$dst,$tmp\n\t"
8407            "vpermq     $dst,$dst,0xD8\n\t! packed32B shift" %}
8408   ins_encode %{
8409     int opcode = this->as_Mach()->ideal_Opcode();
8410 
8411     int vector_len = 1;
8412     __ vextracti128_high($tmp$$XMMRegister, $src$$XMMRegister);
8413     __ vextendbw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, vector_len);
8414     __ vextendbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, vector_len);
8415     __ vshiftw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vector_len);
8416     __ vshiftw(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $shift$$XMMRegister, vector_len);
8417     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vector_len, $scratch$$Register);
8418     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vector_len, $scratch$$Register);
8419     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
8420     __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vector_len);
8421   %}
8422   ins_pipe( pipe_slow );
8423 %}
8424 
8425 instruct vshift64B_avx(vecZ dst, vecZ src, vecS shift, vecZ tmp1, vecZ tmp2, rRegI scratch) %{
8426   predicate(UseAVX > 2 && n->as_Vector()->length() == 64);
8427   match(Set dst (LShiftVB src shift));
8428   match(Set dst (RShiftVB src shift));
8429   match(Set dst (URShiftVB src shift));
8430   effect(TEMP dst, USE src, USE shift, TEMP tmp1, TEMP tmp2, TEMP scratch);
8431   format %{"vextracti64x4  $tmp1,$src\n\t"
8432            "vextendbw      $tmp1,$tmp1\n\t"
8433            "vextendbw      $tmp2,$src\n\t"
8434            "vshiftw        $tmp1,$tmp1,$shift\n\t"
8435            "vshiftw        $tmp2,$tmp2,$shift\n\t"
8436            "vmovdqu        $dst,[0x00ff00ff0x00ff00ff]\n\t"
8437            "vpbroadcastd   $dst,$dst\n\t"
8438            "vpand          $tmp1,$tmp1,$dst\n\t"
8439            "vpand          $tmp2,$tmp2,$dst\n\t"
8440            "vpackuswb      $dst,$tmp1,$tmp2\n\t"
8441            "evmovdquq      $tmp2, [0x0604020007050301]\n\t"
8442            "vpermq         $dst,$tmp2,$dst\n\t! packed64B shift" %}
8443   ins_encode %{
8444     int opcode = this->as_Mach()->ideal_Opcode();
8445 
8446     int vector_len = 2;
8447     __ vextracti64x4($tmp1$$XMMRegister, $src$$XMMRegister, 1);
8448     __ vextendbw(opcode, $tmp1$$XMMRegister, $tmp1$$XMMRegister, vector_len);
8449     __ vextendbw(opcode, $tmp2$$XMMRegister, $src$$XMMRegister, vector_len);
8450     __ vshiftw(opcode, $tmp1$$XMMRegister, $tmp1$$XMMRegister, $shift$$XMMRegister, vector_len);
8451     __ vshiftw(opcode, $tmp2$$XMMRegister, $tmp2$$XMMRegister, $shift$$XMMRegister, vector_len);
8452     __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
8453     __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
8454     __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vector_len);
8455     __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vector_len);
8456     __ vpackuswb($dst$$XMMRegister, $tmp1$$XMMRegister, $tmp2$$XMMRegister, vector_len);
8457     __ evmovdquq($tmp2$$XMMRegister, ExternalAddress(vector_byte_perm_mask()), vector_len, $scratch$$Register);
8458     __ vpermq($dst$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vector_len);
8459   %}
8460   ins_pipe( pipe_slow );
8461 %}
8462 
8463 // Shorts vector logical right shift produces incorrect Java result
8464 // for negative data because java code convert short value into int with
8465 // sign extension before a shift. But char vectors are fine since chars are
8466 // unsigned values.
8467 // Shorts/Chars vector left shift
8468 instruct vshist2S(vecS dst, vecS src, vecS shift) %{
8469   predicate(n->as_Vector()->length() == 2);
8470   match(Set dst (LShiftVS src shift));
8471   match(Set dst (RShiftVS src shift));
8472   match(Set dst (URShiftVS src shift));
8473   effect(TEMP dst, USE src, USE shift);
8474   format %{ "vshiftw  $dst,$src,$shift\t! shift packed2S" %}
8475   ins_encode %{
8476     int opcode = this->as_Mach()->ideal_Opcode();
8477     if (UseAVX == 0) { 
8478       if ($dst$$XMMRegister != $src$$XMMRegister)
8479          __ movflt($dst$$XMMRegister, $src$$XMMRegister);
8480       __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
8481     } else {
8482       int vector_len = 0;
8483       __ vshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8484     }
8485   %}
8486   ins_pipe( pipe_slow );
8487 %}
8488 
8489 instruct vshift4S(vecD dst, vecD src, vecS shift) %{
8490   predicate(n->as_Vector()->length() == 4);
8491   match(Set dst (LShiftVS src shift));
8492   match(Set dst (RShiftVS src shift));
8493   match(Set dst (URShiftVS src shift));
8494   effect(TEMP dst, USE src, USE shift);
8495   format %{ "vshiftw  $dst,$src,$shift\t! shift packed4S" %}
8496   ins_encode %{
8497     int opcode = this->as_Mach()->ideal_Opcode();
8498     if (UseAVX == 0) { 
8499       if ($dst$$XMMRegister != $src$$XMMRegister)
8500          __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
8501       __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
8502     
8503     } else {
8504       int vector_len = 0;
8505       __ vshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8506     }
8507   %}
8508   ins_pipe( pipe_slow );
8509 %}
8510 
8511 instruct vshift8S(vecX dst, vecX src, vecS shift) %{
8512   predicate(n->as_Vector()->length() == 8);
8513   match(Set dst (LShiftVS src shift));
8514   match(Set dst (RShiftVS src shift));
8515   match(Set dst (URShiftVS src shift));
8516   effect(TEMP dst, USE src, USE shift);
8517   format %{ "vshiftw  $dst,$src,$shift\t! shift packed8S" %}
8518   ins_encode %{
8519     int opcode = this->as_Mach()->ideal_Opcode();
8520     if (UseAVX == 0) { 
8521       if ($dst$$XMMRegister != $src$$XMMRegister)
8522          __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
8523       __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
8524     } else {
8525       int vector_len = 0;
8526       __ vshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8527     }
8528   %}
8529   ins_pipe( pipe_slow );
8530 %}
8531 
8532 instruct vshift16S(vecY dst, vecY src, vecS shift) %{
8533   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
8534   match(Set dst (LShiftVS src shift));
8535   match(Set dst (RShiftVS src shift));
8536   match(Set dst (URShiftVS src shift));
8537   effect(DEF dst, USE src, USE shift);
8538   format %{ "vshiftw  $dst,$src,$shift\t! shift packed16S" %}
8539   ins_encode %{
8540     int vector_len = 1;
8541     int opcode = this->as_Mach()->ideal_Opcode();
8542     __ vshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8543   %}
8544   ins_pipe( pipe_slow );
8545 %}
8546 
8547 instruct vshift32S(vecZ dst, vecZ src, vecS shift) %{
8548   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
8549   match(Set dst (LShiftVS src shift));
8550   match(Set dst (RShiftVS src shift));
8551   match(Set dst (URShiftVS src shift));
8552   effect(DEF dst, USE src, USE shift);
8553   format %{ "vshiftw  $dst,$src,$shift\t! shift packed32S" %}
8554   ins_encode %{
8555     int vector_len = 2;
8556     int opcode = this->as_Mach()->ideal_Opcode();
8557     __ vshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8558   %}
8559   ins_pipe( pipe_slow );
8560 %}
8561 
8562 // Integers vector left shift
8563 instruct vshift2I(vecD dst, vecD src, vecS shift) %{
8564   predicate(n->as_Vector()->length() == 2);
8565   match(Set dst (LShiftVI src shift));
8566   match(Set dst (RShiftVI src shift));
8567   match(Set dst (URShiftVI src shift));
8568   effect(TEMP dst, USE src, USE shift);
8569   format %{ "vshiftd  $dst,$src,$shift\t! shift packed2I" %}
8570   ins_encode %{
8571     int opcode = this->as_Mach()->ideal_Opcode();
8572     if (UseAVX == 0) { 
8573       if ($dst$$XMMRegister != $src$$XMMRegister)
8574          __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
8575       __ vshiftd(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
8576     } else {
8577       int vector_len = 0;
8578       __ vshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8579     }
8580   %}
8581   ins_pipe( pipe_slow );
8582 %}
8583 
8584 instruct vshift4I(vecX dst, vecX src, vecS shift) %{
8585   predicate(n->as_Vector()->length() == 4);
8586   match(Set dst (LShiftVI src shift));
8587   match(Set dst (RShiftVI src shift));
8588   match(Set dst (URShiftVI src shift));
8589   effect(TEMP dst, USE src, USE shift);
8590   format %{ "vshiftd  $dst,$src,$shift\t! shift packed4I" %}
8591   ins_encode %{
8592     int opcode = this->as_Mach()->ideal_Opcode();
8593     if (UseAVX == 0) { 
8594       if ($dst$$XMMRegister != $src$$XMMRegister)
8595          __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
8596       __ vshiftd(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
8597     } else {
8598       int vector_len = 0;
8599       __ vshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8600     }
8601   %}
8602   ins_pipe( pipe_slow );
8603 %}
8604 
8605 instruct vshift8I(vecY dst, vecY src, vecS shift) %{
8606   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
8607   match(Set dst (LShiftVI src shift));
8608   match(Set dst (RShiftVI src shift));
8609   match(Set dst (URShiftVI src shift));
8610   effect(DEF dst, USE src, USE shift);
8611   format %{ "vshiftd  $dst,$src,$shift\t! shift packed8I" %}
8612   ins_encode %{
8613     int vector_len = 1;
8614     int opcode = this->as_Mach()->ideal_Opcode();
8615     __ vshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8616   %}
8617   ins_pipe( pipe_slow );
8618 %}
8619 
8620 instruct vshift16I(vecZ dst, vecZ src, vecS shift) %{
8621   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
8622   match(Set dst (LShiftVI src shift));
8623   match(Set dst (RShiftVI src shift));
8624   match(Set dst (URShiftVI src shift));
8625   effect(DEF dst, USE src, USE shift);
8626   format %{ "vshiftd  $dst,$src,$shift\t! shift packed16I" %}
8627   ins_encode %{
8628     int vector_len = 2;
8629     int opcode = this->as_Mach()->ideal_Opcode();
8630     __ vshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8631   %}
8632   ins_pipe( pipe_slow );
8633 %}
8634 
8635 // Longs vector shift
8636 instruct vshift2L(vecX dst, vecX src, vecS shift) %{
8637   predicate(n->as_Vector()->length() == 2);
8638   match(Set dst (LShiftVL src shift));
8639   match(Set dst (URShiftVL src shift));
8640   effect(TEMP dst, USE src, USE shift);
8641   format %{ "vshiftq  $dst,$src,$shift\t! shift packed2L" %}
8642   ins_encode %{
8643     int opcode = this->as_Mach()->ideal_Opcode();
8644     if (UseAVX == 0) { 
8645       if ($dst$$XMMRegister != $src$$XMMRegister)
8646          __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
8647       __ vshiftq(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
8648     } else {
8649       int vector_len = 0;
8650       __ vshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8651     }
8652   %}
8653   ins_pipe( pipe_slow );
8654 %}
8655 
8656 instruct vshift4L(vecY dst, vecY src, vecS shift) %{
8657   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
8658   match(Set dst (LShiftVL src shift));
8659   match(Set dst (URShiftVL src shift));
8660   effect(DEF dst, USE src, USE shift);
8661   format %{ "vshiftq  $dst,$src,$shift\t! left shift packed4L" %}
8662   ins_encode %{
8663     int vector_len = 1;
8664     int opcode = this->as_Mach()->ideal_Opcode();
8665     __ vshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8666   %}
8667   ins_pipe( pipe_slow );
8668 %}
8669 
8670 instruct vshift8L(vecZ dst, vecZ src, vecS shift) %{
8671   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8672   match(Set dst (LShiftVL src shift));
8673   match(Set dst (RShiftVL src shift));
8674   match(Set dst (URShiftVL src shift));
8675   effect(DEF dst, USE src, USE shift);
8676   format %{ "vshiftq  $dst,$src,$shift\t! shift packed8L" %}
8677   ins_encode %{
8678     int vector_len = 2;
8679     int opcode = this->as_Mach()->ideal_Opcode();
8680     __ vshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8681   %}
8682   ins_pipe( pipe_slow );
8683 %}
8684 
8685 // -------------------ArithmeticRightShift -----------------------------------
8686 // Long vector arithmetic right shift
8687 instruct vsra2L_reg(vecX dst, vecX src, vecS shift, vecX tmp, rRegI scratch) %{
8688   predicate(UseSSE >= 2 && n->as_Vector()->length() == 2);
8689   match(Set dst (RShiftVL src shift));
8690   effect(TEMP dst, TEMP tmp, TEMP scratch);
8691   format %{ "movdqu  $dst,$src\n\t"
8692             "psrlq   $dst,$shift\n\t"
8693             "movdqu  $tmp,[0x8000000000000000]\n\t"
8694             "psrlq   $tmp,$shift\n\t"
8695             "pxor    $dst,$tmp\n\t"
8696             "psubq   $dst,$tmp\t! arithmetic right shift packed2L" %}
8697   ins_encode %{
8698     __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
8699     __ psrlq($dst$$XMMRegister, $shift$$XMMRegister);
8700     __ movdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), $scratch$$Register);
8701     __ psrlq($tmp$$XMMRegister, $shift$$XMMRegister);
8702     __ pxor($dst$$XMMRegister, $tmp$$XMMRegister);
8703     __ psubq($dst$$XMMRegister, $tmp$$XMMRegister);
8704   %}
8705   ins_pipe( pipe_slow );
8706 %}
8707 
8708 instruct vsra2L_reg_evex(vecX dst, vecX src, vecS shift) %{
8709   predicate(UseAVX > 2 && n->as_Vector()->length() == 2);
8710   match(Set dst (RShiftVL src shift));
8711   format %{ "evpsraq  $dst,$src,$shift\t! arithmetic right shift packed2L" %}
8712   ins_encode %{
8713     int vector_len = 0;
8714     __ evpsraq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8715   %}
8716   ins_pipe( pipe_slow );
8717 %}
8718 
8719 instruct vsra4L_reg(vecY dst, vecY src, vecS shift, vecY tmp, rRegI scratch) %{
8720   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
8721   match(Set dst (RShiftVL src shift));
8722   effect(TEMP dst, TEMP tmp, TEMP scratch);
8723   format %{ "vpsrlq   $dst,$src,$shift\n\t"
8724             "vmovdqu  $tmp,[0x8000000000000000]\n\t"
8725             "vpsrlq   $tmp,$tmp,$shift\n\t"
8726             "vpxor    $dst,$dst,$tmp\n\t"
8727             "vpsubq   $dst,$dst,$tmp\t! arithmetic right shift packed4L" %}
8728   ins_encode %{
8729     int vector_len = 1;
8730     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8731     __ vmovdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), $scratch$$Register);
8732     __ vpsrlq($tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vector_len);
8733     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
8734     __ vpsubq($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
8735   %}
8736   ins_pipe( pipe_slow );
8737 %}
8738 
8739 instruct vsra4L_reg_evex(vecY dst, vecY src, vecS shift) %{
8740   predicate(UseAVX > 2 && n->as_Vector()->length() == 4);
8741   match(Set dst (RShiftVL src shift));
8742   format %{ "evpsraq  $dst,$src,$shift\t! arithmetic right shift packed4L" %}
8743   ins_encode %{
8744     int vector_len = 1;
8745     __ evpsraq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8746   %}
8747   ins_pipe( pipe_slow );
8748 %}
8749 
8750 // --------------------------------- AND --------------------------------------
8751 
8752 instruct vand4B(vecS dst, vecS src) %{
8753   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 4);
8754   match(Set dst (AndV dst src));
8755   format %{ "pand    $dst,$src\t! and vectors (4 bytes)" %}
8756   ins_encode %{
8757     __ pand($dst$$XMMRegister, $src$$XMMRegister);
8758   %}
8759   ins_pipe( pipe_slow );
8760 %}
8761 
8762 instruct vand4B_reg(vecS dst, vecS src1, vecS src2) %{
8763   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
8764   match(Set dst (AndV src1 src2));
8765   format %{ "vpand   $dst,$src1,$src2\t! and vectors (4 bytes)" %}
8766   ins_encode %{
8767     int vector_len = 0;
8768     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8769   %}
8770   ins_pipe( pipe_slow );
8771 %}
8772 
8773 instruct vand4B_mem(vecS dst, vecS src, memory mem) %{
8774   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
8775   match(Set dst (AndV src (LoadVector mem)));
8776   format %{ "vpand   $dst,$src,$mem\t! and vectors (4 bytes)" %}
8777   ins_encode %{
8778     int vector_len = 0;
8779     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8780   %}
8781   ins_pipe( pipe_slow );
8782 %}
8783 
8784 instruct vand8B(vecD dst, vecD src) %{
8785   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 8);
8786   match(Set dst (AndV dst src));
8787   format %{ "pand    $dst,$src\t! and vectors (8 bytes)" %}
8788   ins_encode %{
8789     __ pand($dst$$XMMRegister, $src$$XMMRegister);
8790   %}
8791   ins_pipe( pipe_slow );
8792 %}
8793 
8794 instruct vand8B_reg(vecD dst, vecD src1, vecD src2) %{
8795   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
8796   match(Set dst (AndV src1 src2));
8797   format %{ "vpand   $dst,$src1,$src2\t! and vectors (8 bytes)" %}
8798   ins_encode %{
8799     int vector_len = 0;
8800     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8801   %}
8802   ins_pipe( pipe_slow );
8803 %}
8804 
8805 instruct vand8B_mem(vecD dst, vecD src, memory mem) %{
8806   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
8807   match(Set dst (AndV src (LoadVector mem)));
8808   format %{ "vpand   $dst,$src,$mem\t! and vectors (8 bytes)" %}
8809   ins_encode %{
8810     int vector_len = 0;
8811     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8812   %}
8813   ins_pipe( pipe_slow );
8814 %}
8815 
8816 instruct vand16B(vecX dst, vecX src) %{
8817   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 16);
8818   match(Set dst (AndV dst src));
8819   format %{ "pand    $dst,$src\t! and vectors (16 bytes)" %}
8820   ins_encode %{
8821     __ pand($dst$$XMMRegister, $src$$XMMRegister);
8822   %}
8823   ins_pipe( pipe_slow );
8824 %}
8825 
8826 instruct vand16B_reg(vecX dst, vecX src1, vecX src2) %{
8827   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
8828   match(Set dst (AndV src1 src2));
8829   format %{ "vpand   $dst,$src1,$src2\t! and vectors (16 bytes)" %}
8830   ins_encode %{
8831     int vector_len = 0;
8832     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8833   %}
8834   ins_pipe( pipe_slow );
8835 %}
8836 
8837 instruct vand16B_mem(vecX dst, vecX src, memory mem) %{
8838   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
8839   match(Set dst (AndV src (LoadVector mem)));
8840   format %{ "vpand   $dst,$src,$mem\t! and vectors (16 bytes)" %}
8841   ins_encode %{
8842     int vector_len = 0;
8843     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8844   %}
8845   ins_pipe( pipe_slow );
8846 %}
8847 
8848 instruct vand32B_reg(vecY dst, vecY src1, vecY src2) %{
8849   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
8850   match(Set dst (AndV src1 src2));
8851   format %{ "vpand   $dst,$src1,$src2\t! and vectors (32 bytes)" %}
8852   ins_encode %{
8853     int vector_len = 1;
8854     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8855   %}
8856   ins_pipe( pipe_slow );
8857 %}
8858 
8859 instruct vand32B_mem(vecY dst, vecY src, memory mem) %{
8860   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
8861   match(Set dst (AndV src (LoadVector mem)));
8862   format %{ "vpand   $dst,$src,$mem\t! and vectors (32 bytes)" %}
8863   ins_encode %{
8864     int vector_len = 1;
8865     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8866   %}
8867   ins_pipe( pipe_slow );
8868 %}
8869 
8870 instruct vand64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
8871   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
8872   match(Set dst (AndV src1 src2));
8873   format %{ "vpand   $dst,$src1,$src2\t! and vectors (64 bytes)" %}
8874   ins_encode %{
8875     int vector_len = 2;
8876     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8877   %}
8878   ins_pipe( pipe_slow );
8879 %}
8880 
8881 instruct vand64B_mem(vecZ dst, vecZ src, memory mem) %{
8882   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
8883   match(Set dst (AndV src (LoadVector mem)));
8884   format %{ "vpand   $dst,$src,$mem\t! and vectors (64 bytes)" %}
8885   ins_encode %{
8886     int vector_len = 2;
8887     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8888   %}
8889   ins_pipe( pipe_slow );
8890 %}
8891 
8892 // --------------------------------- OR ---------------------------------------
8893 
8894 instruct vor4B(vecS dst, vecS src) %{
8895   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 4);
8896   match(Set dst (OrV dst src));
8897   format %{ "por     $dst,$src\t! or vectors (4 bytes)" %}
8898   ins_encode %{
8899     __ por($dst$$XMMRegister, $src$$XMMRegister);
8900   %}
8901   ins_pipe( pipe_slow );
8902 %}
8903 
8904 instruct vor4B_reg(vecS dst, vecS src1, vecS src2) %{
8905   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
8906   match(Set dst (OrV src1 src2));
8907   format %{ "vpor    $dst,$src1,$src2\t! or vectors (4 bytes)" %}
8908   ins_encode %{
8909     int vector_len = 0;
8910     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8911   %}
8912   ins_pipe( pipe_slow );
8913 %}
8914 
8915 instruct vor4B_mem(vecS dst, vecS src, memory mem) %{
8916   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
8917   match(Set dst (OrV src (LoadVector mem)));
8918   format %{ "vpor    $dst,$src,$mem\t! or vectors (4 bytes)" %}
8919   ins_encode %{
8920     int vector_len = 0;
8921     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8922   %}
8923   ins_pipe( pipe_slow );
8924 %}
8925 
8926 instruct vor8B(vecD dst, vecD src) %{
8927   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 8);
8928   match(Set dst (OrV dst src));
8929   format %{ "por     $dst,$src\t! or vectors (8 bytes)" %}
8930   ins_encode %{
8931     __ por($dst$$XMMRegister, $src$$XMMRegister);
8932   %}
8933   ins_pipe( pipe_slow );
8934 %}
8935 
8936 instruct vor8B_reg(vecD dst, vecD src1, vecD src2) %{
8937   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
8938   match(Set dst (OrV src1 src2));
8939   format %{ "vpor    $dst,$src1,$src2\t! or vectors (8 bytes)" %}
8940   ins_encode %{
8941     int vector_len = 0;
8942     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8943   %}
8944   ins_pipe( pipe_slow );
8945 %}
8946 
8947 instruct vor8B_mem(vecD dst, vecD src, memory mem) %{
8948   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
8949   match(Set dst (OrV src (LoadVector mem)));
8950   format %{ "vpor    $dst,$src,$mem\t! or vectors (8 bytes)" %}
8951   ins_encode %{
8952     int vector_len = 0;
8953     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8954   %}
8955   ins_pipe( pipe_slow );
8956 %}
8957 
8958 instruct vor16B(vecX dst, vecX src) %{
8959   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 16);
8960   match(Set dst (OrV dst src));
8961   format %{ "por     $dst,$src\t! or vectors (16 bytes)" %}
8962   ins_encode %{
8963     __ por($dst$$XMMRegister, $src$$XMMRegister);
8964   %}
8965   ins_pipe( pipe_slow );
8966 %}
8967 
8968 instruct vor16B_reg(vecX dst, vecX src1, vecX src2) %{
8969   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
8970   match(Set dst (OrV src1 src2));
8971   format %{ "vpor    $dst,$src1,$src2\t! or vectors (16 bytes)" %}
8972   ins_encode %{
8973     int vector_len = 0;
8974     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8975   %}
8976   ins_pipe( pipe_slow );
8977 %}
8978 
8979 instruct vor16B_mem(vecX dst, vecX src, memory mem) %{
8980   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
8981   match(Set dst (OrV src (LoadVector mem)));
8982   format %{ "vpor    $dst,$src,$mem\t! or vectors (16 bytes)" %}
8983   ins_encode %{
8984     int vector_len = 0;
8985     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8986   %}
8987   ins_pipe( pipe_slow );
8988 %}
8989 
8990 instruct vor32B_reg(vecY dst, vecY src1, vecY src2) %{
8991   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
8992   match(Set dst (OrV src1 src2));
8993   format %{ "vpor    $dst,$src1,$src2\t! or vectors (32 bytes)" %}
8994   ins_encode %{
8995     int vector_len = 1;
8996     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8997   %}
8998   ins_pipe( pipe_slow );
8999 %}
9000 
9001 instruct vor32B_mem(vecY dst, vecY src, memory mem) %{
9002   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
9003   match(Set dst (OrV src (LoadVector mem)));
9004   format %{ "vpor    $dst,$src,$mem\t! or vectors (32 bytes)" %}
9005   ins_encode %{
9006     int vector_len = 1;
9007     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9008   %}
9009   ins_pipe( pipe_slow );
9010 %}
9011 
9012 instruct vor64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
9013   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
9014   match(Set dst (OrV src1 src2));
9015   format %{ "vpor    $dst,$src1,$src2\t! or vectors (64 bytes)" %}
9016   ins_encode %{
9017     int vector_len = 2;
9018     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9019   %}
9020   ins_pipe( pipe_slow );
9021 %}
9022 
9023 instruct vor64B_mem(vecZ dst, vecZ src, memory mem) %{
9024   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
9025   match(Set dst (OrV src (LoadVector mem)));
9026   format %{ "vpor    $dst,$src,$mem\t! or vectors (64 bytes)" %}
9027   ins_encode %{
9028     int vector_len = 2;
9029     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9030   %}
9031   ins_pipe( pipe_slow );
9032 %}
9033 
9034 // --------------------------------- XOR --------------------------------------
9035 
9036 instruct vxor4B(vecS dst, vecS src) %{
9037   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 4);
9038   match(Set dst (XorV dst src));
9039   format %{ "pxor    $dst,$src\t! xor vectors (4 bytes)" %}
9040   ins_encode %{
9041     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
9042   %}
9043   ins_pipe( pipe_slow );
9044 %}
9045 
9046 instruct vxor4B_reg(vecS dst, vecS src1, vecS src2) %{
9047   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
9048   match(Set dst (XorV src1 src2));
9049   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (4 bytes)" %}
9050   ins_encode %{
9051     int vector_len = 0;
9052     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9053   %}
9054   ins_pipe( pipe_slow );
9055 %}
9056 
9057 instruct vxor4B_mem(vecS dst, vecS src, memory mem) %{
9058   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
9059   match(Set dst (XorV src (LoadVector mem)));
9060   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (4 bytes)" %}
9061   ins_encode %{
9062     int vector_len = 0;
9063     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9064   %}
9065   ins_pipe( pipe_slow );
9066 %}
9067 
9068 instruct vxor8B(vecD dst, vecD src) %{
9069   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 8);
9070   match(Set dst (XorV dst src));
9071   format %{ "pxor    $dst,$src\t! xor vectors (8 bytes)" %}
9072   ins_encode %{
9073     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
9074   %}
9075   ins_pipe( pipe_slow );
9076 %}
9077 
9078 instruct vxor8B_reg(vecD dst, vecD src1, vecD src2) %{
9079   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
9080   match(Set dst (XorV src1 src2));
9081   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (8 bytes)" %}
9082   ins_encode %{
9083     int vector_len = 0;
9084     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9085   %}
9086   ins_pipe( pipe_slow );
9087 %}
9088 
9089 instruct vxor8B_mem(vecD dst, vecD src, memory mem) %{
9090   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
9091   match(Set dst (XorV src (LoadVector mem)));
9092   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (8 bytes)" %}
9093   ins_encode %{
9094     int vector_len = 0;
9095     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9096   %}
9097   ins_pipe( pipe_slow );
9098 %}
9099 
9100 instruct vxor16B(vecX dst, vecX src) %{
9101   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 16);
9102   match(Set dst (XorV dst src));
9103   format %{ "pxor    $dst,$src\t! xor vectors (16 bytes)" %}
9104   ins_encode %{
9105     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
9106   %}
9107   ins_pipe( pipe_slow );
9108 %}
9109 
9110 instruct vxor16B_reg(vecX dst, vecX src1, vecX src2) %{
9111   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
9112   match(Set dst (XorV src1 src2));
9113   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (16 bytes)" %}
9114   ins_encode %{
9115     int vector_len = 0;
9116     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9117   %}
9118   ins_pipe( pipe_slow );
9119 %}
9120 
9121 instruct vxor16B_mem(vecX dst, vecX src, memory mem) %{
9122   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
9123   match(Set dst (XorV src (LoadVector mem)));
9124   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (16 bytes)" %}
9125   ins_encode %{
9126     int vector_len = 0;
9127     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9128   %}
9129   ins_pipe( pipe_slow );
9130 %}
9131 
9132 instruct vxor32B_reg(vecY dst, vecY src1, vecY src2) %{
9133   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
9134   match(Set dst (XorV src1 src2));
9135   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (32 bytes)" %}
9136   ins_encode %{
9137     int vector_len = 1;
9138     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9139   %}
9140   ins_pipe( pipe_slow );
9141 %}
9142 
9143 instruct vxor32B_mem(vecY dst, vecY src, memory mem) %{
9144   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
9145   match(Set dst (XorV src (LoadVector mem)));
9146   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (32 bytes)" %}
9147   ins_encode %{
9148     int vector_len = 1;
9149     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9150   %}
9151   ins_pipe( pipe_slow );
9152 %}
9153 
9154 instruct vxor64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
9155   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
9156   match(Set dst (XorV src1 src2));
9157   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (64 bytes)" %}
9158   ins_encode %{
9159     int vector_len = 2;
9160     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9161   %}
9162   ins_pipe( pipe_slow );
9163 %}
9164 
9165 instruct vxor64B_mem(vecZ dst, vecZ src, memory mem) %{
9166   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
9167   match(Set dst (XorV src (LoadVector mem)));
9168   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (64 bytes)" %}
9169   ins_encode %{
9170     int vector_len = 2;
9171     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9172   %}
9173   ins_pipe( pipe_slow );
9174 %}
9175 
9176 // --------------------------------- ABS --------------------------------------
9177 // a = |a|
9178 instruct vabs4B_reg(vecS dst, vecS src) %{
9179   predicate(UseSSE > 2 && n->as_Vector()->length() == 4);
9180   match(Set dst (AbsVB  src));
9181   format %{ "pabsb $dst,$src\t# $dst = |$src| abs packed4B" %}
9182   ins_encode %{
9183     __ pabsb($dst$$XMMRegister, $src$$XMMRegister);
9184   %}
9185   ins_pipe( pipe_slow );
9186 %}
9187 
9188 instruct vabs8B_reg(vecD dst, vecD src) %{
9189   predicate(UseSSE > 2 && n->as_Vector()->length() == 8);
9190   match(Set dst (AbsVB  src));
9191   format %{ "pabsb $dst,$src\t# $dst = |$src| abs packed8B" %}
9192   ins_encode %{
9193     __ pabsb($dst$$XMMRegister, $src$$XMMRegister);
9194   %}
9195   ins_pipe( pipe_slow );
9196 %}
9197 
9198 instruct vabs16B_reg(vecX dst, vecX src) %{
9199   predicate(UseSSE > 2 && n->as_Vector()->length() == 16);
9200   match(Set dst (AbsVB  src));
9201   format %{ "pabsb $dst,$src\t# $dst = |$src| abs packed16B" %}
9202   ins_encode %{
9203     __ pabsb($dst$$XMMRegister, $src$$XMMRegister);
9204   %}
9205   ins_pipe( pipe_slow );
9206 %}
9207 
9208 instruct vabs32B_reg(vecY dst, vecY src) %{
9209   predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
9210   match(Set dst (AbsVB  src));
9211   format %{ "vpabsb $dst,$src\t# $dst = |$src| abs packed32B" %}
9212   ins_encode %{
9213     int vector_len = 1;
9214     __ vpabsb($dst$$XMMRegister, $src$$XMMRegister, vector_len);
9215   %}
9216   ins_pipe( pipe_slow );
9217 %}
9218 
9219 instruct vabs64B_reg(vecZ dst, vecZ src) %{
9220   predicate(UseAVX > 2 && n->as_Vector()->length() == 64);
9221   match(Set dst (AbsVB  src));
9222   format %{ "vpabsb $dst,$src\t# $dst = |$src| abs packed64B" %}
9223   ins_encode %{
9224     int vector_len = 2;
9225     __ vpabsb($dst$$XMMRegister, $src$$XMMRegister, vector_len);
9226   %}
9227   ins_pipe( pipe_slow );
9228 %}
9229 
9230 instruct vabs2S_reg(vecD dst, vecD src) %{
9231   predicate(UseSSE > 2 && n->as_Vector()->length() == 2);
9232   match(Set dst (AbsVS  src));
9233   format %{ "pabsw $dst,$src\t# $dst = |$src| abs packed2S" %}
9234   ins_encode %{
9235     __ pabsw($dst$$XMMRegister, $src$$XMMRegister);
9236   %}
9237   ins_pipe( pipe_slow );
9238 %}
9239 
9240 instruct vabs4S_reg(vecD dst, vecD src) %{
9241   predicate(UseSSE > 2 && n->as_Vector()->length() == 4);
9242   match(Set dst (AbsVS  src));
9243   format %{ "pabsw $dst,$src\t# $dst = |$src| abs packed4S" %}
9244   ins_encode %{
9245     __ pabsw($dst$$XMMRegister, $src$$XMMRegister);
9246   %}
9247   ins_pipe( pipe_slow );
9248 %}
9249 
9250 instruct vabs8S_reg(vecX dst, vecX src) %{
9251   predicate(UseSSE > 2 && n->as_Vector()->length() == 8);
9252   match(Set dst (AbsVS  src));
9253   format %{ "pabsw $dst,$src\t# $dst = |$src| abs packed8S" %}
9254   ins_encode %{
9255     __ pabsw($dst$$XMMRegister, $src$$XMMRegister);
9256   %}
9257   ins_pipe( pipe_slow );
9258 %}
9259 
9260 instruct vabs16S_reg(vecY dst, vecY src) %{
9261   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
9262   match(Set dst (AbsVS  src));
9263   format %{ "vpabsw $dst,$src\t# $dst = |$src| abs packed16S" %}
9264   ins_encode %{
9265     int vector_len = 1;
9266     __ vpabsw($dst$$XMMRegister, $src$$XMMRegister, vector_len);
9267   %}
9268   ins_pipe( pipe_slow );
9269 %}
9270 
9271 instruct vabs32S_reg(vecZ dst, vecZ src) %{
9272   predicate(UseAVX > 2 && n->as_Vector()->length() == 32);
9273   match(Set dst (AbsVS  src));
9274   format %{ "vpabsw $dst,$src\t# $dst = |$src| abs packed32S" %}
9275   ins_encode %{
9276     int vector_len = 2;
9277     __ vpabsw($dst$$XMMRegister, $src$$XMMRegister, vector_len);
9278   %}
9279   ins_pipe( pipe_slow );
9280 %}
9281 
9282 instruct vabs2I_reg(vecD dst, vecD src) %{
9283   predicate(UseSSE > 2 && n->as_Vector()->length() == 2);
9284   match(Set dst (AbsVI  src));
9285   format %{ "pabsd $dst,$src\t# $dst = |$src| abs packed2I" %}
9286   ins_encode %{
9287     __ pabsd($dst$$XMMRegister, $src$$XMMRegister);
9288   %}
9289   ins_pipe( pipe_slow );
9290 %}
9291 
9292 instruct vabs4I_reg(vecX dst, vecX src) %{
9293   predicate(UseSSE > 2 && n->as_Vector()->length() == 4);
9294   match(Set dst (AbsVI  src));
9295   format %{ "pabsd $dst,$src\t# $dst = |$src| abs packed4I" %}
9296   ins_encode %{
9297     __ pabsd($dst$$XMMRegister, $src$$XMMRegister);
9298   %}
9299   ins_pipe( pipe_slow );
9300 %}
9301 
9302 instruct vabs8I_reg(vecY dst, vecY src) %{
9303   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
9304   match(Set dst (AbsVI src));
9305   format %{ "vpabsd $dst,$src\t# $dst = |$src| abs packed8I" %}
9306   ins_encode %{
9307     int vector_len = 1;
9308     __ vpabsd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
9309   %}
9310   ins_pipe( pipe_slow );
9311 %}
9312 
9313 instruct vabs16I_reg(vecZ dst, vecZ src) %{
9314   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
9315   match(Set dst (AbsVI src));
9316   format %{ "vpabsd $dst,$src\t# $dst = |$src| abs packed16I" %}
9317   ins_encode %{
9318     int vector_len = 2;
9319     __ vpabsd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
9320   %}
9321   ins_pipe( pipe_slow );
9322 %}
9323 
9324 instruct vabs2L_reg(vecX dst, vecX src) %{
9325   predicate(UseAVX > 2 && n->as_Vector()->length() == 2);
9326   match(Set dst (AbsVL  src));
9327   format %{ "evpabsq $dst,$src\t# $dst = |$src| abs packed2L" %}
9328   ins_encode %{
9329     int vector_len = 0;
9330     __ evpabsq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
9331   %}
9332   ins_pipe( pipe_slow );
9333 %}
9334 
9335 instruct vabs4L_reg(vecY dst, vecY src) %{
9336   predicate(UseAVX > 2 && n->as_Vector()->length() == 4);
9337   match(Set dst (AbsVL  src));
9338   format %{ "evpabsq $dst,$src\t# $dst = |$src| abs packed4L" %}
9339   ins_encode %{
9340     int vector_len = 1;
9341     __ evpabsq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
9342   %}
9343   ins_pipe( pipe_slow );
9344 %}
9345 
9346 instruct vabs8L_reg(vecZ dst, vecZ src) %{
9347   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
9348   match(Set dst (AbsVL  src));
9349   format %{ "evpabsq $dst,$src\t# $dst = |$src| abs packed8L" %}
9350   ins_encode %{
9351     int vector_len = 2;
9352     __ evpabsq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
9353   %}
9354   ins_pipe( pipe_slow );
9355 %}
9356 
9357 // --------------------------------- ABSNEG --------------------------------------
9358 
9359 instruct vabsneg2D(vecX dst, vecX src, rRegI scratch) %{
9360   predicate(UseSSE >= 2 && n->as_Vector()->length() == 2);
9361   match(Set dst (AbsVD  src));
9362   match(Set dst (NegVD  src));
9363   effect(TEMP scratch);
9364   format %{ "vabsnegd $dst,$src,[mask]\t# absneg packed2D" %}
9365   ins_encode %{
9366     int opcode = this->as_Mach()->ideal_Opcode();
9367     if ($dst$$XMMRegister != $src$$XMMRegister)
9368       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
9369     __ vabsnegd(opcode, $dst$$XMMRegister, $scratch$$Register);
9370   %}
9371   ins_pipe( pipe_slow );
9372 %}
9373 
9374 instruct vabsneg4D(vecY dst, vecY src, rRegI scratch) %{
9375   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
9376   match(Set dst (AbsVD  src));
9377   match(Set dst (NegVD  src));
9378   effect(TEMP scratch);
9379   format %{ "vabsnegd $dst,$src,[mask]\t# absneg packed4D" %}
9380   ins_encode %{
9381     int opcode = this->as_Mach()->ideal_Opcode();
9382     int vector_len = 1;
9383     __ vabsnegd(opcode, $dst$$XMMRegister, $src$$XMMRegister, vector_len, $scratch$$Register);
9384   %}
9385   ins_pipe( pipe_slow );
9386 %}
9387 
9388 instruct vabsneg8D(vecZ dst, vecZ src, rRegI scratch) %{
9389   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
9390   match(Set dst (AbsVD  src));
9391   match(Set dst (NegVD  src));
9392   effect(TEMP scratch);
9393   format %{ "vabsnegd $dst,$src,[mask]\t# absneg packed8D" %}
9394   ins_encode %{
9395     int opcode = this->as_Mach()->ideal_Opcode();
9396     int vector_len = 2;
9397     __ vabsnegd(opcode, $dst$$XMMRegister, $src$$XMMRegister, vector_len, $scratch$$Register);
9398   %}
9399   ins_pipe( pipe_slow );
9400 %}
9401 
9402 instruct vabsneg2F(vecD dst, vecD src, rRegI scratch) %{
9403   predicate(UseSSE > 0 && n->as_Vector()->length() == 2);
9404   match(Set dst (AbsVF  src));
9405   match(Set dst (NegVF  src));
9406   effect(TEMP scratch);
9407   format %{ "vabsnegf $dst,$src,[mask]\t# absneg packed2F" %}
9408   ins_encode %{
9409     int opcode = this->as_Mach()->ideal_Opcode();
9410     if ($dst$$XMMRegister != $src$$XMMRegister)
9411       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
9412     __ vabsnegf(opcode, $dst$$XMMRegister, $scratch$$Register);
9413   %}
9414   ins_pipe( pipe_slow );
9415 %}
9416 
9417 instruct vabsneg4F(vecX dst, rRegI scratch) %{
9418   predicate(UseSSE > 0 && n->as_Vector()->length() == 4);
9419   match(Set dst (AbsVF  dst));
9420   match(Set dst (NegVF  dst));
9421   effect(TEMP scratch);
9422   format %{ "vabsnegf $dst,[mask]\t# absneg packed4F" %}
9423   ins_cost(150);
9424   ins_encode %{
9425     int opcode = this->as_Mach()->ideal_Opcode();
9426     __ vabsnegf(opcode, $dst$$XMMRegister, $scratch$$Register);
9427   %}
9428   ins_pipe( pipe_slow );
9429 %}
9430 
9431 instruct vabsneg8F(vecY dst, vecY src, rRegI scratch) %{
9432   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
9433   match(Set dst (AbsVF  src));
9434   match(Set dst (NegVF  src));
9435   effect(TEMP scratch);
9436   format %{ "vabsnegf $dst,$src,[mask]\t# absneg packed8F" %}
9437   ins_cost(150);
9438   ins_encode %{
9439     int opcode = this->as_Mach()->ideal_Opcode();
9440     int vector_len = 1;
9441     __ vabsnegf(opcode, $dst$$XMMRegister, $src$$XMMRegister, vector_len, $scratch$$Register);
9442   %}
9443   ins_pipe( pipe_slow );
9444 %}
9445 
9446 instruct vabsneg16F(vecZ dst, vecZ src, rRegI scratch) %{
9447   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
9448   match(Set dst (AbsVF  src));
9449   match(Set dst (NegVF  src));
9450   effect(TEMP scratch);
9451   format %{ "vabsnegf $dst,$src,[mask]\t# absneg packed16F" %}
9452   ins_cost(150);
9453   ins_encode %{
9454     int opcode = this->as_Mach()->ideal_Opcode();
9455     int vector_len = 2;
9456     __ vabsnegf(opcode, $dst$$XMMRegister, $src$$XMMRegister, vector_len, $scratch$$Register);
9457   %}
9458   ins_pipe( pipe_slow );
9459 %}
9460 
9461 // --------------------------------- FMA --------------------------------------
9462 
9463 // a * b + c
9464 instruct vfma2D_reg(vecX a, vecX b, vecX c) %{
9465   predicate(UseFMA && n->as_Vector()->length() == 2);
9466   match(Set c (FmaVD  c (Binary a b)));
9467   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed2D" %}
9468   ins_cost(150);
9469   ins_encode %{
9470     int vector_len = 0;
9471     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
9472   %}
9473   ins_pipe( pipe_slow );
9474 %}
9475 
9476 // a * b + c
9477 instruct vfma2D_mem(vecX a, memory b, vecX c) %{
9478   predicate(UseFMA && n->as_Vector()->length() == 2);
9479   match(Set c (FmaVD  c (Binary a (LoadVector b))));
9480   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed2D" %}
9481   ins_cost(150);
9482   ins_encode %{
9483     int vector_len = 0;
9484     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
9485   %}
9486   ins_pipe( pipe_slow );
9487 %}
9488 
9489 
9490 // a * b + c
9491 instruct vfma4D_reg(vecY a, vecY b, vecY c) %{
9492   predicate(UseFMA && n->as_Vector()->length() == 4);
9493   match(Set c (FmaVD  c (Binary a b)));
9494   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed4D" %}
9495   ins_cost(150);
9496   ins_encode %{
9497     int vector_len = 1;
9498     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
9499   %}
9500   ins_pipe( pipe_slow );
9501 %}
9502 
9503 // a * b + c
9504 instruct vfma4D_mem(vecY a, memory b, vecY c) %{
9505   predicate(UseFMA && n->as_Vector()->length() == 4);
9506   match(Set c (FmaVD  c (Binary a (LoadVector b))));
9507   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed4D" %}
9508   ins_cost(150);
9509   ins_encode %{
9510     int vector_len = 1;
9511     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
9512   %}
9513   ins_pipe( pipe_slow );
9514 %}
9515 
9516 // a * b + c
9517 instruct vfma8D_reg(vecZ a, vecZ b, vecZ c) %{
9518   predicate(UseFMA && n->as_Vector()->length() == 8);
9519   match(Set c (FmaVD  c (Binary a b)));
9520   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed8D" %}
9521   ins_cost(150);
9522   ins_encode %{
9523     int vector_len = 2;
9524     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
9525   %}
9526   ins_pipe( pipe_slow );
9527 %}
9528 
9529 // a * b + c
9530 instruct vfma8D_mem(vecZ a, memory b, vecZ c) %{
9531   predicate(UseFMA && n->as_Vector()->length() == 8);
9532   match(Set c (FmaVD  c (Binary a (LoadVector b))));
9533   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed8D" %}
9534   ins_cost(150);
9535   ins_encode %{
9536     int vector_len = 2;
9537     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
9538   %}
9539   ins_pipe( pipe_slow );
9540 %}
9541 
9542 // a * b + c
9543 instruct vfma4F_reg(vecX a, vecX b, vecX c) %{
9544   predicate(UseFMA && n->as_Vector()->length() == 4);
9545   match(Set c (FmaVF  c (Binary a b)));
9546   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed4F" %}
9547   ins_cost(150);
9548   ins_encode %{
9549     int vector_len = 0;
9550     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
9551   %}
9552   ins_pipe( pipe_slow );
9553 %}
9554 
9555 // a * b + c
9556 instruct vfma4F_mem(vecX a, memory b, vecX c) %{
9557   predicate(UseFMA && n->as_Vector()->length() == 4);
9558   match(Set c (FmaVF  c (Binary a (LoadVector b))));
9559   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed4F" %}
9560   ins_cost(150);
9561   ins_encode %{
9562     int vector_len = 0;
9563     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
9564   %}
9565   ins_pipe( pipe_slow );
9566 %}
9567 
9568 // a * b + c
9569 instruct vfma8F_reg(vecY a, vecY b, vecY c) %{
9570   predicate(UseFMA && n->as_Vector()->length() == 8);
9571   match(Set c (FmaVF  c (Binary a b)));
9572   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed8F" %}
9573   ins_cost(150);
9574   ins_encode %{
9575     int vector_len = 1;
9576     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
9577   %}
9578   ins_pipe( pipe_slow );
9579 %}
9580 
9581 // a * b + c
9582 instruct vfma8F_mem(vecY a, memory b, vecY c) %{
9583   predicate(UseFMA && n->as_Vector()->length() == 8);
9584   match(Set c (FmaVF  c (Binary a (LoadVector b))));
9585   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed8F" %}
9586   ins_cost(150);
9587   ins_encode %{
9588     int vector_len = 1;
9589     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
9590   %}
9591   ins_pipe( pipe_slow );
9592 %}
9593 
9594 // a * b + c
9595 instruct vfma16F_reg(vecZ a, vecZ b, vecZ c) %{
9596   predicate(UseFMA && n->as_Vector()->length() == 16);
9597   match(Set c (FmaVF  c (Binary a b)));
9598   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed16F" %}
9599   ins_cost(150);
9600   ins_encode %{
9601     int vector_len = 2;
9602     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
9603   %}
9604   ins_pipe( pipe_slow );
9605 %}
9606 
9607 // a * b + c
9608 instruct vfma16F_mem(vecZ a, memory b, vecZ c) %{
9609   predicate(UseFMA && n->as_Vector()->length() == 16);
9610   match(Set c (FmaVF  c (Binary a (LoadVector b))));
9611   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed16F" %}
9612   ins_cost(150);
9613   ins_encode %{
9614     int vector_len = 2;
9615     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
9616   %}
9617   ins_pipe( pipe_slow );
9618 %}
9619 
9620 // --------------------------------- PopCount --------------------------------------
9621 
9622 instruct vpopcount2I(vecD dst, vecD src) %{
9623   predicate(VM_Version::supports_vpopcntdq() && UsePopCountInstruction && n->as_Vector()->length() == 2);
9624   match(Set dst (PopCountVI src));
9625   format %{ "vpopcntd  $dst,$src\t! vector popcount packed2I" %}
9626   ins_encode %{
9627     int vector_len = 0;
9628     __ vpopcntd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
9629   %}
9630   ins_pipe( pipe_slow );
9631 %}
9632 
9633 instruct vpopcount4I(vecX dst, vecX src) %{
9634   predicate(VM_Version::supports_vpopcntdq() && UsePopCountInstruction && n->as_Vector()->length() == 4);
9635   match(Set dst (PopCountVI src));
9636   format %{ "vpopcntd  $dst,$src\t! vector popcount packed4I" %}
9637   ins_encode %{
9638     int vector_len = 0;
9639     __ vpopcntd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
9640   %}
9641   ins_pipe( pipe_slow );
9642 %}
9643 
9644 instruct vpopcount8I(vecY dst, vecY src) %{
9645   predicate(VM_Version::supports_vpopcntdq() && UsePopCountInstruction && n->as_Vector()->length() == 8);
9646   match(Set dst (PopCountVI src));
9647   format %{ "vpopcntd  $dst,$src\t! vector popcount packed8I" %}
9648   ins_encode %{
9649     int vector_len = 1;
9650     __ vpopcntd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
9651   %}
9652   ins_pipe( pipe_slow );
9653 %}
9654 
9655 instruct vpopcount16I(vecZ dst, vecZ src) %{
9656   predicate(VM_Version::supports_vpopcntdq() && UsePopCountInstruction && n->as_Vector()->length() == 16);
9657   match(Set dst (PopCountVI src));
9658   format %{ "vpopcntd  $dst,$src\t! vector popcount packed16I" %}
9659   ins_encode %{
9660     int vector_len = 2;
9661     __ vpopcntd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
9662   %}
9663   ins_pipe( pipe_slow );
9664 %}