1 //
   2 // Copyright (c) 2011, 2019, Oracle and/or its affiliates. All rights reserved.
   3 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4 //
   5 // This code is free software; you can redistribute it and/or modify it
   6 // under the terms of the GNU General Public License version 2 only, as
   7 // published by the Free Software Foundation.
   8 //
   9 // This code is distributed in the hope that it will be useful, but WITHOUT
  10 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11 // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12 // version 2 for more details (a copy is included in the LICENSE file that
  13 // accompanied this code).
  14 //
  15 // You should have received a copy of the GNU General Public License version
  16 // 2 along with this work; if not, write to the Free Software Foundation,
  17 // Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18 //
  19 // Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20 // or visit www.oracle.com if you need additional information or have any
  21 // questions.
  22 //
  23 //
  24 
  25 // X86 Common Architecture Description File
  26 
  27 //----------REGISTER DEFINITION BLOCK------------------------------------------
  28 // This information is used by the matcher and the register allocator to
  29 // describe individual registers and classes of registers within the target
  30 // archtecture.
  31 
  32 register %{
  33 //----------Architecture Description Register Definitions----------------------
  34 // General Registers
  35 // "reg_def"  name ( register save type, C convention save type,
  36 //                   ideal register type, encoding );
  37 // Register Save Types:
  38 //
  39 // NS  = No-Save:       The register allocator assumes that these registers
  40 //                      can be used without saving upon entry to the method, &
  41 //                      that they do not need to be saved at call sites.
  42 //
  43 // SOC = Save-On-Call:  The register allocator assumes that these registers
  44 //                      can be used without saving upon entry to the method,
  45 //                      but that they must be saved at call sites.
  46 //
  47 // SOE = Save-On-Entry: The register allocator assumes that these registers
  48 //                      must be saved before using them upon entry to the
  49 //                      method, but they do not need to be saved at call
  50 //                      sites.
  51 //
  52 // AS  = Always-Save:   The register allocator assumes that these registers
  53 //                      must be saved before using them upon entry to the
  54 //                      method, & that they must be saved at call sites.
  55 //
  56 // Ideal Register Type is used to determine how to save & restore a
  57 // register.  Op_RegI will get spilled with LoadI/StoreI, Op_RegP will get
  58 // spilled with LoadP/StoreP.  If the register supports both, use Op_RegI.
  59 //
  60 // The encoding number is the actual bit-pattern placed into the opcodes.
  61 
  62 // XMM registers.  512-bit registers or 8 words each, labeled (a)-p.
  63 // Word a in each register holds a Float, words ab hold a Double.
  64 // The whole registers are used in SSE4.2 version intrinsics,
  65 // array copy stubs and superword operations (see UseSSE42Intrinsics,
  66 // UseXMMForArrayCopy and UseSuperword flags).
  67 // For pre EVEX enabled architectures:
  68 //      XMM8-XMM15 must be encoded with REX (VEX for UseAVX)
  69 // For EVEX enabled architectures:
  70 //      XMM8-XMM31 must be encoded with REX (EVEX for UseAVX).
  71 //
  72 // Linux ABI:   No register preserved across function calls
  73 //              XMM0-XMM7 might hold parameters
  74 // Windows ABI: XMM6-XMM31 preserved across function calls
  75 //              XMM0-XMM3 might hold parameters
  76 
  77 reg_def XMM0 ( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg());
  78 reg_def XMM0b( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(1));
  79 reg_def XMM0c( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(2));
  80 reg_def XMM0d( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(3));
  81 reg_def XMM0e( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(4));
  82 reg_def XMM0f( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(5));
  83 reg_def XMM0g( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(6));
  84 reg_def XMM0h( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(7));
  85 reg_def XMM0i( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(8));
  86 reg_def XMM0j( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(9));
  87 reg_def XMM0k( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(10));
  88 reg_def XMM0l( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(11));
  89 reg_def XMM0m( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(12));
  90 reg_def XMM0n( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(13));
  91 reg_def XMM0o( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(14));
  92 reg_def XMM0p( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(15));
  93 
  94 reg_def XMM1 ( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg());
  95 reg_def XMM1b( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(1));
  96 reg_def XMM1c( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(2));
  97 reg_def XMM1d( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(3));
  98 reg_def XMM1e( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(4));
  99 reg_def XMM1f( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(5));
 100 reg_def XMM1g( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(6));
 101 reg_def XMM1h( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(7));
 102 reg_def XMM1i( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(8));
 103 reg_def XMM1j( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(9));
 104 reg_def XMM1k( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(10));
 105 reg_def XMM1l( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(11));
 106 reg_def XMM1m( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(12));
 107 reg_def XMM1n( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(13));
 108 reg_def XMM1o( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(14));
 109 reg_def XMM1p( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(15));
 110 
 111 reg_def XMM2 ( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg());
 112 reg_def XMM2b( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(1));
 113 reg_def XMM2c( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(2));
 114 reg_def XMM2d( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(3));
 115 reg_def XMM2e( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(4));
 116 reg_def XMM2f( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(5));
 117 reg_def XMM2g( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(6));
 118 reg_def XMM2h( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(7));
 119 reg_def XMM2i( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(8));
 120 reg_def XMM2j( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(9));
 121 reg_def XMM2k( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(10));
 122 reg_def XMM2l( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(11));
 123 reg_def XMM2m( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(12));
 124 reg_def XMM2n( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(13));
 125 reg_def XMM2o( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(14));
 126 reg_def XMM2p( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(15));
 127 
 128 reg_def XMM3 ( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg());
 129 reg_def XMM3b( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(1));
 130 reg_def XMM3c( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(2));
 131 reg_def XMM3d( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(3));
 132 reg_def XMM3e( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(4));
 133 reg_def XMM3f( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(5));
 134 reg_def XMM3g( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(6));
 135 reg_def XMM3h( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(7));
 136 reg_def XMM3i( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(8));
 137 reg_def XMM3j( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(9));
 138 reg_def XMM3k( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(10));
 139 reg_def XMM3l( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(11));
 140 reg_def XMM3m( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(12));
 141 reg_def XMM3n( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(13));
 142 reg_def XMM3o( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(14));
 143 reg_def XMM3p( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(15));
 144 
 145 reg_def XMM4 ( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg());
 146 reg_def XMM4b( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(1));
 147 reg_def XMM4c( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(2));
 148 reg_def XMM4d( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(3));
 149 reg_def XMM4e( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(4));
 150 reg_def XMM4f( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(5));
 151 reg_def XMM4g( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(6));
 152 reg_def XMM4h( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(7));
 153 reg_def XMM4i( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(8));
 154 reg_def XMM4j( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(9));
 155 reg_def XMM4k( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(10));
 156 reg_def XMM4l( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(11));
 157 reg_def XMM4m( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(12));
 158 reg_def XMM4n( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(13));
 159 reg_def XMM4o( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(14));
 160 reg_def XMM4p( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(15));
 161 
 162 reg_def XMM5 ( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg());
 163 reg_def XMM5b( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(1));
 164 reg_def XMM5c( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(2));
 165 reg_def XMM5d( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(3));
 166 reg_def XMM5e( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(4));
 167 reg_def XMM5f( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(5));
 168 reg_def XMM5g( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(6));
 169 reg_def XMM5h( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(7));
 170 reg_def XMM5i( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(8));
 171 reg_def XMM5j( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(9));
 172 reg_def XMM5k( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(10));
 173 reg_def XMM5l( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(11));
 174 reg_def XMM5m( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(12));
 175 reg_def XMM5n( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(13));
 176 reg_def XMM5o( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(14));
 177 reg_def XMM5p( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(15));
 178 
 179 reg_def XMM6 ( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg());
 180 reg_def XMM6b( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(1));
 181 reg_def XMM6c( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(2));
 182 reg_def XMM6d( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(3));
 183 reg_def XMM6e( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(4));
 184 reg_def XMM6f( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(5));
 185 reg_def XMM6g( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(6));
 186 reg_def XMM6h( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(7));
 187 reg_def XMM6i( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(8));
 188 reg_def XMM6j( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(9));
 189 reg_def XMM6k( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(10));
 190 reg_def XMM6l( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(11));
 191 reg_def XMM6m( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(12));
 192 reg_def XMM6n( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(13));
 193 reg_def XMM6o( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(14));
 194 reg_def XMM6p( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(15));
 195 
 196 reg_def XMM7 ( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg());
 197 reg_def XMM7b( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(1));
 198 reg_def XMM7c( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(2));
 199 reg_def XMM7d( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(3));
 200 reg_def XMM7e( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(4));
 201 reg_def XMM7f( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(5));
 202 reg_def XMM7g( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(6));
 203 reg_def XMM7h( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(7));
 204 reg_def XMM7i( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(8));
 205 reg_def XMM7j( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(9));
 206 reg_def XMM7k( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(10));
 207 reg_def XMM7l( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(11));
 208 reg_def XMM7m( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(12));
 209 reg_def XMM7n( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(13));
 210 reg_def XMM7o( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(14));
 211 reg_def XMM7p( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(15));
 212 
 213 #ifdef _LP64
 214 
 215 reg_def XMM8 ( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg());
 216 reg_def XMM8b( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(1));
 217 reg_def XMM8c( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(2));
 218 reg_def XMM8d( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(3));
 219 reg_def XMM8e( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(4));
 220 reg_def XMM8f( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(5));
 221 reg_def XMM8g( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(6));
 222 reg_def XMM8h( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(7));
 223 reg_def XMM8i( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(8));
 224 reg_def XMM8j( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(9));
 225 reg_def XMM8k( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(10));
 226 reg_def XMM8l( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(11));
 227 reg_def XMM8m( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(12));
 228 reg_def XMM8n( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(13));
 229 reg_def XMM8o( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(14));
 230 reg_def XMM8p( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(15));
 231 
 232 reg_def XMM9 ( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg());
 233 reg_def XMM9b( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(1));
 234 reg_def XMM9c( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(2));
 235 reg_def XMM9d( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(3));
 236 reg_def XMM9e( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(4));
 237 reg_def XMM9f( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(5));
 238 reg_def XMM9g( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(6));
 239 reg_def XMM9h( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(7));
 240 reg_def XMM9i( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(8));
 241 reg_def XMM9j( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(9));
 242 reg_def XMM9k( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(10));
 243 reg_def XMM9l( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(11));
 244 reg_def XMM9m( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(12));
 245 reg_def XMM9n( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(13));
 246 reg_def XMM9o( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(14));
 247 reg_def XMM9p( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(15));
 248 
 249 reg_def XMM10 ( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg());
 250 reg_def XMM10b( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(1));
 251 reg_def XMM10c( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(2));
 252 reg_def XMM10d( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(3));
 253 reg_def XMM10e( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(4));
 254 reg_def XMM10f( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(5));
 255 reg_def XMM10g( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(6));
 256 reg_def XMM10h( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(7));
 257 reg_def XMM10i( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(8));
 258 reg_def XMM10j( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(9));
 259 reg_def XMM10k( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(10));
 260 reg_def XMM10l( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(11));
 261 reg_def XMM10m( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(12));
 262 reg_def XMM10n( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(13));
 263 reg_def XMM10o( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(14));
 264 reg_def XMM10p( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(15));
 265 
 266 reg_def XMM11 ( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg());
 267 reg_def XMM11b( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(1));
 268 reg_def XMM11c( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(2));
 269 reg_def XMM11d( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(3));
 270 reg_def XMM11e( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(4));
 271 reg_def XMM11f( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(5));
 272 reg_def XMM11g( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(6));
 273 reg_def XMM11h( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(7));
 274 reg_def XMM11i( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(8));
 275 reg_def XMM11j( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(9));
 276 reg_def XMM11k( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(10));
 277 reg_def XMM11l( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(11));
 278 reg_def XMM11m( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(12));
 279 reg_def XMM11n( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(13));
 280 reg_def XMM11o( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(14));
 281 reg_def XMM11p( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(15));
 282 
 283 reg_def XMM12 ( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg());
 284 reg_def XMM12b( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(1));
 285 reg_def XMM12c( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(2));
 286 reg_def XMM12d( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(3));
 287 reg_def XMM12e( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(4));
 288 reg_def XMM12f( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(5));
 289 reg_def XMM12g( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(6));
 290 reg_def XMM12h( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(7));
 291 reg_def XMM12i( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(8));
 292 reg_def XMM12j( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(9));
 293 reg_def XMM12k( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(10));
 294 reg_def XMM12l( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(11));
 295 reg_def XMM12m( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(12));
 296 reg_def XMM12n( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(13));
 297 reg_def XMM12o( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(14));
 298 reg_def XMM12p( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(15));
 299 
 300 reg_def XMM13 ( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg());
 301 reg_def XMM13b( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(1));
 302 reg_def XMM13c( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(2));
 303 reg_def XMM13d( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(3));
 304 reg_def XMM13e( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(4));
 305 reg_def XMM13f( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(5));
 306 reg_def XMM13g( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(6));
 307 reg_def XMM13h( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(7));
 308 reg_def XMM13i( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(8));
 309 reg_def XMM13j( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(9));
 310 reg_def XMM13k( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(10));
 311 reg_def XMM13l( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(11));
 312 reg_def XMM13m( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(12));
 313 reg_def XMM13n( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(13));
 314 reg_def XMM13o( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(14));
 315 reg_def XMM13p( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(15));
 316 
 317 reg_def XMM14 ( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg());
 318 reg_def XMM14b( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(1));
 319 reg_def XMM14c( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(2));
 320 reg_def XMM14d( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(3));
 321 reg_def XMM14e( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(4));
 322 reg_def XMM14f( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(5));
 323 reg_def XMM14g( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(6));
 324 reg_def XMM14h( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(7));
 325 reg_def XMM14i( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(8));
 326 reg_def XMM14j( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(9));
 327 reg_def XMM14k( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(10));
 328 reg_def XMM14l( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(11));
 329 reg_def XMM14m( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(12));
 330 reg_def XMM14n( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(13));
 331 reg_def XMM14o( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(14));
 332 reg_def XMM14p( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(15));
 333 
 334 reg_def XMM15 ( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg());
 335 reg_def XMM15b( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(1));
 336 reg_def XMM15c( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(2));
 337 reg_def XMM15d( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(3));
 338 reg_def XMM15e( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(4));
 339 reg_def XMM15f( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(5));
 340 reg_def XMM15g( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(6));
 341 reg_def XMM15h( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(7));
 342 reg_def XMM15i( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(8));
 343 reg_def XMM15j( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(9));
 344 reg_def XMM15k( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(10));
 345 reg_def XMM15l( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(11));
 346 reg_def XMM15m( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(12));
 347 reg_def XMM15n( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(13));
 348 reg_def XMM15o( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(14));
 349 reg_def XMM15p( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(15));
 350 
 351 reg_def XMM16 ( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg());
 352 reg_def XMM16b( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(1));
 353 reg_def XMM16c( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(2));
 354 reg_def XMM16d( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(3));
 355 reg_def XMM16e( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(4));
 356 reg_def XMM16f( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(5));
 357 reg_def XMM16g( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(6));
 358 reg_def XMM16h( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(7));
 359 reg_def XMM16i( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(8));
 360 reg_def XMM16j( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(9));
 361 reg_def XMM16k( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(10));
 362 reg_def XMM16l( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(11));
 363 reg_def XMM16m( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(12));
 364 reg_def XMM16n( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(13));
 365 reg_def XMM16o( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(14));
 366 reg_def XMM16p( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(15));
 367 
 368 reg_def XMM17 ( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg());
 369 reg_def XMM17b( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(1));
 370 reg_def XMM17c( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(2));
 371 reg_def XMM17d( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(3));
 372 reg_def XMM17e( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(4));
 373 reg_def XMM17f( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(5));
 374 reg_def XMM17g( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(6));
 375 reg_def XMM17h( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(7));
 376 reg_def XMM17i( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(8));
 377 reg_def XMM17j( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(9));
 378 reg_def XMM17k( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(10));
 379 reg_def XMM17l( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(11));
 380 reg_def XMM17m( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(12));
 381 reg_def XMM17n( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(13));
 382 reg_def XMM17o( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(14));
 383 reg_def XMM17p( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(15));
 384 
 385 reg_def XMM18 ( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg());
 386 reg_def XMM18b( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(1));
 387 reg_def XMM18c( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(2));
 388 reg_def XMM18d( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(3));
 389 reg_def XMM18e( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(4));
 390 reg_def XMM18f( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(5));
 391 reg_def XMM18g( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(6));
 392 reg_def XMM18h( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(7));
 393 reg_def XMM18i( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(8));
 394 reg_def XMM18j( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(9));
 395 reg_def XMM18k( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(10));
 396 reg_def XMM18l( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(11));
 397 reg_def XMM18m( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(12));
 398 reg_def XMM18n( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(13));
 399 reg_def XMM18o( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(14));
 400 reg_def XMM18p( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(15));
 401 
 402 reg_def XMM19 ( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg());
 403 reg_def XMM19b( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(1));
 404 reg_def XMM19c( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(2));
 405 reg_def XMM19d( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(3));
 406 reg_def XMM19e( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(4));
 407 reg_def XMM19f( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(5));
 408 reg_def XMM19g( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(6));
 409 reg_def XMM19h( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(7));
 410 reg_def XMM19i( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(8));
 411 reg_def XMM19j( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(9));
 412 reg_def XMM19k( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(10));
 413 reg_def XMM19l( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(11));
 414 reg_def XMM19m( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(12));
 415 reg_def XMM19n( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(13));
 416 reg_def XMM19o( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(14));
 417 reg_def XMM19p( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(15));
 418 
 419 reg_def XMM20 ( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg());
 420 reg_def XMM20b( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(1));
 421 reg_def XMM20c( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(2));
 422 reg_def XMM20d( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(3));
 423 reg_def XMM20e( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(4));
 424 reg_def XMM20f( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(5));
 425 reg_def XMM20g( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(6));
 426 reg_def XMM20h( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(7));
 427 reg_def XMM20i( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(8));
 428 reg_def XMM20j( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(9));
 429 reg_def XMM20k( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(10));
 430 reg_def XMM20l( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(11));
 431 reg_def XMM20m( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(12));
 432 reg_def XMM20n( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(13));
 433 reg_def XMM20o( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(14));
 434 reg_def XMM20p( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(15));
 435 
 436 reg_def XMM21 ( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg());
 437 reg_def XMM21b( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(1));
 438 reg_def XMM21c( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(2));
 439 reg_def XMM21d( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(3));
 440 reg_def XMM21e( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(4));
 441 reg_def XMM21f( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(5));
 442 reg_def XMM21g( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(6));
 443 reg_def XMM21h( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(7));
 444 reg_def XMM21i( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(8));
 445 reg_def XMM21j( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(9));
 446 reg_def XMM21k( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(10));
 447 reg_def XMM21l( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(11));
 448 reg_def XMM21m( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(12));
 449 reg_def XMM21n( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(13));
 450 reg_def XMM21o( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(14));
 451 reg_def XMM21p( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(15));
 452 
 453 reg_def XMM22 ( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg());
 454 reg_def XMM22b( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(1));
 455 reg_def XMM22c( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(2));
 456 reg_def XMM22d( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(3));
 457 reg_def XMM22e( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(4));
 458 reg_def XMM22f( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(5));
 459 reg_def XMM22g( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(6));
 460 reg_def XMM22h( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(7));
 461 reg_def XMM22i( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(8));
 462 reg_def XMM22j( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(9));
 463 reg_def XMM22k( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(10));
 464 reg_def XMM22l( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(11));
 465 reg_def XMM22m( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(12));
 466 reg_def XMM22n( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(13));
 467 reg_def XMM22o( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(14));
 468 reg_def XMM22p( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(15));
 469 
 470 reg_def XMM23 ( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg());
 471 reg_def XMM23b( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(1));
 472 reg_def XMM23c( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(2));
 473 reg_def XMM23d( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(3));
 474 reg_def XMM23e( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(4));
 475 reg_def XMM23f( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(5));
 476 reg_def XMM23g( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(6));
 477 reg_def XMM23h( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(7));
 478 reg_def XMM23i( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(8));
 479 reg_def XMM23j( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(9));
 480 reg_def XMM23k( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(10));
 481 reg_def XMM23l( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(11));
 482 reg_def XMM23m( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(12));
 483 reg_def XMM23n( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(13));
 484 reg_def XMM23o( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(14));
 485 reg_def XMM23p( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(15));
 486 
 487 reg_def XMM24 ( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg());
 488 reg_def XMM24b( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(1));
 489 reg_def XMM24c( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(2));
 490 reg_def XMM24d( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(3));
 491 reg_def XMM24e( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(4));
 492 reg_def XMM24f( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(5));
 493 reg_def XMM24g( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(6));
 494 reg_def XMM24h( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(7));
 495 reg_def XMM24i( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(8));
 496 reg_def XMM24j( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(9));
 497 reg_def XMM24k( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(10));
 498 reg_def XMM24l( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(11));
 499 reg_def XMM24m( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(12));
 500 reg_def XMM24n( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(13));
 501 reg_def XMM24o( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(14));
 502 reg_def XMM24p( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(15));
 503 
 504 reg_def XMM25 ( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg());
 505 reg_def XMM25b( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(1));
 506 reg_def XMM25c( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(2));
 507 reg_def XMM25d( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(3));
 508 reg_def XMM25e( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(4));
 509 reg_def XMM25f( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(5));
 510 reg_def XMM25g( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(6));
 511 reg_def XMM25h( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(7));
 512 reg_def XMM25i( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(8));
 513 reg_def XMM25j( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(9));
 514 reg_def XMM25k( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(10));
 515 reg_def XMM25l( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(11));
 516 reg_def XMM25m( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(12));
 517 reg_def XMM25n( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(13));
 518 reg_def XMM25o( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(14));
 519 reg_def XMM25p( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(15));
 520 
 521 reg_def XMM26 ( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg());
 522 reg_def XMM26b( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(1));
 523 reg_def XMM26c( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(2));
 524 reg_def XMM26d( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(3));
 525 reg_def XMM26e( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(4));
 526 reg_def XMM26f( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(5));
 527 reg_def XMM26g( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(6));
 528 reg_def XMM26h( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(7));
 529 reg_def XMM26i( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(8));
 530 reg_def XMM26j( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(9));
 531 reg_def XMM26k( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(10));
 532 reg_def XMM26l( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(11));
 533 reg_def XMM26m( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(12));
 534 reg_def XMM26n( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(13));
 535 reg_def XMM26o( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(14));
 536 reg_def XMM26p( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(15));
 537 
 538 reg_def XMM27 ( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg());
 539 reg_def XMM27b( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(1));
 540 reg_def XMM27c( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(2));
 541 reg_def XMM27d( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(3));
 542 reg_def XMM27e( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(4));
 543 reg_def XMM27f( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(5));
 544 reg_def XMM27g( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(6));
 545 reg_def XMM27h( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(7));
 546 reg_def XMM27i( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(8));
 547 reg_def XMM27j( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(9));
 548 reg_def XMM27k( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(10));
 549 reg_def XMM27l( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(11));
 550 reg_def XMM27m( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(12));
 551 reg_def XMM27n( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(13));
 552 reg_def XMM27o( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(14));
 553 reg_def XMM27p( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(15));
 554 
 555 reg_def XMM28 ( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg());
 556 reg_def XMM28b( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(1));
 557 reg_def XMM28c( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(2));
 558 reg_def XMM28d( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(3));
 559 reg_def XMM28e( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(4));
 560 reg_def XMM28f( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(5));
 561 reg_def XMM28g( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(6));
 562 reg_def XMM28h( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(7));
 563 reg_def XMM28i( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(8));
 564 reg_def XMM28j( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(9));
 565 reg_def XMM28k( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(10));
 566 reg_def XMM28l( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(11));
 567 reg_def XMM28m( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(12));
 568 reg_def XMM28n( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(13));
 569 reg_def XMM28o( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(14));
 570 reg_def XMM28p( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(15));
 571 
 572 reg_def XMM29 ( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg());
 573 reg_def XMM29b( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(1));
 574 reg_def XMM29c( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(2));
 575 reg_def XMM29d( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(3));
 576 reg_def XMM29e( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(4));
 577 reg_def XMM29f( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(5));
 578 reg_def XMM29g( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(6));
 579 reg_def XMM29h( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(7));
 580 reg_def XMM29i( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(8));
 581 reg_def XMM29j( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(9));
 582 reg_def XMM29k( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(10));
 583 reg_def XMM29l( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(11));
 584 reg_def XMM29m( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(12));
 585 reg_def XMM29n( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(13));
 586 reg_def XMM29o( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(14));
 587 reg_def XMM29p( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(15));
 588 
 589 reg_def XMM30 ( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg());
 590 reg_def XMM30b( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(1));
 591 reg_def XMM30c( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(2));
 592 reg_def XMM30d( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(3));
 593 reg_def XMM30e( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(4));
 594 reg_def XMM30f( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(5));
 595 reg_def XMM30g( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(6));
 596 reg_def XMM30h( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(7));
 597 reg_def XMM30i( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(8));
 598 reg_def XMM30j( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(9));
 599 reg_def XMM30k( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(10));
 600 reg_def XMM30l( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(11));
 601 reg_def XMM30m( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(12));
 602 reg_def XMM30n( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(13));
 603 reg_def XMM30o( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(14));
 604 reg_def XMM30p( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(15));
 605 
 606 reg_def XMM31 ( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg());
 607 reg_def XMM31b( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(1));
 608 reg_def XMM31c( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(2));
 609 reg_def XMM31d( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(3));
 610 reg_def XMM31e( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(4));
 611 reg_def XMM31f( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(5));
 612 reg_def XMM31g( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(6));
 613 reg_def XMM31h( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(7));
 614 reg_def XMM31i( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(8));
 615 reg_def XMM31j( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(9));
 616 reg_def XMM31k( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(10));
 617 reg_def XMM31l( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(11));
 618 reg_def XMM31m( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(12));
 619 reg_def XMM31n( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(13));
 620 reg_def XMM31o( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(14));
 621 reg_def XMM31p( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(15));
 622 
 623 #endif // _LP64
 624 
 625 #ifdef _LP64
 626 reg_def RFLAGS(SOC, SOC, 0, 16, VMRegImpl::Bad());
 627 #else
 628 reg_def RFLAGS(SOC, SOC, 0, 8, VMRegImpl::Bad());
 629 #endif // _LP64
 630 
 631 alloc_class chunk1(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
 632                    XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
 633                    XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
 634                    XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
 635                    XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
 636                    XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
 637                    XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
 638                    XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
 639 #ifdef _LP64
 640                   ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
 641                    XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
 642                    XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
 643                    XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
 644                    XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
 645                    XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
 646                    XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
 647                    XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
 648                   ,XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
 649                    XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
 650                    XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
 651                    XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
 652                    XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
 653                    XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
 654                    XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
 655                    XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
 656                    XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
 657                    XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
 658                    XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
 659                    XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
 660                    XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
 661                    XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
 662                    XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
 663                    XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p
 664 #endif
 665                       );
 666 
 667 // flags allocation class should be last.
 668 alloc_class chunk2(RFLAGS);
 669 
 670 // Singleton class for condition codes
 671 reg_class int_flags(RFLAGS);
 672 
 673 // Class for pre evex float registers
 674 reg_class float_reg_legacy(XMM0,
 675                     XMM1,
 676                     XMM2,
 677                     XMM3,
 678                     XMM4,
 679                     XMM5,
 680                     XMM6,
 681                     XMM7
 682 #ifdef _LP64
 683                    ,XMM8,
 684                     XMM9,
 685                     XMM10,
 686                     XMM11,
 687                     XMM12,
 688                     XMM13,
 689                     XMM14,
 690                     XMM15
 691 #endif
 692                     );
 693 
 694 // Class for evex float registers
 695 reg_class float_reg_evex(XMM0,
 696                     XMM1,
 697                     XMM2,
 698                     XMM3,
 699                     XMM4,
 700                     XMM5,
 701                     XMM6,
 702                     XMM7
 703 #ifdef _LP64
 704                    ,XMM8,
 705                     XMM9,
 706                     XMM10,
 707                     XMM11,
 708                     XMM12,
 709                     XMM13,
 710                     XMM14,
 711                     XMM15,
 712                     XMM16,
 713                     XMM17,
 714                     XMM18,
 715                     XMM19,
 716                     XMM20,
 717                     XMM21,
 718                     XMM22,
 719                     XMM23,
 720                     XMM24,
 721                     XMM25,
 722                     XMM26,
 723                     XMM27,
 724                     XMM28,
 725                     XMM29,
 726                     XMM30,
 727                     XMM31
 728 #endif
 729                     );
 730 
 731 reg_class_dynamic float_reg(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() %} );
 732 reg_class_dynamic float_reg_vl(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
 733 
 734 // Class for pre evex double registers
 735 reg_class double_reg_legacy(XMM0,  XMM0b,
 736                      XMM1,  XMM1b,
 737                      XMM2,  XMM2b,
 738                      XMM3,  XMM3b,
 739                      XMM4,  XMM4b,
 740                      XMM5,  XMM5b,
 741                      XMM6,  XMM6b,
 742                      XMM7,  XMM7b
 743 #ifdef _LP64
 744                     ,XMM8,  XMM8b,
 745                      XMM9,  XMM9b,
 746                      XMM10, XMM10b,
 747                      XMM11, XMM11b,
 748                      XMM12, XMM12b,
 749                      XMM13, XMM13b,
 750                      XMM14, XMM14b,
 751                      XMM15, XMM15b
 752 #endif
 753                      );
 754 
 755 // Class for evex double registers
 756 reg_class double_reg_evex(XMM0,  XMM0b,
 757                      XMM1,  XMM1b,
 758                      XMM2,  XMM2b,
 759                      XMM3,  XMM3b,
 760                      XMM4,  XMM4b,
 761                      XMM5,  XMM5b,
 762                      XMM6,  XMM6b,
 763                      XMM7,  XMM7b
 764 #ifdef _LP64
 765                     ,XMM8,  XMM8b,
 766                      XMM9,  XMM9b,
 767                      XMM10, XMM10b,
 768                      XMM11, XMM11b,
 769                      XMM12, XMM12b,
 770                      XMM13, XMM13b,
 771                      XMM14, XMM14b,
 772                      XMM15, XMM15b,
 773                      XMM16, XMM16b,
 774                      XMM17, XMM17b,
 775                      XMM18, XMM18b,
 776                      XMM19, XMM19b,
 777                      XMM20, XMM20b,
 778                      XMM21, XMM21b,
 779                      XMM22, XMM22b,
 780                      XMM23, XMM23b,
 781                      XMM24, XMM24b,
 782                      XMM25, XMM25b,
 783                      XMM26, XMM26b,
 784                      XMM27, XMM27b,
 785                      XMM28, XMM28b,
 786                      XMM29, XMM29b,
 787                      XMM30, XMM30b,
 788                      XMM31, XMM31b
 789 #endif
 790                      );
 791 
 792 reg_class_dynamic double_reg(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() %} );
 793 reg_class_dynamic double_reg_vl(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
 794 
 795 // Class for pre evex 32bit vector registers
 796 reg_class vectors_reg_legacy(XMM0,
 797                       XMM1,
 798                       XMM2,
 799                       XMM3,
 800                       XMM4,
 801                       XMM5,
 802                       XMM6,
 803                       XMM7
 804 #ifdef _LP64
 805                      ,XMM8,
 806                       XMM9,
 807                       XMM10,
 808                       XMM11,
 809                       XMM12,
 810                       XMM13,
 811                       XMM14,
 812                       XMM15
 813 #endif
 814                       );
 815 
 816 // Class for evex 32bit vector registers
 817 reg_class vectors_reg_evex(XMM0,
 818                       XMM1,
 819                       XMM2,
 820                       XMM3,
 821                       XMM4,
 822                       XMM5,
 823                       XMM6,
 824                       XMM7
 825 #ifdef _LP64
 826                      ,XMM8,
 827                       XMM9,
 828                       XMM10,
 829                       XMM11,
 830                       XMM12,
 831                       XMM13,
 832                       XMM14,
 833                       XMM15,
 834                       XMM16,
 835                       XMM17,
 836                       XMM18,
 837                       XMM19,
 838                       XMM20,
 839                       XMM21,
 840                       XMM22,
 841                       XMM23,
 842                       XMM24,
 843                       XMM25,
 844                       XMM26,
 845                       XMM27,
 846                       XMM28,
 847                       XMM29,
 848                       XMM30,
 849                       XMM31
 850 #endif
 851                       );
 852 
 853 reg_class_dynamic vectors_reg(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_evex() %} );
 854 reg_class_dynamic vectors_reg_vlbwdq(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
 855 
 856 // Class for all 64bit vector registers
 857 reg_class vectord_reg_legacy(XMM0,  XMM0b,
 858                       XMM1,  XMM1b,
 859                       XMM2,  XMM2b,
 860                       XMM3,  XMM3b,
 861                       XMM4,  XMM4b,
 862                       XMM5,  XMM5b,
 863                       XMM6,  XMM6b,
 864                       XMM7,  XMM7b
 865 #ifdef _LP64
 866                      ,XMM8,  XMM8b,
 867                       XMM9,  XMM9b,
 868                       XMM10, XMM10b,
 869                       XMM11, XMM11b,
 870                       XMM12, XMM12b,
 871                       XMM13, XMM13b,
 872                       XMM14, XMM14b,
 873                       XMM15, XMM15b
 874 #endif
 875                       );
 876 
 877 // Class for all 64bit vector registers
 878 reg_class vectord_reg_evex(XMM0,  XMM0b,
 879                       XMM1,  XMM1b,
 880                       XMM2,  XMM2b,
 881                       XMM3,  XMM3b,
 882                       XMM4,  XMM4b,
 883                       XMM5,  XMM5b,
 884                       XMM6,  XMM6b,
 885                       XMM7,  XMM7b
 886 #ifdef _LP64
 887                      ,XMM8,  XMM8b,
 888                       XMM9,  XMM9b,
 889                       XMM10, XMM10b,
 890                       XMM11, XMM11b,
 891                       XMM12, XMM12b,
 892                       XMM13, XMM13b,
 893                       XMM14, XMM14b,
 894                       XMM15, XMM15b,
 895                       XMM16, XMM16b,
 896                       XMM17, XMM17b,
 897                       XMM18, XMM18b,
 898                       XMM19, XMM19b,
 899                       XMM20, XMM20b,
 900                       XMM21, XMM21b,
 901                       XMM22, XMM22b,
 902                       XMM23, XMM23b,
 903                       XMM24, XMM24b,
 904                       XMM25, XMM25b,
 905                       XMM26, XMM26b,
 906                       XMM27, XMM27b,
 907                       XMM28, XMM28b,
 908                       XMM29, XMM29b,
 909                       XMM30, XMM30b,
 910                       XMM31, XMM31b
 911 #endif
 912                       );
 913 
 914 reg_class_dynamic vectord_reg(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_evex() %} );
 915 reg_class_dynamic vectord_reg_vlbwdq(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
 916 
 917 // Class for all 128bit vector registers
 918 reg_class vectorx_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,
 919                       XMM1,  XMM1b,  XMM1c,  XMM1d,
 920                       XMM2,  XMM2b,  XMM2c,  XMM2d,
 921                       XMM3,  XMM3b,  XMM3c,  XMM3d,
 922                       XMM4,  XMM4b,  XMM4c,  XMM4d,
 923                       XMM5,  XMM5b,  XMM5c,  XMM5d,
 924                       XMM6,  XMM6b,  XMM6c,  XMM6d,
 925                       XMM7,  XMM7b,  XMM7c,  XMM7d
 926 #ifdef _LP64
 927                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,
 928                       XMM9,  XMM9b,  XMM9c,  XMM9d,
 929                       XMM10, XMM10b, XMM10c, XMM10d,
 930                       XMM11, XMM11b, XMM11c, XMM11d,
 931                       XMM12, XMM12b, XMM12c, XMM12d,
 932                       XMM13, XMM13b, XMM13c, XMM13d,
 933                       XMM14, XMM14b, XMM14c, XMM14d,
 934                       XMM15, XMM15b, XMM15c, XMM15d
 935 #endif
 936                       );
 937 
 938 // Class for all 128bit vector registers
 939 reg_class vectorx_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,
 940                       XMM1,  XMM1b,  XMM1c,  XMM1d,
 941                       XMM2,  XMM2b,  XMM2c,  XMM2d,
 942                       XMM3,  XMM3b,  XMM3c,  XMM3d,
 943                       XMM4,  XMM4b,  XMM4c,  XMM4d,
 944                       XMM5,  XMM5b,  XMM5c,  XMM5d,
 945                       XMM6,  XMM6b,  XMM6c,  XMM6d,
 946                       XMM7,  XMM7b,  XMM7c,  XMM7d
 947 #ifdef _LP64
 948                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,
 949                       XMM9,  XMM9b,  XMM9c,  XMM9d,
 950                       XMM10, XMM10b, XMM10c, XMM10d,
 951                       XMM11, XMM11b, XMM11c, XMM11d,
 952                       XMM12, XMM12b, XMM12c, XMM12d,
 953                       XMM13, XMM13b, XMM13c, XMM13d,
 954                       XMM14, XMM14b, XMM14c, XMM14d,
 955                       XMM15, XMM15b, XMM15c, XMM15d,
 956                       XMM16, XMM16b, XMM16c, XMM16d,
 957                       XMM17, XMM17b, XMM17c, XMM17d,
 958                       XMM18, XMM18b, XMM18c, XMM18d,
 959                       XMM19, XMM19b, XMM19c, XMM19d,
 960                       XMM20, XMM20b, XMM20c, XMM20d,
 961                       XMM21, XMM21b, XMM21c, XMM21d,
 962                       XMM22, XMM22b, XMM22c, XMM22d,
 963                       XMM23, XMM23b, XMM23c, XMM23d,
 964                       XMM24, XMM24b, XMM24c, XMM24d,
 965                       XMM25, XMM25b, XMM25c, XMM25d,
 966                       XMM26, XMM26b, XMM26c, XMM26d,
 967                       XMM27, XMM27b, XMM27c, XMM27d,
 968                       XMM28, XMM28b, XMM28c, XMM28d,
 969                       XMM29, XMM29b, XMM29c, XMM29d,
 970                       XMM30, XMM30b, XMM30c, XMM30d,
 971                       XMM31, XMM31b, XMM31c, XMM31d
 972 #endif
 973                       );
 974 
 975 reg_class_dynamic vectorx_reg(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_evex() %} );
 976 reg_class_dynamic vectorx_reg_vlbwdq(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
 977 
 978 // Class for all 256bit vector registers
 979 reg_class vectory_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
 980                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
 981                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
 982                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
 983                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
 984                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
 985                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
 986                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
 987 #ifdef _LP64
 988                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
 989                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
 990                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
 991                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
 992                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
 993                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
 994                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
 995                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h
 996 #endif
 997                       );
 998 
 999 // Class for all 256bit vector registers
1000 reg_class vectory_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
1001                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
1002                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
1003                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
1004                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
1005                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
1006                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
1007                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
1008 #ifdef _LP64
1009                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
1010                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
1011                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
1012                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
1013                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
1014                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
1015                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
1016                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h,
1017                       XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h,
1018                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h,
1019                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h,
1020                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h,
1021                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h,
1022                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h,
1023                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h,
1024                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h,
1025                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h,
1026                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h,
1027                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h,
1028                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h,
1029                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h,
1030                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h,
1031                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h,
1032                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h
1033 #endif
1034                       );
1035 
1036 reg_class_dynamic vectory_reg(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_evex() %} );
1037 reg_class_dynamic vectory_reg_vlbwdq(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
1038 
1039 // Class for all 512bit vector registers
1040 reg_class vectorz_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
1041                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
1042                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
1043                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
1044                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
1045                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
1046                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
1047                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
1048 #ifdef _LP64
1049                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
1050                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
1051                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
1052                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
1053                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
1054                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
1055                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
1056                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
1057                      ,XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
1058                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
1059                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
1060                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
1061                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
1062                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
1063                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
1064                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
1065                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
1066                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
1067                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
1068                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
1069                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
1070                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
1071                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
1072                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p
1073 #endif
1074                       );
1075 
1076 // Class for restricted 512bit vector registers
1077 reg_class vectorz_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
1078                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
1079                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
1080                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
1081                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
1082                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
1083                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
1084                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
1085 #ifdef _LP64
1086                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
1087                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
1088                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
1089                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
1090                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
1091                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
1092                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
1093                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
1094 #endif
1095                       );
1096 
1097 reg_class_dynamic vectorz_reg(vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() %} );
1098 reg_class_dynamic vectorz_reg_vl(vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
1099 
1100 reg_class xmm0_reg(XMM0, XMM0b, XMM0c, XMM0d);
1101 reg_class ymm0_reg(XMM0, XMM0b, XMM0c, XMM0d, XMM0e, XMM0f, XMM0g, XMM0h);
1102 reg_class zmm0_reg(XMM0, XMM0b, XMM0c, XMM0d, XMM0e, XMM0f, XMM0g, XMM0h, XMM0i, XMM0j, XMM0k, XMM0l, XMM0m, XMM0n, XMM0o, XMM0p);
1103 
1104 reg_class xmm1_reg(XMM1, XMM1b, XMM1c, XMM1d);
1105 reg_class ymm1_reg(XMM1, XMM1b, XMM1c, XMM1d, XMM1e, XMM1f, XMM1g, XMM1h);
1106 reg_class zmm1_reg(XMM1, XMM1b, XMM1c, XMM1d, XMM1e, XMM1f, XMM1g, XMM1h, XMM1i, XMM1j, XMM1k, XMM1l, XMM1m, XMM1n, XMM1o, XMM1p);
1107 
1108 reg_class xmm2_reg(XMM2, XMM2b, XMM2c, XMM2d);
1109 reg_class ymm2_reg(XMM2, XMM2b, XMM2c, XMM2d, XMM2e, XMM2f, XMM2g, XMM2h);
1110 reg_class zmm2_reg(XMM2, XMM2b, XMM2c, XMM2d, XMM2e, XMM2f, XMM2g, XMM2h, XMM2i, XMM2j, XMM2k, XMM2l, XMM2m, XMM2n, XMM2o, XMM2p);
1111 
1112 reg_class xmm3_reg(XMM3, XMM3b, XMM3c, XMM3d);
1113 reg_class ymm3_reg(XMM3, XMM3b, XMM3c, XMM3d, XMM3e, XMM3f, XMM3g, XMM3h);
1114 reg_class zmm3_reg(XMM3, XMM3b, XMM3c, XMM3d, XMM3e, XMM3f, XMM3g, XMM3h, XMM3i, XMM3j, XMM3k, XMM3l, XMM3m, XMM3n, XMM3o, XMM3p);
1115 
1116 reg_class xmm4_reg(XMM4, XMM4b, XMM4c, XMM4d);
1117 reg_class ymm4_reg(XMM4, XMM4b, XMM4c, XMM4d, XMM4e, XMM4f, XMM4g, XMM4h);
1118 reg_class zmm4_reg(XMM4, XMM4b, XMM4c, XMM4d, XMM4e, XMM4f, XMM4g, XMM4h, XMM4i, XMM4j, XMM4k, XMM4l, XMM4m, XMM4n, XMM4o, XMM4p);
1119 
1120 reg_class xmm5_reg(XMM5, XMM5b, XMM5c, XMM5d);
1121 reg_class ymm5_reg(XMM5, XMM5b, XMM5c, XMM5d, XMM5e, XMM5f, XMM5g, XMM5h);
1122 reg_class zmm5_reg(XMM5, XMM5b, XMM5c, XMM5d, XMM5e, XMM5f, XMM5g, XMM5h, XMM5i, XMM5j, XMM5k, XMM5l, XMM5m, XMM5n, XMM5o, XMM5p);
1123 
1124 reg_class xmm6_reg(XMM6, XMM6b, XMM6c, XMM6d);
1125 reg_class ymm6_reg(XMM6, XMM6b, XMM6c, XMM6d, XMM6e, XMM6f, XMM6g, XMM6h);
1126 reg_class zmm6_reg(XMM6, XMM6b, XMM6c, XMM6d, XMM6e, XMM6f, XMM6g, XMM6h, XMM6i, XMM6j, XMM6k, XMM6l, XMM6m, XMM6n, XMM6o, XMM6p);
1127 
1128 reg_class xmm7_reg(XMM7, XMM7b, XMM7c, XMM7d);
1129 reg_class ymm7_reg(XMM7, XMM7b, XMM7c, XMM7d, XMM7e, XMM7f, XMM7g, XMM7h);
1130 reg_class zmm7_reg(XMM7, XMM7b, XMM7c, XMM7d, XMM7e, XMM7f, XMM7g, XMM7h, XMM7i, XMM7j, XMM7k, XMM7l, XMM7m, XMM7n, XMM7o, XMM7p);
1131 
1132 #ifdef _LP64
1133 
1134 reg_class xmm8_reg(XMM8, XMM8b, XMM8c, XMM8d);
1135 reg_class ymm8_reg(XMM8, XMM8b, XMM8c, XMM8d, XMM8e, XMM8f, XMM8g, XMM8h);
1136 reg_class zmm8_reg(XMM8, XMM8b, XMM8c, XMM8d, XMM8e, XMM8f, XMM8g, XMM8h, XMM8i, XMM8j, XMM8k, XMM8l, XMM8m, XMM8n, XMM8o, XMM8p);
1137 
1138 reg_class xmm9_reg(XMM9, XMM9b, XMM9c, XMM9d);
1139 reg_class ymm9_reg(XMM9, XMM9b, XMM9c, XMM9d, XMM9e, XMM9f, XMM9g, XMM9h);
1140 reg_class zmm9_reg(XMM9, XMM9b, XMM9c, XMM9d, XMM9e, XMM9f, XMM9g, XMM9h, XMM9i, XMM9j, XMM9k, XMM9l, XMM9m, XMM9n, XMM9o, XMM9p);
1141 
1142 reg_class xmm10_reg(XMM10, XMM10b, XMM10c, XMM10d);
1143 reg_class ymm10_reg(XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h);
1144 reg_class zmm10_reg(XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p);
1145 
1146 reg_class xmm11_reg(XMM11, XMM11b, XMM11c, XMM11d);
1147 reg_class ymm11_reg(XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h);
1148 reg_class zmm11_reg(XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p);
1149 
1150 reg_class xmm12_reg(XMM12, XMM12b, XMM12c, XMM12d);
1151 reg_class ymm12_reg(XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h);
1152 reg_class zmm12_reg(XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p);
1153 
1154 reg_class xmm13_reg(XMM13, XMM13b, XMM13c, XMM13d);
1155 reg_class ymm13_reg(XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h);
1156 reg_class zmm13_reg(XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p);
1157 
1158 reg_class xmm14_reg(XMM14, XMM14b, XMM14c, XMM14d);
1159 reg_class ymm14_reg(XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h);
1160 reg_class zmm14_reg(XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p);
1161 
1162 reg_class xmm15_reg(XMM15, XMM15b, XMM15c, XMM15d);
1163 reg_class ymm15_reg(XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h);
1164 reg_class zmm15_reg(XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p);
1165 
1166 reg_class xmm16_reg(XMM16, XMM16b, XMM16c, XMM16d);
1167 reg_class ymm16_reg(XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h);
1168 reg_class zmm16_reg(XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p);
1169 
1170 reg_class xmm17_reg(XMM17, XMM17b, XMM17c, XMM17d);
1171 reg_class ymm17_reg(XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h);
1172 reg_class zmm17_reg(XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p);
1173 
1174 reg_class xmm18_reg(XMM18, XMM18b, XMM18c, XMM18d);
1175 reg_class ymm18_reg(XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h);
1176 reg_class zmm18_reg(XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p);
1177 
1178 reg_class xmm19_reg(XMM19, XMM19b, XMM19c, XMM19d);
1179 reg_class ymm19_reg(XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h);
1180 reg_class zmm19_reg(XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p);
1181 
1182 reg_class xmm20_reg(XMM20, XMM20b, XMM20c, XMM20d);
1183 reg_class ymm20_reg(XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h);
1184 reg_class zmm20_reg(XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p);
1185 
1186 reg_class xmm21_reg(XMM21, XMM21b, XMM21c, XMM21d);
1187 reg_class ymm21_reg(XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h);
1188 reg_class zmm21_reg(XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p);
1189 
1190 reg_class xmm22_reg(XMM22, XMM22b, XMM22c, XMM22d);
1191 reg_class ymm22_reg(XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h);
1192 reg_class zmm22_reg(XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p);
1193 
1194 reg_class xmm23_reg(XMM23, XMM23b, XMM23c, XMM23d);
1195 reg_class ymm23_reg(XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h);
1196 reg_class zmm23_reg(XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p);
1197 
1198 reg_class xmm24_reg(XMM24, XMM24b, XMM24c, XMM24d);
1199 reg_class ymm24_reg(XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h);
1200 reg_class zmm24_reg(XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p);
1201 
1202 reg_class xmm25_reg(XMM25, XMM25b, XMM25c, XMM25d);
1203 reg_class ymm25_reg(XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h);
1204 reg_class zmm25_reg(XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p);
1205 
1206 reg_class xmm26_reg(XMM26, XMM26b, XMM26c, XMM26d);
1207 reg_class ymm26_reg(XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h);
1208 reg_class zmm26_reg(XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p);
1209 
1210 reg_class xmm27_reg(XMM27, XMM27b, XMM27c, XMM27d);
1211 reg_class ymm27_reg(XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h);
1212 reg_class zmm27_reg(XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p);
1213 
1214 reg_class xmm28_reg(XMM28, XMM28b, XMM28c, XMM28d);
1215 reg_class ymm28_reg(XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h);
1216 reg_class zmm28_reg(XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p);
1217 
1218 reg_class xmm29_reg(XMM29, XMM29b, XMM29c, XMM29d);
1219 reg_class ymm29_reg(XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h);
1220 reg_class zmm29_reg(XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p);
1221 
1222 reg_class xmm30_reg(XMM30, XMM30b, XMM30c, XMM30d);
1223 reg_class ymm30_reg(XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h);
1224 reg_class zmm30_reg(XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p);
1225 
1226 reg_class xmm31_reg(XMM31, XMM31b, XMM31c, XMM31d);
1227 reg_class ymm31_reg(XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h);
1228 reg_class zmm31_reg(XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p);
1229 
1230 #endif
1231 
1232 %}
1233 
1234 
1235 //----------SOURCE BLOCK-------------------------------------------------------
1236 // This is a block of C++ code which provides values, functions, and
1237 // definitions necessary in the rest of the architecture description
1238 
1239 source_hpp %{
1240 // Header information of the source block.
1241 // Method declarations/definitions which are used outside
1242 // the ad-scope can conveniently be defined here.
1243 //
1244 // To keep related declarations/definitions/uses close together,
1245 // we switch between source %{ }% and source_hpp %{ }% freely as needed.
1246 
1247 class NativeJump;
1248 
1249 class CallStubImpl {
1250 
1251   //--------------------------------------------------------------
1252   //---<  Used for optimization in Compile::shorten_branches  >---
1253   //--------------------------------------------------------------
1254 
1255  public:
1256   // Size of call trampoline stub.
1257   static uint size_call_trampoline() {
1258     return 0; // no call trampolines on this platform
1259   }
1260 
1261   // number of relocations needed by a call trampoline stub
1262   static uint reloc_call_trampoline() {
1263     return 0; // no call trampolines on this platform
1264   }
1265 };
1266 
1267 class HandlerImpl {
1268 
1269  public:
1270 
1271   static int emit_exception_handler(CodeBuffer &cbuf);
1272   static int emit_deopt_handler(CodeBuffer& cbuf);
1273 
1274   static uint size_exception_handler() {
1275     // NativeCall instruction size is the same as NativeJump.
1276     // exception handler starts out as jump and can be patched to
1277     // a call be deoptimization.  (4932387)
1278     // Note that this value is also credited (in output.cpp) to
1279     // the size of the code section.
1280     return NativeJump::instruction_size;
1281   }
1282 
1283 #ifdef _LP64
1284   static uint size_deopt_handler() {
1285     // three 5 byte instructions plus one move for unreachable address.
1286     return 15+3;
1287   }
1288 #else
1289   static uint size_deopt_handler() {
1290     // NativeCall instruction size is the same as NativeJump.
1291     // exception handler starts out as jump and can be patched to
1292     // a call be deoptimization.  (4932387)
1293     // Note that this value is also credited (in output.cpp) to
1294     // the size of the code section.
1295     return 5 + NativeJump::instruction_size; // pushl(); jmp;
1296   }
1297 #endif
1298 };
1299 
1300 %} // end source_hpp
1301 
1302 source %{
1303 
1304 #include "opto/addnode.hpp"
1305 
1306 // Emit exception handler code.
1307 // Stuff framesize into a register and call a VM stub routine.
1308 int HandlerImpl::emit_exception_handler(CodeBuffer& cbuf) {
1309 
1310   // Note that the code buffer's insts_mark is always relative to insts.
1311   // That's why we must use the macroassembler to generate a handler.
1312   MacroAssembler _masm(&cbuf);
1313   address base = __ start_a_stub(size_exception_handler());
1314   if (base == NULL) {
1315     ciEnv::current()->record_failure("CodeCache is full");
1316     return 0;  // CodeBuffer::expand failed
1317   }
1318   int offset = __ offset();
1319   __ jump(RuntimeAddress(OptoRuntime::exception_blob()->entry_point()));
1320   assert(__ offset() - offset <= (int) size_exception_handler(), "overflow");
1321   __ end_a_stub();
1322   return offset;
1323 }
1324 
1325 // Emit deopt handler code.
1326 int HandlerImpl::emit_deopt_handler(CodeBuffer& cbuf) {
1327 
1328   // Note that the code buffer's insts_mark is always relative to insts.
1329   // That's why we must use the macroassembler to generate a handler.
1330   MacroAssembler _masm(&cbuf);
1331   address base = __ start_a_stub(size_deopt_handler());
1332   if (base == NULL) {
1333     ciEnv::current()->record_failure("CodeCache is full");
1334     return 0;  // CodeBuffer::expand failed
1335   }
1336   int offset = __ offset();
1337 
1338 #ifdef _LP64
1339   address the_pc = (address) __ pc();
1340   Label next;
1341   // push a "the_pc" on the stack without destroying any registers
1342   // as they all may be live.
1343 
1344   // push address of "next"
1345   __ call(next, relocInfo::none); // reloc none is fine since it is a disp32
1346   __ bind(next);
1347   // adjust it so it matches "the_pc"
1348   __ subptr(Address(rsp, 0), __ offset() - offset);
1349 #else
1350   InternalAddress here(__ pc());
1351   __ pushptr(here.addr());
1352 #endif
1353 
1354   __ jump(RuntimeAddress(SharedRuntime::deopt_blob()->unpack()));
1355   assert(__ offset() - offset <= (int) size_deopt_handler(), "overflow %d", (__ offset() - offset));
1356   __ end_a_stub();
1357   return offset;
1358 }
1359 
1360 
1361 //=============================================================================
1362 
1363   // Float masks come from different places depending on platform.
1364 #ifdef _LP64
1365   static address float_signmask()  { return StubRoutines::x86::float_sign_mask(); }
1366   static address float_signflip()  { return StubRoutines::x86::float_sign_flip(); }
1367   static address double_signmask() { return StubRoutines::x86::double_sign_mask(); }
1368   static address double_signflip() { return StubRoutines::x86::double_sign_flip(); }
1369 #else
1370   static address float_signmask()  { return (address)float_signmask_pool; }
1371   static address float_signflip()  { return (address)float_signflip_pool; }
1372   static address double_signmask() { return (address)double_signmask_pool; }
1373   static address double_signflip() { return (address)double_signflip_pool; }
1374 #endif
1375   static address vector_short_to_byte_mask() { return StubRoutines::x86::vector_short_to_byte_mask(); }
1376   static address vector_byte_perm_mask() { return StubRoutines::x86::vector_byte_perm_mask(); }
1377   static address vector_long_sign_mask() { return StubRoutines::x86::vector_long_sign_mask(); }
1378 
1379 //=============================================================================
1380 const bool Matcher::match_rule_supported(int opcode) {
1381   if (!has_match_rule(opcode))
1382     return false;
1383 
1384   bool ret_value = true;
1385   switch (opcode) {
1386     case Op_AbsVL:
1387       if (UseAVX < 3)
1388         ret_value = false;
1389     case Op_PopCountI:
1390     case Op_PopCountL:
1391       if (!UsePopCountInstruction)
1392         ret_value = false;
1393       break;
1394     case Op_PopCountVI:
1395       if (!UsePopCountInstruction || !VM_Version::supports_vpopcntdq())
1396         ret_value = false;
1397       break;
1398     case Op_MulVI:
1399       if ((UseSSE < 4) && (UseAVX < 1)) // only with SSE4_1 or AVX
1400         ret_value = false;
1401       break;
1402     case Op_MulVL:
1403     case Op_MulReductionVL:
1404       if (VM_Version::supports_avx512dq() == false)
1405         ret_value = false;
1406       break;
1407     case Op_AddReductionVL:
1408       if (UseAVX < 3) // only EVEX : vector connectivity becomes an issue here
1409         ret_value = false;
1410       break;
1411     case Op_AbsVB:
1412     case Op_AbsVS:
1413     case Op_AbsVI:
1414     case Op_AddReductionVI:
1415       if (UseSSE < 3) // requires at least SSE3
1416         ret_value = false;
1417       break;
1418     case Op_MulReductionVI:
1419       if (UseSSE < 4) // requires at least SSE4
1420         ret_value = false;
1421       break;
1422     case Op_AddReductionVF:
1423     case Op_AddReductionVD:
1424     case Op_MulReductionVF:
1425     case Op_MulReductionVD:
1426       if (UseSSE < 1) // requires at least SSE
1427         ret_value = false;
1428       break;
1429     case Op_SqrtVD:
1430     case Op_SqrtVF:
1431       if (UseAVX < 1) // enabled for AVX only
1432         ret_value = false;
1433       break;
1434     case Op_CompareAndSwapL:
1435 #ifdef _LP64
1436     case Op_CompareAndSwapP:
1437 #endif
1438       if (!VM_Version::supports_cx8())
1439         ret_value = false;
1440       break;
1441     case Op_CMoveVF:
1442     case Op_CMoveVD:
1443       if (UseAVX < 1 || UseAVX > 2)
1444         ret_value = false;
1445       break;
1446     case Op_StrIndexOf:
1447       if (!UseSSE42Intrinsics)
1448         ret_value = false;
1449       break;
1450     case Op_StrIndexOfChar:
1451       if (!UseSSE42Intrinsics)
1452         ret_value = false;
1453       break;
1454     case Op_OnSpinWait:
1455       if (VM_Version::supports_on_spin_wait() == false)
1456         ret_value = false;
1457       break;
1458     case Op_MulAddVS2VI:
1459     case Op_RShiftVL:
1460     case Op_AbsVD:
1461     case Op_NegVD:
1462       if (UseSSE < 2)
1463         ret_value = false;
1464       break;
1465     case Op_MulVB:
1466     case Op_LShiftVB:
1467     case Op_RShiftVB:
1468     case Op_URShiftVB:
1469       if (UseSSE < 4)
1470         ret_value = false;
1471       break;
1472 #ifdef _LP64
1473     case Op_MaxD:
1474     case Op_MaxF:
1475     case Op_MinD:
1476     case Op_MinF:
1477       if (UseAVX < 1) // enabled for AVX only
1478         ret_value = false;
1479       break;
1480 #endif
1481   }
1482 
1483   return ret_value;  // Per default match rules are supported.
1484 }
1485 
1486 const bool Matcher::match_rule_supported_vector(int opcode, int vlen) {
1487   // identify extra cases that we might want to provide match rules for
1488   // e.g. Op_ vector nodes and other intrinsics while guarding with vlen
1489   bool ret_value = match_rule_supported(opcode);
1490   if (ret_value) {
1491     switch (opcode) {
1492       case Op_AbsVB:
1493       case Op_AddVB:
1494       case Op_SubVB:
1495         if ((vlen == 64) && (VM_Version::supports_avx512bw() == false))
1496           ret_value = false;
1497         break;
1498       case Op_AbsVS:
1499       case Op_AddVS:
1500       case Op_SubVS:
1501       case Op_MulVS:
1502       case Op_LShiftVS:
1503       case Op_RShiftVS:
1504       case Op_URShiftVS:
1505         if ((vlen == 32) && (VM_Version::supports_avx512bw() == false))
1506           ret_value = false;
1507         break;
1508       case Op_MulVB:
1509       case Op_LShiftVB:
1510       case Op_RShiftVB:
1511       case Op_URShiftVB:
1512         if ((vlen == 32 && UseAVX < 2) || 
1513             ((vlen == 64) && (VM_Version::supports_avx512bw() == false)))
1514           ret_value = false;
1515         break;
1516       case Op_NegVF:
1517         if ((vlen == 16) && (VM_Version::supports_avx512dq() == false))
1518           ret_value = false;
1519         break;
1520       case Op_CMoveVF:
1521         if (vlen != 8)
1522           ret_value  = false;
1523         break;
1524       case Op_NegVD:
1525         if ((vlen == 8) && (VM_Version::supports_avx512dq() == false))
1526           ret_value = false;
1527         break;
1528       case Op_CMoveVD:
1529         if (vlen != 4)
1530           ret_value  = false;
1531         break;
1532     }
1533   }
1534 
1535   return ret_value;  // Per default match rules are supported.
1536 }
1537 
1538 const bool Matcher::has_predicated_vectors(void) {
1539   bool ret_value = false;
1540   if (UseAVX > 2) {
1541     ret_value = VM_Version::supports_avx512vl();
1542   }
1543 
1544   return ret_value;
1545 }
1546 
1547 const int Matcher::float_pressure(int default_pressure_threshold) {
1548   int float_pressure_threshold = default_pressure_threshold;
1549 #ifdef _LP64
1550   if (UseAVX > 2) {
1551     // Increase pressure threshold on machines with AVX3 which have
1552     // 2x more XMM registers.
1553     float_pressure_threshold = default_pressure_threshold * 2;
1554   }
1555 #endif
1556   return float_pressure_threshold;
1557 }
1558 
1559 // Max vector size in bytes. 0 if not supported.
1560 const int Matcher::vector_width_in_bytes(BasicType bt) {
1561   assert(is_java_primitive(bt), "only primitive type vectors");
1562   if (UseSSE < 2) return 0;
1563   // SSE2 supports 128bit vectors for all types.
1564   // AVX2 supports 256bit vectors for all types.
1565   // AVX2/EVEX supports 512bit vectors for all types.
1566   int size = (UseAVX > 1) ? (1 << UseAVX) * 8 : 16;
1567   // AVX1 supports 256bit vectors only for FLOAT and DOUBLE.
1568   if (UseAVX > 0 && (bt == T_FLOAT || bt == T_DOUBLE))
1569     size = (UseAVX > 2) ? 64 : 32;
1570   if (UseAVX > 2 && (bt == T_BYTE || bt == T_SHORT || bt == T_CHAR))
1571     size = (VM_Version::supports_avx512bw()) ? 64 : 32;
1572   // Use flag to limit vector size.
1573   size = MIN2(size,(int)MaxVectorSize);
1574   // Minimum 2 values in vector (or 4 for bytes).
1575   switch (bt) {
1576   case T_DOUBLE:
1577   case T_LONG:
1578     if (size < 16) return 0;
1579     break;
1580   case T_FLOAT:
1581   case T_INT:
1582     if (size < 8) return 0;
1583     break;
1584   case T_BOOLEAN:
1585     if (size < 4) return 0;
1586     break;
1587   case T_CHAR:
1588     if (size < 4) return 0;
1589     break;
1590   case T_BYTE:
1591     if (size < 4) return 0;
1592     break;
1593   case T_SHORT:
1594     if (size < 4) return 0;
1595     break;
1596   default:
1597     ShouldNotReachHere();
1598   }
1599   return size;
1600 }
1601 
1602 // Limits on vector size (number of elements) loaded into vector.
1603 const int Matcher::max_vector_size(const BasicType bt) {
1604   return vector_width_in_bytes(bt)/type2aelembytes(bt);
1605 }
1606 const int Matcher::min_vector_size(const BasicType bt) {
1607   int max_size = max_vector_size(bt);
1608   // Min size which can be loaded into vector is 4 bytes.
1609   int size = (type2aelembytes(bt) == 1) ? 4 : 2;
1610   return MIN2(size,max_size);
1611 }
1612 
1613 // Vector ideal reg corresponding to specified size in bytes
1614 const uint Matcher::vector_ideal_reg(int size) {
1615   assert(MaxVectorSize >= size, "");
1616   switch(size) {
1617     case  4: return Op_VecS;
1618     case  8: return Op_VecD;
1619     case 16: return Op_VecX;
1620     case 32: return Op_VecY;
1621     case 64: return Op_VecZ;
1622   }
1623   ShouldNotReachHere();
1624   return 0;
1625 }
1626 
1627 // Only lowest bits of xmm reg are used for vector shift count.
1628 const uint Matcher::vector_shift_count_ideal_reg(int size) {
1629   return Op_VecS;
1630 }
1631 
1632 // x86 supports misaligned vectors store/load.
1633 const bool Matcher::misaligned_vectors_ok() {
1634   return true;
1635 }
1636 
1637 // x86 AES instructions are compatible with SunJCE expanded
1638 // keys, hence we do not need to pass the original key to stubs
1639 const bool Matcher::pass_original_key_for_aes() {
1640   return false;
1641 }
1642 
1643 
1644 const bool Matcher::convi2l_type_required = true;
1645 
1646 // Check for shift by small constant as well
1647 static bool clone_shift(Node* shift, Matcher* matcher, Matcher::MStack& mstack, VectorSet& address_visited) {
1648   if (shift->Opcode() == Op_LShiftX && shift->in(2)->is_Con() &&
1649       shift->in(2)->get_int() <= 3 &&
1650       // Are there other uses besides address expressions?
1651       !matcher->is_visited(shift)) {
1652     address_visited.set(shift->_idx); // Flag as address_visited
1653     mstack.push(shift->in(2), Matcher::Visit);
1654     Node *conv = shift->in(1);
1655 #ifdef _LP64
1656     // Allow Matcher to match the rule which bypass
1657     // ConvI2L operation for an array index on LP64
1658     // if the index value is positive.
1659     if (conv->Opcode() == Op_ConvI2L &&
1660         conv->as_Type()->type()->is_long()->_lo >= 0 &&
1661         // Are there other uses besides address expressions?
1662         !matcher->is_visited(conv)) {
1663       address_visited.set(conv->_idx); // Flag as address_visited
1664       mstack.push(conv->in(1), Matcher::Pre_Visit);
1665     } else
1666 #endif
1667       mstack.push(conv, Matcher::Pre_Visit);
1668     return true;
1669   }
1670   return false;
1671 }
1672 
1673 // Should the Matcher clone shifts on addressing modes, expecting them
1674 // to be subsumed into complex addressing expressions or compute them
1675 // into registers?
1676 bool Matcher::clone_address_expressions(AddPNode* m, Matcher::MStack& mstack, VectorSet& address_visited) {
1677   Node *off = m->in(AddPNode::Offset);
1678   if (off->is_Con()) {
1679     address_visited.test_set(m->_idx); // Flag as address_visited
1680     Node *adr = m->in(AddPNode::Address);
1681 
1682     // Intel can handle 2 adds in addressing mode
1683     // AtomicAdd is not an addressing expression.
1684     // Cheap to find it by looking for screwy base.
1685     if (adr->is_AddP() &&
1686         !adr->in(AddPNode::Base)->is_top() &&
1687         LP64_ONLY( off->get_long() == (int) (off->get_long()) && ) // immL32
1688         // Are there other uses besides address expressions?
1689         !is_visited(adr)) {
1690       address_visited.set(adr->_idx); // Flag as address_visited
1691       Node *shift = adr->in(AddPNode::Offset);
1692       if (!clone_shift(shift, this, mstack, address_visited)) {
1693         mstack.push(shift, Pre_Visit);
1694       }
1695       mstack.push(adr->in(AddPNode::Address), Pre_Visit);
1696       mstack.push(adr->in(AddPNode::Base), Pre_Visit);
1697     } else {
1698       mstack.push(adr, Pre_Visit);
1699     }
1700 
1701     // Clone X+offset as it also folds into most addressing expressions
1702     mstack.push(off, Visit);
1703     mstack.push(m->in(AddPNode::Base), Pre_Visit);
1704     return true;
1705   } else if (clone_shift(off, this, mstack, address_visited)) {
1706     address_visited.test_set(m->_idx); // Flag as address_visited
1707     mstack.push(m->in(AddPNode::Address), Pre_Visit);
1708     mstack.push(m->in(AddPNode::Base), Pre_Visit);
1709     return true;
1710   }
1711   return false;
1712 }
1713 
1714 void Compile::reshape_address(AddPNode* addp) {
1715 }
1716 
1717 // Helper methods for MachSpillCopyNode::implementation().
1718 static int vec_mov_helper(CodeBuffer *cbuf, bool do_size, int src_lo, int dst_lo,
1719                           int src_hi, int dst_hi, uint ireg, outputStream* st) {
1720   // In 64-bit VM size calculation is very complex. Emitting instructions
1721   // into scratch buffer is used to get size in 64-bit VM.
1722   LP64_ONLY( assert(!do_size, "this method calculates size only for 32-bit VM"); )
1723   assert(ireg == Op_VecS || // 32bit vector
1724          (src_lo & 1) == 0 && (src_lo + 1) == src_hi &&
1725          (dst_lo & 1) == 0 && (dst_lo + 1) == dst_hi,
1726          "no non-adjacent vector moves" );
1727   if (cbuf) {
1728     MacroAssembler _masm(cbuf);
1729     int offset = __ offset();
1730     switch (ireg) {
1731     case Op_VecS: // copy whole register
1732     case Op_VecD:
1733     case Op_VecX:
1734 #ifndef _LP64
1735       __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
1736 #else
1737       if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
1738         __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
1739       } else {
1740         __ vextractf32x4(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 0x0);
1741      }
1742 #endif
1743       break;
1744     case Op_VecY:
1745 #ifndef _LP64
1746       __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
1747 #else
1748       if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
1749         __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
1750       } else {
1751         __ vextractf64x4(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 0x0);
1752      }
1753 #endif
1754       break;
1755     case Op_VecZ:
1756       __ evmovdquq(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 2);
1757       break;
1758     default:
1759       ShouldNotReachHere();
1760     }
1761     int size = __ offset() - offset;
1762 #ifdef ASSERT
1763     // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
1764     assert(!do_size || size == 4, "incorrect size calculattion");
1765 #endif
1766     return size;
1767 #ifndef PRODUCT
1768   } else if (!do_size) {
1769     switch (ireg) {
1770     case Op_VecS:
1771     case Op_VecD:
1772     case Op_VecX:
1773       st->print("movdqu  %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
1774       break;
1775     case Op_VecY:
1776     case Op_VecZ:
1777       st->print("vmovdqu %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
1778       break;
1779     default:
1780       ShouldNotReachHere();
1781     }
1782 #endif
1783   }
1784   // VEX_2bytes prefix is used if UseAVX > 0, and it takes the same 2 bytes as SIMD prefix.
1785   return (UseAVX > 2) ? 6 : 4;
1786 }
1787 
1788 static int vec_spill_helper(CodeBuffer *cbuf, bool do_size, bool is_load,
1789                             int stack_offset, int reg, uint ireg, outputStream* st) {
1790   // In 64-bit VM size calculation is very complex. Emitting instructions
1791   // into scratch buffer is used to get size in 64-bit VM.
1792   LP64_ONLY( assert(!do_size, "this method calculates size only for 32-bit VM"); )
1793   if (cbuf) {
1794     MacroAssembler _masm(cbuf);
1795     int offset = __ offset();
1796     if (is_load) {
1797       switch (ireg) {
1798       case Op_VecS:
1799         __ movdl(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
1800         break;
1801       case Op_VecD:
1802         __ movq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
1803         break;
1804       case Op_VecX:
1805 #ifndef _LP64
1806         __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
1807 #else
1808         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
1809           __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
1810         } else {
1811           __ vpxor(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 2);
1812           __ vinsertf32x4(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset),0x0);
1813         }
1814 #endif
1815         break;
1816       case Op_VecY:
1817 #ifndef _LP64
1818         __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
1819 #else
1820         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
1821           __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
1822         } else {
1823           __ vpxor(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 2);
1824           __ vinsertf64x4(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset),0x0);
1825         }
1826 #endif
1827         break;
1828       case Op_VecZ:
1829         __ evmovdquq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset), 2);
1830         break;
1831       default:
1832         ShouldNotReachHere();
1833       }
1834     } else { // store
1835       switch (ireg) {
1836       case Op_VecS:
1837         __ movdl(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
1838         break;
1839       case Op_VecD:
1840         __ movq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
1841         break;
1842       case Op_VecX:
1843 #ifndef _LP64
1844         __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
1845 #else
1846         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
1847           __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
1848         }
1849         else {
1850           __ vextractf32x4(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 0x0);
1851         }
1852 #endif
1853         break;
1854       case Op_VecY:
1855 #ifndef _LP64
1856         __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
1857 #else
1858         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
1859           __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
1860         }
1861         else {
1862           __ vextractf64x4(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 0x0);
1863         }
1864 #endif
1865         break;
1866       case Op_VecZ:
1867         __ evmovdquq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 2);
1868         break;
1869       default:
1870         ShouldNotReachHere();
1871       }
1872     }
1873     int size = __ offset() - offset;
1874 #ifdef ASSERT
1875     int offset_size = (stack_offset == 0) ? 0 : ((stack_offset < 0x80) ? 1 : (UseAVX > 2) ? 6 : 4);
1876     // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
1877     assert(!do_size || size == (5+offset_size), "incorrect size calculattion");
1878 #endif
1879     return size;
1880 #ifndef PRODUCT
1881   } else if (!do_size) {
1882     if (is_load) {
1883       switch (ireg) {
1884       case Op_VecS:
1885         st->print("movd    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
1886         break;
1887       case Op_VecD:
1888         st->print("movq    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
1889         break;
1890        case Op_VecX:
1891         st->print("movdqu  %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
1892         break;
1893       case Op_VecY:
1894       case Op_VecZ:
1895         st->print("vmovdqu %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
1896         break;
1897       default:
1898         ShouldNotReachHere();
1899       }
1900     } else { // store
1901       switch (ireg) {
1902       case Op_VecS:
1903         st->print("movd    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
1904         break;
1905       case Op_VecD:
1906         st->print("movq    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
1907         break;
1908        case Op_VecX:
1909         st->print("movdqu  [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
1910         break;
1911       case Op_VecY:
1912       case Op_VecZ:
1913         st->print("vmovdqu [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
1914         break;
1915       default:
1916         ShouldNotReachHere();
1917       }
1918     }
1919 #endif
1920   }
1921   bool is_single_byte = false;
1922   int vec_len = 0;
1923   if ((UseAVX > 2) && (stack_offset != 0)) {
1924     int tuple_type = Assembler::EVEX_FVM;
1925     int input_size = Assembler::EVEX_32bit;
1926     switch (ireg) {
1927     case Op_VecS:
1928       tuple_type = Assembler::EVEX_T1S;
1929       break;
1930     case Op_VecD:
1931       tuple_type = Assembler::EVEX_T1S;
1932       input_size = Assembler::EVEX_64bit;
1933       break;
1934     case Op_VecX:
1935       break;
1936     case Op_VecY:
1937       vec_len = 1;
1938       break;
1939     case Op_VecZ:
1940       vec_len = 2;
1941       break;
1942     }
1943     is_single_byte = Assembler::query_compressed_disp_byte(stack_offset, true, vec_len, tuple_type, input_size, 0);
1944   }
1945   int offset_size = 0;
1946   int size = 5;
1947   if (UseAVX > 2 ) {
1948     if (VM_Version::supports_avx512novl() && (vec_len == 2)) {
1949       offset_size = (stack_offset == 0) ? 0 : ((is_single_byte) ? 1 : 4);
1950       size += 2; // Need an additional two bytes for EVEX encoding
1951     } else if (VM_Version::supports_avx512novl() && (vec_len < 2)) {
1952       offset_size = (stack_offset == 0) ? 0 : ((stack_offset <= 127) ? 1 : 4);
1953     } else {
1954       offset_size = (stack_offset == 0) ? 0 : ((is_single_byte) ? 1 : 4);
1955       size += 2; // Need an additional two bytes for EVEX encodding
1956     }
1957   } else {
1958     offset_size = (stack_offset == 0) ? 0 : ((stack_offset <= 127) ? 1 : 4);
1959   }
1960   // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
1961   return size+offset_size;
1962 }
1963 
1964 static inline jint replicate4_imm(int con, int width) {
1965   // Load a constant of "width" (in bytes) and replicate it to fill 32bit.
1966   assert(width == 1 || width == 2, "only byte or short types here");
1967   int bit_width = width * 8;
1968   jint val = con;
1969   val &= (1 << bit_width) - 1;  // mask off sign bits
1970   while(bit_width < 32) {
1971     val |= (val << bit_width);
1972     bit_width <<= 1;
1973   }
1974   return val;
1975 }
1976 
1977 static inline jlong replicate8_imm(int con, int width) {
1978   // Load a constant of "width" (in bytes) and replicate it to fill 64bit.
1979   assert(width == 1 || width == 2 || width == 4, "only byte, short or int types here");
1980   int bit_width = width * 8;
1981   jlong val = con;
1982   val &= (((jlong) 1) << bit_width) - 1;  // mask off sign bits
1983   while(bit_width < 64) {
1984     val |= (val << bit_width);
1985     bit_width <<= 1;
1986   }
1987   return val;
1988 }
1989 
1990 #ifndef PRODUCT
1991   void MachNopNode::format(PhaseRegAlloc*, outputStream* st) const {
1992     st->print("nop \t# %d bytes pad for loops and calls", _count);
1993   }
1994 #endif
1995 
1996   void MachNopNode::emit(CodeBuffer &cbuf, PhaseRegAlloc*) const {
1997     MacroAssembler _masm(&cbuf);
1998     __ nop(_count);
1999   }
2000 
2001   uint MachNopNode::size(PhaseRegAlloc*) const {
2002     return _count;
2003   }
2004 
2005 #ifndef PRODUCT
2006   void MachBreakpointNode::format(PhaseRegAlloc*, outputStream* st) const {
2007     st->print("# breakpoint");
2008   }
2009 #endif
2010 
2011   void MachBreakpointNode::emit(CodeBuffer &cbuf, PhaseRegAlloc* ra_) const {
2012     MacroAssembler _masm(&cbuf);
2013     __ int3();
2014   }
2015 
2016   uint MachBreakpointNode::size(PhaseRegAlloc* ra_) const {
2017     return MachNode::size(ra_);
2018   }
2019 
2020 %}
2021 
2022 encode %{
2023 
2024   enc_class call_epilog %{
2025     if (VerifyStackAtCalls) {
2026       // Check that stack depth is unchanged: find majik cookie on stack
2027       int framesize = ra_->reg2offset_unchecked(OptoReg::add(ra_->_matcher._old_SP, -3*VMRegImpl::slots_per_word));
2028       MacroAssembler _masm(&cbuf);
2029       Label L;
2030       __ cmpptr(Address(rsp, framesize), (int32_t)0xbadb100d);
2031       __ jccb(Assembler::equal, L);
2032       // Die if stack mismatch
2033       __ int3();
2034       __ bind(L);
2035     }
2036   %}
2037 
2038 %}
2039 
2040 
2041 //----------OPERANDS-----------------------------------------------------------
2042 // Operand definitions must precede instruction definitions for correct parsing
2043 // in the ADLC because operands constitute user defined types which are used in
2044 // instruction definitions.
2045 
2046 operand vecZ() %{
2047   constraint(ALLOC_IN_RC(vectorz_reg));
2048   match(VecZ);
2049 
2050   format %{ %}
2051   interface(REG_INTER);
2052 %}
2053 
2054 operand legVecZ() %{
2055   constraint(ALLOC_IN_RC(vectorz_reg_vl));
2056   match(VecZ);
2057 
2058   format %{ %}
2059   interface(REG_INTER);
2060 %}
2061 
2062 // Comparison Code for FP conditional move
2063 operand cmpOp_vcmppd() %{
2064   match(Bool);
2065 
2066   predicate(n->as_Bool()->_test._test != BoolTest::overflow &&
2067             n->as_Bool()->_test._test != BoolTest::no_overflow);
2068   format %{ "" %}
2069   interface(COND_INTER) %{
2070     equal        (0x0, "eq");
2071     less         (0x1, "lt");
2072     less_equal   (0x2, "le");
2073     not_equal    (0xC, "ne");
2074     greater_equal(0xD, "ge");
2075     greater      (0xE, "gt");
2076     //TODO cannot compile (adlc breaks) without two next lines with error:
2077     // x86_64.ad(13987) Syntax Error: :In operand cmpOp_vcmppd: Do not support this encode constant: ' %{
2078     // equal' for overflow.
2079     overflow     (0x20, "o");  // not really supported by the instruction
2080     no_overflow  (0x21, "no"); // not really supported by the instruction
2081   %}
2082 %}
2083 
2084 
2085 // INSTRUCTIONS -- Platform independent definitions (same for 32- and 64-bit)
2086 
2087 // ============================================================================
2088 
2089 instruct ShouldNotReachHere() %{
2090   match(Halt);
2091   format %{ "ud2\t# ShouldNotReachHere" %}
2092   ins_encode %{
2093     __ ud2();
2094   %}
2095   ins_pipe(pipe_slow);
2096 %}
2097 
2098 // =================================EVEX special===============================
2099 
2100 instruct setMask(rRegI dst, rRegI src) %{
2101   predicate(Matcher::has_predicated_vectors());
2102   match(Set dst (SetVectMaskI  src));
2103   effect(TEMP dst);
2104   format %{ "setvectmask   $dst, $src" %}
2105   ins_encode %{
2106     __ setvectmask($dst$$Register, $src$$Register);
2107   %}
2108   ins_pipe(pipe_slow);
2109 %}
2110 
2111 // ============================================================================
2112 
2113 instruct addF_reg(regF dst, regF src) %{
2114   predicate((UseSSE>=1) && (UseAVX == 0));
2115   match(Set dst (AddF dst src));
2116 
2117   format %{ "addss   $dst, $src" %}
2118   ins_cost(150);
2119   ins_encode %{
2120     __ addss($dst$$XMMRegister, $src$$XMMRegister);
2121   %}
2122   ins_pipe(pipe_slow);
2123 %}
2124 
2125 instruct addF_mem(regF dst, memory src) %{
2126   predicate((UseSSE>=1) && (UseAVX == 0));
2127   match(Set dst (AddF dst (LoadF src)));
2128 
2129   format %{ "addss   $dst, $src" %}
2130   ins_cost(150);
2131   ins_encode %{
2132     __ addss($dst$$XMMRegister, $src$$Address);
2133   %}
2134   ins_pipe(pipe_slow);
2135 %}
2136 
2137 instruct addF_imm(regF dst, immF con) %{
2138   predicate((UseSSE>=1) && (UseAVX == 0));
2139   match(Set dst (AddF dst con));
2140   format %{ "addss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2141   ins_cost(150);
2142   ins_encode %{
2143     __ addss($dst$$XMMRegister, $constantaddress($con));
2144   %}
2145   ins_pipe(pipe_slow);
2146 %}
2147 
2148 instruct addF_reg_reg(regF dst, regF src1, regF src2) %{
2149   predicate(UseAVX > 0);
2150   match(Set dst (AddF src1 src2));
2151 
2152   format %{ "vaddss  $dst, $src1, $src2" %}
2153   ins_cost(150);
2154   ins_encode %{
2155     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2156   %}
2157   ins_pipe(pipe_slow);
2158 %}
2159 
2160 instruct addF_reg_mem(regF dst, regF src1, memory src2) %{
2161   predicate(UseAVX > 0);
2162   match(Set dst (AddF src1 (LoadF src2)));
2163 
2164   format %{ "vaddss  $dst, $src1, $src2" %}
2165   ins_cost(150);
2166   ins_encode %{
2167     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2168   %}
2169   ins_pipe(pipe_slow);
2170 %}
2171 
2172 instruct addF_reg_imm(regF dst, regF src, immF con) %{
2173   predicate(UseAVX > 0);
2174   match(Set dst (AddF src con));
2175 
2176   format %{ "vaddss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
2177   ins_cost(150);
2178   ins_encode %{
2179     __ vaddss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2180   %}
2181   ins_pipe(pipe_slow);
2182 %}
2183 
2184 instruct addD_reg(regD dst, regD src) %{
2185   predicate((UseSSE>=2) && (UseAVX == 0));
2186   match(Set dst (AddD dst src));
2187 
2188   format %{ "addsd   $dst, $src" %}
2189   ins_cost(150);
2190   ins_encode %{
2191     __ addsd($dst$$XMMRegister, $src$$XMMRegister);
2192   %}
2193   ins_pipe(pipe_slow);
2194 %}
2195 
2196 instruct addD_mem(regD dst, memory src) %{
2197   predicate((UseSSE>=2) && (UseAVX == 0));
2198   match(Set dst (AddD dst (LoadD src)));
2199 
2200   format %{ "addsd   $dst, $src" %}
2201   ins_cost(150);
2202   ins_encode %{
2203     __ addsd($dst$$XMMRegister, $src$$Address);
2204   %}
2205   ins_pipe(pipe_slow);
2206 %}
2207 
2208 instruct addD_imm(regD dst, immD con) %{
2209   predicate((UseSSE>=2) && (UseAVX == 0));
2210   match(Set dst (AddD dst con));
2211   format %{ "addsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2212   ins_cost(150);
2213   ins_encode %{
2214     __ addsd($dst$$XMMRegister, $constantaddress($con));
2215   %}
2216   ins_pipe(pipe_slow);
2217 %}
2218 
2219 instruct addD_reg_reg(regD dst, regD src1, regD src2) %{
2220   predicate(UseAVX > 0);
2221   match(Set dst (AddD src1 src2));
2222 
2223   format %{ "vaddsd  $dst, $src1, $src2" %}
2224   ins_cost(150);
2225   ins_encode %{
2226     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2227   %}
2228   ins_pipe(pipe_slow);
2229 %}
2230 
2231 instruct addD_reg_mem(regD dst, regD src1, memory src2) %{
2232   predicate(UseAVX > 0);
2233   match(Set dst (AddD src1 (LoadD src2)));
2234 
2235   format %{ "vaddsd  $dst, $src1, $src2" %}
2236   ins_cost(150);
2237   ins_encode %{
2238     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2239   %}
2240   ins_pipe(pipe_slow);
2241 %}
2242 
2243 instruct addD_reg_imm(regD dst, regD src, immD con) %{
2244   predicate(UseAVX > 0);
2245   match(Set dst (AddD src con));
2246 
2247   format %{ "vaddsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
2248   ins_cost(150);
2249   ins_encode %{
2250     __ vaddsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2251   %}
2252   ins_pipe(pipe_slow);
2253 %}
2254 
2255 instruct subF_reg(regF dst, regF src) %{
2256   predicate((UseSSE>=1) && (UseAVX == 0));
2257   match(Set dst (SubF dst src));
2258 
2259   format %{ "subss   $dst, $src" %}
2260   ins_cost(150);
2261   ins_encode %{
2262     __ subss($dst$$XMMRegister, $src$$XMMRegister);
2263   %}
2264   ins_pipe(pipe_slow);
2265 %}
2266 
2267 instruct subF_mem(regF dst, memory src) %{
2268   predicate((UseSSE>=1) && (UseAVX == 0));
2269   match(Set dst (SubF dst (LoadF src)));
2270 
2271   format %{ "subss   $dst, $src" %}
2272   ins_cost(150);
2273   ins_encode %{
2274     __ subss($dst$$XMMRegister, $src$$Address);
2275   %}
2276   ins_pipe(pipe_slow);
2277 %}
2278 
2279 instruct subF_imm(regF dst, immF con) %{
2280   predicate((UseSSE>=1) && (UseAVX == 0));
2281   match(Set dst (SubF dst con));
2282   format %{ "subss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2283   ins_cost(150);
2284   ins_encode %{
2285     __ subss($dst$$XMMRegister, $constantaddress($con));
2286   %}
2287   ins_pipe(pipe_slow);
2288 %}
2289 
2290 instruct subF_reg_reg(regF dst, regF src1, regF src2) %{
2291   predicate(UseAVX > 0);
2292   match(Set dst (SubF src1 src2));
2293 
2294   format %{ "vsubss  $dst, $src1, $src2" %}
2295   ins_cost(150);
2296   ins_encode %{
2297     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2298   %}
2299   ins_pipe(pipe_slow);
2300 %}
2301 
2302 instruct subF_reg_mem(regF dst, regF src1, memory src2) %{
2303   predicate(UseAVX > 0);
2304   match(Set dst (SubF src1 (LoadF src2)));
2305 
2306   format %{ "vsubss  $dst, $src1, $src2" %}
2307   ins_cost(150);
2308   ins_encode %{
2309     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2310   %}
2311   ins_pipe(pipe_slow);
2312 %}
2313 
2314 instruct subF_reg_imm(regF dst, regF src, immF con) %{
2315   predicate(UseAVX > 0);
2316   match(Set dst (SubF src con));
2317 
2318   format %{ "vsubss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
2319   ins_cost(150);
2320   ins_encode %{
2321     __ vsubss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2322   %}
2323   ins_pipe(pipe_slow);
2324 %}
2325 
2326 instruct subD_reg(regD dst, regD src) %{
2327   predicate((UseSSE>=2) && (UseAVX == 0));
2328   match(Set dst (SubD dst src));
2329 
2330   format %{ "subsd   $dst, $src" %}
2331   ins_cost(150);
2332   ins_encode %{
2333     __ subsd($dst$$XMMRegister, $src$$XMMRegister);
2334   %}
2335   ins_pipe(pipe_slow);
2336 %}
2337 
2338 instruct subD_mem(regD dst, memory src) %{
2339   predicate((UseSSE>=2) && (UseAVX == 0));
2340   match(Set dst (SubD dst (LoadD src)));
2341 
2342   format %{ "subsd   $dst, $src" %}
2343   ins_cost(150);
2344   ins_encode %{
2345     __ subsd($dst$$XMMRegister, $src$$Address);
2346   %}
2347   ins_pipe(pipe_slow);
2348 %}
2349 
2350 instruct subD_imm(regD dst, immD con) %{
2351   predicate((UseSSE>=2) && (UseAVX == 0));
2352   match(Set dst (SubD dst con));
2353   format %{ "subsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2354   ins_cost(150);
2355   ins_encode %{
2356     __ subsd($dst$$XMMRegister, $constantaddress($con));
2357   %}
2358   ins_pipe(pipe_slow);
2359 %}
2360 
2361 instruct subD_reg_reg(regD dst, regD src1, regD src2) %{
2362   predicate(UseAVX > 0);
2363   match(Set dst (SubD src1 src2));
2364 
2365   format %{ "vsubsd  $dst, $src1, $src2" %}
2366   ins_cost(150);
2367   ins_encode %{
2368     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2369   %}
2370   ins_pipe(pipe_slow);
2371 %}
2372 
2373 instruct subD_reg_mem(regD dst, regD src1, memory src2) %{
2374   predicate(UseAVX > 0);
2375   match(Set dst (SubD src1 (LoadD src2)));
2376 
2377   format %{ "vsubsd  $dst, $src1, $src2" %}
2378   ins_cost(150);
2379   ins_encode %{
2380     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2381   %}
2382   ins_pipe(pipe_slow);
2383 %}
2384 
2385 instruct subD_reg_imm(regD dst, regD src, immD con) %{
2386   predicate(UseAVX > 0);
2387   match(Set dst (SubD src con));
2388 
2389   format %{ "vsubsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
2390   ins_cost(150);
2391   ins_encode %{
2392     __ vsubsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2393   %}
2394   ins_pipe(pipe_slow);
2395 %}
2396 
2397 instruct mulF_reg(regF dst, regF src) %{
2398   predicate((UseSSE>=1) && (UseAVX == 0));
2399   match(Set dst (MulF dst src));
2400 
2401   format %{ "mulss   $dst, $src" %}
2402   ins_cost(150);
2403   ins_encode %{
2404     __ mulss($dst$$XMMRegister, $src$$XMMRegister);
2405   %}
2406   ins_pipe(pipe_slow);
2407 %}
2408 
2409 instruct mulF_mem(regF dst, memory src) %{
2410   predicate((UseSSE>=1) && (UseAVX == 0));
2411   match(Set dst (MulF dst (LoadF src)));
2412 
2413   format %{ "mulss   $dst, $src" %}
2414   ins_cost(150);
2415   ins_encode %{
2416     __ mulss($dst$$XMMRegister, $src$$Address);
2417   %}
2418   ins_pipe(pipe_slow);
2419 %}
2420 
2421 instruct mulF_imm(regF dst, immF con) %{
2422   predicate((UseSSE>=1) && (UseAVX == 0));
2423   match(Set dst (MulF dst con));
2424   format %{ "mulss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2425   ins_cost(150);
2426   ins_encode %{
2427     __ mulss($dst$$XMMRegister, $constantaddress($con));
2428   %}
2429   ins_pipe(pipe_slow);
2430 %}
2431 
2432 instruct mulF_reg_reg(regF dst, regF src1, regF src2) %{
2433   predicate(UseAVX > 0);
2434   match(Set dst (MulF src1 src2));
2435 
2436   format %{ "vmulss  $dst, $src1, $src2" %}
2437   ins_cost(150);
2438   ins_encode %{
2439     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2440   %}
2441   ins_pipe(pipe_slow);
2442 %}
2443 
2444 instruct mulF_reg_mem(regF dst, regF src1, memory src2) %{
2445   predicate(UseAVX > 0);
2446   match(Set dst (MulF src1 (LoadF src2)));
2447 
2448   format %{ "vmulss  $dst, $src1, $src2" %}
2449   ins_cost(150);
2450   ins_encode %{
2451     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2452   %}
2453   ins_pipe(pipe_slow);
2454 %}
2455 
2456 instruct mulF_reg_imm(regF dst, regF src, immF con) %{
2457   predicate(UseAVX > 0);
2458   match(Set dst (MulF src con));
2459 
2460   format %{ "vmulss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
2461   ins_cost(150);
2462   ins_encode %{
2463     __ vmulss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2464   %}
2465   ins_pipe(pipe_slow);
2466 %}
2467 
2468 instruct mulD_reg(regD dst, regD src) %{
2469   predicate((UseSSE>=2) && (UseAVX == 0));
2470   match(Set dst (MulD dst src));
2471 
2472   format %{ "mulsd   $dst, $src" %}
2473   ins_cost(150);
2474   ins_encode %{
2475     __ mulsd($dst$$XMMRegister, $src$$XMMRegister);
2476   %}
2477   ins_pipe(pipe_slow);
2478 %}
2479 
2480 instruct mulD_mem(regD dst, memory src) %{
2481   predicate((UseSSE>=2) && (UseAVX == 0));
2482   match(Set dst (MulD dst (LoadD src)));
2483 
2484   format %{ "mulsd   $dst, $src" %}
2485   ins_cost(150);
2486   ins_encode %{
2487     __ mulsd($dst$$XMMRegister, $src$$Address);
2488   %}
2489   ins_pipe(pipe_slow);
2490 %}
2491 
2492 instruct mulD_imm(regD dst, immD con) %{
2493   predicate((UseSSE>=2) && (UseAVX == 0));
2494   match(Set dst (MulD dst con));
2495   format %{ "mulsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2496   ins_cost(150);
2497   ins_encode %{
2498     __ mulsd($dst$$XMMRegister, $constantaddress($con));
2499   %}
2500   ins_pipe(pipe_slow);
2501 %}
2502 
2503 instruct mulD_reg_reg(regD dst, regD src1, regD src2) %{
2504   predicate(UseAVX > 0);
2505   match(Set dst (MulD src1 src2));
2506 
2507   format %{ "vmulsd  $dst, $src1, $src2" %}
2508   ins_cost(150);
2509   ins_encode %{
2510     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2511   %}
2512   ins_pipe(pipe_slow);
2513 %}
2514 
2515 instruct mulD_reg_mem(regD dst, regD src1, memory src2) %{
2516   predicate(UseAVX > 0);
2517   match(Set dst (MulD src1 (LoadD src2)));
2518 
2519   format %{ "vmulsd  $dst, $src1, $src2" %}
2520   ins_cost(150);
2521   ins_encode %{
2522     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2523   %}
2524   ins_pipe(pipe_slow);
2525 %}
2526 
2527 instruct mulD_reg_imm(regD dst, regD src, immD con) %{
2528   predicate(UseAVX > 0);
2529   match(Set dst (MulD src con));
2530 
2531   format %{ "vmulsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
2532   ins_cost(150);
2533   ins_encode %{
2534     __ vmulsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2535   %}
2536   ins_pipe(pipe_slow);
2537 %}
2538 
2539 instruct divF_reg(regF dst, regF src) %{
2540   predicate((UseSSE>=1) && (UseAVX == 0));
2541   match(Set dst (DivF dst src));
2542 
2543   format %{ "divss   $dst, $src" %}
2544   ins_cost(150);
2545   ins_encode %{
2546     __ divss($dst$$XMMRegister, $src$$XMMRegister);
2547   %}
2548   ins_pipe(pipe_slow);
2549 %}
2550 
2551 instruct divF_mem(regF dst, memory src) %{
2552   predicate((UseSSE>=1) && (UseAVX == 0));
2553   match(Set dst (DivF dst (LoadF src)));
2554 
2555   format %{ "divss   $dst, $src" %}
2556   ins_cost(150);
2557   ins_encode %{
2558     __ divss($dst$$XMMRegister, $src$$Address);
2559   %}
2560   ins_pipe(pipe_slow);
2561 %}
2562 
2563 instruct divF_imm(regF dst, immF con) %{
2564   predicate((UseSSE>=1) && (UseAVX == 0));
2565   match(Set dst (DivF dst con));
2566   format %{ "divss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2567   ins_cost(150);
2568   ins_encode %{
2569     __ divss($dst$$XMMRegister, $constantaddress($con));
2570   %}
2571   ins_pipe(pipe_slow);
2572 %}
2573 
2574 instruct divF_reg_reg(regF dst, regF src1, regF src2) %{
2575   predicate(UseAVX > 0);
2576   match(Set dst (DivF src1 src2));
2577 
2578   format %{ "vdivss  $dst, $src1, $src2" %}
2579   ins_cost(150);
2580   ins_encode %{
2581     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2582   %}
2583   ins_pipe(pipe_slow);
2584 %}
2585 
2586 instruct divF_reg_mem(regF dst, regF src1, memory src2) %{
2587   predicate(UseAVX > 0);
2588   match(Set dst (DivF src1 (LoadF src2)));
2589 
2590   format %{ "vdivss  $dst, $src1, $src2" %}
2591   ins_cost(150);
2592   ins_encode %{
2593     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2594   %}
2595   ins_pipe(pipe_slow);
2596 %}
2597 
2598 instruct divF_reg_imm(regF dst, regF src, immF con) %{
2599   predicate(UseAVX > 0);
2600   match(Set dst (DivF src con));
2601 
2602   format %{ "vdivss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
2603   ins_cost(150);
2604   ins_encode %{
2605     __ vdivss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2606   %}
2607   ins_pipe(pipe_slow);
2608 %}
2609 
2610 instruct divD_reg(regD dst, regD src) %{
2611   predicate((UseSSE>=2) && (UseAVX == 0));
2612   match(Set dst (DivD dst src));
2613 
2614   format %{ "divsd   $dst, $src" %}
2615   ins_cost(150);
2616   ins_encode %{
2617     __ divsd($dst$$XMMRegister, $src$$XMMRegister);
2618   %}
2619   ins_pipe(pipe_slow);
2620 %}
2621 
2622 instruct divD_mem(regD dst, memory src) %{
2623   predicate((UseSSE>=2) && (UseAVX == 0));
2624   match(Set dst (DivD dst (LoadD src)));
2625 
2626   format %{ "divsd   $dst, $src" %}
2627   ins_cost(150);
2628   ins_encode %{
2629     __ divsd($dst$$XMMRegister, $src$$Address);
2630   %}
2631   ins_pipe(pipe_slow);
2632 %}
2633 
2634 instruct divD_imm(regD dst, immD con) %{
2635   predicate((UseSSE>=2) && (UseAVX == 0));
2636   match(Set dst (DivD dst con));
2637   format %{ "divsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2638   ins_cost(150);
2639   ins_encode %{
2640     __ divsd($dst$$XMMRegister, $constantaddress($con));
2641   %}
2642   ins_pipe(pipe_slow);
2643 %}
2644 
2645 instruct divD_reg_reg(regD dst, regD src1, regD src2) %{
2646   predicate(UseAVX > 0);
2647   match(Set dst (DivD src1 src2));
2648 
2649   format %{ "vdivsd  $dst, $src1, $src2" %}
2650   ins_cost(150);
2651   ins_encode %{
2652     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2653   %}
2654   ins_pipe(pipe_slow);
2655 %}
2656 
2657 instruct divD_reg_mem(regD dst, regD src1, memory src2) %{
2658   predicate(UseAVX > 0);
2659   match(Set dst (DivD src1 (LoadD src2)));
2660 
2661   format %{ "vdivsd  $dst, $src1, $src2" %}
2662   ins_cost(150);
2663   ins_encode %{
2664     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2665   %}
2666   ins_pipe(pipe_slow);
2667 %}
2668 
2669 instruct divD_reg_imm(regD dst, regD src, immD con) %{
2670   predicate(UseAVX > 0);
2671   match(Set dst (DivD src con));
2672 
2673   format %{ "vdivsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
2674   ins_cost(150);
2675   ins_encode %{
2676     __ vdivsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2677   %}
2678   ins_pipe(pipe_slow);
2679 %}
2680 
2681 instruct absF_reg(regF dst) %{
2682   predicate((UseSSE>=1) && (UseAVX == 0));
2683   match(Set dst (AbsF dst));
2684   ins_cost(150);
2685   format %{ "andps   $dst, [0x7fffffff]\t# abs float by sign masking" %}
2686   ins_encode %{
2687     __ andps($dst$$XMMRegister, ExternalAddress(float_signmask()));
2688   %}
2689   ins_pipe(pipe_slow);
2690 %}
2691 
2692 instruct absF_reg_reg(vlRegF dst, vlRegF src) %{
2693   predicate(UseAVX > 0);
2694   match(Set dst (AbsF src));
2695   ins_cost(150);
2696   format %{ "vandps  $dst, $src, [0x7fffffff]\t# abs float by sign masking" %}
2697   ins_encode %{
2698     int vector_len = 0;
2699     __ vandps($dst$$XMMRegister, $src$$XMMRegister,
2700               ExternalAddress(float_signmask()), vector_len);
2701   %}
2702   ins_pipe(pipe_slow);
2703 %}
2704 
2705 instruct absD_reg(regD dst) %{
2706   predicate((UseSSE>=2) && (UseAVX == 0));
2707   match(Set dst (AbsD dst));
2708   ins_cost(150);
2709   format %{ "andpd   $dst, [0x7fffffffffffffff]\t"
2710             "# abs double by sign masking" %}
2711   ins_encode %{
2712     __ andpd($dst$$XMMRegister, ExternalAddress(double_signmask()));
2713   %}
2714   ins_pipe(pipe_slow);
2715 %}
2716 
2717 instruct absD_reg_reg(vlRegD dst, vlRegD src) %{
2718   predicate(UseAVX > 0);
2719   match(Set dst (AbsD src));
2720   ins_cost(150);
2721   format %{ "vandpd  $dst, $src, [0x7fffffffffffffff]\t"
2722             "# abs double by sign masking" %}
2723   ins_encode %{
2724     int vector_len = 0;
2725     __ vandpd($dst$$XMMRegister, $src$$XMMRegister,
2726               ExternalAddress(double_signmask()), vector_len);
2727   %}
2728   ins_pipe(pipe_slow);
2729 %}
2730 
2731 instruct negF_reg(regF dst) %{
2732   predicate((UseSSE>=1) && (UseAVX == 0));
2733   match(Set dst (NegF dst));
2734   ins_cost(150);
2735   format %{ "xorps   $dst, [0x80000000]\t# neg float by sign flipping" %}
2736   ins_encode %{
2737     __ xorps($dst$$XMMRegister, ExternalAddress(float_signflip()));
2738   %}
2739   ins_pipe(pipe_slow);
2740 %}
2741 
2742 instruct negF_reg_reg(vlRegF dst, vlRegF src) %{
2743   predicate(UseAVX > 0);
2744   match(Set dst (NegF src));
2745   ins_cost(150);
2746   format %{ "vnegatess  $dst, $src, [0x80000000]\t# neg float by sign flipping" %}
2747   ins_encode %{
2748     __ vnegatess($dst$$XMMRegister, $src$$XMMRegister,
2749                  ExternalAddress(float_signflip()));
2750   %}
2751   ins_pipe(pipe_slow);
2752 %}
2753 
2754 instruct negD_reg(regD dst) %{
2755   predicate((UseSSE>=2) && (UseAVX == 0));
2756   match(Set dst (NegD dst));
2757   ins_cost(150);
2758   format %{ "xorpd   $dst, [0x8000000000000000]\t"
2759             "# neg double by sign flipping" %}
2760   ins_encode %{
2761     __ xorpd($dst$$XMMRegister, ExternalAddress(double_signflip()));
2762   %}
2763   ins_pipe(pipe_slow);
2764 %}
2765 
2766 instruct negD_reg_reg(vlRegD dst, vlRegD src) %{
2767   predicate(UseAVX > 0);
2768   match(Set dst (NegD src));
2769   ins_cost(150);
2770   format %{ "vnegatesd  $dst, $src, [0x8000000000000000]\t"
2771             "# neg double by sign flipping" %}
2772   ins_encode %{
2773     __ vnegatesd($dst$$XMMRegister, $src$$XMMRegister,
2774                  ExternalAddress(double_signflip()));
2775   %}
2776   ins_pipe(pipe_slow);
2777 %}
2778 
2779 instruct sqrtF_reg(regF dst, regF src) %{
2780   predicate(UseSSE>=1);
2781   match(Set dst (SqrtF src));
2782 
2783   format %{ "sqrtss  $dst, $src" %}
2784   ins_cost(150);
2785   ins_encode %{
2786     __ sqrtss($dst$$XMMRegister, $src$$XMMRegister);
2787   %}
2788   ins_pipe(pipe_slow);
2789 %}
2790 
2791 instruct sqrtF_mem(regF dst, memory src) %{
2792   predicate(UseSSE>=1);
2793   match(Set dst (SqrtF (LoadF src)));
2794 
2795   format %{ "sqrtss  $dst, $src" %}
2796   ins_cost(150);
2797   ins_encode %{
2798     __ sqrtss($dst$$XMMRegister, $src$$Address);
2799   %}
2800   ins_pipe(pipe_slow);
2801 %}
2802 
2803 instruct sqrtF_imm(regF dst, immF con) %{
2804   predicate(UseSSE>=1);
2805   match(Set dst (SqrtF con));
2806 
2807   format %{ "sqrtss  $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2808   ins_cost(150);
2809   ins_encode %{
2810     __ sqrtss($dst$$XMMRegister, $constantaddress($con));
2811   %}
2812   ins_pipe(pipe_slow);
2813 %}
2814 
2815 instruct sqrtD_reg(regD dst, regD src) %{
2816   predicate(UseSSE>=2);
2817   match(Set dst (SqrtD src));
2818 
2819   format %{ "sqrtsd  $dst, $src" %}
2820   ins_cost(150);
2821   ins_encode %{
2822     __ sqrtsd($dst$$XMMRegister, $src$$XMMRegister);
2823   %}
2824   ins_pipe(pipe_slow);
2825 %}
2826 
2827 instruct sqrtD_mem(regD dst, memory src) %{
2828   predicate(UseSSE>=2);
2829   match(Set dst (SqrtD (LoadD src)));
2830 
2831   format %{ "sqrtsd  $dst, $src" %}
2832   ins_cost(150);
2833   ins_encode %{
2834     __ sqrtsd($dst$$XMMRegister, $src$$Address);
2835   %}
2836   ins_pipe(pipe_slow);
2837 %}
2838 
2839 instruct sqrtD_imm(regD dst, immD con) %{
2840   predicate(UseSSE>=2);
2841   match(Set dst (SqrtD con));
2842   format %{ "sqrtsd  $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2843   ins_cost(150);
2844   ins_encode %{
2845     __ sqrtsd($dst$$XMMRegister, $constantaddress($con));
2846   %}
2847   ins_pipe(pipe_slow);
2848 %}
2849 
2850 instruct onspinwait() %{
2851   match(OnSpinWait);
2852   ins_cost(200);
2853 
2854   format %{
2855     $$template
2856     $$emit$$"pause\t! membar_onspinwait"
2857   %}
2858   ins_encode %{
2859     __ pause();
2860   %}
2861   ins_pipe(pipe_slow);
2862 %}
2863 
2864 // a * b + c
2865 instruct fmaD_reg(regD a, regD b, regD c) %{
2866   predicate(UseFMA);
2867   match(Set c (FmaD  c (Binary a b)));
2868   format %{ "fmasd $a,$b,$c\t# $c = $a * $b + $c" %}
2869   ins_cost(150);
2870   ins_encode %{
2871     __ fmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
2872   %}
2873   ins_pipe( pipe_slow );
2874 %}
2875 
2876 // a * b + c
2877 instruct fmaF_reg(regF a, regF b, regF c) %{
2878   predicate(UseFMA);
2879   match(Set c (FmaF  c (Binary a b)));
2880   format %{ "fmass $a,$b,$c\t# $c = $a * $b + $c" %}
2881   ins_cost(150);
2882   ins_encode %{
2883     __ fmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
2884   %}
2885   ins_pipe( pipe_slow );
2886 %}
2887 
2888 // ====================VECTOR INSTRUCTIONS=====================================
2889 
2890 
2891 // Load vectors (4 bytes long)
2892 instruct loadV4(vecS dst, memory mem) %{
2893   predicate(n->as_LoadVector()->memory_size() == 4);
2894   match(Set dst (LoadVector mem));
2895   ins_cost(125);
2896   format %{ "movd    $dst,$mem\t! load vector (4 bytes)" %}
2897   ins_encode %{
2898     __ movdl($dst$$XMMRegister, $mem$$Address);
2899   %}
2900   ins_pipe( pipe_slow );
2901 %}
2902 
2903 // Load vectors (4 bytes long)
2904 instruct MoveVecS2Leg(legVecS dst, vecS src) %{
2905   match(Set dst src);
2906   format %{ "movss $dst,$src\t! load vector (4 bytes)" %}
2907   ins_encode %{
2908     __ movflt($dst$$XMMRegister, $src$$XMMRegister);
2909   %}
2910   ins_pipe( fpu_reg_reg );
2911 %}
2912 
2913 // Load vectors (4 bytes long)
2914 instruct MoveLeg2VecS(vecS dst, legVecS src) %{
2915   match(Set dst src);
2916   format %{ "movss $dst,$src\t! load vector (4 bytes)" %}
2917   ins_encode %{
2918     __ movflt($dst$$XMMRegister, $src$$XMMRegister);
2919   %}
2920   ins_pipe( fpu_reg_reg );
2921 %}
2922 
2923 // Load vectors (8 bytes long)
2924 instruct loadV8(vecD dst, memory mem) %{
2925   predicate(n->as_LoadVector()->memory_size() == 8);
2926   match(Set dst (LoadVector mem));
2927   ins_cost(125);
2928   format %{ "movq    $dst,$mem\t! load vector (8 bytes)" %}
2929   ins_encode %{
2930     __ movq($dst$$XMMRegister, $mem$$Address);
2931   %}
2932   ins_pipe( pipe_slow );
2933 %}
2934 
2935 // Load vectors (8 bytes long)
2936 instruct MoveVecD2Leg(legVecD dst, vecD src) %{
2937   match(Set dst src);
2938   format %{ "movsd $dst,$src\t! load vector (8 bytes)" %}
2939   ins_encode %{
2940     __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
2941   %}
2942   ins_pipe( fpu_reg_reg );
2943 %}
2944 
2945 // Load vectors (8 bytes long)
2946 instruct MoveLeg2VecD(vecD dst, legVecD src) %{
2947   match(Set dst src);
2948   format %{ "movsd $dst,$src\t! load vector (8 bytes)" %}
2949   ins_encode %{
2950     __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
2951   %}
2952   ins_pipe( fpu_reg_reg );
2953 %}
2954 
2955 // Load vectors (16 bytes long)
2956 instruct loadV16(vecX dst, memory mem) %{
2957   predicate(n->as_LoadVector()->memory_size() == 16);
2958   match(Set dst (LoadVector mem));
2959   ins_cost(125);
2960   format %{ "movdqu  $dst,$mem\t! load vector (16 bytes)" %}
2961   ins_encode %{
2962     __ movdqu($dst$$XMMRegister, $mem$$Address);
2963   %}
2964   ins_pipe( pipe_slow );
2965 %}
2966 
2967 // Load vectors (16 bytes long)
2968 instruct MoveVecX2Leg(legVecX dst, vecX src) %{
2969   match(Set dst src);
2970   format %{ "movdqu $dst,$src\t! load vector (16 bytes)" %}
2971   ins_encode %{
2972     if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
2973       int vector_len = 2;
2974       __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
2975     } else {
2976       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
2977     }
2978   %}
2979   ins_pipe( fpu_reg_reg );
2980 %}
2981 
2982 // Load vectors (16 bytes long)
2983 instruct MoveLeg2VecX(vecX dst, legVecX src) %{
2984   match(Set dst src);
2985   format %{ "movdqu $dst,$src\t! load vector (16 bytes)" %}
2986   ins_encode %{
2987     if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
2988       int vector_len = 2;
2989       __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
2990     } else {
2991       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
2992     }
2993   %}
2994   ins_pipe( fpu_reg_reg );
2995 %}
2996 
2997 // Load vectors (32 bytes long)
2998 instruct loadV32(vecY dst, memory mem) %{
2999   predicate(n->as_LoadVector()->memory_size() == 32);
3000   match(Set dst (LoadVector mem));
3001   ins_cost(125);
3002   format %{ "vmovdqu $dst,$mem\t! load vector (32 bytes)" %}
3003   ins_encode %{
3004     __ vmovdqu($dst$$XMMRegister, $mem$$Address);
3005   %}
3006   ins_pipe( pipe_slow );
3007 %}
3008 
3009 // Load vectors (32 bytes long)
3010 instruct MoveVecY2Leg(legVecY dst, vecY src) %{
3011   match(Set dst src);
3012   format %{ "vmovdqu $dst,$src\t! load vector (32 bytes)" %}
3013   ins_encode %{
3014     if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
3015       int vector_len = 2;
3016       __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
3017     } else {
3018       __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister);
3019     }
3020   %}
3021   ins_pipe( fpu_reg_reg );
3022 %}
3023 
3024 // Load vectors (32 bytes long)
3025 instruct MoveLeg2VecY(vecY dst, legVecY src) %{
3026   match(Set dst src);
3027   format %{ "vmovdqu $dst,$src\t! load vector (32 bytes)" %}
3028   ins_encode %{
3029     if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
3030       int vector_len = 2;
3031       __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
3032     } else {
3033       __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister);
3034     }
3035   %}
3036   ins_pipe( fpu_reg_reg );
3037 %}
3038 
3039 // Load vectors (64 bytes long)
3040 instruct loadV64_dword(vecZ dst, memory mem) %{
3041   predicate(n->as_LoadVector()->memory_size() == 64 && n->as_LoadVector()->element_size() <= 4);
3042   match(Set dst (LoadVector mem));
3043   ins_cost(125);
3044   format %{ "vmovdqul $dst k0,$mem\t! load vector (64 bytes)" %}
3045   ins_encode %{
3046     int vector_len = 2;
3047     __ evmovdqul($dst$$XMMRegister, $mem$$Address, vector_len);
3048   %}
3049   ins_pipe( pipe_slow );
3050 %}
3051 
3052 // Load vectors (64 bytes long)
3053 instruct loadV64_qword(vecZ dst, memory mem) %{
3054   predicate(n->as_LoadVector()->memory_size() == 64 && n->as_LoadVector()->element_size() > 4);
3055   match(Set dst (LoadVector mem));
3056   ins_cost(125);
3057   format %{ "vmovdquq $dst k0,$mem\t! load vector (64 bytes)" %}
3058   ins_encode %{
3059     int vector_len = 2;
3060     __ evmovdquq($dst$$XMMRegister, $mem$$Address, vector_len);
3061   %}
3062   ins_pipe( pipe_slow );
3063 %}
3064 
3065 instruct MoveVecZ2Leg(legVecZ dst, vecZ  src) %{
3066   match(Set dst src);
3067   format %{ "vmovdquq $dst k0,$src\t! Move vector (64 bytes)" %}
3068   ins_encode %{
3069     int vector_len = 2;
3070     __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
3071   %}
3072   ins_pipe( fpu_reg_reg );
3073 %}
3074 
3075 instruct MoveLeg2VecZ(vecZ dst, legVecZ  src) %{
3076   match(Set dst src);
3077   format %{ "vmovdquq $dst k0,$src\t! Move vector (64 bytes)" %}
3078   ins_encode %{
3079     int vector_len = 2;
3080     __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
3081   %}
3082   ins_pipe( fpu_reg_reg );
3083 %}
3084 
3085 // Store vectors
3086 instruct storeV4(memory mem, vecS src) %{
3087   predicate(n->as_StoreVector()->memory_size() == 4);
3088   match(Set mem (StoreVector mem src));
3089   ins_cost(145);
3090   format %{ "movd    $mem,$src\t! store vector (4 bytes)" %}
3091   ins_encode %{
3092     __ movdl($mem$$Address, $src$$XMMRegister);
3093   %}
3094   ins_pipe( pipe_slow );
3095 %}
3096 
3097 instruct storeV8(memory mem, vecD src) %{
3098   predicate(n->as_StoreVector()->memory_size() == 8);
3099   match(Set mem (StoreVector mem src));
3100   ins_cost(145);
3101   format %{ "movq    $mem,$src\t! store vector (8 bytes)" %}
3102   ins_encode %{
3103     __ movq($mem$$Address, $src$$XMMRegister);
3104   %}
3105   ins_pipe( pipe_slow );
3106 %}
3107 
3108 instruct storeV16(memory mem, vecX src) %{
3109   predicate(n->as_StoreVector()->memory_size() == 16);
3110   match(Set mem (StoreVector mem src));
3111   ins_cost(145);
3112   format %{ "movdqu  $mem,$src\t! store vector (16 bytes)" %}
3113   ins_encode %{
3114     __ movdqu($mem$$Address, $src$$XMMRegister);
3115   %}
3116   ins_pipe( pipe_slow );
3117 %}
3118 
3119 instruct storeV32(memory mem, vecY src) %{
3120   predicate(n->as_StoreVector()->memory_size() == 32);
3121   match(Set mem (StoreVector mem src));
3122   ins_cost(145);
3123   format %{ "vmovdqu $mem,$src\t! store vector (32 bytes)" %}
3124   ins_encode %{
3125     __ vmovdqu($mem$$Address, $src$$XMMRegister);
3126   %}
3127   ins_pipe( pipe_slow );
3128 %}
3129 
3130 instruct storeV64_dword(memory mem, vecZ src) %{
3131   predicate(n->as_StoreVector()->memory_size() == 64 && n->as_StoreVector()->element_size() <= 4);
3132   match(Set mem (StoreVector mem src));
3133   ins_cost(145);
3134   format %{ "vmovdqul $mem k0,$src\t! store vector (64 bytes)" %}
3135   ins_encode %{
3136     int vector_len = 2;
3137     __ evmovdqul($mem$$Address, $src$$XMMRegister, vector_len);
3138   %}
3139   ins_pipe( pipe_slow );
3140 %}
3141 
3142 instruct storeV64_qword(memory mem, vecZ src) %{
3143   predicate(n->as_StoreVector()->memory_size() == 64 && n->as_StoreVector()->element_size() > 4);
3144   match(Set mem (StoreVector mem src));
3145   ins_cost(145);
3146   format %{ "vmovdquq $mem k0,$src\t! store vector (64 bytes)" %}
3147   ins_encode %{
3148     int vector_len = 2;
3149     __ evmovdquq($mem$$Address, $src$$XMMRegister, vector_len);
3150   %}
3151   ins_pipe( pipe_slow );
3152 %}
3153 
3154 // ====================LEGACY REPLICATE=======================================
3155 
3156 instruct Repl16B(vecX dst, rRegI src) %{
3157   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
3158   match(Set dst (ReplicateB src));
3159   format %{ "movd    $dst,$src\n\t"
3160             "punpcklbw $dst,$dst\n\t"
3161             "pshuflw $dst,$dst,0x00\n\t"
3162             "punpcklqdq $dst,$dst\t! replicate16B" %}
3163   ins_encode %{
3164     __ movdl($dst$$XMMRegister, $src$$Register);
3165     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
3166     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3167     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3168   %}
3169   ins_pipe( pipe_slow );
3170 %}
3171 
3172 instruct Repl32B(vecY dst, rRegI src) %{
3173   predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
3174   match(Set dst (ReplicateB src));
3175   format %{ "movd    $dst,$src\n\t"
3176             "punpcklbw $dst,$dst\n\t"
3177             "pshuflw $dst,$dst,0x00\n\t"
3178             "punpcklqdq $dst,$dst\n\t"
3179             "vinserti128_high $dst,$dst\t! replicate32B" %}
3180   ins_encode %{
3181     __ movdl($dst$$XMMRegister, $src$$Register);
3182     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
3183     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3184     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3185     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3186   %}
3187   ins_pipe( pipe_slow );
3188 %}
3189 
3190 instruct Repl64B(legVecZ dst, rRegI src) %{
3191   predicate(n->as_Vector()->length() == 64 && !VM_Version::supports_avx512vlbw());
3192   match(Set dst (ReplicateB src));
3193   format %{ "movd    $dst,$src\n\t"
3194             "punpcklbw $dst,$dst\n\t"
3195             "pshuflw $dst,$dst,0x00\n\t"
3196             "punpcklqdq $dst,$dst\n\t"
3197             "vinserti128_high $dst,$dst\t"
3198             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate64B" %}
3199   ins_encode %{
3200     __ movdl($dst$$XMMRegister, $src$$Register);
3201     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
3202     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3203     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3204     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3205     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3206   %}
3207   ins_pipe( pipe_slow );
3208 %}
3209 
3210 instruct Repl16B_imm(vecX dst, immI con) %{
3211   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
3212   match(Set dst (ReplicateB con));
3213   format %{ "movq    $dst,[$constantaddress]\n\t"
3214             "punpcklqdq $dst,$dst\t! replicate16B($con)" %}
3215   ins_encode %{
3216     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
3217     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3218   %}
3219   ins_pipe( pipe_slow );
3220 %}
3221 
3222 instruct Repl32B_imm(vecY dst, immI con) %{
3223   predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
3224   match(Set dst (ReplicateB con));
3225   format %{ "movq    $dst,[$constantaddress]\n\t"
3226             "punpcklqdq $dst,$dst\n\t"
3227             "vinserti128_high $dst,$dst\t! lreplicate32B($con)" %}
3228   ins_encode %{
3229     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
3230     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3231     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3232   %}
3233   ins_pipe( pipe_slow );
3234 %}
3235 
3236 instruct Repl64B_imm(legVecZ dst, immI con) %{
3237   predicate(n->as_Vector()->length() == 64 && !VM_Version::supports_avx512vlbw());
3238   match(Set dst (ReplicateB con));
3239   format %{ "movq    $dst,[$constantaddress]\n\t"
3240             "punpcklqdq $dst,$dst\n\t"
3241             "vinserti128_high $dst,$dst\t"
3242             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate64B($con)" %}
3243   ins_encode %{
3244     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
3245     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3246     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3247     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3248   %}
3249   ins_pipe( pipe_slow );
3250 %}
3251 
3252 instruct Repl4S(vecD dst, rRegI src) %{
3253   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vlbw());
3254   match(Set dst (ReplicateS src));
3255   format %{ "movd    $dst,$src\n\t"
3256             "pshuflw $dst,$dst,0x00\t! replicate4S" %}
3257   ins_encode %{
3258     __ movdl($dst$$XMMRegister, $src$$Register);
3259     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3260   %}
3261   ins_pipe( pipe_slow );
3262 %}
3263 
3264 instruct Repl4S_mem(vecD dst, memory mem) %{
3265   predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
3266   match(Set dst (ReplicateS (LoadS mem)));
3267   format %{ "pshuflw $dst,$mem,0x00\t! replicate4S" %}
3268   ins_encode %{
3269     __ pshuflw($dst$$XMMRegister, $mem$$Address, 0x00);
3270   %}
3271   ins_pipe( pipe_slow );
3272 %}
3273 
3274 instruct Repl8S(vecX dst, rRegI src) %{
3275   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vlbw());
3276   match(Set dst (ReplicateS src));
3277   format %{ "movd    $dst,$src\n\t"
3278             "pshuflw $dst,$dst,0x00\n\t"
3279             "punpcklqdq $dst,$dst\t! replicate8S" %}
3280   ins_encode %{
3281     __ movdl($dst$$XMMRegister, $src$$Register);
3282     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3283     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3284   %}
3285   ins_pipe( pipe_slow );
3286 %}
3287 
3288 instruct Repl8S_mem(vecX dst, memory mem) %{
3289   predicate(n->as_Vector()->length() == 8 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
3290   match(Set dst (ReplicateS (LoadS mem)));
3291   format %{ "pshuflw $dst,$mem,0x00\n\t"
3292             "punpcklqdq $dst,$dst\t! replicate8S" %}
3293   ins_encode %{
3294     __ pshuflw($dst$$XMMRegister, $mem$$Address, 0x00);
3295     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3296   %}
3297   ins_pipe( pipe_slow );
3298 %}
3299 
3300 instruct Repl8S_imm(vecX dst, immI con) %{
3301   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vlbw());
3302   match(Set dst (ReplicateS con));
3303   format %{ "movq    $dst,[$constantaddress]\n\t"
3304             "punpcklqdq $dst,$dst\t! replicate8S($con)" %}
3305   ins_encode %{
3306     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
3307     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3308   %}
3309   ins_pipe( pipe_slow );
3310 %}
3311 
3312 instruct Repl16S(vecY dst, rRegI src) %{
3313   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
3314   match(Set dst (ReplicateS src));
3315   format %{ "movd    $dst,$src\n\t"
3316             "pshuflw $dst,$dst,0x00\n\t"
3317             "punpcklqdq $dst,$dst\n\t"
3318             "vinserti128_high $dst,$dst\t! replicate16S" %}
3319   ins_encode %{
3320     __ movdl($dst$$XMMRegister, $src$$Register);
3321     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3322     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3323     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3324   %}
3325   ins_pipe( pipe_slow );
3326 %}
3327 
3328 instruct Repl16S_mem(vecY dst, memory mem) %{
3329   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
3330   match(Set dst (ReplicateS (LoadS mem)));
3331   format %{ "pshuflw $dst,$mem,0x00\n\t"
3332             "punpcklqdq $dst,$dst\n\t"
3333             "vinserti128_high $dst,$dst\t! replicate16S" %}
3334   ins_encode %{
3335     __ pshuflw($dst$$XMMRegister, $mem$$Address, 0x00);
3336     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3337     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3338   %}
3339   ins_pipe( pipe_slow );
3340 %}
3341 
3342 instruct Repl16S_imm(vecY dst, immI con) %{
3343   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
3344   match(Set dst (ReplicateS con));
3345   format %{ "movq    $dst,[$constantaddress]\n\t"
3346             "punpcklqdq $dst,$dst\n\t"
3347             "vinserti128_high $dst,$dst\t! replicate16S($con)" %}
3348   ins_encode %{
3349     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
3350     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3351     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3352   %}
3353   ins_pipe( pipe_slow );
3354 %}
3355 
3356 instruct Repl32S(legVecZ dst, rRegI src) %{
3357   predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
3358   match(Set dst (ReplicateS src));
3359   format %{ "movd    $dst,$src\n\t"
3360             "pshuflw $dst,$dst,0x00\n\t"
3361             "punpcklqdq $dst,$dst\n\t"
3362             "vinserti128_high $dst,$dst\t"
3363             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate32S" %}
3364   ins_encode %{
3365     __ movdl($dst$$XMMRegister, $src$$Register);
3366     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3367     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3368     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3369     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3370   %}
3371   ins_pipe( pipe_slow );
3372 %}
3373 
3374 instruct Repl32S_mem(legVecZ dst, memory mem) %{
3375   predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
3376   match(Set dst (ReplicateS (LoadS mem)));
3377   format %{ "pshuflw $dst,$mem,0x00\n\t"
3378             "punpcklqdq $dst,$dst\n\t"
3379             "vinserti128_high $dst,$dst\t"
3380             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate32S" %}
3381   ins_encode %{
3382     __ pshuflw($dst$$XMMRegister, $mem$$Address, 0x00);
3383     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3384     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3385     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3386   %}
3387   ins_pipe( pipe_slow );
3388 %}
3389 
3390 instruct Repl32S_imm(legVecZ dst, immI con) %{
3391   predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
3392   match(Set dst (ReplicateS con));
3393   format %{ "movq    $dst,[$constantaddress]\n\t"
3394             "punpcklqdq $dst,$dst\n\t"
3395             "vinserti128_high $dst,$dst\t"
3396             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate32S($con)" %}
3397   ins_encode %{
3398     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
3399     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3400     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3401     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3402   %}
3403   ins_pipe( pipe_slow );
3404 %}
3405 
3406 instruct Repl4I(vecX dst, rRegI src) %{
3407   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3408   match(Set dst (ReplicateI src));
3409   format %{ "movd    $dst,$src\n\t"
3410             "pshufd  $dst,$dst,0x00\t! replicate4I" %}
3411   ins_encode %{
3412     __ movdl($dst$$XMMRegister, $src$$Register);
3413     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3414   %}
3415   ins_pipe( pipe_slow );
3416 %}
3417 
3418 instruct Repl4I_mem(vecX dst, memory mem) %{
3419   predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3420   match(Set dst (ReplicateI (LoadI mem)));
3421   format %{ "pshufd  $dst,$mem,0x00\t! replicate4I" %}
3422   ins_encode %{
3423     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3424   %}
3425   ins_pipe( pipe_slow );
3426 %}
3427 
3428 instruct Repl8I(vecY dst, rRegI src) %{
3429   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3430   match(Set dst (ReplicateI src));
3431   format %{ "movd    $dst,$src\n\t"
3432             "pshufd  $dst,$dst,0x00\n\t"
3433             "vinserti128_high $dst,$dst\t! replicate8I" %}
3434   ins_encode %{
3435     __ movdl($dst$$XMMRegister, $src$$Register);
3436     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3437     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3438   %}
3439   ins_pipe( pipe_slow );
3440 %}
3441 
3442 instruct Repl8I_mem(vecY dst, memory mem) %{
3443   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3444   match(Set dst (ReplicateI (LoadI mem)));
3445   format %{ "pshufd  $dst,$mem,0x00\n\t"
3446             "vinserti128_high $dst,$dst\t! replicate8I" %}
3447   ins_encode %{
3448     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3449     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3450   %}
3451   ins_pipe( pipe_slow );
3452 %}
3453 
3454 instruct Repl16I(legVecZ dst, rRegI src) %{
3455   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vl());
3456   match(Set dst (ReplicateI src));
3457   format %{ "movd    $dst,$src\n\t"
3458             "pshufd  $dst,$dst,0x00\n\t"
3459             "vinserti128_high $dst,$dst\t"
3460             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate16I" %}
3461   ins_encode %{
3462     __ movdl($dst$$XMMRegister, $src$$Register);
3463     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3464     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3465     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3466   %}
3467   ins_pipe( pipe_slow );
3468 %}
3469 
3470 instruct Repl16I_mem(legVecZ dst, memory mem) %{
3471   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vl());
3472   match(Set dst (ReplicateI (LoadI mem)));
3473   format %{ "pshufd  $dst,$mem,0x00\n\t"
3474             "vinserti128_high $dst,$dst\t"
3475             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate16I" %}
3476   ins_encode %{
3477     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3478     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3479     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3480   %}
3481   ins_pipe( pipe_slow );
3482 %}
3483 
3484 instruct Repl4I_imm(vecX dst, immI con) %{
3485   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3486   match(Set dst (ReplicateI con));
3487   format %{ "movq    $dst,[$constantaddress]\t! replicate4I($con)\n\t"
3488             "punpcklqdq $dst,$dst" %}
3489   ins_encode %{
3490     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
3491     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3492   %}
3493   ins_pipe( pipe_slow );
3494 %}
3495 
3496 instruct Repl8I_imm(vecY dst, immI con) %{
3497   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3498   match(Set dst (ReplicateI con));
3499   format %{ "movq    $dst,[$constantaddress]\t! replicate8I($con)\n\t"
3500             "punpcklqdq $dst,$dst\n\t"
3501             "vinserti128_high $dst,$dst" %}
3502   ins_encode %{
3503     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
3504     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3505     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3506   %}
3507   ins_pipe( pipe_slow );
3508 %}
3509 
3510 instruct Repl16I_imm(legVecZ dst, immI con) %{
3511   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vl());
3512   match(Set dst (ReplicateI con));
3513   format %{ "movq    $dst,[$constantaddress]\t"
3514             "punpcklqdq $dst,$dst\n\t"
3515             "vinserti128_high $dst,$dst"
3516             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate16I($con)" %}
3517   ins_encode %{
3518     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
3519     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3520     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3521     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3522   %}
3523   ins_pipe( pipe_slow );
3524 %}
3525 
3526 // Long could be loaded into xmm register directly from memory.
3527 instruct Repl2L_mem(vecX dst, memory mem) %{
3528   predicate(n->as_Vector()->length() == 2 && !VM_Version::supports_avx512vlbw());
3529   match(Set dst (ReplicateL (LoadL mem)));
3530   format %{ "movq    $dst,$mem\n\t"
3531             "punpcklqdq $dst,$dst\t! replicate2L" %}
3532   ins_encode %{
3533     __ movq($dst$$XMMRegister, $mem$$Address);
3534     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3535   %}
3536   ins_pipe( pipe_slow );
3537 %}
3538 
3539 // Replicate long (8 byte) scalar to be vector
3540 #ifdef _LP64
3541 instruct Repl4L(vecY dst, rRegL src) %{
3542   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3543   match(Set dst (ReplicateL src));
3544   format %{ "movdq   $dst,$src\n\t"
3545             "punpcklqdq $dst,$dst\n\t"
3546             "vinserti128_high $dst,$dst\t! replicate4L" %}
3547   ins_encode %{
3548     __ movdq($dst$$XMMRegister, $src$$Register);
3549     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3550     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3551   %}
3552   ins_pipe( pipe_slow );
3553 %}
3554 
3555 instruct Repl8L(legVecZ dst, rRegL src) %{
3556   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3557   match(Set dst (ReplicateL src));
3558   format %{ "movdq   $dst,$src\n\t"
3559             "punpcklqdq $dst,$dst\n\t"
3560             "vinserti128_high $dst,$dst\t"
3561             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate8L" %}
3562   ins_encode %{
3563     __ movdq($dst$$XMMRegister, $src$$Register);
3564     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3565     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3566     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3567   %}
3568   ins_pipe( pipe_slow );
3569 %}
3570 #else // _LP64
3571 instruct Repl4L(vecY dst, eRegL src, vecY tmp) %{
3572   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3573   match(Set dst (ReplicateL src));
3574   effect(TEMP dst, USE src, TEMP tmp);
3575   format %{ "movdl   $dst,$src.lo\n\t"
3576             "movdl   $tmp,$src.hi\n\t"
3577             "punpckldq $dst,$tmp\n\t"
3578             "punpcklqdq $dst,$dst\n\t"
3579             "vinserti128_high $dst,$dst\t! replicate4L" %}
3580   ins_encode %{
3581     __ movdl($dst$$XMMRegister, $src$$Register);
3582     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
3583     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
3584     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3585     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3586   %}
3587   ins_pipe( pipe_slow );
3588 %}
3589 
3590 instruct Repl8L(legVecZ dst, eRegL src, legVecZ tmp) %{
3591   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3592   match(Set dst (ReplicateL src));
3593   effect(TEMP dst, USE src, TEMP tmp);
3594   format %{ "movdl   $dst,$src.lo\n\t"
3595             "movdl   $tmp,$src.hi\n\t"
3596             "punpckldq $dst,$tmp\n\t"
3597             "punpcklqdq $dst,$dst\n\t"
3598             "vinserti128_high $dst,$dst\t"
3599             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate8L" %}
3600   ins_encode %{
3601     __ movdl($dst$$XMMRegister, $src$$Register);
3602     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
3603     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
3604     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3605     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3606     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3607   %}
3608   ins_pipe( pipe_slow );
3609 %}
3610 #endif // _LP64
3611 
3612 instruct Repl4L_imm(vecY dst, immL con) %{
3613   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3614   match(Set dst (ReplicateL con));
3615   format %{ "movq    $dst,[$constantaddress]\n\t"
3616             "punpcklqdq $dst,$dst\n\t"
3617             "vinserti128_high $dst,$dst\t! replicate4L($con)" %}
3618   ins_encode %{
3619     __ movq($dst$$XMMRegister, $constantaddress($con));
3620     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3621     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3622   %}
3623   ins_pipe( pipe_slow );
3624 %}
3625 
3626 instruct Repl8L_imm(legVecZ dst, immL con) %{
3627   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3628   match(Set dst (ReplicateL con));
3629   format %{ "movq    $dst,[$constantaddress]\n\t"
3630             "punpcklqdq $dst,$dst\n\t"
3631             "vinserti128_high $dst,$dst\t"
3632             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate8L($con)" %}
3633   ins_encode %{
3634     __ movq($dst$$XMMRegister, $constantaddress($con));
3635     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3636     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3637     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3638   %}
3639   ins_pipe( pipe_slow );
3640 %}
3641 
3642 instruct Repl4L_mem(vecY dst, memory mem) %{
3643   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3644   match(Set dst (ReplicateL (LoadL mem)));
3645   format %{ "movq    $dst,$mem\n\t"
3646             "punpcklqdq $dst,$dst\n\t"
3647             "vinserti128_high $dst,$dst\t! replicate4L" %}
3648   ins_encode %{
3649     __ movq($dst$$XMMRegister, $mem$$Address);
3650     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3651     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3652   %}
3653   ins_pipe( pipe_slow );
3654 %}
3655 
3656 instruct Repl8L_mem(legVecZ dst, memory mem) %{
3657   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3658   match(Set dst (ReplicateL (LoadL mem)));
3659   format %{ "movq    $dst,$mem\n\t"
3660             "punpcklqdq $dst,$dst\n\t"
3661             "vinserti128_high $dst,$dst\t"
3662             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate8L" %}
3663   ins_encode %{
3664     __ movq($dst$$XMMRegister, $mem$$Address);
3665     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3666     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3667     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3668   %}
3669   ins_pipe( pipe_slow );
3670 %}
3671 
3672 instruct Repl2F_mem(vecD dst, memory mem) %{
3673   predicate(n->as_Vector()->length() == 2 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3674   match(Set dst (ReplicateF (LoadF mem)));
3675   format %{ "pshufd  $dst,$mem,0x00\t! replicate2F" %}
3676   ins_encode %{
3677     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3678   %}
3679   ins_pipe( pipe_slow );
3680 %}
3681 
3682 instruct Repl4F_mem(vecX dst, memory mem) %{
3683   predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3684   match(Set dst (ReplicateF (LoadF mem)));
3685   format %{ "pshufd  $dst,$mem,0x00\t! replicate4F" %}
3686   ins_encode %{
3687     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3688   %}
3689   ins_pipe( pipe_slow );
3690 %}
3691 
3692 instruct Repl8F(vecY dst, vlRegF src) %{
3693   predicate(n->as_Vector()->length() == 8 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3694   match(Set dst (ReplicateF src));
3695   format %{ "pshufd  $dst,$src,0x00\n\t"
3696             "vinsertf128_high $dst,$dst\t! replicate8F" %}
3697   ins_encode %{
3698     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
3699     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3700   %}
3701   ins_pipe( pipe_slow );
3702 %}
3703 
3704 instruct Repl8F_mem(vecY dst, memory mem) %{
3705   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3706   match(Set dst (ReplicateF (LoadF mem)));
3707   format %{ "pshufd  $dst,$mem,0x00\n\t"
3708             "vinsertf128_high $dst,$dst\t! replicate8F" %}
3709   ins_encode %{
3710     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3711     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3712   %}
3713   ins_pipe( pipe_slow );
3714 %}
3715 
3716 instruct Repl16F(legVecZ dst, vlRegF src) %{
3717   predicate(n->as_Vector()->length() == 16 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3718   match(Set dst (ReplicateF src));
3719   format %{ "pshufd  $dst,$src,0x00\n\t"
3720             "vinsertf128_high $dst,$dst\t"
3721             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate16F" %}
3722   ins_encode %{
3723     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
3724     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3725     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3726   %}
3727   ins_pipe( pipe_slow );
3728 %}
3729 
3730 instruct Repl16F_mem(legVecZ dst, memory mem) %{
3731   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vl());
3732   match(Set dst (ReplicateF (LoadF mem)));
3733   format %{ "pshufd  $dst,$mem,0x00\n\t"
3734             "vinsertf128_high $dst,$dst\t"
3735             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate16F" %}
3736   ins_encode %{
3737     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3738     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3739     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3740   %}
3741   ins_pipe( pipe_slow );
3742 %}
3743 
3744 instruct Repl2F_zero(vecD dst, immF0 zero) %{
3745   predicate(n->as_Vector()->length() == 2 && UseAVX < 3);
3746   match(Set dst (ReplicateF zero));
3747   format %{ "xorps   $dst,$dst\t! replicate2F zero" %}
3748   ins_encode %{
3749     __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
3750   %}
3751   ins_pipe( fpu_reg_reg );
3752 %}
3753 
3754 instruct Repl4F_zero(vecX dst, immF0 zero) %{
3755   predicate(n->as_Vector()->length() == 4 && UseAVX < 3);
3756   match(Set dst (ReplicateF zero));
3757   format %{ "xorps   $dst,$dst\t! replicate4F zero" %}
3758   ins_encode %{
3759     __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
3760   %}
3761   ins_pipe( fpu_reg_reg );
3762 %}
3763 
3764 instruct Repl8F_zero(vecY dst, immF0 zero) %{
3765   predicate(n->as_Vector()->length() == 8 && UseAVX < 3);
3766   match(Set dst (ReplicateF zero));
3767   format %{ "vxorps  $dst,$dst,$dst\t! replicate8F zero" %}
3768   ins_encode %{
3769     int vector_len = 1;
3770     __ vxorps($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3771   %}
3772   ins_pipe( fpu_reg_reg );
3773 %}
3774 
3775 instruct Repl2D_mem(vecX dst, memory mem) %{
3776   predicate(n->as_Vector()->length() == 2 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3777   match(Set dst (ReplicateD (LoadD mem)));
3778   format %{ "pshufd  $dst,$mem,0x44\t! replicate2D" %}
3779   ins_encode %{
3780     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x44);
3781   %}
3782   ins_pipe( pipe_slow );
3783 %}
3784 
3785 instruct Repl4D(vecY dst, vlRegD src) %{
3786   predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3787   match(Set dst (ReplicateD src));
3788   format %{ "pshufd  $dst,$src,0x44\n\t"
3789             "vinsertf128_high $dst,$dst\t! replicate4D" %}
3790   ins_encode %{
3791     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
3792     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3793   %}
3794   ins_pipe( pipe_slow );
3795 %}
3796 
3797 instruct Repl4D_mem(vecY dst, memory mem) %{
3798   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3799   match(Set dst (ReplicateD (LoadD mem)));
3800   format %{ "pshufd  $dst,$mem,0x44\n\t"
3801             "vinsertf128_high $dst,$dst\t! replicate4D" %}
3802   ins_encode %{
3803     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x44);
3804     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3805   %}
3806   ins_pipe( pipe_slow );
3807 %}
3808 
3809 instruct Repl8D(legVecZ dst, vlRegD src) %{
3810   predicate(n->as_Vector()->length() == 8 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3811   match(Set dst (ReplicateD src));
3812   format %{ "pshufd  $dst,$src,0x44\n\t"
3813             "vinsertf128_high $dst,$dst\t"
3814             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate8D" %}
3815   ins_encode %{
3816     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
3817     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3818     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3819   %}
3820   ins_pipe( pipe_slow );
3821 %}
3822 
3823 instruct Repl8D_mem(legVecZ dst, memory mem) %{
3824   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3825   match(Set dst (ReplicateD (LoadD mem)));
3826   format %{ "pshufd  $dst,$mem,0x44\n\t"
3827             "vinsertf128_high $dst,$dst\t"
3828             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate8D" %}
3829   ins_encode %{
3830     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x44);
3831     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3832     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3833   %}
3834   ins_pipe( pipe_slow );
3835 %}
3836 
3837 // Replicate double (8 byte) scalar zero to be vector
3838 instruct Repl2D_zero(vecX dst, immD0 zero) %{
3839   predicate(n->as_Vector()->length() == 2 && UseAVX < 3);
3840   match(Set dst (ReplicateD zero));
3841   format %{ "xorpd   $dst,$dst\t! replicate2D zero" %}
3842   ins_encode %{
3843     __ xorpd($dst$$XMMRegister, $dst$$XMMRegister);
3844   %}
3845   ins_pipe( fpu_reg_reg );
3846 %}
3847 
3848 instruct Repl4D_zero(vecY dst, immD0 zero) %{
3849   predicate(n->as_Vector()->length() == 4 && UseAVX < 3);
3850   match(Set dst (ReplicateD zero));
3851   format %{ "vxorpd  $dst,$dst,$dst,vect256\t! replicate4D zero" %}
3852   ins_encode %{
3853     int vector_len = 1;
3854     __ vxorpd($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3855   %}
3856   ins_pipe( fpu_reg_reg );
3857 %}
3858 
3859 // ====================GENERIC REPLICATE==========================================
3860 
3861 // Replicate byte scalar to be vector
3862 instruct Repl4B(vecS dst, rRegI src) %{
3863   predicate(n->as_Vector()->length() == 4);
3864   match(Set dst (ReplicateB src));
3865   format %{ "movd    $dst,$src\n\t"
3866             "punpcklbw $dst,$dst\n\t"
3867             "pshuflw $dst,$dst,0x00\t! replicate4B" %}
3868   ins_encode %{
3869     __ movdl($dst$$XMMRegister, $src$$Register);
3870     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
3871     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3872   %}
3873   ins_pipe( pipe_slow );
3874 %}
3875 
3876 instruct Repl8B(vecD dst, rRegI src) %{
3877   predicate(n->as_Vector()->length() == 8);
3878   match(Set dst (ReplicateB src));
3879   format %{ "movd    $dst,$src\n\t"
3880             "punpcklbw $dst,$dst\n\t"
3881             "pshuflw $dst,$dst,0x00\t! replicate8B" %}
3882   ins_encode %{
3883     __ movdl($dst$$XMMRegister, $src$$Register);
3884     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
3885     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3886   %}
3887   ins_pipe( pipe_slow );
3888 %}
3889 
3890 // Replicate byte scalar immediate to be vector by loading from const table.
3891 instruct Repl4B_imm(vecS dst, immI con) %{
3892   predicate(n->as_Vector()->length() == 4);
3893   match(Set dst (ReplicateB con));
3894   format %{ "movdl   $dst,[$constantaddress]\t! replicate4B($con)" %}
3895   ins_encode %{
3896     __ movdl($dst$$XMMRegister, $constantaddress(replicate4_imm($con$$constant, 1)));
3897   %}
3898   ins_pipe( pipe_slow );
3899 %}
3900 
3901 instruct Repl8B_imm(vecD dst, immI con) %{
3902   predicate(n->as_Vector()->length() == 8);
3903   match(Set dst (ReplicateB con));
3904   format %{ "movq    $dst,[$constantaddress]\t! replicate8B($con)" %}
3905   ins_encode %{
3906     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
3907   %}
3908   ins_pipe( pipe_slow );
3909 %}
3910 
3911 // Replicate byte scalar zero to be vector
3912 instruct Repl4B_zero(vecS dst, immI0 zero) %{
3913   predicate(n->as_Vector()->length() == 4);
3914   match(Set dst (ReplicateB zero));
3915   format %{ "pxor    $dst,$dst\t! replicate4B zero" %}
3916   ins_encode %{
3917     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3918   %}
3919   ins_pipe( fpu_reg_reg );
3920 %}
3921 
3922 instruct Repl8B_zero(vecD dst, immI0 zero) %{
3923   predicate(n->as_Vector()->length() == 8);
3924   match(Set dst (ReplicateB zero));
3925   format %{ "pxor    $dst,$dst\t! replicate8B zero" %}
3926   ins_encode %{
3927     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3928   %}
3929   ins_pipe( fpu_reg_reg );
3930 %}
3931 
3932 instruct Repl16B_zero(vecX dst, immI0 zero) %{
3933   predicate(n->as_Vector()->length() == 16);
3934   match(Set dst (ReplicateB zero));
3935   format %{ "pxor    $dst,$dst\t! replicate16B zero" %}
3936   ins_encode %{
3937     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3938   %}
3939   ins_pipe( fpu_reg_reg );
3940 %}
3941 
3942 instruct Repl32B_zero(vecY dst, immI0 zero) %{
3943   predicate(n->as_Vector()->length() == 32);
3944   match(Set dst (ReplicateB zero));
3945   format %{ "vpxor   $dst,$dst,$dst\t! replicate32B zero" %}
3946   ins_encode %{
3947     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
3948     int vector_len = 1;
3949     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3950   %}
3951   ins_pipe( fpu_reg_reg );
3952 %}
3953 
3954 // Replicate char/short (2 byte) scalar to be vector
3955 instruct Repl2S(vecS dst, rRegI src) %{
3956   predicate(n->as_Vector()->length() == 2);
3957   match(Set dst (ReplicateS src));
3958   format %{ "movd    $dst,$src\n\t"
3959             "pshuflw $dst,$dst,0x00\t! replicate2S" %}
3960   ins_encode %{
3961     __ movdl($dst$$XMMRegister, $src$$Register);
3962     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3963   %}
3964   ins_pipe( fpu_reg_reg );
3965 %}
3966 
3967 // Replicate char/short (2 byte) scalar immediate to be vector by loading from const table.
3968 instruct Repl2S_imm(vecS dst, immI con) %{
3969   predicate(n->as_Vector()->length() == 2);
3970   match(Set dst (ReplicateS con));
3971   format %{ "movdl   $dst,[$constantaddress]\t! replicate2S($con)" %}
3972   ins_encode %{
3973     __ movdl($dst$$XMMRegister, $constantaddress(replicate4_imm($con$$constant, 2)));
3974   %}
3975   ins_pipe( fpu_reg_reg );
3976 %}
3977 
3978 instruct Repl4S_imm(vecD dst, immI con) %{
3979   predicate(n->as_Vector()->length() == 4);
3980   match(Set dst (ReplicateS con));
3981   format %{ "movq    $dst,[$constantaddress]\t! replicate4S($con)" %}
3982   ins_encode %{
3983     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
3984   %}
3985   ins_pipe( fpu_reg_reg );
3986 %}
3987 
3988 // Replicate char/short (2 byte) scalar zero to be vector
3989 instruct Repl2S_zero(vecS dst, immI0 zero) %{
3990   predicate(n->as_Vector()->length() == 2);
3991   match(Set dst (ReplicateS zero));
3992   format %{ "pxor    $dst,$dst\t! replicate2S zero" %}
3993   ins_encode %{
3994     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3995   %}
3996   ins_pipe( fpu_reg_reg );
3997 %}
3998 
3999 instruct Repl4S_zero(vecD dst, immI0 zero) %{
4000   predicate(n->as_Vector()->length() == 4);
4001   match(Set dst (ReplicateS zero));
4002   format %{ "pxor    $dst,$dst\t! replicate4S zero" %}
4003   ins_encode %{
4004     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4005   %}
4006   ins_pipe( fpu_reg_reg );
4007 %}
4008 
4009 instruct Repl8S_zero(vecX dst, immI0 zero) %{
4010   predicate(n->as_Vector()->length() == 8);
4011   match(Set dst (ReplicateS zero));
4012   format %{ "pxor    $dst,$dst\t! replicate8S zero" %}
4013   ins_encode %{
4014     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4015   %}
4016   ins_pipe( fpu_reg_reg );
4017 %}
4018 
4019 instruct Repl16S_zero(vecY dst, immI0 zero) %{
4020   predicate(n->as_Vector()->length() == 16);
4021   match(Set dst (ReplicateS zero));
4022   format %{ "vpxor   $dst,$dst,$dst\t! replicate16S zero" %}
4023   ins_encode %{
4024     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
4025     int vector_len = 1;
4026     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4027   %}
4028   ins_pipe( fpu_reg_reg );
4029 %}
4030 
4031 // Replicate integer (4 byte) scalar to be vector
4032 instruct Repl2I(vecD dst, rRegI src) %{
4033   predicate(n->as_Vector()->length() == 2);
4034   match(Set dst (ReplicateI src));
4035   format %{ "movd    $dst,$src\n\t"
4036             "pshufd  $dst,$dst,0x00\t! replicate2I" %}
4037   ins_encode %{
4038     __ movdl($dst$$XMMRegister, $src$$Register);
4039     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
4040   %}
4041   ins_pipe( fpu_reg_reg );
4042 %}
4043 
4044 // Integer could be loaded into xmm register directly from memory.
4045 instruct Repl2I_mem(vecD dst, memory mem) %{
4046   predicate(n->as_Vector()->length() == 2);
4047   match(Set dst (ReplicateI (LoadI mem)));
4048   format %{ "movd    $dst,$mem\n\t"
4049             "pshufd  $dst,$dst,0x00\t! replicate2I" %}
4050   ins_encode %{
4051     __ movdl($dst$$XMMRegister, $mem$$Address);
4052     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
4053   %}
4054   ins_pipe( fpu_reg_reg );
4055 %}
4056 
4057 // Replicate integer (4 byte) scalar immediate to be vector by loading from const table.
4058 instruct Repl2I_imm(vecD dst, immI con) %{
4059   predicate(n->as_Vector()->length() == 2);
4060   match(Set dst (ReplicateI con));
4061   format %{ "movq    $dst,[$constantaddress]\t! replicate2I($con)" %}
4062   ins_encode %{
4063     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
4064   %}
4065   ins_pipe( fpu_reg_reg );
4066 %}
4067 
4068 // Replicate integer (4 byte) scalar zero to be vector
4069 instruct Repl2I_zero(vecD dst, immI0 zero) %{
4070   predicate(n->as_Vector()->length() == 2);
4071   match(Set dst (ReplicateI zero));
4072   format %{ "pxor    $dst,$dst\t! replicate2I" %}
4073   ins_encode %{
4074     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4075   %}
4076   ins_pipe( fpu_reg_reg );
4077 %}
4078 
4079 instruct Repl4I_zero(vecX dst, immI0 zero) %{
4080   predicate(n->as_Vector()->length() == 4);
4081   match(Set dst (ReplicateI zero));
4082   format %{ "pxor    $dst,$dst\t! replicate4I zero)" %}
4083   ins_encode %{
4084     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4085   %}
4086   ins_pipe( fpu_reg_reg );
4087 %}
4088 
4089 instruct Repl8I_zero(vecY dst, immI0 zero) %{
4090   predicate(n->as_Vector()->length() == 8);
4091   match(Set dst (ReplicateI zero));
4092   format %{ "vpxor   $dst,$dst,$dst\t! replicate8I zero" %}
4093   ins_encode %{
4094     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
4095     int vector_len = 1;
4096     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4097   %}
4098   ins_pipe( fpu_reg_reg );
4099 %}
4100 
4101 // Replicate long (8 byte) scalar to be vector
4102 #ifdef _LP64
4103 instruct Repl2L(vecX dst, rRegL src) %{
4104   predicate(n->as_Vector()->length() == 2);
4105   match(Set dst (ReplicateL src));
4106   format %{ "movdq   $dst,$src\n\t"
4107             "punpcklqdq $dst,$dst\t! replicate2L" %}
4108   ins_encode %{
4109     __ movdq($dst$$XMMRegister, $src$$Register);
4110     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4111   %}
4112   ins_pipe( pipe_slow );
4113 %}
4114 #else // _LP64
4115 instruct Repl2L(vecX dst, eRegL src, vecX tmp) %{
4116   predicate(n->as_Vector()->length() == 2);
4117   match(Set dst (ReplicateL src));
4118   effect(TEMP dst, USE src, TEMP tmp);
4119   format %{ "movdl   $dst,$src.lo\n\t"
4120             "movdl   $tmp,$src.hi\n\t"
4121             "punpckldq $dst,$tmp\n\t"
4122             "punpcklqdq $dst,$dst\t! replicate2L"%}
4123   ins_encode %{
4124     __ movdl($dst$$XMMRegister, $src$$Register);
4125     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
4126     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
4127     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4128   %}
4129   ins_pipe( pipe_slow );
4130 %}
4131 #endif // _LP64
4132 
4133 // Replicate long (8 byte) scalar immediate to be vector by loading from const table.
4134 instruct Repl2L_imm(vecX dst, immL con) %{
4135   predicate(n->as_Vector()->length() == 2);
4136   match(Set dst (ReplicateL con));
4137   format %{ "movq    $dst,[$constantaddress]\n\t"
4138             "punpcklqdq $dst,$dst\t! replicate2L($con)" %}
4139   ins_encode %{
4140     __ movq($dst$$XMMRegister, $constantaddress($con));
4141     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4142   %}
4143   ins_pipe( pipe_slow );
4144 %}
4145 
4146 // Replicate long (8 byte) scalar zero to be vector
4147 instruct Repl2L_zero(vecX dst, immL0 zero) %{
4148   predicate(n->as_Vector()->length() == 2);
4149   match(Set dst (ReplicateL zero));
4150   format %{ "pxor    $dst,$dst\t! replicate2L zero" %}
4151   ins_encode %{
4152     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4153   %}
4154   ins_pipe( fpu_reg_reg );
4155 %}
4156 
4157 instruct Repl4L_zero(vecY dst, immL0 zero) %{
4158   predicate(n->as_Vector()->length() == 4);
4159   match(Set dst (ReplicateL zero));
4160   format %{ "vpxor   $dst,$dst,$dst\t! replicate4L zero" %}
4161   ins_encode %{
4162     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
4163     int vector_len = 1;
4164     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4165   %}
4166   ins_pipe( fpu_reg_reg );
4167 %}
4168 
4169 // Replicate float (4 byte) scalar to be vector
4170 instruct Repl2F(vecD dst, vlRegF src) %{
4171   predicate(n->as_Vector()->length() == 2);
4172   match(Set dst (ReplicateF src));
4173   format %{ "pshufd  $dst,$dst,0x00\t! replicate2F" %}
4174   ins_encode %{
4175     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
4176   %}
4177   ins_pipe( fpu_reg_reg );
4178 %}
4179 
4180 instruct Repl4F(vecX dst, vlRegF src) %{
4181   predicate(n->as_Vector()->length() == 4);
4182   match(Set dst (ReplicateF src));
4183   format %{ "pshufd  $dst,$dst,0x00\t! replicate4F" %}
4184   ins_encode %{
4185     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
4186   %}
4187   ins_pipe( pipe_slow );
4188 %}
4189 
4190 // Replicate double (8 bytes) scalar to be vector
4191 instruct Repl2D(vecX dst, vlRegD src) %{
4192   predicate(n->as_Vector()->length() == 2);
4193   match(Set dst (ReplicateD src));
4194   format %{ "pshufd  $dst,$src,0x44\t! replicate2D" %}
4195   ins_encode %{
4196     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
4197   %}
4198   ins_pipe( pipe_slow );
4199 %}
4200 
4201 // ====================EVEX REPLICATE=============================================
4202 
4203 instruct Repl4B_mem_evex(vecS dst, memory mem) %{
4204   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4205   match(Set dst (ReplicateB (LoadB mem)));
4206   format %{ "vpbroadcastb  $dst,$mem\t! replicate4B" %}
4207   ins_encode %{
4208     int vector_len = 0;
4209     __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
4210   %}
4211   ins_pipe( pipe_slow );
4212 %}
4213 
4214 instruct Repl8B_mem_evex(vecD dst, memory mem) %{
4215   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4216   match(Set dst (ReplicateB (LoadB mem)));
4217   format %{ "vpbroadcastb  $dst,$mem\t! replicate8B" %}
4218   ins_encode %{
4219     int vector_len = 0;
4220     __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
4221   %}
4222   ins_pipe( pipe_slow );
4223 %}
4224 
4225 instruct Repl16B_evex(vecX dst, rRegI src) %{
4226   predicate(n->as_Vector()->length() == 16 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4227   match(Set dst (ReplicateB src));
4228   format %{ "evpbroadcastb $dst,$src\t! replicate16B" %}
4229   ins_encode %{
4230    int vector_len = 0;
4231     __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vector_len);
4232   %}
4233   ins_pipe( pipe_slow );
4234 %}
4235 
4236 instruct Repl16B_mem_evex(vecX dst, memory mem) %{
4237   predicate(n->as_Vector()->length() == 16 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4238   match(Set dst (ReplicateB (LoadB mem)));
4239   format %{ "vpbroadcastb  $dst,$mem\t! replicate16B" %}
4240   ins_encode %{
4241     int vector_len = 0;
4242     __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
4243   %}
4244   ins_pipe( pipe_slow );
4245 %}
4246 
4247 instruct Repl32B_evex(vecY dst, rRegI src) %{
4248   predicate(n->as_Vector()->length() == 32 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4249   match(Set dst (ReplicateB src));
4250   format %{ "evpbroadcastb $dst,$src\t! replicate32B" %}
4251   ins_encode %{
4252    int vector_len = 1;
4253     __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vector_len);
4254   %}
4255   ins_pipe( pipe_slow );
4256 %}
4257 
4258 instruct Repl32B_mem_evex(vecY dst, memory mem) %{
4259   predicate(n->as_Vector()->length() == 32 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4260   match(Set dst (ReplicateB (LoadB mem)));
4261   format %{ "vpbroadcastb  $dst,$mem\t! replicate32B" %}
4262   ins_encode %{
4263     int vector_len = 1;
4264     __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
4265   %}
4266   ins_pipe( pipe_slow );
4267 %}
4268 
4269 instruct Repl64B_evex(vecZ dst, rRegI src) %{
4270   predicate(n->as_Vector()->length() == 64 && UseAVX > 2 && VM_Version::supports_avx512bw());
4271   match(Set dst (ReplicateB src));
4272   format %{ "evpbroadcastb $dst,$src\t! upper replicate64B" %}
4273   ins_encode %{
4274    int vector_len = 2;
4275     __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vector_len);
4276   %}
4277   ins_pipe( pipe_slow );
4278 %}
4279 
4280 instruct Repl64B_mem_evex(vecZ dst, memory mem) %{
4281   predicate(n->as_Vector()->length() == 64 && UseAVX > 2 && VM_Version::supports_avx512bw());
4282   match(Set dst (ReplicateB (LoadB mem)));
4283   format %{ "vpbroadcastb  $dst,$mem\t! replicate64B" %}
4284   ins_encode %{
4285     int vector_len = 2;
4286     __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
4287   %}
4288   ins_pipe( pipe_slow );
4289 %}
4290 
4291 instruct Repl16B_imm_evex(vecX dst, immI con) %{
4292   predicate(n->as_Vector()->length() == 16 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4293   match(Set dst (ReplicateB con));
4294   format %{ "movq    $dst,[$constantaddress]\n\t"
4295             "vpbroadcastb $dst,$dst\t! replicate16B" %}
4296   ins_encode %{
4297    int vector_len = 0;
4298     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
4299     __ vpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4300   %}
4301   ins_pipe( pipe_slow );
4302 %}
4303 
4304 instruct Repl32B_imm_evex(vecY dst, immI con) %{
4305   predicate(n->as_Vector()->length() == 32 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4306   match(Set dst (ReplicateB con));
4307   format %{ "movq    $dst,[$constantaddress]\n\t"
4308             "vpbroadcastb $dst,$dst\t! replicate32B" %}
4309   ins_encode %{
4310    int vector_len = 1;
4311     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
4312     __ vpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4313   %}
4314   ins_pipe( pipe_slow );
4315 %}
4316 
4317 instruct Repl64B_imm_evex(vecZ dst, immI con) %{
4318   predicate(n->as_Vector()->length() == 64 && UseAVX > 2 && VM_Version::supports_avx512bw());
4319   match(Set dst (ReplicateB con));
4320   format %{ "movq    $dst,[$constantaddress]\n\t"
4321             "vpbroadcastb $dst,$dst\t! upper replicate64B" %}
4322   ins_encode %{
4323    int vector_len = 2;
4324     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
4325     __ vpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4326   %}
4327   ins_pipe( pipe_slow );
4328 %}
4329 
4330 instruct Repl64B_zero_evex(vecZ dst, immI0 zero) %{
4331   predicate(n->as_Vector()->length() == 64 && UseAVX > 2);
4332   match(Set dst (ReplicateB zero));
4333   format %{ "vpxor   $dst k0,$dst,$dst\t! replicate64B zero" %}
4334   ins_encode %{
4335     // Use vxorpd since AVX does not have vpxor for 512-bit (EVEX will have it).
4336     int vector_len = 2;
4337     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4338   %}
4339   ins_pipe( fpu_reg_reg );
4340 %}
4341 
4342 instruct Repl4S_evex(vecD dst, rRegI src) %{
4343   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4344   match(Set dst (ReplicateS src));
4345   format %{ "evpbroadcastw $dst,$src\t! replicate4S" %}
4346   ins_encode %{
4347    int vector_len = 0;
4348     __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
4349   %}
4350   ins_pipe( pipe_slow );
4351 %}
4352 
4353 instruct Repl4S_mem_evex(vecD dst, memory mem) %{
4354   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4355   match(Set dst (ReplicateS (LoadS mem)));
4356   format %{ "vpbroadcastw  $dst,$mem\t! replicate4S" %}
4357   ins_encode %{
4358     int vector_len = 0;
4359     __ vpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
4360   %}
4361   ins_pipe( pipe_slow );
4362 %}
4363 
4364 instruct Repl8S_evex(vecX dst, rRegI src) %{
4365   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4366   match(Set dst (ReplicateS src));
4367   format %{ "evpbroadcastw $dst,$src\t! replicate8S" %}
4368   ins_encode %{
4369    int vector_len = 0;
4370     __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
4371   %}
4372   ins_pipe( pipe_slow );
4373 %}
4374 
4375 instruct Repl8S_mem_evex(vecX dst, memory mem) %{
4376   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4377   match(Set dst (ReplicateS (LoadS mem)));
4378   format %{ "vpbroadcastw  $dst,$mem\t! replicate8S" %}
4379   ins_encode %{
4380     int vector_len = 0;
4381     __ vpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
4382   %}
4383   ins_pipe( pipe_slow );
4384 %}
4385 
4386 instruct Repl16S_evex(vecY dst, rRegI src) %{
4387   predicate(n->as_Vector()->length() == 16 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4388   match(Set dst (ReplicateS src));
4389   format %{ "evpbroadcastw $dst,$src\t! replicate16S" %}
4390   ins_encode %{
4391    int vector_len = 1;
4392     __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
4393   %}
4394   ins_pipe( pipe_slow );
4395 %}
4396 
4397 instruct Repl16S_mem_evex(vecY dst, memory mem) %{
4398   predicate(n->as_Vector()->length() == 16 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4399   match(Set dst (ReplicateS (LoadS mem)));
4400   format %{ "vpbroadcastw  $dst,$mem\t! replicate16S" %}
4401   ins_encode %{
4402     int vector_len = 1;
4403     __ vpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
4404   %}
4405   ins_pipe( pipe_slow );
4406 %}
4407 
4408 instruct Repl32S_evex(vecZ dst, rRegI src) %{
4409   predicate(n->as_Vector()->length() == 32 && UseAVX > 2 && VM_Version::supports_avx512bw());
4410   match(Set dst (ReplicateS src));
4411   format %{ "evpbroadcastw $dst,$src\t! replicate32S" %}
4412   ins_encode %{
4413    int vector_len = 2;
4414     __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
4415   %}
4416   ins_pipe( pipe_slow );
4417 %}
4418 
4419 instruct Repl32S_mem_evex(vecZ dst, memory mem) %{
4420   predicate(n->as_Vector()->length() == 32 && UseAVX > 2 && VM_Version::supports_avx512bw());
4421   match(Set dst (ReplicateS (LoadS mem)));
4422   format %{ "vpbroadcastw  $dst,$mem\t! replicate32S" %}
4423   ins_encode %{
4424     int vector_len = 2;
4425     __ vpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
4426   %}
4427   ins_pipe( pipe_slow );
4428 %}
4429 
4430 instruct Repl8S_imm_evex(vecX dst, immI con) %{
4431   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4432   match(Set dst (ReplicateS con));
4433   format %{ "movq    $dst,[$constantaddress]\n\t"
4434             "vpbroadcastw $dst,$dst\t! replicate8S" %}
4435   ins_encode %{
4436    int vector_len = 0;
4437     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
4438     __ vpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4439   %}
4440   ins_pipe( pipe_slow );
4441 %}
4442 
4443 instruct Repl16S_imm_evex(vecY dst, immI con) %{
4444   predicate(n->as_Vector()->length() == 16 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4445   match(Set dst (ReplicateS con));
4446   format %{ "movq    $dst,[$constantaddress]\n\t"
4447             "vpbroadcastw $dst,$dst\t! replicate16S" %}
4448   ins_encode %{
4449    int vector_len = 1;
4450     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
4451     __ vpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4452   %}
4453   ins_pipe( pipe_slow );
4454 %}
4455 
4456 instruct Repl32S_imm_evex(vecZ dst, immI con) %{
4457   predicate(n->as_Vector()->length() == 32 && UseAVX > 2 && VM_Version::supports_avx512bw());
4458   match(Set dst (ReplicateS con));
4459   format %{ "movq    $dst,[$constantaddress]\n\t"
4460             "vpbroadcastw $dst,$dst\t! replicate32S" %}
4461   ins_encode %{
4462    int vector_len = 2;
4463     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
4464     __ vpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4465   %}
4466   ins_pipe( pipe_slow );
4467 %}
4468 
4469 instruct Repl32S_zero_evex(vecZ dst, immI0 zero) %{
4470   predicate(n->as_Vector()->length() == 32 && UseAVX > 2);
4471   match(Set dst (ReplicateS zero));
4472   format %{ "vpxor   $dst k0,$dst,$dst\t! replicate32S zero" %}
4473   ins_encode %{
4474     // Use vxorpd since AVX does not have vpxor for 512-bit (EVEX will have it).
4475     int vector_len = 2;
4476     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4477   %}
4478   ins_pipe( fpu_reg_reg );
4479 %}
4480 
4481 instruct Repl4I_evex(vecX dst, rRegI src) %{
4482   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
4483   match(Set dst (ReplicateI src));
4484   format %{ "evpbroadcastd  $dst,$src\t! replicate4I" %}
4485   ins_encode %{
4486     int vector_len = 0;
4487     __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vector_len);
4488   %}
4489   ins_pipe( pipe_slow );
4490 %}
4491 
4492 instruct Repl4I_mem_evex(vecX dst, memory mem) %{
4493   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
4494   match(Set dst (ReplicateI (LoadI mem)));
4495   format %{ "vpbroadcastd  $dst,$mem\t! replicate4I" %}
4496   ins_encode %{
4497     int vector_len = 0;
4498     __ vpbroadcastd($dst$$XMMRegister, $mem$$Address, vector_len);
4499   %}
4500   ins_pipe( pipe_slow );
4501 %}
4502 
4503 instruct Repl8I_evex(vecY dst, rRegI src) %{
4504   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vl());
4505   match(Set dst (ReplicateI src));
4506   format %{ "evpbroadcastd  $dst,$src\t! replicate8I" %}
4507   ins_encode %{
4508     int vector_len = 1;
4509     __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vector_len);
4510   %}
4511   ins_pipe( pipe_slow );
4512 %}
4513 
4514 instruct Repl8I_mem_evex(vecY dst, memory mem) %{
4515   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vl());
4516   match(Set dst (ReplicateI (LoadI mem)));
4517   format %{ "vpbroadcastd  $dst,$mem\t! replicate8I" %}
4518   ins_encode %{
4519     int vector_len = 1;
4520     __ vpbroadcastd($dst$$XMMRegister, $mem$$Address, vector_len);
4521   %}
4522   ins_pipe( pipe_slow );
4523 %}
4524 
4525 instruct Repl16I_evex(vecZ dst, rRegI src) %{
4526   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4527   match(Set dst (ReplicateI src));
4528   format %{ "evpbroadcastd  $dst,$src\t! replicate16I" %}
4529   ins_encode %{
4530     int vector_len = 2;
4531     __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vector_len);
4532   %}
4533   ins_pipe( pipe_slow );
4534 %}
4535 
4536 instruct Repl16I_mem_evex(vecZ dst, memory mem) %{
4537   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4538   match(Set dst (ReplicateI (LoadI mem)));
4539   format %{ "vpbroadcastd  $dst,$mem\t! replicate16I" %}
4540   ins_encode %{
4541     int vector_len = 2;
4542     __ vpbroadcastd($dst$$XMMRegister, $mem$$Address, vector_len);
4543   %}
4544   ins_pipe( pipe_slow );
4545 %}
4546 
4547 instruct Repl4I_imm_evex(vecX dst, immI con) %{
4548   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
4549   match(Set dst (ReplicateI con));
4550   format %{ "movq    $dst,[$constantaddress]\t! replicate8I($con)\n\t"
4551             "vpbroadcastd  $dst,$dst\t! replicate4I" %}
4552   ins_encode %{
4553     int vector_len = 0;
4554     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
4555     __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4556   %}
4557   ins_pipe( pipe_slow );
4558 %}
4559 
4560 instruct Repl8I_imm_evex(vecY dst, immI con) %{
4561   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vl());
4562   match(Set dst (ReplicateI con));
4563   format %{ "movq    $dst,[$constantaddress]\t! replicate8I($con)\n\t"
4564             "vpbroadcastd  $dst,$dst\t! replicate8I" %}
4565   ins_encode %{
4566     int vector_len = 1;
4567     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
4568     __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4569   %}
4570   ins_pipe( pipe_slow );
4571 %}
4572 
4573 instruct Repl16I_imm_evex(vecZ dst, immI con) %{
4574   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4575   match(Set dst (ReplicateI con));
4576   format %{ "movq    $dst,[$constantaddress]\t! replicate16I($con)\n\t"
4577             "vpbroadcastd  $dst,$dst\t! replicate16I" %}
4578   ins_encode %{
4579     int vector_len = 2;
4580     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
4581     __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4582   %}
4583   ins_pipe( pipe_slow );
4584 %}
4585 
4586 instruct Repl16I_zero_evex(vecZ dst, immI0 zero) %{
4587   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4588   match(Set dst (ReplicateI zero));
4589   format %{ "vpxor   $dst k0,$dst,$dst\t! replicate16I zero" %}
4590   ins_encode %{
4591     // Use vxorpd since AVX does not have vpxor for 512-bit (AVX2 will have it).
4592     int vector_len = 2;
4593     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4594   %}
4595   ins_pipe( fpu_reg_reg );
4596 %}
4597 
4598 // Replicate long (8 byte) scalar to be vector
4599 #ifdef _LP64
4600 instruct Repl4L_evex(vecY dst, rRegL src) %{
4601   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
4602   match(Set dst (ReplicateL src));
4603   format %{ "evpbroadcastq  $dst,$src\t! replicate4L" %}
4604   ins_encode %{
4605     int vector_len = 1;
4606     __ evpbroadcastq($dst$$XMMRegister, $src$$Register, vector_len);
4607   %}
4608   ins_pipe( pipe_slow );
4609 %}
4610 
4611 instruct Repl8L_evex(vecZ dst, rRegL src) %{
4612   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4613   match(Set dst (ReplicateL src));
4614   format %{ "evpbroadcastq  $dst,$src\t! replicate8L" %}
4615   ins_encode %{
4616     int vector_len = 2;
4617     __ evpbroadcastq($dst$$XMMRegister, $src$$Register, vector_len);
4618   %}
4619   ins_pipe( pipe_slow );
4620 %}
4621 #else // _LP64
4622 instruct Repl4L_evex(vecY dst, eRegL src, regD tmp) %{
4623   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
4624   match(Set dst (ReplicateL src));
4625   effect(TEMP dst, USE src, TEMP tmp);
4626   format %{ "movdl   $dst,$src.lo\n\t"
4627             "movdl   $tmp,$src.hi\n\t"
4628             "punpckldq $dst,$tmp\n\t"
4629             "vpbroadcastq  $dst,$dst\t! replicate4L" %}
4630   ins_encode %{
4631     int vector_len = 1;
4632     __ movdl($dst$$XMMRegister, $src$$Register);
4633     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
4634     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
4635     __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4636   %}
4637   ins_pipe( pipe_slow );
4638 %}
4639 
4640 instruct Repl8L_evex(legVecZ dst, eRegL src, legVecZ tmp) %{
4641   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4642   match(Set dst (ReplicateL src));
4643   effect(TEMP dst, USE src, TEMP tmp);
4644   format %{ "movdl   $dst,$src.lo\n\t"
4645             "movdl   $tmp,$src.hi\n\t"
4646             "punpckldq $dst,$tmp\n\t"
4647             "vpbroadcastq  $dst,$dst\t! replicate8L" %}
4648   ins_encode %{
4649     int vector_len = 2;
4650     __ movdl($dst$$XMMRegister, $src$$Register);
4651     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
4652     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
4653     __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4654   %}
4655   ins_pipe( pipe_slow );
4656 %}
4657 #endif // _LP64
4658 
4659 instruct Repl4L_imm_evex(vecY dst, immL con) %{
4660   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
4661   match(Set dst (ReplicateL con));
4662   format %{ "movq    $dst,[$constantaddress]\n\t"
4663             "vpbroadcastq  $dst,$dst\t! replicate4L" %}
4664   ins_encode %{
4665     int vector_len = 1;
4666     __ movq($dst$$XMMRegister, $constantaddress($con));
4667     __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4668   %}
4669   ins_pipe( pipe_slow );
4670 %}
4671 
4672 instruct Repl8L_imm_evex(vecZ dst, immL con) %{
4673   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4674   match(Set dst (ReplicateL con));
4675   format %{ "movq    $dst,[$constantaddress]\n\t"
4676             "vpbroadcastq  $dst,$dst\t! replicate8L" %}
4677   ins_encode %{
4678     int vector_len = 2;
4679     __ movq($dst$$XMMRegister, $constantaddress($con));
4680     __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4681   %}
4682   ins_pipe( pipe_slow );
4683 %}
4684 
4685 instruct Repl2L_mem_evex(vecX dst, memory mem) %{
4686   predicate(n->as_Vector()->length() == 2 && UseAVX > 2 && VM_Version::supports_avx512vl());
4687   match(Set dst (ReplicateL (LoadL mem)));
4688   format %{ "vpbroadcastd  $dst,$mem\t! replicate2L" %}
4689   ins_encode %{
4690     int vector_len = 0;
4691     __ vpbroadcastq($dst$$XMMRegister, $mem$$Address, vector_len);
4692   %}
4693   ins_pipe( pipe_slow );
4694 %}
4695 
4696 instruct Repl4L_mem_evex(vecY dst, memory mem) %{
4697   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
4698   match(Set dst (ReplicateL (LoadL mem)));
4699   format %{ "vpbroadcastd  $dst,$mem\t! replicate4L" %}
4700   ins_encode %{
4701     int vector_len = 1;
4702     __ vpbroadcastq($dst$$XMMRegister, $mem$$Address, vector_len);
4703   %}
4704   ins_pipe( pipe_slow );
4705 %}
4706 
4707 instruct Repl8L_mem_evex(vecZ dst, memory mem) %{
4708   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4709   match(Set dst (ReplicateL (LoadL mem)));
4710   format %{ "vpbroadcastd  $dst,$mem\t! replicate8L" %}
4711   ins_encode %{
4712     int vector_len = 2;
4713     __ vpbroadcastq($dst$$XMMRegister, $mem$$Address, vector_len);
4714   %}
4715   ins_pipe( pipe_slow );
4716 %}
4717 
4718 instruct Repl8L_zero_evex(vecZ dst, immL0 zero) %{
4719   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4720   match(Set dst (ReplicateL zero));
4721   format %{ "vpxor   $dst k0,$dst,$dst\t! replicate8L zero" %}
4722   ins_encode %{
4723     // Use vxorpd since AVX does not have vpxor for 512-bit (EVEX will have it).
4724     int vector_len = 2;
4725     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4726   %}
4727   ins_pipe( fpu_reg_reg );
4728 %}
4729 
4730 instruct Repl8F_evex(vecY dst, regF src) %{
4731   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vl());
4732   match(Set dst (ReplicateF src));
4733   format %{ "vpbroadcastss $dst,$src\t! replicate8F" %}
4734   ins_encode %{
4735     int vector_len = 1;
4736     __ vpbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vector_len);
4737   %}
4738   ins_pipe( pipe_slow );
4739 %}
4740 
4741 instruct Repl8F_mem_evex(vecY dst, memory mem) %{
4742   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vl());
4743   match(Set dst (ReplicateF (LoadF mem)));
4744   format %{ "vbroadcastss  $dst,$mem\t! replicate8F" %}
4745   ins_encode %{
4746     int vector_len = 1;
4747     __ vpbroadcastss($dst$$XMMRegister, $mem$$Address, vector_len);
4748   %}
4749   ins_pipe( pipe_slow );
4750 %}
4751 
4752 instruct Repl16F_evex(vecZ dst, regF src) %{
4753   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4754   match(Set dst (ReplicateF src));
4755   format %{ "vpbroadcastss $dst,$src\t! replicate16F" %}
4756   ins_encode %{
4757     int vector_len = 2;
4758     __ vpbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vector_len);
4759   %}
4760   ins_pipe( pipe_slow );
4761 %}
4762 
4763 instruct Repl16F_mem_evex(vecZ dst, memory mem) %{
4764   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4765   match(Set dst (ReplicateF (LoadF mem)));
4766   format %{ "vbroadcastss  $dst,$mem\t! replicate16F" %}
4767   ins_encode %{
4768     int vector_len = 2;
4769     __ vpbroadcastss($dst$$XMMRegister, $mem$$Address, vector_len);
4770   %}
4771   ins_pipe( pipe_slow );
4772 %}
4773 
4774 instruct Repl2F_zero_evex(vecD dst, immF0 zero) %{
4775   predicate(n->as_Vector()->length() == 2 && UseAVX > 2);
4776   match(Set dst (ReplicateF zero));
4777   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate2F zero" %}
4778   ins_encode %{
4779     // Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
4780     int vector_len = 2;
4781     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4782   %}
4783   ins_pipe( fpu_reg_reg );
4784 %}
4785 
4786 instruct Repl4F_zero_evex(vecX dst, immF0 zero) %{
4787   predicate(n->as_Vector()->length() == 4 && UseAVX > 2);
4788   match(Set dst (ReplicateF zero));
4789   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate4F zero" %}
4790   ins_encode %{
4791     // Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
4792     int vector_len = 2;
4793     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4794   %}
4795   ins_pipe( fpu_reg_reg );
4796 %}
4797 
4798 instruct Repl8F_zero_evex(vecY dst, immF0 zero) %{
4799   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4800   match(Set dst (ReplicateF zero));
4801   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate8F zero" %}
4802   ins_encode %{
4803     // Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
4804     int vector_len = 2;
4805     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4806   %}
4807   ins_pipe( fpu_reg_reg );
4808 %}
4809 
4810 instruct Repl16F_zero_evex(vecZ dst, immF0 zero) %{
4811   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4812   match(Set dst (ReplicateF zero));
4813   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate16F zero" %}
4814   ins_encode %{
4815     // Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
4816     int vector_len = 2;
4817     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4818   %}
4819   ins_pipe( fpu_reg_reg );
4820 %}
4821 
4822 instruct Repl4D_evex(vecY dst, regD src) %{
4823   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
4824   match(Set dst (ReplicateD src));
4825   format %{ "vpbroadcastsd $dst,$src\t! replicate4D" %}
4826   ins_encode %{
4827     int vector_len = 1;
4828     __ vpbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
4829   %}
4830   ins_pipe( pipe_slow );
4831 %}
4832 
4833 instruct Repl4D_mem_evex(vecY dst, memory mem) %{
4834   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
4835   match(Set dst (ReplicateD (LoadD mem)));
4836   format %{ "vbroadcastsd  $dst,$mem\t! replicate4D" %}
4837   ins_encode %{
4838     int vector_len = 1;
4839     __ vpbroadcastsd($dst$$XMMRegister, $mem$$Address, vector_len);
4840   %}
4841   ins_pipe( pipe_slow );
4842 %}
4843 
4844 instruct Repl8D_evex(vecZ dst, regD src) %{
4845   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4846   match(Set dst (ReplicateD src));
4847   format %{ "vpbroadcastsd $dst,$src\t! replicate8D" %}
4848   ins_encode %{
4849     int vector_len = 2;
4850     __ vpbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
4851   %}
4852   ins_pipe( pipe_slow );
4853 %}
4854 
4855 instruct Repl8D_mem_evex(vecZ dst, memory mem) %{
4856   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4857   match(Set dst (ReplicateD (LoadD mem)));
4858   format %{ "vbroadcastsd  $dst,$mem\t! replicate8D" %}
4859   ins_encode %{
4860     int vector_len = 2;
4861     __ vpbroadcastsd($dst$$XMMRegister, $mem$$Address, vector_len);
4862   %}
4863   ins_pipe( pipe_slow );
4864 %}
4865 
4866 instruct Repl2D_zero_evex(vecX dst, immD0 zero) %{
4867   predicate(n->as_Vector()->length() == 2 && UseAVX > 2);
4868   match(Set dst (ReplicateD zero));
4869   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate2D zero" %}
4870   ins_encode %{
4871     // Use vpxor in place of vxorpd since EVEX has a constriant on dq for vxorpd: this is a 512-bit operation
4872     int vector_len = 2;
4873     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4874   %}
4875   ins_pipe( fpu_reg_reg );
4876 %}
4877 
4878 instruct Repl4D_zero_evex(vecY dst, immD0 zero) %{
4879   predicate(n->as_Vector()->length() == 4 && UseAVX > 2);
4880   match(Set dst (ReplicateD zero));
4881   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate4D zero" %}
4882   ins_encode %{
4883     // Use vpxor in place of vxorpd since EVEX has a constriant on dq for vxorpd: this is a 512-bit operation
4884     int vector_len = 2;
4885     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4886   %}
4887   ins_pipe( fpu_reg_reg );
4888 %}
4889 
4890 instruct Repl8D_zero_evex(vecZ dst, immD0 zero) %{
4891   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4892   match(Set dst (ReplicateD zero));
4893   format %{ "vpxor  $dst k0,$dst,$dst,vect512\t! replicate8D zero" %}
4894   ins_encode %{
4895     // Use vpxor in place of vxorpd since EVEX has a constriant on dq for vxorpd: this is a 512-bit operation
4896     int vector_len = 2;
4897     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4898   %}
4899   ins_pipe( fpu_reg_reg );
4900 %}
4901 
4902 // ====================REDUCTION ARITHMETIC=======================================
4903 
4904 instruct rsadd2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, vecD tmp, vecD tmp2) %{
4905   predicate(UseSSE > 2 && UseAVX == 0);
4906   match(Set dst (AddReductionVI src1 src2));
4907   effect(TEMP tmp2, TEMP tmp);
4908   format %{ "movdqu  $tmp2,$src2\n\t"
4909             "phaddd  $tmp2,$tmp2\n\t"
4910             "movd    $tmp,$src1\n\t"
4911             "paddd   $tmp,$tmp2\n\t"
4912             "movd    $dst,$tmp\t! add reduction2I" %}
4913   ins_encode %{
4914     __ movdqu($tmp2$$XMMRegister, $src2$$XMMRegister);
4915     __ phaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister);
4916     __ movdl($tmp$$XMMRegister, $src1$$Register);
4917     __ paddd($tmp$$XMMRegister, $tmp2$$XMMRegister);
4918     __ movdl($dst$$Register, $tmp$$XMMRegister);
4919   %}
4920   ins_pipe( pipe_slow );
4921 %}
4922 
4923 instruct rvadd2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, vecD tmp, vecD tmp2) %{
4924   predicate(VM_Version::supports_avxonly());
4925   match(Set dst (AddReductionVI src1 src2));
4926   effect(TEMP tmp, TEMP tmp2);
4927   format %{ "vphaddd  $tmp,$src2,$src2\n\t"
4928             "movd     $tmp2,$src1\n\t"
4929             "vpaddd   $tmp2,$tmp2,$tmp\n\t"
4930             "movd     $dst,$tmp2\t! add reduction2I" %}
4931   ins_encode %{
4932     int vector_len = 0;
4933     __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len);
4934     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4935     __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
4936     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4937   %}
4938   ins_pipe( pipe_slow );
4939 %}
4940 
4941 instruct rvadd2I_reduction_reg_evex(rRegI dst, rRegI src1, vecD src2, vecD tmp, vecD tmp2) %{
4942   predicate(UseAVX > 2);
4943   match(Set dst (AddReductionVI src1 src2));
4944   effect(TEMP tmp, TEMP tmp2);
4945   format %{ "pshufd  $tmp2,$src2,0x1\n\t"
4946             "vpaddd  $tmp,$src2,$tmp2\n\t"
4947             "movd    $tmp2,$src1\n\t"
4948             "vpaddd  $tmp2,$tmp,$tmp2\n\t"
4949             "movd    $dst,$tmp2\t! add reduction2I" %}
4950   ins_encode %{
4951     int vector_len = 0;
4952     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
4953     __ vpaddd($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4954     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4955     __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4956     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4957   %}
4958   ins_pipe( pipe_slow );
4959 %}
4960 
4961 instruct rsadd4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, vecX tmp, vecX tmp2) %{
4962   predicate(UseSSE > 2 && UseAVX == 0);
4963   match(Set dst (AddReductionVI src1 src2));
4964   effect(TEMP tmp, TEMP tmp2);
4965   format %{ "movdqu  $tmp,$src2\n\t"
4966             "phaddd  $tmp,$tmp\n\t"
4967             "phaddd  $tmp,$tmp\n\t"
4968             "movd    $tmp2,$src1\n\t"
4969             "paddd   $tmp2,$tmp\n\t"
4970             "movd    $dst,$tmp2\t! add reduction4I" %}
4971   ins_encode %{
4972     __ movdqu($tmp$$XMMRegister, $src2$$XMMRegister);
4973     __ phaddd($tmp$$XMMRegister, $tmp$$XMMRegister);
4974     __ phaddd($tmp$$XMMRegister, $tmp$$XMMRegister);
4975     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4976     __ paddd($tmp2$$XMMRegister, $tmp$$XMMRegister);
4977     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4978   %}
4979   ins_pipe( pipe_slow );
4980 %}
4981 
4982 instruct rvadd4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, vecX tmp, vecX tmp2) %{
4983   predicate(VM_Version::supports_avxonly());
4984   match(Set dst (AddReductionVI src1 src2));
4985   effect(TEMP tmp, TEMP tmp2);
4986   format %{ "vphaddd  $tmp,$src2,$src2\n\t"
4987             "vphaddd  $tmp,$tmp,$tmp\n\t"
4988             "movd     $tmp2,$src1\n\t"
4989             "vpaddd   $tmp2,$tmp2,$tmp\n\t"
4990             "movd     $dst,$tmp2\t! add reduction4I" %}
4991   ins_encode %{
4992     int vector_len = 0;
4993     __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len);
4994     __ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp$$XMMRegister, vector_len);
4995     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4996     __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
4997     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4998   %}
4999   ins_pipe( pipe_slow );
5000 %}
5001 
5002 instruct rvadd4I_reduction_reg_evex(rRegI dst, rRegI src1, vecX src2, vecX tmp, vecX tmp2) %{
5003   predicate(UseAVX > 2);
5004   match(Set dst (AddReductionVI src1 src2));
5005   effect(TEMP tmp, TEMP tmp2);
5006   format %{ "pshufd  $tmp2,$src2,0xE\n\t"
5007             "vpaddd  $tmp,$src2,$tmp2\n\t"
5008             "pshufd  $tmp2,$tmp,0x1\n\t"
5009             "vpaddd  $tmp,$tmp,$tmp2\n\t"
5010             "movd    $tmp2,$src1\n\t"
5011             "vpaddd  $tmp2,$tmp,$tmp2\n\t"
5012             "movd    $dst,$tmp2\t! add reduction4I" %}
5013   ins_encode %{
5014     int vector_len = 0;
5015     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
5016     __ vpaddd($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5017     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
5018     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5019     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5020     __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5021     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5022   %}
5023   ins_pipe( pipe_slow );
5024 %}
5025 
5026 instruct rvadd8I_reduction_reg(rRegI dst, rRegI src1, vecY src2, vecY tmp, vecY tmp2) %{
5027   predicate(VM_Version::supports_avxonly());
5028   match(Set dst (AddReductionVI src1 src2));
5029   effect(TEMP tmp, TEMP tmp2);
5030   format %{ "vphaddd  $tmp,$src2,$src2\n\t"
5031             "vphaddd  $tmp,$tmp,$tmp2\n\t"
5032             "vextracti128_high  $tmp2,$tmp\n\t"
5033             "vpaddd   $tmp,$tmp,$tmp2\n\t"
5034             "movd     $tmp2,$src1\n\t"
5035             "vpaddd   $tmp2,$tmp2,$tmp\n\t"
5036             "movd     $dst,$tmp2\t! add reduction8I" %}
5037   ins_encode %{
5038     int vector_len = 1;
5039     __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len);
5040     __ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5041     __ vextracti128_high($tmp2$$XMMRegister, $tmp$$XMMRegister);
5042     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5043     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5044     __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5045     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5046   %}
5047   ins_pipe( pipe_slow );
5048 %}
5049 
5050 instruct rvadd8I_reduction_reg_evex(rRegI dst, rRegI src1, vecY src2, vecY tmp, vecY tmp2) %{
5051   predicate(UseAVX > 2);
5052   match(Set dst (AddReductionVI src1 src2));
5053   effect(TEMP tmp, TEMP tmp2);
5054   format %{ "vextracti128_high  $tmp,$src2\n\t"
5055             "vpaddd  $tmp,$tmp,$src2\n\t"
5056             "pshufd  $tmp2,$tmp,0xE\n\t"
5057             "vpaddd  $tmp,$tmp,$tmp2\n\t"
5058             "pshufd  $tmp2,$tmp,0x1\n\t"
5059             "vpaddd  $tmp,$tmp,$tmp2\n\t"
5060             "movd    $tmp2,$src1\n\t"
5061             "vpaddd  $tmp2,$tmp,$tmp2\n\t"
5062             "movd    $dst,$tmp2\t! add reduction8I" %}
5063   ins_encode %{
5064     int vector_len = 0;
5065     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
5066     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
5067     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
5068     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5069     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
5070     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5071     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5072     __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5073     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5074   %}
5075   ins_pipe( pipe_slow );
5076 %}
5077 
5078 instruct rvadd16I_reduction_reg_evex(rRegI dst, rRegI src1, legVecZ src2, legVecZ tmp, legVecZ tmp2, legVecZ tmp3) %{
5079   predicate(UseAVX > 2);
5080   match(Set dst (AddReductionVI src1 src2));
5081   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
5082   format %{ "vextracti64x4_high  $tmp3,$src2\n\t"
5083             "vpaddd  $tmp3,$tmp3,$src2\n\t"
5084             "vextracti128_high  $tmp,$tmp3\n\t"
5085             "vpaddd  $tmp,$tmp,$tmp3\n\t"
5086             "pshufd  $tmp2,$tmp,0xE\n\t"
5087             "vpaddd  $tmp,$tmp,$tmp2\n\t"
5088             "pshufd  $tmp2,$tmp,0x1\n\t"
5089             "vpaddd  $tmp,$tmp,$tmp2\n\t"
5090             "movd    $tmp2,$src1\n\t"
5091             "vpaddd  $tmp2,$tmp,$tmp2\n\t"
5092             "movd    $dst,$tmp2\t! mul reduction16I" %}
5093   ins_encode %{
5094     __ vextracti64x4_high($tmp3$$XMMRegister, $src2$$XMMRegister);
5095     __ vpaddd($tmp3$$XMMRegister, $tmp3$$XMMRegister, $src2$$XMMRegister, 1);
5096     __ vextracti128_high($tmp$$XMMRegister, $tmp3$$XMMRegister);
5097     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp3$$XMMRegister, 0);
5098     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
5099     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5100     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
5101     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5102     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5103     __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5104     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5105   %}
5106   ins_pipe( pipe_slow );
5107 %}
5108 
5109 #ifdef _LP64
5110 instruct rvadd2L_reduction_reg(rRegL dst, rRegL src1, vecX src2, vecX tmp, vecX tmp2) %{
5111   predicate(UseAVX > 2);
5112   match(Set dst (AddReductionVL src1 src2));
5113   effect(TEMP tmp, TEMP tmp2);
5114   format %{ "pshufd  $tmp2,$src2,0xE\n\t"
5115             "vpaddq  $tmp,$src2,$tmp2\n\t"
5116             "movdq   $tmp2,$src1\n\t"
5117             "vpaddq  $tmp2,$tmp,$tmp2\n\t"
5118             "movdq   $dst,$tmp2\t! add reduction2L" %}
5119   ins_encode %{
5120     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
5121     __ vpaddq($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, 0);
5122     __ movdq($tmp2$$XMMRegister, $src1$$Register);
5123     __ vpaddq($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5124     __ movdq($dst$$Register, $tmp2$$XMMRegister);
5125   %}
5126   ins_pipe( pipe_slow );
5127 %}
5128 
5129 instruct rvadd4L_reduction_reg(rRegL dst, rRegL src1, vecY src2, vecY tmp, vecY tmp2) %{
5130   predicate(UseAVX > 2);
5131   match(Set dst (AddReductionVL src1 src2));
5132   effect(TEMP tmp, TEMP tmp2);
5133   format %{ "vextracti128_high  $tmp,$src2\n\t"
5134             "vpaddq  $tmp2,$tmp,$src2\n\t"
5135             "pshufd  $tmp,$tmp2,0xE\n\t"
5136             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
5137             "movdq   $tmp,$src1\n\t"
5138             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
5139             "movdq   $dst,$tmp2\t! add reduction4L" %}
5140   ins_encode %{
5141     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
5142     __ vpaddq($tmp2$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, 0);
5143     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5144     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5145     __ movdq($tmp$$XMMRegister, $src1$$Register);
5146     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5147     __ movdq($dst$$Register, $tmp2$$XMMRegister);
5148   %}
5149   ins_pipe( pipe_slow );
5150 %}
5151 
5152 instruct rvadd8L_reduction_reg(rRegL dst, rRegL src1, legVecZ src2, legVecZ tmp, legVecZ tmp2) %{
5153   predicate(UseAVX > 2);
5154   match(Set dst (AddReductionVL src1 src2));
5155   effect(TEMP tmp, TEMP tmp2);
5156   format %{ "vextracti64x4_high  $tmp2,$src2\n\t"
5157             "vpaddq  $tmp2,$tmp2,$src2\n\t"
5158             "vextracti128_high  $tmp,$tmp2\n\t"
5159             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
5160             "pshufd  $tmp,$tmp2,0xE\n\t"
5161             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
5162             "movdq   $tmp,$src1\n\t"
5163             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
5164             "movdq   $dst,$tmp2\t! add reduction8L" %}
5165   ins_encode %{
5166     __ vextracti64x4_high($tmp2$$XMMRegister, $src2$$XMMRegister);
5167     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
5168     __ vextracti128_high($tmp$$XMMRegister, $tmp2$$XMMRegister);
5169     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5170     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5171     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5172     __ movdq($tmp$$XMMRegister, $src1$$Register);
5173     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5174     __ movdq($dst$$Register, $tmp2$$XMMRegister);
5175   %}
5176   ins_pipe( pipe_slow );
5177 %}
5178 #endif
5179 
5180 instruct rsadd2F_reduction_reg(regF dst, vecD src2, vecD tmp) %{
5181   predicate(UseSSE >= 1 && UseAVX == 0);
5182   match(Set dst (AddReductionVF dst src2));
5183   effect(TEMP dst, TEMP tmp);
5184   format %{ "addss   $dst,$src2\n\t"
5185             "pshufd  $tmp,$src2,0x01\n\t"
5186             "addss   $dst,$tmp\t! add reduction2F" %}
5187   ins_encode %{
5188     __ addss($dst$$XMMRegister, $src2$$XMMRegister);
5189     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5190     __ addss($dst$$XMMRegister, $tmp$$XMMRegister);
5191   %}
5192   ins_pipe( pipe_slow );
5193 %}
5194 
5195 instruct rvadd2F_reduction_reg(regF dst, vecD src2, vecD tmp) %{
5196   predicate(UseAVX > 0);
5197   match(Set dst (AddReductionVF dst src2));
5198   effect(TEMP dst, TEMP tmp);
5199   format %{ "vaddss  $dst,$dst,$src2\n\t"
5200             "pshufd  $tmp,$src2,0x01\n\t"
5201             "vaddss  $dst,$dst,$tmp\t! add reduction2F" %}
5202   ins_encode %{
5203     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5204     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5205     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5206   %}
5207   ins_pipe( pipe_slow );
5208 %}
5209 
5210 instruct rsadd4F_reduction_reg(regF dst, vecX src2, vecX tmp) %{
5211   predicate(UseSSE >= 1 && UseAVX == 0);
5212   match(Set dst (AddReductionVF dst src2));
5213   effect(TEMP dst, TEMP tmp);
5214   format %{ "addss   $dst,$src2\n\t"
5215             "pshufd  $tmp,$src2,0x01\n\t"
5216             "addss   $dst,$tmp\n\t"
5217             "pshufd  $tmp,$src2,0x02\n\t"
5218             "addss   $dst,$tmp\n\t"
5219             "pshufd  $tmp,$src2,0x03\n\t"
5220             "addss   $dst,$tmp\t! add reduction4F" %}
5221   ins_encode %{
5222     __ addss($dst$$XMMRegister, $src2$$XMMRegister);
5223     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5224     __ addss($dst$$XMMRegister, $tmp$$XMMRegister);
5225     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5226     __ addss($dst$$XMMRegister, $tmp$$XMMRegister);
5227     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5228     __ addss($dst$$XMMRegister, $tmp$$XMMRegister);
5229   %}
5230   ins_pipe( pipe_slow );
5231 %}
5232 
5233 instruct rvadd4F_reduction_reg(regF dst, vecX src2, vecX tmp) %{
5234   predicate(UseAVX > 0);
5235   match(Set dst (AddReductionVF dst src2));
5236   effect(TEMP tmp, TEMP dst);
5237   format %{ "vaddss  $dst,dst,$src2\n\t"
5238             "pshufd  $tmp,$src2,0x01\n\t"
5239             "vaddss  $dst,$dst,$tmp\n\t"
5240             "pshufd  $tmp,$src2,0x02\n\t"
5241             "vaddss  $dst,$dst,$tmp\n\t"
5242             "pshufd  $tmp,$src2,0x03\n\t"
5243             "vaddss  $dst,$dst,$tmp\t! add reduction4F" %}
5244   ins_encode %{
5245     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5246     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5247     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5248     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5249     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5250     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5251     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5252   %}
5253   ins_pipe( pipe_slow );
5254 %}
5255 
5256 instruct radd8F_reduction_reg(regF dst, vecY src2, vecY tmp, vecY tmp2) %{
5257   predicate(UseAVX > 0);
5258   match(Set dst (AddReductionVF dst src2));
5259   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5260   format %{ "vaddss  $dst,$dst,$src2\n\t"
5261             "pshufd  $tmp,$src2,0x01\n\t"
5262             "vaddss  $dst,$dst,$tmp\n\t"
5263             "pshufd  $tmp,$src2,0x02\n\t"
5264             "vaddss  $dst,$dst,$tmp\n\t"
5265             "pshufd  $tmp,$src2,0x03\n\t"
5266             "vaddss  $dst,$dst,$tmp\n\t"
5267             "vextractf128_high  $tmp2,$src2\n\t"
5268             "vaddss  $dst,$dst,$tmp2\n\t"
5269             "pshufd  $tmp,$tmp2,0x01\n\t"
5270             "vaddss  $dst,$dst,$tmp\n\t"
5271             "pshufd  $tmp,$tmp2,0x02\n\t"
5272             "vaddss  $dst,$dst,$tmp\n\t"
5273             "pshufd  $tmp,$tmp2,0x03\n\t"
5274             "vaddss  $dst,$dst,$tmp\t! add reduction8F" %}
5275   ins_encode %{
5276     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5277     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5278     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5279     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5280     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5281     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5282     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5283     __ vextractf128_high($tmp2$$XMMRegister, $src2$$XMMRegister);
5284     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5285     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5286     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5287     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5288     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5289     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5290     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5291   %}
5292   ins_pipe( pipe_slow );
5293 %}
5294 
5295 instruct radd16F_reduction_reg(regF dst, legVecZ src2, legVecZ tmp, legVecZ tmp2) %{
5296   predicate(UseAVX > 2);
5297   match(Set dst (AddReductionVF dst src2));
5298   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5299   format %{ "vaddss  $dst,$dst,$src2\n\t"
5300             "pshufd  $tmp,$src2,0x01\n\t"
5301             "vaddss  $dst,$dst,$tmp\n\t"
5302             "pshufd  $tmp,$src2,0x02\n\t"
5303             "vaddss  $dst,$dst,$tmp\n\t"
5304             "pshufd  $tmp,$src2,0x03\n\t"
5305             "vaddss  $dst,$dst,$tmp\n\t"
5306             "vextractf32x4  $tmp2,$src2,0x1\n\t"
5307             "vaddss  $dst,$dst,$tmp2\n\t"
5308             "pshufd  $tmp,$tmp2,0x01\n\t"
5309             "vaddss  $dst,$dst,$tmp\n\t"
5310             "pshufd  $tmp,$tmp2,0x02\n\t"
5311             "vaddss  $dst,$dst,$tmp\n\t"
5312             "pshufd  $tmp,$tmp2,0x03\n\t"
5313             "vaddss  $dst,$dst,$tmp\n\t"
5314             "vextractf32x4  $tmp2,$src2,0x2\n\t"
5315             "vaddss  $dst,$dst,$tmp2\n\t"
5316             "pshufd  $tmp,$tmp2,0x01\n\t"
5317             "vaddss  $dst,$dst,$tmp\n\t"
5318             "pshufd  $tmp,$tmp2,0x02\n\t"
5319             "vaddss  $dst,$dst,$tmp\n\t"
5320             "pshufd  $tmp,$tmp2,0x03\n\t"
5321             "vaddss  $dst,$dst,$tmp\n\t"
5322             "vextractf32x4  $tmp2,$src2,0x3\n\t"
5323             "vaddss  $dst,$dst,$tmp2\n\t"
5324             "pshufd  $tmp,$tmp2,0x01\n\t"
5325             "vaddss  $dst,$dst,$tmp\n\t"
5326             "pshufd  $tmp,$tmp2,0x02\n\t"
5327             "vaddss  $dst,$dst,$tmp\n\t"
5328             "pshufd  $tmp,$tmp2,0x03\n\t"
5329             "vaddss  $dst,$dst,$tmp\t! add reduction16F" %}
5330   ins_encode %{
5331     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5332     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5333     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5334     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5335     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5336     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5337     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5338     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5339     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5340     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5341     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5342     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5343     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5344     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5345     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5346     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
5347     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5348     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5349     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5350     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5351     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5352     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5353     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5354     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
5355     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5356     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5357     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5358     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5359     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5360     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5361     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5362   %}
5363   ins_pipe( pipe_slow );
5364 %}
5365 
5366 instruct rsadd2D_reduction_reg(regD dst, vecX src2, vecX tmp) %{
5367   predicate(UseSSE >= 1 && UseAVX == 0);
5368   match(Set dst (AddReductionVD dst src2));
5369   effect(TEMP tmp, TEMP dst);
5370   format %{ "addsd   $dst,$src2\n\t"
5371             "pshufd  $tmp,$src2,0xE\n\t"
5372             "addsd   $dst,$tmp\t! add reduction2D" %}
5373   ins_encode %{
5374     __ addsd($dst$$XMMRegister, $src2$$XMMRegister);
5375     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5376     __ addsd($dst$$XMMRegister, $tmp$$XMMRegister);
5377   %}
5378   ins_pipe( pipe_slow );
5379 %}
5380 
5381 instruct rvadd2D_reduction_reg(regD dst, vecX src2, vecX tmp) %{
5382   predicate(UseAVX > 0);
5383   match(Set dst (AddReductionVD dst src2));
5384   effect(TEMP tmp, TEMP dst);
5385   format %{ "vaddsd  $dst,$dst,$src2\n\t"
5386             "pshufd  $tmp,$src2,0xE\n\t"
5387             "vaddsd  $dst,$dst,$tmp\t! add reduction2D" %}
5388   ins_encode %{
5389     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5390     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5391     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5392   %}
5393   ins_pipe( pipe_slow );
5394 %}
5395 
5396 instruct rvadd4D_reduction_reg(regD dst, vecY src2, vecX tmp, vecX tmp2) %{
5397   predicate(UseAVX > 0);
5398   match(Set dst (AddReductionVD dst src2));
5399   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5400   format %{ "vaddsd  $dst,$dst,$src2\n\t"
5401             "pshufd  $tmp,$src2,0xE\n\t"
5402             "vaddsd  $dst,$dst,$tmp\n\t"
5403             "vextractf128  $tmp2,$src2,0x1\n\t"
5404             "vaddsd  $dst,$dst,$tmp2\n\t"
5405             "pshufd  $tmp,$tmp2,0xE\n\t"
5406             "vaddsd  $dst,$dst,$tmp\t! add reduction4D" %}
5407   ins_encode %{
5408     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5409     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5410     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5411     __ vextractf128($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5412     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5413     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5414     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5415   %}
5416   ins_pipe( pipe_slow );
5417 %}
5418 
5419 instruct rvadd8D_reduction_reg(regD dst, legVecZ src2, legVecZ tmp, legVecZ tmp2) %{
5420   predicate(UseAVX > 2);
5421   match(Set dst (AddReductionVD dst src2));
5422   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5423   format %{ "vaddsd  $dst,$dst,$src2\n\t"
5424             "pshufd  $tmp,$src2,0xE\n\t"
5425             "vaddsd  $dst,$dst,$tmp\n\t"
5426             "vextractf32x4  $tmp2,$src2,0x1\n\t"
5427             "vaddsd  $dst,$dst,$tmp2\n\t"
5428             "pshufd  $tmp,$tmp2,0xE\n\t"
5429             "vaddsd  $dst,$dst,$tmp\n\t"
5430             "vextractf32x4  $tmp2,$src2,0x2\n\t"
5431             "vaddsd  $dst,$dst,$tmp2\n\t"
5432             "pshufd  $tmp,$tmp2,0xE\n\t"
5433             "vaddsd  $dst,$dst,$tmp\n\t"
5434             "vextractf32x4  $tmp2,$src2,0x3\n\t"
5435             "vaddsd  $dst,$dst,$tmp2\n\t"
5436             "pshufd  $tmp,$tmp2,0xE\n\t"
5437             "vaddsd  $dst,$dst,$tmp\t! add reduction8D" %}
5438   ins_encode %{
5439     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5440     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5441     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5442     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5443     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5444     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5445     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5446     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
5447     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5448     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5449     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5450     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
5451     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5452     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5453     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5454   %}
5455   ins_pipe( pipe_slow );
5456 %}
5457 
5458 instruct rsmul2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, vecD tmp, vecD tmp2) %{
5459   predicate(UseSSE > 3 && UseAVX == 0);
5460   match(Set dst (MulReductionVI src1 src2));
5461   effect(TEMP tmp, TEMP tmp2);
5462   format %{ "pshufd  $tmp2,$src2,0x1\n\t"
5463             "pmulld  $tmp2,$src2\n\t"
5464             "movd    $tmp,$src1\n\t"
5465             "pmulld  $tmp2,$tmp\n\t"
5466             "movd    $dst,$tmp2\t! mul reduction2I" %}
5467   ins_encode %{
5468     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5469     __ pmulld($tmp2$$XMMRegister, $src2$$XMMRegister);
5470     __ movdl($tmp$$XMMRegister, $src1$$Register);
5471     __ pmulld($tmp2$$XMMRegister, $tmp$$XMMRegister);
5472     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5473   %}
5474   ins_pipe( pipe_slow );
5475 %}
5476 
5477 instruct rvmul2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, vecD tmp, vecD tmp2) %{
5478   predicate(UseAVX > 0);
5479   match(Set dst (MulReductionVI src1 src2));
5480   effect(TEMP tmp, TEMP tmp2);
5481   format %{ "pshufd   $tmp2,$src2,0x1\n\t"
5482             "vpmulld  $tmp,$src2,$tmp2\n\t"
5483             "movd     $tmp2,$src1\n\t"
5484             "vpmulld  $tmp2,$tmp,$tmp2\n\t"
5485             "movd     $dst,$tmp2\t! mul reduction2I" %}
5486   ins_encode %{
5487     int vector_len = 0;
5488     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5489     __ vpmulld($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5490     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5491     __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5492     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5493   %}
5494   ins_pipe( pipe_slow );
5495 %}
5496 
5497 instruct rsmul4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, vecX tmp, vecX tmp2) %{
5498   predicate(UseSSE > 3 && UseAVX == 0);
5499   match(Set dst (MulReductionVI src1 src2));
5500   effect(TEMP tmp, TEMP tmp2);
5501   format %{ "pshufd  $tmp2,$src2,0xE\n\t"
5502             "pmulld  $tmp2,$src2\n\t"
5503             "pshufd  $tmp,$tmp2,0x1\n\t"
5504             "pmulld  $tmp2,$tmp\n\t"
5505             "movd    $tmp,$src1\n\t"
5506             "pmulld  $tmp2,$tmp\n\t"
5507             "movd    $dst,$tmp2\t! mul reduction4I" %}
5508   ins_encode %{
5509     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
5510     __ pmulld($tmp2$$XMMRegister, $src2$$XMMRegister);
5511     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x1);
5512     __ pmulld($tmp2$$XMMRegister, $tmp$$XMMRegister);
5513     __ movdl($tmp$$XMMRegister, $src1$$Register);
5514     __ pmulld($tmp2$$XMMRegister, $tmp$$XMMRegister);
5515     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5516   %}
5517   ins_pipe( pipe_slow );
5518 %}
5519 
5520 instruct rvmul4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, vecX tmp, vecX tmp2) %{
5521   predicate(UseAVX > 0);
5522   match(Set dst (MulReductionVI src1 src2));
5523   effect(TEMP tmp, TEMP tmp2);
5524   format %{ "pshufd   $tmp2,$src2,0xE\n\t"
5525             "vpmulld  $tmp,$src2,$tmp2\n\t"
5526             "pshufd   $tmp2,$tmp,0x1\n\t"
5527             "vpmulld  $tmp,$tmp,$tmp2\n\t"
5528             "movd     $tmp2,$src1\n\t"
5529             "vpmulld  $tmp2,$tmp,$tmp2\n\t"
5530             "movd     $dst,$tmp2\t! mul reduction4I" %}
5531   ins_encode %{
5532     int vector_len = 0;
5533     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
5534     __ vpmulld($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5535     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
5536     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5537     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5538     __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5539     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5540   %}
5541   ins_pipe( pipe_slow );
5542 %}
5543 
5544 instruct rvmul8I_reduction_reg(rRegI dst, rRegI src1, vecY src2, vecY tmp, vecY tmp2) %{
5545   predicate(UseAVX > 1);
5546   match(Set dst (MulReductionVI src1 src2));
5547   effect(TEMP tmp, TEMP tmp2);
5548   format %{ "vextracti128_high  $tmp,$src2\n\t"
5549             "vpmulld  $tmp,$tmp,$src2\n\t"
5550             "pshufd   $tmp2,$tmp,0xE\n\t"
5551             "vpmulld  $tmp,$tmp,$tmp2\n\t"
5552             "pshufd   $tmp2,$tmp,0x1\n\t"
5553             "vpmulld  $tmp,$tmp,$tmp2\n\t"
5554             "movd     $tmp2,$src1\n\t"
5555             "vpmulld  $tmp2,$tmp,$tmp2\n\t"
5556             "movd     $dst,$tmp2\t! mul reduction8I" %}
5557   ins_encode %{
5558     int vector_len = 0;
5559     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
5560     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
5561     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
5562     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5563     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
5564     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5565     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5566     __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5567     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5568   %}
5569   ins_pipe( pipe_slow );
5570 %}
5571 
5572 instruct rvmul16I_reduction_reg(rRegI dst, rRegI src1, legVecZ src2, legVecZ tmp, legVecZ tmp2, legVecZ tmp3) %{
5573   predicate(UseAVX > 2);
5574   match(Set dst (MulReductionVI src1 src2));
5575   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
5576   format %{ "vextracti64x4_high  $tmp3,$src2\n\t"
5577             "vpmulld  $tmp3,$tmp3,$src2\n\t"
5578             "vextracti128_high  $tmp,$tmp3\n\t"
5579             "vpmulld  $tmp,$tmp,$src2\n\t"
5580             "pshufd   $tmp2,$tmp,0xE\n\t"
5581             "vpmulld  $tmp,$tmp,$tmp2\n\t"
5582             "pshufd   $tmp2,$tmp,0x1\n\t"
5583             "vpmulld  $tmp,$tmp,$tmp2\n\t"
5584             "movd     $tmp2,$src1\n\t"
5585             "vpmulld  $tmp2,$tmp,$tmp2\n\t"
5586             "movd     $dst,$tmp2\t! mul reduction16I" %}
5587   ins_encode %{
5588     __ vextracti64x4_high($tmp3$$XMMRegister, $src2$$XMMRegister);
5589     __ vpmulld($tmp3$$XMMRegister, $tmp3$$XMMRegister, $src2$$XMMRegister, 1);
5590     __ vextracti128_high($tmp$$XMMRegister, $tmp3$$XMMRegister);
5591     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp3$$XMMRegister, 0);
5592     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
5593     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5594     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
5595     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5596     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5597     __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5598     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5599   %}
5600   ins_pipe( pipe_slow );
5601 %}
5602 
5603 #ifdef _LP64
5604 instruct rvmul2L_reduction_reg(rRegL dst, rRegL src1, vecX src2, vecX tmp, vecX tmp2) %{
5605   predicate(UseAVX > 2 && VM_Version::supports_avx512dq());
5606   match(Set dst (MulReductionVL src1 src2));
5607   effect(TEMP tmp, TEMP tmp2);
5608   format %{ "pshufd   $tmp2,$src2,0xE\n\t"
5609             "vpmullq  $tmp,$src2,$tmp2\n\t"
5610             "movdq    $tmp2,$src1\n\t"
5611             "vpmullq  $tmp2,$tmp,$tmp2\n\t"
5612             "movdq    $dst,$tmp2\t! mul reduction2L" %}
5613   ins_encode %{
5614     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
5615     __ vpmullq($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, 0);
5616     __ movdq($tmp2$$XMMRegister, $src1$$Register);
5617     __ vpmullq($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5618     __ movdq($dst$$Register, $tmp2$$XMMRegister);
5619   %}
5620   ins_pipe( pipe_slow );
5621 %}
5622 
5623 instruct rvmul4L_reduction_reg(rRegL dst, rRegL src1, vecY src2, vecY tmp, vecY tmp2) %{
5624   predicate(UseAVX > 2 && VM_Version::supports_avx512dq());
5625   match(Set dst (MulReductionVL src1 src2));
5626   effect(TEMP tmp, TEMP tmp2);
5627   format %{ "vextracti128_high  $tmp,$src2\n\t"
5628             "vpmullq  $tmp2,$tmp,$src2\n\t"
5629             "pshufd   $tmp,$tmp2,0xE\n\t"
5630             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5631             "movdq    $tmp,$src1\n\t"
5632             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5633             "movdq    $dst,$tmp2\t! mul reduction4L" %}
5634   ins_encode %{
5635     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
5636     __ vpmullq($tmp2$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, 0);
5637     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5638     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5639     __ movdq($tmp$$XMMRegister, $src1$$Register);
5640     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5641     __ movdq($dst$$Register, $tmp2$$XMMRegister);
5642   %}
5643   ins_pipe( pipe_slow );
5644 %}
5645 
5646 instruct rvmul8L_reduction_reg(rRegL dst, rRegL src1, legVecZ src2, legVecZ tmp, legVecZ tmp2) %{
5647   predicate(UseAVX > 2 && VM_Version::supports_avx512dq());
5648   match(Set dst (MulReductionVL src1 src2));
5649   effect(TEMP tmp, TEMP tmp2);
5650   format %{ "vextracti64x4_high  $tmp2,$src2\n\t"
5651             "vpmullq  $tmp2,$tmp2,$src2\n\t"
5652             "vextracti128_high  $tmp,$tmp2\n\t"
5653             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5654             "pshufd   $tmp,$tmp2,0xE\n\t"
5655             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5656             "movdq    $tmp,$src1\n\t"
5657             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5658             "movdq    $dst,$tmp2\t! mul reduction8L" %}
5659   ins_encode %{
5660     __ vextracti64x4_high($tmp2$$XMMRegister, $src2$$XMMRegister);
5661     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
5662     __ vextracti128_high($tmp$$XMMRegister, $tmp2$$XMMRegister);
5663     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5664     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5665     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5666     __ movdq($tmp$$XMMRegister, $src1$$Register);
5667     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5668     __ movdq($dst$$Register, $tmp2$$XMMRegister);
5669   %}
5670   ins_pipe( pipe_slow );
5671 %}
5672 #endif
5673 
5674 instruct rsmul2F_reduction(regF dst, vecD src2, vecD tmp) %{
5675   predicate(UseSSE >= 1 && UseAVX == 0);
5676   match(Set dst (MulReductionVF dst src2));
5677   effect(TEMP dst, TEMP tmp);
5678   format %{ "mulss   $dst,$src2\n\t"
5679             "pshufd  $tmp,$src2,0x01\n\t"
5680             "mulss   $dst,$tmp\t! mul reduction2F" %}
5681   ins_encode %{
5682     __ mulss($dst$$XMMRegister, $src2$$XMMRegister);
5683     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5684     __ mulss($dst$$XMMRegister, $tmp$$XMMRegister);
5685   %}
5686   ins_pipe( pipe_slow );
5687 %}
5688 
5689 instruct rvmul2F_reduction_reg(regF dst, vecD src2, vecD tmp) %{
5690   predicate(UseAVX > 0);
5691   match(Set dst (MulReductionVF dst src2));
5692   effect(TEMP tmp, TEMP dst);
5693   format %{ "vmulss  $dst,$dst,$src2\n\t"
5694             "pshufd  $tmp,$src2,0x01\n\t"
5695             "vmulss  $dst,$dst,$tmp\t! mul reduction2F" %}
5696   ins_encode %{
5697     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5698     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5699     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5700   %}
5701   ins_pipe( pipe_slow );
5702 %}
5703 
5704 instruct rsmul4F_reduction_reg(regF dst, vecX src2, vecX tmp) %{
5705   predicate(UseSSE >= 1 && UseAVX == 0);
5706   match(Set dst (MulReductionVF dst src2));
5707   effect(TEMP dst, TEMP tmp);
5708   format %{ "mulss   $dst,$src2\n\t"
5709             "pshufd  $tmp,$src2,0x01\n\t"
5710             "mulss   $dst,$tmp\n\t"
5711             "pshufd  $tmp,$src2,0x02\n\t"
5712             "mulss   $dst,$tmp\n\t"
5713             "pshufd  $tmp,$src2,0x03\n\t"
5714             "mulss   $dst,$tmp\t! mul reduction4F" %}
5715   ins_encode %{
5716     __ mulss($dst$$XMMRegister, $src2$$XMMRegister);
5717     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5718     __ mulss($dst$$XMMRegister, $tmp$$XMMRegister);
5719     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5720     __ mulss($dst$$XMMRegister, $tmp$$XMMRegister);
5721     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5722     __ mulss($dst$$XMMRegister, $tmp$$XMMRegister);
5723   %}
5724   ins_pipe( pipe_slow );
5725 %}
5726 
5727 instruct rvmul4F_reduction_reg(regF dst, vecX src2, vecX tmp) %{
5728   predicate(UseAVX > 0);
5729   match(Set dst (MulReductionVF dst src2));
5730   effect(TEMP tmp, TEMP dst);
5731   format %{ "vmulss  $dst,$dst,$src2\n\t"
5732             "pshufd  $tmp,$src2,0x01\n\t"
5733             "vmulss  $dst,$dst,$tmp\n\t"
5734             "pshufd  $tmp,$src2,0x02\n\t"
5735             "vmulss  $dst,$dst,$tmp\n\t"
5736             "pshufd  $tmp,$src2,0x03\n\t"
5737             "vmulss  $dst,$dst,$tmp\t! mul reduction4F" %}
5738   ins_encode %{
5739     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5740     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5741     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5742     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5743     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5744     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5745     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5746   %}
5747   ins_pipe( pipe_slow );
5748 %}
5749 
5750 instruct rvmul8F_reduction_reg(regF dst, vecY src2, vecY tmp, vecY tmp2) %{
5751   predicate(UseAVX > 0);
5752   match(Set dst (MulReductionVF dst src2));
5753   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5754   format %{ "vmulss  $dst,$dst,$src2\n\t"
5755             "pshufd  $tmp,$src2,0x01\n\t"
5756             "vmulss  $dst,$dst,$tmp\n\t"
5757             "pshufd  $tmp,$src2,0x02\n\t"
5758             "vmulss  $dst,$dst,$tmp\n\t"
5759             "pshufd  $tmp,$src2,0x03\n\t"
5760             "vmulss  $dst,$dst,$tmp\n\t"
5761             "vextractf128_high  $tmp2,$src2\n\t"
5762             "vmulss  $dst,$dst,$tmp2\n\t"
5763             "pshufd  $tmp,$tmp2,0x01\n\t"
5764             "vmulss  $dst,$dst,$tmp\n\t"
5765             "pshufd  $tmp,$tmp2,0x02\n\t"
5766             "vmulss  $dst,$dst,$tmp\n\t"
5767             "pshufd  $tmp,$tmp2,0x03\n\t"
5768             "vmulss  $dst,$dst,$tmp\t! mul reduction8F" %}
5769   ins_encode %{
5770     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5771     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5772     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5773     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5774     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5775     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5776     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5777     __ vextractf128_high($tmp2$$XMMRegister, $src2$$XMMRegister);
5778     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5779     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5780     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5781     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5782     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5783     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5784     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5785   %}
5786   ins_pipe( pipe_slow );
5787 %}
5788 
5789 instruct rvmul16F_reduction_reg(regF dst, legVecZ src2, legVecZ tmp, legVecZ tmp2) %{
5790   predicate(UseAVX > 2);
5791   match(Set dst (MulReductionVF dst src2));
5792   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5793   format %{ "vmulss  $dst,$dst,$src2\n\t"
5794             "pshufd  $tmp,$src2,0x01\n\t"
5795             "vmulss  $dst,$dst,$tmp\n\t"
5796             "pshufd  $tmp,$src2,0x02\n\t"
5797             "vmulss  $dst,$dst,$tmp\n\t"
5798             "pshufd  $tmp,$src2,0x03\n\t"
5799             "vmulss  $dst,$dst,$tmp\n\t"
5800             "vextractf32x4  $tmp2,$src2,0x1\n\t"
5801             "vmulss  $dst,$dst,$tmp2\n\t"
5802             "pshufd  $tmp,$tmp2,0x01\n\t"
5803             "vmulss  $dst,$dst,$tmp\n\t"
5804             "pshufd  $tmp,$tmp2,0x02\n\t"
5805             "vmulss  $dst,$dst,$tmp\n\t"
5806             "pshufd  $tmp,$tmp2,0x03\n\t"
5807             "vmulss  $dst,$dst,$tmp\n\t"
5808             "vextractf32x4  $tmp2,$src2,0x2\n\t"
5809             "vmulss  $dst,$dst,$tmp2\n\t"
5810             "pshufd  $tmp,$tmp2,0x01\n\t"
5811             "vmulss  $dst,$dst,$tmp\n\t"
5812             "pshufd  $tmp,$tmp2,0x02\n\t"
5813             "vmulss  $dst,$dst,$tmp\n\t"
5814             "pshufd  $tmp,$tmp2,0x03\n\t"
5815             "vmulss  $dst,$dst,$tmp\n\t"
5816             "vextractf32x4  $tmp2,$src2,0x3\n\t"
5817             "vmulss  $dst,$dst,$tmp2\n\t"
5818             "pshufd  $tmp,$tmp2,0x01\n\t"
5819             "vmulss  $dst,$dst,$tmp\n\t"
5820             "pshufd  $tmp,$tmp2,0x02\n\t"
5821             "vmulss  $dst,$dst,$tmp\n\t"
5822             "pshufd  $tmp,$tmp2,0x03\n\t"
5823             "vmulss  $dst,$dst,$tmp\t! mul reduction16F" %}
5824   ins_encode %{
5825     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5826     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5827     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5828     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5829     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5830     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5831     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5832     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5833     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5834     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5835     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5836     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5837     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5838     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5839     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5840     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
5841     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5842     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5843     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5844     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5845     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5846     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5847     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5848     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
5849     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5850     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5851     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5852     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5853     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5854     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5855     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5856   %}
5857   ins_pipe( pipe_slow );
5858 %}
5859 
5860 instruct rsmul2D_reduction_reg(regD dst, vecX src2, vecX tmp) %{
5861   predicate(UseSSE >= 1 && UseAVX == 0);
5862   match(Set dst (MulReductionVD dst src2));
5863   effect(TEMP dst, TEMP tmp);
5864   format %{ "mulsd   $dst,$src2\n\t"
5865             "pshufd  $tmp,$src2,0xE\n\t"
5866             "mulsd   $dst,$tmp\t! mul reduction2D" %}
5867   ins_encode %{
5868     __ mulsd($dst$$XMMRegister, $src2$$XMMRegister);
5869     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5870     __ mulsd($dst$$XMMRegister, $tmp$$XMMRegister);
5871   %}
5872   ins_pipe( pipe_slow );
5873 %}
5874 
5875 instruct rvmul2D_reduction_reg(regD dst, vecX src2, vecX tmp) %{
5876   predicate(UseAVX > 0);
5877   match(Set dst (MulReductionVD dst src2));
5878   effect(TEMP tmp, TEMP dst);
5879   format %{ "vmulsd  $dst,$dst,$src2\n\t"
5880             "pshufd  $tmp,$src2,0xE\n\t"
5881             "vmulsd  $dst,$dst,$tmp\t! mul reduction2D" %}
5882   ins_encode %{
5883     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5884     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5885     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5886   %}
5887   ins_pipe( pipe_slow );
5888 %}
5889 
5890 instruct rvmul4D_reduction_reg(regD dst, vecY src2, vecY tmp, vecY tmp2) %{
5891   predicate(UseAVX > 0);
5892   match(Set dst (MulReductionVD dst src2));
5893   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5894   format %{ "vmulsd  $dst,$dst,$src2\n\t"
5895             "pshufd  $tmp,$src2,0xE\n\t"
5896             "vmulsd  $dst,$dst,$tmp\n\t"
5897             "vextractf128_high  $tmp2,$src2\n\t"
5898             "vmulsd  $dst,$dst,$tmp2\n\t"
5899             "pshufd  $tmp,$tmp2,0xE\n\t"
5900             "vmulsd  $dst,$dst,$tmp\t! mul reduction4D" %}
5901   ins_encode %{
5902     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5903     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5904     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5905     __ vextractf128_high($tmp2$$XMMRegister, $src2$$XMMRegister);
5906     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5907     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5908     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5909   %}
5910   ins_pipe( pipe_slow );
5911 %}
5912 
5913 instruct rvmul8D_reduction_reg(regD dst, legVecZ src2, legVecZ tmp, legVecZ tmp2) %{
5914   predicate(UseAVX > 2);
5915   match(Set dst (MulReductionVD dst src2));
5916   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5917   format %{ "vmulsd  $dst,$dst,$src2\n\t"
5918             "pshufd  $tmp,$src2,0xE\n\t"
5919             "vmulsd  $dst,$dst,$tmp\n\t"
5920             "vextractf32x4  $tmp2,$src2,0x1\n\t"
5921             "vmulsd  $dst,$dst,$tmp2\n\t"
5922             "pshufd  $tmp,$src2,0xE\n\t"
5923             "vmulsd  $dst,$dst,$tmp\n\t"
5924             "vextractf32x4  $tmp2,$src2,0x2\n\t"
5925             "vmulsd  $dst,$dst,$tmp2\n\t"
5926             "pshufd  $tmp,$tmp2,0xE\n\t"
5927             "vmulsd  $dst,$dst,$tmp\n\t"
5928             "vextractf32x4  $tmp2,$src2,0x3\n\t"
5929             "vmulsd  $dst,$dst,$tmp2\n\t"
5930             "pshufd  $tmp,$tmp2,0xE\n\t"
5931             "vmulsd  $dst,$dst,$tmp\t! mul reduction8D" %}
5932   ins_encode %{
5933     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5934     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5935     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5936     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5937     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5938     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5939     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5940     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
5941     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5942     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5943     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5944     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
5945     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5946     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5947     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5948   %}
5949   ins_pipe( pipe_slow );
5950 %}
5951 
5952 // ====================VECTOR ARITHMETIC=======================================
5953 
5954 // --------------------------------- ADD --------------------------------------
5955 
5956 // Bytes vector add
5957 instruct vadd4B(vecS dst, vecS src) %{
5958   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
5959   match(Set dst (AddVB dst src));
5960   format %{ "paddb   $dst,$src\t! add packed4B" %}
5961   ins_encode %{
5962     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
5963   %}
5964   ins_pipe( pipe_slow );
5965 %}
5966 
5967 instruct vadd4B_reg(vecS dst, vecS src1, vecS src2) %{
5968   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
5969   match(Set dst (AddVB src1 src2));
5970   format %{ "vpaddb  $dst,$src1,$src2\t! add packed4B" %}
5971   ins_encode %{
5972     int vector_len = 0;
5973     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5974   %}
5975   ins_pipe( pipe_slow );
5976 %}
5977 
5978 
5979 instruct vadd4B_mem(vecS dst, vecS src, memory mem) %{
5980   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
5981   match(Set dst (AddVB src (LoadVector mem)));
5982   format %{ "vpaddb  $dst,$src,$mem\t! add packed4B" %}
5983   ins_encode %{
5984     int vector_len = 0;
5985     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5986   %}
5987   ins_pipe( pipe_slow );
5988 %}
5989 
5990 instruct vadd8B(vecD dst, vecD src) %{
5991   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
5992   match(Set dst (AddVB dst src));
5993   format %{ "paddb   $dst,$src\t! add packed8B" %}
5994   ins_encode %{
5995     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
5996   %}
5997   ins_pipe( pipe_slow );
5998 %}
5999 
6000 instruct vadd8B_reg(vecD dst, vecD src1, vecD src2) %{
6001   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6002   match(Set dst (AddVB src1 src2));
6003   format %{ "vpaddb  $dst,$src1,$src2\t! add packed8B" %}
6004   ins_encode %{
6005     int vector_len = 0;
6006     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6007   %}
6008   ins_pipe( pipe_slow );
6009 %}
6010 
6011 
6012 instruct vadd8B_mem(vecD dst, vecD src, memory mem) %{
6013   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6014   match(Set dst (AddVB src (LoadVector mem)));
6015   format %{ "vpaddb  $dst,$src,$mem\t! add packed8B" %}
6016   ins_encode %{
6017     int vector_len = 0;
6018     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6019   %}
6020   ins_pipe( pipe_slow );
6021 %}
6022 
6023 instruct vadd16B(vecX dst, vecX src) %{
6024   predicate(UseAVX == 0 && n->as_Vector()->length() == 16);
6025   match(Set dst (AddVB dst src));
6026   format %{ "paddb   $dst,$src\t! add packed16B" %}
6027   ins_encode %{
6028     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
6029   %}
6030   ins_pipe( pipe_slow );
6031 %}
6032 
6033 instruct vadd16B_reg(vecX dst, vecX src1, vecX src2) %{
6034   predicate(UseAVX > 0  && n->as_Vector()->length() == 16);
6035   match(Set dst (AddVB src1 src2));
6036   format %{ "vpaddb  $dst,$src1,$src2\t! add packed16B" %}
6037   ins_encode %{
6038     int vector_len = 0;
6039     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6040   %}
6041   ins_pipe( pipe_slow );
6042 %}
6043 
6044 instruct vadd16B_mem(vecX dst, vecX src, memory mem) %{
6045   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
6046   match(Set dst (AddVB src (LoadVector mem)));
6047   format %{ "vpaddb  $dst,$src,$mem\t! add packed16B" %}
6048   ins_encode %{
6049     int vector_len = 0;
6050     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6051   %}
6052   ins_pipe( pipe_slow );
6053 %}
6054 
6055 instruct vadd32B_reg(vecY dst, vecY src1, vecY src2) %{
6056   predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
6057   match(Set dst (AddVB src1 src2));
6058   format %{ "vpaddb  $dst,$src1,$src2\t! add packed32B" %}
6059   ins_encode %{
6060     int vector_len = 1;
6061     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6062   %}
6063   ins_pipe( pipe_slow );
6064 %}
6065 
6066 instruct vadd32B_mem(vecY dst, vecY src, memory mem) %{
6067   predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
6068   match(Set dst (AddVB src (LoadVector mem)));
6069   format %{ "vpaddb  $dst,$src,$mem\t! add packed32B" %}
6070   ins_encode %{
6071     int vector_len = 1;
6072     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6073   %}
6074   ins_pipe( pipe_slow );
6075 %}
6076 
6077 instruct vadd64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
6078   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64);
6079   match(Set dst (AddVB src1 src2));
6080   format %{ "vpaddb  $dst,$src1,$src2\t! add packed64B" %}
6081   ins_encode %{
6082     int vector_len = 2;
6083     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6084   %}
6085   ins_pipe( pipe_slow );
6086 %}
6087 
6088 instruct vadd64B_mem(vecZ dst, vecZ src, memory mem) %{
6089   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64);
6090   match(Set dst (AddVB src (LoadVector mem)));
6091   format %{ "vpaddb  $dst,$src,$mem\t! add packed64B" %}
6092   ins_encode %{
6093     int vector_len = 2;
6094     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6095   %}
6096   ins_pipe( pipe_slow );
6097 %}
6098 
6099 // Shorts/Chars vector add
6100 instruct vadd2S(vecS dst, vecS src) %{
6101   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
6102   match(Set dst (AddVS dst src));
6103   format %{ "paddw   $dst,$src\t! add packed2S" %}
6104   ins_encode %{
6105     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
6106   %}
6107   ins_pipe( pipe_slow );
6108 %}
6109 
6110 instruct vadd2S_reg(vecS dst, vecS src1, vecS src2) %{
6111   predicate(UseAVX > 0  && n->as_Vector()->length() == 2);
6112   match(Set dst (AddVS src1 src2));
6113   format %{ "vpaddw  $dst,$src1,$src2\t! add packed2S" %}
6114   ins_encode %{
6115     int vector_len = 0;
6116     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6117   %}
6118   ins_pipe( pipe_slow );
6119 %}
6120 
6121 instruct vadd2S_mem(vecS dst, vecS src, memory mem) %{
6122   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6123   match(Set dst (AddVS src (LoadVector mem)));
6124   format %{ "vpaddw  $dst,$src,$mem\t! add packed2S" %}
6125   ins_encode %{
6126     int vector_len = 0;
6127     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6128   %}
6129   ins_pipe( pipe_slow );
6130 %}
6131 
6132 instruct vadd4S(vecD dst, vecD src) %{
6133   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
6134   match(Set dst (AddVS dst src));
6135   format %{ "paddw   $dst,$src\t! add packed4S" %}
6136   ins_encode %{
6137     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
6138   %}
6139   ins_pipe( pipe_slow );
6140 %}
6141 
6142 instruct vadd4S_reg(vecD dst, vecD src1, vecD src2) %{
6143   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6144   match(Set dst (AddVS src1 src2));
6145   format %{ "vpaddw  $dst,$src1,$src2\t! add packed4S" %}
6146   ins_encode %{
6147     int vector_len = 0;
6148     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6149   %}
6150   ins_pipe( pipe_slow );
6151 %}
6152 
6153 instruct vadd4S_mem(vecD dst, vecD src, memory mem) %{
6154   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6155   match(Set dst (AddVS src (LoadVector mem)));
6156   format %{ "vpaddw  $dst,$src,$mem\t! add packed4S" %}
6157   ins_encode %{
6158     int vector_len = 0;
6159     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6160   %}
6161   ins_pipe( pipe_slow );
6162 %}
6163 
6164 instruct vadd8S(vecX dst, vecX src) %{
6165   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
6166   match(Set dst (AddVS dst src));
6167   format %{ "paddw   $dst,$src\t! add packed8S" %}
6168   ins_encode %{
6169     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
6170   %}
6171   ins_pipe( pipe_slow );
6172 %}
6173 
6174 instruct vadd8S_reg(vecX dst, vecX src1, vecX src2) %{
6175   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6176   match(Set dst (AddVS src1 src2));
6177   format %{ "vpaddw  $dst,$src1,$src2\t! add packed8S" %}
6178   ins_encode %{
6179     int vector_len = 0;
6180     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6181   %}
6182   ins_pipe( pipe_slow );
6183 %}
6184 
6185 instruct vadd8S_mem(vecX dst, vecX src, memory mem) %{
6186   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6187   match(Set dst (AddVS src (LoadVector mem)));
6188   format %{ "vpaddw  $dst,$src,$mem\t! add packed8S" %}
6189   ins_encode %{
6190     int vector_len = 0;
6191     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6192   %}
6193   ins_pipe( pipe_slow );
6194 %}
6195 
6196 instruct vadd16S_reg(vecY dst, vecY src1, vecY src2) %{
6197   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
6198   match(Set dst (AddVS src1 src2));
6199   format %{ "vpaddw  $dst,$src1,$src2\t! add packed16S" %}
6200   ins_encode %{
6201     int vector_len = 1;
6202     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6203   %}
6204   ins_pipe( pipe_slow );
6205 %}
6206 
6207 instruct vadd16S_mem(vecY dst, vecY src, memory mem) %{
6208   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
6209   match(Set dst (AddVS src (LoadVector mem)));
6210   format %{ "vpaddw  $dst,$src,$mem\t! add packed16S" %}
6211   ins_encode %{
6212     int vector_len = 1;
6213     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6214   %}
6215   ins_pipe( pipe_slow );
6216 %}
6217 
6218 instruct vadd32S_reg(vecZ dst, vecZ src1, vecZ src2) %{
6219   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
6220   match(Set dst (AddVS src1 src2));
6221   format %{ "vpaddw  $dst,$src1,$src2\t! add packed32S" %}
6222   ins_encode %{
6223     int vector_len = 2;
6224     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6225   %}
6226   ins_pipe( pipe_slow );
6227 %}
6228 
6229 instruct vadd32S_mem(vecZ dst, vecZ src, memory mem) %{
6230   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
6231   match(Set dst (AddVS src (LoadVector mem)));
6232   format %{ "vpaddw  $dst,$src,$mem\t! add packed32S" %}
6233   ins_encode %{
6234     int vector_len = 2;
6235     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6236   %}
6237   ins_pipe( pipe_slow );
6238 %}
6239 
6240 // Integers vector add
6241 instruct vadd2I(vecD dst, vecD src) %{
6242   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
6243   match(Set dst (AddVI dst src));
6244   format %{ "paddd   $dst,$src\t! add packed2I" %}
6245   ins_encode %{
6246     __ paddd($dst$$XMMRegister, $src$$XMMRegister);
6247   %}
6248   ins_pipe( pipe_slow );
6249 %}
6250 
6251 instruct vadd2I_reg(vecD dst, vecD src1, vecD src2) %{
6252   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6253   match(Set dst (AddVI src1 src2));
6254   format %{ "vpaddd  $dst,$src1,$src2\t! add packed2I" %}
6255   ins_encode %{
6256     int vector_len = 0;
6257     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6258   %}
6259   ins_pipe( pipe_slow );
6260 %}
6261 
6262 instruct vadd2I_mem(vecD dst, vecD src, memory mem) %{
6263   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6264   match(Set dst (AddVI src (LoadVector mem)));
6265   format %{ "vpaddd  $dst,$src,$mem\t! add packed2I" %}
6266   ins_encode %{
6267     int vector_len = 0;
6268     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6269   %}
6270   ins_pipe( pipe_slow );
6271 %}
6272 
6273 instruct vadd4I(vecX dst, vecX src) %{
6274   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
6275   match(Set dst (AddVI dst src));
6276   format %{ "paddd   $dst,$src\t! add packed4I" %}
6277   ins_encode %{
6278     __ paddd($dst$$XMMRegister, $src$$XMMRegister);
6279   %}
6280   ins_pipe( pipe_slow );
6281 %}
6282 
6283 instruct vadd4I_reg(vecX dst, vecX src1, vecX src2) %{
6284   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6285   match(Set dst (AddVI src1 src2));
6286   format %{ "vpaddd  $dst,$src1,$src2\t! add packed4I" %}
6287   ins_encode %{
6288     int vector_len = 0;
6289     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6290   %}
6291   ins_pipe( pipe_slow );
6292 %}
6293 
6294 instruct vadd4I_mem(vecX dst, vecX src, memory mem) %{
6295   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6296   match(Set dst (AddVI src (LoadVector mem)));
6297   format %{ "vpaddd  $dst,$src,$mem\t! add packed4I" %}
6298   ins_encode %{
6299     int vector_len = 0;
6300     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6301   %}
6302   ins_pipe( pipe_slow );
6303 %}
6304 
6305 instruct vadd8I_reg(vecY dst, vecY src1, vecY src2) %{
6306   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
6307   match(Set dst (AddVI src1 src2));
6308   format %{ "vpaddd  $dst,$src1,$src2\t! add packed8I" %}
6309   ins_encode %{
6310     int vector_len = 1;
6311     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6312   %}
6313   ins_pipe( pipe_slow );
6314 %}
6315 
6316 instruct vadd8I_mem(vecY dst, vecY src, memory mem) %{
6317   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
6318   match(Set dst (AddVI src (LoadVector mem)));
6319   format %{ "vpaddd  $dst,$src,$mem\t! add packed8I" %}
6320   ins_encode %{
6321     int vector_len = 1;
6322     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6323   %}
6324   ins_pipe( pipe_slow );
6325 %}
6326 
6327 instruct vadd16I_reg(vecZ dst, vecZ src1, vecZ src2) %{
6328   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
6329   match(Set dst (AddVI src1 src2));
6330   format %{ "vpaddd  $dst,$src1,$src2\t! add packed16I" %}
6331   ins_encode %{
6332     int vector_len = 2;
6333     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6334   %}
6335   ins_pipe( pipe_slow );
6336 %}
6337 
6338 instruct vadd16I_mem(vecZ dst, vecZ src, memory mem) %{
6339   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
6340   match(Set dst (AddVI src (LoadVector mem)));
6341   format %{ "vpaddd  $dst,$src,$mem\t! add packed16I" %}
6342   ins_encode %{
6343     int vector_len = 2;
6344     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6345   %}
6346   ins_pipe( pipe_slow );
6347 %}
6348 
6349 // Longs vector add
6350 instruct vadd2L(vecX dst, vecX src) %{
6351   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
6352   match(Set dst (AddVL dst src));
6353   format %{ "paddq   $dst,$src\t! add packed2L" %}
6354   ins_encode %{
6355     __ paddq($dst$$XMMRegister, $src$$XMMRegister);
6356   %}
6357   ins_pipe( pipe_slow );
6358 %}
6359 
6360 instruct vadd2L_reg(vecX dst, vecX src1, vecX src2) %{
6361   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6362   match(Set dst (AddVL src1 src2));
6363   format %{ "vpaddq  $dst,$src1,$src2\t! add packed2L" %}
6364   ins_encode %{
6365     int vector_len = 0;
6366     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6367   %}
6368   ins_pipe( pipe_slow );
6369 %}
6370 
6371 instruct vadd2L_mem(vecX dst, vecX src, memory mem) %{
6372   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6373   match(Set dst (AddVL src (LoadVector mem)));
6374   format %{ "vpaddq  $dst,$src,$mem\t! add packed2L" %}
6375   ins_encode %{
6376     int vector_len = 0;
6377     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6378   %}
6379   ins_pipe( pipe_slow );
6380 %}
6381 
6382 instruct vadd4L_reg(vecY dst, vecY src1, vecY src2) %{
6383   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
6384   match(Set dst (AddVL src1 src2));
6385   format %{ "vpaddq  $dst,$src1,$src2\t! add packed4L" %}
6386   ins_encode %{
6387     int vector_len = 1;
6388     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6389   %}
6390   ins_pipe( pipe_slow );
6391 %}
6392 
6393 instruct vadd4L_mem(vecY dst, vecY src, memory mem) %{
6394   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
6395   match(Set dst (AddVL src (LoadVector mem)));
6396   format %{ "vpaddq  $dst,$src,$mem\t! add packed4L" %}
6397   ins_encode %{
6398     int vector_len = 1;
6399     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6400   %}
6401   ins_pipe( pipe_slow );
6402 %}
6403 
6404 instruct vadd8L_reg(vecZ dst, vecZ src1, vecZ src2) %{
6405   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
6406   match(Set dst (AddVL src1 src2));
6407   format %{ "vpaddq  $dst,$src1,$src2\t! add packed8L" %}
6408   ins_encode %{
6409     int vector_len = 2;
6410     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6411   %}
6412   ins_pipe( pipe_slow );
6413 %}
6414 
6415 instruct vadd8L_mem(vecZ dst, vecZ src, memory mem) %{
6416   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
6417   match(Set dst (AddVL src (LoadVector mem)));
6418   format %{ "vpaddq  $dst,$src,$mem\t! add packed8L" %}
6419   ins_encode %{
6420     int vector_len = 2;
6421     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6422   %}
6423   ins_pipe( pipe_slow );
6424 %}
6425 
6426 // Floats vector add
6427 instruct vadd2F(vecD dst, vecD src) %{
6428   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
6429   match(Set dst (AddVF dst src));
6430   format %{ "addps   $dst,$src\t! add packed2F" %}
6431   ins_encode %{
6432     __ addps($dst$$XMMRegister, $src$$XMMRegister);
6433   %}
6434   ins_pipe( pipe_slow );
6435 %}
6436 
6437 instruct vadd2F_reg(vecD dst, vecD src1, vecD src2) %{
6438   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6439   match(Set dst (AddVF src1 src2));
6440   format %{ "vaddps  $dst,$src1,$src2\t! add packed2F" %}
6441   ins_encode %{
6442     int vector_len = 0;
6443     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6444   %}
6445   ins_pipe( pipe_slow );
6446 %}
6447 
6448 instruct vadd2F_mem(vecD dst, vecD src, memory mem) %{
6449   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6450   match(Set dst (AddVF src (LoadVector mem)));
6451   format %{ "vaddps  $dst,$src,$mem\t! add packed2F" %}
6452   ins_encode %{
6453     int vector_len = 0;
6454     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6455   %}
6456   ins_pipe( pipe_slow );
6457 %}
6458 
6459 instruct vadd4F(vecX dst, vecX src) %{
6460   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
6461   match(Set dst (AddVF dst src));
6462   format %{ "addps   $dst,$src\t! add packed4F" %}
6463   ins_encode %{
6464     __ addps($dst$$XMMRegister, $src$$XMMRegister);
6465   %}
6466   ins_pipe( pipe_slow );
6467 %}
6468 
6469 instruct vadd4F_reg(vecX dst, vecX src1, vecX src2) %{
6470   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6471   match(Set dst (AddVF src1 src2));
6472   format %{ "vaddps  $dst,$src1,$src2\t! add packed4F" %}
6473   ins_encode %{
6474     int vector_len = 0;
6475     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6476   %}
6477   ins_pipe( pipe_slow );
6478 %}
6479 
6480 instruct vadd4F_mem(vecX dst, vecX src, memory mem) %{
6481   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6482   match(Set dst (AddVF src (LoadVector mem)));
6483   format %{ "vaddps  $dst,$src,$mem\t! add packed4F" %}
6484   ins_encode %{
6485     int vector_len = 0;
6486     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6487   %}
6488   ins_pipe( pipe_slow );
6489 %}
6490 
6491 instruct vadd8F_reg(vecY dst, vecY src1, vecY src2) %{
6492   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6493   match(Set dst (AddVF src1 src2));
6494   format %{ "vaddps  $dst,$src1,$src2\t! add packed8F" %}
6495   ins_encode %{
6496     int vector_len = 1;
6497     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6498   %}
6499   ins_pipe( pipe_slow );
6500 %}
6501 
6502 instruct vadd8F_mem(vecY dst, vecY src, memory mem) %{
6503   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6504   match(Set dst (AddVF src (LoadVector mem)));
6505   format %{ "vaddps  $dst,$src,$mem\t! add packed8F" %}
6506   ins_encode %{
6507     int vector_len = 1;
6508     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6509   %}
6510   ins_pipe( pipe_slow );
6511 %}
6512 
6513 instruct vadd16F_reg(vecZ dst, vecZ src1, vecZ src2) %{
6514   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
6515   match(Set dst (AddVF src1 src2));
6516   format %{ "vaddps  $dst,$src1,$src2\t! add packed16F" %}
6517   ins_encode %{
6518     int vector_len = 2;
6519     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6520   %}
6521   ins_pipe( pipe_slow );
6522 %}
6523 
6524 instruct vadd16F_mem(vecZ dst, vecZ src, memory mem) %{
6525   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
6526   match(Set dst (AddVF src (LoadVector mem)));
6527   format %{ "vaddps  $dst,$src,$mem\t! add packed16F" %}
6528   ins_encode %{
6529     int vector_len = 2;
6530     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6531   %}
6532   ins_pipe( pipe_slow );
6533 %}
6534 
6535 // Doubles vector add
6536 instruct vadd2D(vecX dst, vecX src) %{
6537   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
6538   match(Set dst (AddVD dst src));
6539   format %{ "addpd   $dst,$src\t! add packed2D" %}
6540   ins_encode %{
6541     __ addpd($dst$$XMMRegister, $src$$XMMRegister);
6542   %}
6543   ins_pipe( pipe_slow );
6544 %}
6545 
6546 instruct vadd2D_reg(vecX dst, vecX src1, vecX src2) %{
6547   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6548   match(Set dst (AddVD src1 src2));
6549   format %{ "vaddpd  $dst,$src1,$src2\t! add packed2D" %}
6550   ins_encode %{
6551     int vector_len = 0;
6552     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6553   %}
6554   ins_pipe( pipe_slow );
6555 %}
6556 
6557 instruct vadd2D_mem(vecX dst, vecX src, memory mem) %{
6558   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6559   match(Set dst (AddVD src (LoadVector mem)));
6560   format %{ "vaddpd  $dst,$src,$mem\t! add packed2D" %}
6561   ins_encode %{
6562     int vector_len = 0;
6563     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6564   %}
6565   ins_pipe( pipe_slow );
6566 %}
6567 
6568 instruct vadd4D_reg(vecY dst, vecY src1, vecY src2) %{
6569   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6570   match(Set dst (AddVD src1 src2));
6571   format %{ "vaddpd  $dst,$src1,$src2\t! add packed4D" %}
6572   ins_encode %{
6573     int vector_len = 1;
6574     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6575   %}
6576   ins_pipe( pipe_slow );
6577 %}
6578 
6579 instruct vadd4D_mem(vecY dst, vecY src, memory mem) %{
6580   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6581   match(Set dst (AddVD src (LoadVector mem)));
6582   format %{ "vaddpd  $dst,$src,$mem\t! add packed4D" %}
6583   ins_encode %{
6584     int vector_len = 1;
6585     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6586   %}
6587   ins_pipe( pipe_slow );
6588 %}
6589 
6590 instruct vadd8D_reg(vecZ dst, vecZ src1, vecZ src2) %{
6591   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
6592   match(Set dst (AddVD src1 src2));
6593   format %{ "vaddpd  $dst,$src1,$src2\t! add packed8D" %}
6594   ins_encode %{
6595     int vector_len = 2;
6596     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6597   %}
6598   ins_pipe( pipe_slow );
6599 %}
6600 
6601 instruct vadd8D_mem(vecZ dst, vecZ src, memory mem) %{
6602   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
6603   match(Set dst (AddVD src (LoadVector mem)));
6604   format %{ "vaddpd  $dst,$src,$mem\t! add packed8D" %}
6605   ins_encode %{
6606     int vector_len = 2;
6607     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6608   %}
6609   ins_pipe( pipe_slow );
6610 %}
6611 
6612 // --------------------------------- SUB --------------------------------------
6613 
6614 // Bytes vector sub
6615 instruct vsub4B(vecS dst, vecS src) %{
6616   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
6617   match(Set dst (SubVB dst src));
6618   format %{ "psubb   $dst,$src\t! sub packed4B" %}
6619   ins_encode %{
6620     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
6621   %}
6622   ins_pipe( pipe_slow );
6623 %}
6624 
6625 instruct vsub4B_reg(vecS dst, vecS src1, vecS src2) %{
6626   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6627   match(Set dst (SubVB src1 src2));
6628   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed4B" %}
6629   ins_encode %{
6630     int vector_len = 0;
6631     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6632   %}
6633   ins_pipe( pipe_slow );
6634 %}
6635 
6636 instruct vsub4B_mem(vecS dst, vecS src, memory mem) %{
6637   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6638   match(Set dst (SubVB src (LoadVector mem)));
6639   format %{ "vpsubb  $dst,$src,$mem\t! sub packed4B" %}
6640   ins_encode %{
6641     int vector_len = 0;
6642     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6643   %}
6644   ins_pipe( pipe_slow );
6645 %}
6646 
6647 instruct vsub8B(vecD dst, vecD src) %{
6648   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
6649   match(Set dst (SubVB dst src));
6650   format %{ "psubb   $dst,$src\t! sub packed8B" %}
6651   ins_encode %{
6652     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
6653   %}
6654   ins_pipe( pipe_slow );
6655 %}
6656 
6657 instruct vsub8B_reg(vecD dst, vecD src1, vecD src2) %{
6658   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6659   match(Set dst (SubVB src1 src2));
6660   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed8B" %}
6661   ins_encode %{
6662     int vector_len = 0;
6663     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6664   %}
6665   ins_pipe( pipe_slow );
6666 %}
6667 
6668 instruct vsub8B_mem(vecD dst, vecD src, memory mem) %{
6669   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6670   match(Set dst (SubVB src (LoadVector mem)));
6671   format %{ "vpsubb  $dst,$src,$mem\t! sub packed8B" %}
6672   ins_encode %{
6673     int vector_len = 0;
6674     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6675   %}
6676   ins_pipe( pipe_slow );
6677 %}
6678 
6679 instruct vsub16B(vecX dst, vecX src) %{
6680   predicate(UseAVX == 0 && n->as_Vector()->length() == 16);
6681   match(Set dst (SubVB dst src));
6682   format %{ "psubb   $dst,$src\t! sub packed16B" %}
6683   ins_encode %{
6684     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
6685   %}
6686   ins_pipe( pipe_slow );
6687 %}
6688 
6689 instruct vsub16B_reg(vecX dst, vecX src1, vecX src2) %{
6690   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
6691   match(Set dst (SubVB src1 src2));
6692   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed16B" %}
6693   ins_encode %{
6694     int vector_len = 0;
6695     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6696   %}
6697   ins_pipe( pipe_slow );
6698 %}
6699 
6700 instruct vsub16B_mem(vecX dst, vecX src, memory mem) %{
6701   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
6702   match(Set dst (SubVB src (LoadVector mem)));
6703   format %{ "vpsubb  $dst,$src,$mem\t! sub packed16B" %}
6704   ins_encode %{
6705     int vector_len = 0;
6706     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6707   %}
6708   ins_pipe( pipe_slow );
6709 %}
6710 
6711 instruct vsub32B_reg(vecY dst, vecY src1, vecY src2) %{
6712   predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
6713   match(Set dst (SubVB src1 src2));
6714   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed32B" %}
6715   ins_encode %{
6716     int vector_len = 1;
6717     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6718   %}
6719   ins_pipe( pipe_slow );
6720 %}
6721 
6722 instruct vsub32B_mem(vecY dst, vecY src, memory mem) %{
6723   predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
6724   match(Set dst (SubVB src (LoadVector mem)));
6725   format %{ "vpsubb  $dst,$src,$mem\t! sub packed32B" %}
6726   ins_encode %{
6727     int vector_len = 1;
6728     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6729   %}
6730   ins_pipe( pipe_slow );
6731 %}
6732 
6733 instruct vsub64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
6734   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64);
6735   match(Set dst (SubVB src1 src2));
6736   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed64B" %}
6737   ins_encode %{
6738     int vector_len = 2;
6739     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6740   %}
6741   ins_pipe( pipe_slow );
6742 %}
6743 
6744 instruct vsub64B_mem(vecZ dst, vecZ src, memory mem) %{
6745   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64);
6746   match(Set dst (SubVB src (LoadVector mem)));
6747   format %{ "vpsubb  $dst,$src,$mem\t! sub packed64B" %}
6748   ins_encode %{
6749     int vector_len = 2;
6750     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6751   %}
6752   ins_pipe( pipe_slow );
6753 %}
6754 
6755 // Shorts/Chars vector sub
6756 instruct vsub2S(vecS dst, vecS src) %{
6757   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
6758   match(Set dst (SubVS dst src));
6759   format %{ "psubw   $dst,$src\t! sub packed2S" %}
6760   ins_encode %{
6761     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
6762   %}
6763   ins_pipe( pipe_slow );
6764 %}
6765 
6766 instruct vsub2S_reg(vecS dst, vecS src1, vecS src2) %{
6767   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6768   match(Set dst (SubVS src1 src2));
6769   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed2S" %}
6770   ins_encode %{
6771     int vector_len = 0;
6772     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6773   %}
6774   ins_pipe( pipe_slow );
6775 %}
6776 
6777 instruct vsub2S_mem(vecS dst, vecS src, memory mem) %{
6778   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6779   match(Set dst (SubVS src (LoadVector mem)));
6780   format %{ "vpsubw  $dst,$src,$mem\t! sub packed2S" %}
6781   ins_encode %{
6782     int vector_len = 0;
6783     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6784   %}
6785   ins_pipe( pipe_slow );
6786 %}
6787 
6788 instruct vsub4S(vecD dst, vecD src) %{
6789   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
6790   match(Set dst (SubVS dst src));
6791   format %{ "psubw   $dst,$src\t! sub packed4S" %}
6792   ins_encode %{
6793     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
6794   %}
6795   ins_pipe( pipe_slow );
6796 %}
6797 
6798 instruct vsub4S_reg(vecD dst, vecD src1, vecD src2) %{
6799   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6800   match(Set dst (SubVS src1 src2));
6801   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed4S" %}
6802   ins_encode %{
6803     int vector_len = 0;
6804     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6805   %}
6806   ins_pipe( pipe_slow );
6807 %}
6808 
6809 instruct vsub4S_mem(vecD dst, vecD src, memory mem) %{
6810   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6811   match(Set dst (SubVS src (LoadVector mem)));
6812   format %{ "vpsubw  $dst,$src,$mem\t! sub packed4S" %}
6813   ins_encode %{
6814     int vector_len = 0;
6815     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6816   %}
6817   ins_pipe( pipe_slow );
6818 %}
6819 
6820 instruct vsub8S(vecX dst, vecX src) %{
6821   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
6822   match(Set dst (SubVS dst src));
6823   format %{ "psubw   $dst,$src\t! sub packed8S" %}
6824   ins_encode %{
6825     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
6826   %}
6827   ins_pipe( pipe_slow );
6828 %}
6829 
6830 instruct vsub8S_reg(vecX dst, vecX src1, vecX src2) %{
6831   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6832   match(Set dst (SubVS src1 src2));
6833   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed8S" %}
6834   ins_encode %{
6835     int vector_len = 0;
6836     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6837   %}
6838   ins_pipe( pipe_slow );
6839 %}
6840 
6841 instruct vsub8S_mem(vecX dst, vecX src, memory mem) %{
6842   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6843   match(Set dst (SubVS src (LoadVector mem)));
6844   format %{ "vpsubw  $dst,$src,$mem\t! sub packed8S" %}
6845   ins_encode %{
6846     int vector_len = 0;
6847     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6848   %}
6849   ins_pipe( pipe_slow );
6850 %}
6851 
6852 instruct vsub16S_reg(vecY dst, vecY src1, vecY src2) %{
6853   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
6854   match(Set dst (SubVS src1 src2));
6855   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed16S" %}
6856   ins_encode %{
6857     int vector_len = 1;
6858     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6859   %}
6860   ins_pipe( pipe_slow );
6861 %}
6862 
6863 instruct vsub16S_mem(vecY dst, vecY src, memory mem) %{
6864   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
6865   match(Set dst (SubVS src (LoadVector mem)));
6866   format %{ "vpsubw  $dst,$src,$mem\t! sub packed16S" %}
6867   ins_encode %{
6868     int vector_len = 1;
6869     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6870   %}
6871   ins_pipe( pipe_slow );
6872 %}
6873 
6874 instruct vsub32S_reg(vecZ dst, vecZ src1, vecZ src2) %{
6875   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
6876   match(Set dst (SubVS src1 src2));
6877   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed32S" %}
6878   ins_encode %{
6879     int vector_len = 2;
6880     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6881   %}
6882   ins_pipe( pipe_slow );
6883 %}
6884 
6885 instruct vsub32S_mem(vecZ dst, vecZ src, memory mem) %{
6886   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
6887   match(Set dst (SubVS src (LoadVector mem)));
6888   format %{ "vpsubw  $dst,$src,$mem\t! sub packed32S" %}
6889   ins_encode %{
6890     int vector_len = 2;
6891     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6892   %}
6893   ins_pipe( pipe_slow );
6894 %}
6895 
6896 // Integers vector sub
6897 instruct vsub2I(vecD dst, vecD src) %{
6898   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
6899   match(Set dst (SubVI dst src));
6900   format %{ "psubd   $dst,$src\t! sub packed2I" %}
6901   ins_encode %{
6902     __ psubd($dst$$XMMRegister, $src$$XMMRegister);
6903   %}
6904   ins_pipe( pipe_slow );
6905 %}
6906 
6907 instruct vsub2I_reg(vecD dst, vecD src1, vecD src2) %{
6908   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6909   match(Set dst (SubVI src1 src2));
6910   format %{ "vpsubd  $dst,$src1,$src2\t! sub packed2I" %}
6911   ins_encode %{
6912     int vector_len = 0;
6913     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6914   %}
6915   ins_pipe( pipe_slow );
6916 %}
6917 
6918 instruct vsub2I_mem(vecD dst, vecD src, memory mem) %{
6919   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6920   match(Set dst (SubVI src (LoadVector mem)));
6921   format %{ "vpsubd  $dst,$src,$mem\t! sub packed2I" %}
6922   ins_encode %{
6923     int vector_len = 0;
6924     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6925   %}
6926   ins_pipe( pipe_slow );
6927 %}
6928 
6929 instruct vsub4I(vecX dst, vecX src) %{
6930   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
6931   match(Set dst (SubVI dst src));
6932   format %{ "psubd   $dst,$src\t! sub packed4I" %}
6933   ins_encode %{
6934     __ psubd($dst$$XMMRegister, $src$$XMMRegister);
6935   %}
6936   ins_pipe( pipe_slow );
6937 %}
6938 
6939 instruct vsub4I_reg(vecX dst, vecX src1, vecX src2) %{
6940   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6941   match(Set dst (SubVI src1 src2));
6942   format %{ "vpsubd  $dst,$src1,$src2\t! sub packed4I" %}
6943   ins_encode %{
6944     int vector_len = 0;
6945     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6946   %}
6947   ins_pipe( pipe_slow );
6948 %}
6949 
6950 instruct vsub4I_mem(vecX dst, vecX src, memory mem) %{
6951   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6952   match(Set dst (SubVI src (LoadVector mem)));
6953   format %{ "vpsubd  $dst,$src,$mem\t! sub packed4I" %}
6954   ins_encode %{
6955     int vector_len = 0;
6956     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6957   %}
6958   ins_pipe( pipe_slow );
6959 %}
6960 
6961 instruct vsub8I_reg(vecY dst, vecY src1, vecY src2) %{
6962   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
6963   match(Set dst (SubVI src1 src2));
6964   format %{ "vpsubd  $dst,$src1,$src2\t! sub packed8I" %}
6965   ins_encode %{
6966     int vector_len = 1;
6967     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6968   %}
6969   ins_pipe( pipe_slow );
6970 %}
6971 
6972 instruct vsub8I_mem(vecY dst, vecY src, memory mem) %{
6973   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
6974   match(Set dst (SubVI src (LoadVector mem)));
6975   format %{ "vpsubd  $dst,$src,$mem\t! sub packed8I" %}
6976   ins_encode %{
6977     int vector_len = 1;
6978     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6979   %}
6980   ins_pipe( pipe_slow );
6981 %}
6982 
6983 instruct vsub16I_reg(vecZ dst, vecZ src1, vecZ src2) %{
6984   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
6985   match(Set dst (SubVI src1 src2));
6986   format %{ "vpsubd  $dst,$src1,$src2\t! sub packed16I" %}
6987   ins_encode %{
6988     int vector_len = 2;
6989     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6990   %}
6991   ins_pipe( pipe_slow );
6992 %}
6993 
6994 instruct vsub16I_mem(vecZ dst, vecZ src, memory mem) %{
6995   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
6996   match(Set dst (SubVI src (LoadVector mem)));
6997   format %{ "vpsubd  $dst,$src,$mem\t! sub packed16I" %}
6998   ins_encode %{
6999     int vector_len = 2;
7000     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7001   %}
7002   ins_pipe( pipe_slow );
7003 %}
7004 
7005 // Longs vector sub
7006 instruct vsub2L(vecX dst, vecX src) %{
7007   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
7008   match(Set dst (SubVL dst src));
7009   format %{ "psubq   $dst,$src\t! sub packed2L" %}
7010   ins_encode %{
7011     __ psubq($dst$$XMMRegister, $src$$XMMRegister);
7012   %}
7013   ins_pipe( pipe_slow );
7014 %}
7015 
7016 instruct vsub2L_reg(vecX dst, vecX src1, vecX src2) %{
7017   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7018   match(Set dst (SubVL src1 src2));
7019   format %{ "vpsubq  $dst,$src1,$src2\t! sub packed2L" %}
7020   ins_encode %{
7021     int vector_len = 0;
7022     __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7023   %}
7024   ins_pipe( pipe_slow );
7025 %}
7026 
7027 instruct vsub2L_mem(vecX dst, vecX src, memory mem) %{
7028   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7029   match(Set dst (SubVL src (LoadVector mem)));
7030   format %{ "vpsubq  $dst,$src,$mem\t! sub packed2L" %}
7031   ins_encode %{
7032     int vector_len = 0;
7033     __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7034   %}
7035   ins_pipe( pipe_slow );
7036 %}
7037 
7038 instruct vsub4L_reg(vecY dst, vecY src1, vecY src2) %{
7039   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
7040   match(Set dst (SubVL src1 src2));
7041   format %{ "vpsubq  $dst,$src1,$src2\t! sub packed4L" %}
7042   ins_encode %{
7043     int vector_len = 1;
7044     __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7045   %}
7046   ins_pipe( pipe_slow );
7047 %}
7048 
7049 instruct vsub4L_mem(vecY dst, vecY src, memory mem) %{
7050   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
7051   match(Set dst (SubVL src (LoadVector mem)));
7052   format %{ "vpsubq  $dst,$src,$mem\t! sub packed4L" %}
7053   ins_encode %{
7054     int vector_len = 1;
7055     __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7056   %}
7057   ins_pipe( pipe_slow );
7058 %}
7059 
7060 instruct vsub8L_reg(vecZ dst, vecZ src1, vecZ src2) %{
7061   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
7062   match(Set dst (SubVL src1 src2));
7063   format %{ "vpsubq  $dst,$src1,$src2\t! sub packed8L" %}
7064   ins_encode %{
7065     int vector_len = 2;
7066     __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7067   %}
7068   ins_pipe( pipe_slow );
7069 %}
7070 
7071 instruct vsub8L_mem(vecZ dst, vecZ src, memory mem) %{
7072   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
7073   match(Set dst (SubVL src (LoadVector mem)));
7074   format %{ "vpsubq  $dst,$src,$mem\t! sub packed8L" %}
7075   ins_encode %{
7076     int vector_len = 2;
7077     __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7078   %}
7079   ins_pipe( pipe_slow );
7080 %}
7081 
7082 // Floats vector sub
7083 instruct vsub2F(vecD dst, vecD src) %{
7084   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
7085   match(Set dst (SubVF dst src));
7086   format %{ "subps   $dst,$src\t! sub packed2F" %}
7087   ins_encode %{
7088     __ subps($dst$$XMMRegister, $src$$XMMRegister);
7089   %}
7090   ins_pipe( pipe_slow );
7091 %}
7092 
7093 instruct vsub2F_reg(vecD dst, vecD src1, vecD src2) %{
7094   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7095   match(Set dst (SubVF src1 src2));
7096   format %{ "vsubps  $dst,$src1,$src2\t! sub packed2F" %}
7097   ins_encode %{
7098     int vector_len = 0;
7099     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7100   %}
7101   ins_pipe( pipe_slow );
7102 %}
7103 
7104 instruct vsub2F_mem(vecD dst, vecD src, memory mem) %{
7105   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7106   match(Set dst (SubVF src (LoadVector mem)));
7107   format %{ "vsubps  $dst,$src,$mem\t! sub packed2F" %}
7108   ins_encode %{
7109     int vector_len = 0;
7110     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7111   %}
7112   ins_pipe( pipe_slow );
7113 %}
7114 
7115 instruct vsub4F(vecX dst, vecX src) %{
7116   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
7117   match(Set dst (SubVF dst src));
7118   format %{ "subps   $dst,$src\t! sub packed4F" %}
7119   ins_encode %{
7120     __ subps($dst$$XMMRegister, $src$$XMMRegister);
7121   %}
7122   ins_pipe( pipe_slow );
7123 %}
7124 
7125 instruct vsub4F_reg(vecX dst, vecX src1, vecX src2) %{
7126   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7127   match(Set dst (SubVF src1 src2));
7128   format %{ "vsubps  $dst,$src1,$src2\t! sub packed4F" %}
7129   ins_encode %{
7130     int vector_len = 0;
7131     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7132   %}
7133   ins_pipe( pipe_slow );
7134 %}
7135 
7136 instruct vsub4F_mem(vecX dst, vecX src, memory mem) %{
7137   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7138   match(Set dst (SubVF src (LoadVector mem)));
7139   format %{ "vsubps  $dst,$src,$mem\t! sub packed4F" %}
7140   ins_encode %{
7141     int vector_len = 0;
7142     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7143   %}
7144   ins_pipe( pipe_slow );
7145 %}
7146 
7147 instruct vsub8F_reg(vecY dst, vecY src1, vecY src2) %{
7148   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
7149   match(Set dst (SubVF src1 src2));
7150   format %{ "vsubps  $dst,$src1,$src2\t! sub packed8F" %}
7151   ins_encode %{
7152     int vector_len = 1;
7153     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7154   %}
7155   ins_pipe( pipe_slow );
7156 %}
7157 
7158 instruct vsub8F_mem(vecY dst, vecY src, memory mem) %{
7159   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
7160   match(Set dst (SubVF src (LoadVector mem)));
7161   format %{ "vsubps  $dst,$src,$mem\t! sub packed8F" %}
7162   ins_encode %{
7163     int vector_len = 1;
7164     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7165   %}
7166   ins_pipe( pipe_slow );
7167 %}
7168 
7169 instruct vsub16F_reg(vecZ dst, vecZ src1, vecZ src2) %{
7170   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
7171   match(Set dst (SubVF src1 src2));
7172   format %{ "vsubps  $dst,$src1,$src2\t! sub packed16F" %}
7173   ins_encode %{
7174     int vector_len = 2;
7175     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7176   %}
7177   ins_pipe( pipe_slow );
7178 %}
7179 
7180 instruct vsub16F_mem(vecZ dst, vecZ src, memory mem) %{
7181   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
7182   match(Set dst (SubVF src (LoadVector mem)));
7183   format %{ "vsubps  $dst,$src,$mem\t! sub packed16F" %}
7184   ins_encode %{
7185     int vector_len = 2;
7186     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7187   %}
7188   ins_pipe( pipe_slow );
7189 %}
7190 
7191 // Doubles vector sub
7192 instruct vsub2D(vecX dst, vecX src) %{
7193   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
7194   match(Set dst (SubVD dst src));
7195   format %{ "subpd   $dst,$src\t! sub packed2D" %}
7196   ins_encode %{
7197     __ subpd($dst$$XMMRegister, $src$$XMMRegister);
7198   %}
7199   ins_pipe( pipe_slow );
7200 %}
7201 
7202 instruct vsub2D_reg(vecX dst, vecX src1, vecX src2) %{
7203   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7204   match(Set dst (SubVD src1 src2));
7205   format %{ "vsubpd  $dst,$src1,$src2\t! sub packed2D" %}
7206   ins_encode %{
7207     int vector_len = 0;
7208     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7209   %}
7210   ins_pipe( pipe_slow );
7211 %}
7212 
7213 instruct vsub2D_mem(vecX dst, vecX src, memory mem) %{
7214   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7215   match(Set dst (SubVD src (LoadVector mem)));
7216   format %{ "vsubpd  $dst,$src,$mem\t! sub packed2D" %}
7217   ins_encode %{
7218     int vector_len = 0;
7219     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7220   %}
7221   ins_pipe( pipe_slow );
7222 %}
7223 
7224 instruct vsub4D_reg(vecY dst, vecY src1, vecY src2) %{
7225   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7226   match(Set dst (SubVD src1 src2));
7227   format %{ "vsubpd  $dst,$src1,$src2\t! sub packed4D" %}
7228   ins_encode %{
7229     int vector_len = 1;
7230     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7231   %}
7232   ins_pipe( pipe_slow );
7233 %}
7234 
7235 instruct vsub4D_mem(vecY dst, vecY src, memory mem) %{
7236   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7237   match(Set dst (SubVD src (LoadVector mem)));
7238   format %{ "vsubpd  $dst,$src,$mem\t! sub packed4D" %}
7239   ins_encode %{
7240     int vector_len = 1;
7241     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7242   %}
7243   ins_pipe( pipe_slow );
7244 %}
7245 
7246 instruct vsub8D_reg(vecZ dst, vecZ src1, vecZ src2) %{
7247   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
7248   match(Set dst (SubVD src1 src2));
7249   format %{ "vsubpd  $dst,$src1,$src2\t! sub packed8D" %}
7250   ins_encode %{
7251     int vector_len = 2;
7252     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7253   %}
7254   ins_pipe( pipe_slow );
7255 %}
7256 
7257 instruct vsub8D_mem(vecZ dst, vecZ src, memory mem) %{
7258   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
7259   match(Set dst (SubVD src (LoadVector mem)));
7260   format %{ "vsubpd  $dst,$src,$mem\t! sub packed8D" %}
7261   ins_encode %{
7262     int vector_len = 2;
7263     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7264   %}
7265   ins_pipe( pipe_slow );
7266 %}
7267 
7268 // --------------------------------- MUL --------------------------------------
7269 
7270 // Byte vector mul
7271 instruct mul4B_reg(vecS dst, vecS src1, vecS src2, vecS tmp, rRegI scratch) %{
7272   predicate(UseSSE > 3 && n->as_Vector()->length() == 4);
7273   match(Set dst (MulVB src1 src2));
7274   effect(TEMP dst, TEMP tmp, TEMP scratch);
7275   format %{"pmovsxbw  $tmp,$src1\n\t"
7276            "pmovsxbw  $dst,$src2\n\t"
7277            "pmullw    $tmp,$dst\n\t"
7278            "movdqu    $dst,[0x00ff00ff0x00ff00ff]\n\t"
7279            "pand      $dst,$tmp\n\t"
7280            "packuswb  $dst,$dst\t! mul packed4B" %}
7281   ins_encode %{
7282     __ pmovsxbw($tmp$$XMMRegister, $src1$$XMMRegister);
7283     __ pmovsxbw($dst$$XMMRegister, $src2$$XMMRegister);
7284     __ pmullw($tmp$$XMMRegister, $dst$$XMMRegister);
7285     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
7286     __ pand($dst$$XMMRegister, $tmp$$XMMRegister);
7287     __ packuswb($dst$$XMMRegister, $dst$$XMMRegister);
7288   %}
7289   ins_pipe( pipe_slow );
7290 %}
7291 
7292 instruct mul8B_reg(vecD dst, vecD src1, vecD src2, vecD tmp, rRegI scratch) %{
7293   predicate(UseSSE > 3 && n->as_Vector()->length() == 8);
7294   match(Set dst (MulVB src1 src2));
7295   effect(TEMP dst, TEMP tmp, TEMP scratch);
7296   format %{"pmovsxbw  $tmp,$src1\n\t"
7297            "pmovsxbw  $dst,$src2\n\t"
7298            "pmullw    $tmp,$dst\n\t"
7299            "movdqu    $dst,[0x00ff00ff0x00ff00ff]\n\t"
7300            "pand      $dst,$tmp\n\t"
7301            "packuswb  $dst,$dst\t! mul packed8B" %}
7302   ins_encode %{
7303     __ pmovsxbw($tmp$$XMMRegister, $src1$$XMMRegister);
7304     __ pmovsxbw($dst$$XMMRegister, $src2$$XMMRegister);
7305     __ pmullw($tmp$$XMMRegister, $dst$$XMMRegister);
7306     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
7307     __ pand($dst$$XMMRegister, $tmp$$XMMRegister);
7308     __ packuswb($dst$$XMMRegister, $dst$$XMMRegister);
7309   %}
7310   ins_pipe( pipe_slow );
7311 %}
7312 
7313 instruct mul16B_reg(vecX dst, vecX src1, vecX src2, vecX tmp1, vecX tmp2, rRegI scratch) %{
7314   predicate(UseSSE > 3 && n->as_Vector()->length() == 16);
7315   match(Set dst (MulVB src1 src2));
7316   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP scratch);
7317   format %{"pmovsxbw  $tmp1,$src1\n\t"
7318            "pmovsxbw  $tmp2,$src2\n\t"
7319            "pmullw    $tmp1,$tmp2\n\t"
7320            "pshufd    $tmp2,$src1,0xEE\n\t"
7321            "pshufd    $dst,$src2,0xEE\n\t"
7322            "pmovsxbw  $tmp2,$tmp2\n\t"
7323            "pmovsxbw  $dst,$dst\n\t"
7324            "pmullw    $tmp2,$dst\n\t"
7325            "movdqu    $dst,[0x00ff00ff0x00ff00ff]\n\t"
7326            "pand      $tmp2,$dst\n\t"
7327            "pand      $dst,$tmp1\n\t"
7328            "packuswb  $dst,$tmp2\t! mul packed16B" %}
7329   ins_encode %{
7330     __ pmovsxbw($tmp1$$XMMRegister, $src1$$XMMRegister);
7331     __ pmovsxbw($tmp2$$XMMRegister, $src2$$XMMRegister);
7332     __ pmullw($tmp1$$XMMRegister, $tmp2$$XMMRegister);
7333     __ pshufd($tmp2$$XMMRegister, $src1$$XMMRegister, 0xEE);
7334     __ pshufd($dst$$XMMRegister, $src2$$XMMRegister, 0xEE);
7335     __ pmovsxbw($tmp2$$XMMRegister, $tmp2$$XMMRegister);
7336     __ pmovsxbw($dst$$XMMRegister, $dst$$XMMRegister);
7337     __ pmullw($tmp2$$XMMRegister, $dst$$XMMRegister);
7338     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
7339     __ pand($tmp2$$XMMRegister, $dst$$XMMRegister);
7340     __ pand($dst$$XMMRegister, $tmp1$$XMMRegister);
7341     __ packuswb($dst$$XMMRegister, $tmp2$$XMMRegister);
7342   %}
7343   ins_pipe( pipe_slow );
7344 %}
7345 
7346 instruct vmul16B_reg_avx(vecX dst, vecX src1, vecX src2, vecX tmp, rRegI scratch) %{
7347   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
7348   match(Set dst (MulVB src1 src2));
7349   effect(TEMP dst, TEMP tmp, TEMP scratch);
7350   format %{"vpmovsxbw  $tmp,$src1\n\t"
7351            "vpmovsxbw  $dst,$src2\n\t"
7352            "vpmullw    $tmp,$tmp,$dst\n\t"
7353            "vmovdqu    $dst,[0x00ff00ff0x00ff00ff]\n\t"
7354            "vpand      $dst,$dst,$tmp\n\t"
7355            "vextracti128_high  $tmp,$dst\n\t"
7356            "vpackuswb  $dst,$dst,$dst\n\t! mul packed16B" %}
7357   ins_encode %{
7358   int vector_len = 1;
7359     __ vpmovsxbw($tmp$$XMMRegister, $src1$$XMMRegister, vector_len);
7360     __ vpmovsxbw($dst$$XMMRegister, $src2$$XMMRegister, vector_len);
7361     __ vpmullw($tmp$$XMMRegister, $tmp$$XMMRegister, $dst$$XMMRegister, vector_len);
7362     __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
7363     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
7364     __ vextracti128_high($tmp$$XMMRegister, $dst$$XMMRegister);
7365     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, 0);
7366   %}
7367   ins_pipe( pipe_slow );
7368 %}
7369 
7370 instruct vmul32B_reg_avx(vecY dst, vecY src1, vecY src2, vecY tmp1, vecY tmp2, rRegI scratch) %{
7371   predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
7372   match(Set dst (MulVB src1 src2));
7373   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP scratch);
7374   format %{"vextracti128_high  $tmp1,$src1\n\t"
7375            "vextracti128_high  $dst,$src2\n\t"
7376            "vpmovsxbw $tmp1,$tmp1\n\t"
7377            "vpmovsxbw $dst,$dst\n\t"
7378            "vpmullw $tmp1,$tmp1,$dst\n\t"
7379            "vpmovsxbw $tmp2,$src1\n\t"
7380            "vpmovsxbw $dst,$src2\n\t"
7381            "vpmullw $tmp2,$tmp2,$dst\n\t"
7382            "vmovdqu $dst, [0x00ff00ff0x00ff00ff]\n\t"
7383            "vpbroadcastd $dst, $dst\n\t"
7384            "vpand $tmp1,$tmp1,$dst\n\t"
7385            "vpand $dst,$dst,$tmp2\n\t"
7386            "vpackuswb $dst,$dst,$tmp1\n\t"
7387            "vpermq $dst, $dst, 0xD8\t! mul packed32B" %}
7388   ins_encode %{
7389     int vector_len = 1;
7390     __ vextracti128_high($tmp1$$XMMRegister, $src1$$XMMRegister);
7391     __ vextracti128_high($dst$$XMMRegister, $src2$$XMMRegister);
7392     __ vpmovsxbw($tmp1$$XMMRegister, $tmp1$$XMMRegister, vector_len);
7393     __ vpmovsxbw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
7394     __ vpmullw($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vector_len);
7395     __ vpmovsxbw($tmp2$$XMMRegister, $src1$$XMMRegister, vector_len);
7396     __ vpmovsxbw($dst$$XMMRegister, $src2$$XMMRegister, vector_len);
7397     __ vpmullw($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vector_len);
7398     __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
7399     __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
7400     __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vector_len);
7401     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister, vector_len);
7402     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp1$$XMMRegister, vector_len);
7403     __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vector_len);
7404   %}
7405   ins_pipe( pipe_slow );
7406 %}
7407 
7408 instruct vmul64B_reg_avx(vecZ dst, vecZ src1, vecZ src2, vecZ tmp1, vecZ tmp2, rRegI scratch) %{
7409   predicate(UseAVX > 2 && n->as_Vector()->length() == 64);
7410   match(Set dst (MulVB src1 src2));
7411   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP scratch);
7412   format %{"vextracti64x4_high  $tmp1,$src1\n\t"
7413            "vextracti64x4_high  $dst,$src2\n\t"
7414            "vpmovsxbw $tmp1,$tmp1\n\t"
7415            "vpmovsxbw $dst,$dst\n\t"
7416            "vpmullw $tmp1,$tmp1,$dst\n\t"
7417            "vpmovsxbw $tmp2,$src1\n\t"
7418            "vpmovsxbw $dst,$src2\n\t"
7419            "vpmullw $tmp2,$tmp2,$dst\n\t"
7420            "vmovdqu $dst, [0x00ff00ff0x00ff00ff]\n\t"
7421            "vpbroadcastd $dst, $dst\n\t"
7422            "vpand $tmp1,$tmp1,$dst\n\t"
7423            "vpand $tmp2,$tmp2,$dst\n\t"
7424            "vpackuswb $dst,$tmp1,$tmp2\n\t"
7425            "evmovdquq  $tmp2,[0x0604020007050301]\n\t"
7426            "vpermq $dst,$tmp2,$dst,0x01\t! mul packed64B" %}
7427 
7428   ins_encode %{
7429     int vector_len = 2;
7430     __ vextracti64x4_high($tmp1$$XMMRegister, $src1$$XMMRegister);
7431     __ vextracti64x4_high($dst$$XMMRegister, $src2$$XMMRegister);
7432     __ vpmovsxbw($tmp1$$XMMRegister, $tmp1$$XMMRegister, vector_len);
7433     __ vpmovsxbw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
7434     __ vpmullw($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vector_len);
7435     __ vpmovsxbw($tmp2$$XMMRegister, $src1$$XMMRegister, vector_len);
7436     __ vpmovsxbw($dst$$XMMRegister, $src2$$XMMRegister, vector_len);
7437     __ vpmullw($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vector_len);
7438     __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
7439     __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
7440     __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vector_len);
7441     __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vector_len);
7442     __ vpackuswb($dst$$XMMRegister, $tmp1$$XMMRegister, $tmp2$$XMMRegister, vector_len);
7443     __ evmovdquq($tmp2$$XMMRegister, ExternalAddress(vector_byte_perm_mask()), vector_len, $scratch$$Register);
7444     __ vpermq($dst$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vector_len);
7445 
7446   %}
7447   ins_pipe( pipe_slow );
7448 %}
7449 
7450 // Shorts/Chars vector mul
7451 instruct vmul2S(vecS dst, vecS src) %{
7452   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
7453   match(Set dst (MulVS dst src));
7454   format %{ "pmullw $dst,$src\t! mul packed2S" %}
7455   ins_encode %{
7456     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
7457   %}
7458   ins_pipe( pipe_slow );
7459 %}
7460 
7461 instruct vmul2S_reg(vecS dst, vecS src1, vecS src2) %{
7462   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7463   match(Set dst (MulVS src1 src2));
7464   format %{ "vpmullw $dst,$src1,$src2\t! mul packed2S" %}
7465   ins_encode %{
7466     int vector_len = 0;
7467     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7468   %}
7469   ins_pipe( pipe_slow );
7470 %}
7471 
7472 instruct vmul2S_mem(vecS dst, vecS src, memory mem) %{
7473   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7474   match(Set dst (MulVS src (LoadVector mem)));
7475   format %{ "vpmullw $dst,$src,$mem\t! mul packed2S" %}
7476   ins_encode %{
7477     int vector_len = 0;
7478     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7479   %}
7480   ins_pipe( pipe_slow );
7481 %}
7482 
7483 instruct vmul4S(vecD dst, vecD src) %{
7484   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
7485   match(Set dst (MulVS dst src));
7486   format %{ "pmullw  $dst,$src\t! mul packed4S" %}
7487   ins_encode %{
7488     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
7489   %}
7490   ins_pipe( pipe_slow );
7491 %}
7492 
7493 instruct vmul4S_reg(vecD dst, vecD src1, vecD src2) %{
7494   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7495   match(Set dst (MulVS src1 src2));
7496   format %{ "vpmullw $dst,$src1,$src2\t! mul packed4S" %}
7497   ins_encode %{
7498     int vector_len = 0;
7499     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7500   %}
7501   ins_pipe( pipe_slow );
7502 %}
7503 
7504 instruct vmul4S_mem(vecD dst, vecD src, memory mem) %{
7505   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7506   match(Set dst (MulVS src (LoadVector mem)));
7507   format %{ "vpmullw $dst,$src,$mem\t! mul packed4S" %}
7508   ins_encode %{
7509     int vector_len = 0;
7510     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7511   %}
7512   ins_pipe( pipe_slow );
7513 %}
7514 
7515 instruct vmul8S(vecX dst, vecX src) %{
7516   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
7517   match(Set dst (MulVS dst src));
7518   format %{ "pmullw  $dst,$src\t! mul packed8S" %}
7519   ins_encode %{
7520     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
7521   %}
7522   ins_pipe( pipe_slow );
7523 %}
7524 
7525 instruct vmul8S_reg(vecX dst, vecX src1, vecX src2) %{
7526   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
7527   match(Set dst (MulVS src1 src2));
7528   format %{ "vpmullw $dst,$src1,$src2\t! mul packed8S" %}
7529   ins_encode %{
7530     int vector_len = 0;
7531     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7532   %}
7533   ins_pipe( pipe_slow );
7534 %}
7535 
7536 instruct vmul8S_mem(vecX dst, vecX src, memory mem) %{
7537   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
7538   match(Set dst (MulVS src (LoadVector mem)));
7539   format %{ "vpmullw $dst,$src,$mem\t! mul packed8S" %}
7540   ins_encode %{
7541     int vector_len = 0;
7542     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7543   %}
7544   ins_pipe( pipe_slow );
7545 %}
7546 
7547 instruct vmul16S_reg(vecY dst, vecY src1, vecY src2) %{
7548   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
7549   match(Set dst (MulVS src1 src2));
7550   format %{ "vpmullw $dst,$src1,$src2\t! mul packed16S" %}
7551   ins_encode %{
7552     int vector_len = 1;
7553     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7554   %}
7555   ins_pipe( pipe_slow );
7556 %}
7557 
7558 instruct vmul16S_mem(vecY dst, vecY src, memory mem) %{
7559   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
7560   match(Set dst (MulVS src (LoadVector mem)));
7561   format %{ "vpmullw $dst,$src,$mem\t! mul packed16S" %}
7562   ins_encode %{
7563     int vector_len = 1;
7564     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7565   %}
7566   ins_pipe( pipe_slow );
7567 %}
7568 
7569 instruct vmul32S_reg(vecZ dst, vecZ src1, vecZ src2) %{
7570   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
7571   match(Set dst (MulVS src1 src2));
7572   format %{ "vpmullw $dst,$src1,$src2\t! mul packed32S" %}
7573   ins_encode %{
7574     int vector_len = 2;
7575     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7576   %}
7577   ins_pipe( pipe_slow );
7578 %}
7579 
7580 instruct vmul32S_mem(vecZ dst, vecZ src, memory mem) %{
7581   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
7582   match(Set dst (MulVS src (LoadVector mem)));
7583   format %{ "vpmullw $dst,$src,$mem\t! mul packed32S" %}
7584   ins_encode %{
7585     int vector_len = 2;
7586     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7587   %}
7588   ins_pipe( pipe_slow );
7589 %}
7590 
7591 // Integers vector mul (sse4_1)
7592 instruct vmul2I(vecD dst, vecD src) %{
7593   predicate(UseSSE > 3 && n->as_Vector()->length() == 2);
7594   match(Set dst (MulVI dst src));
7595   format %{ "pmulld  $dst,$src\t! mul packed2I" %}
7596   ins_encode %{
7597     __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
7598   %}
7599   ins_pipe( pipe_slow );
7600 %}
7601 
7602 instruct vmul2I_reg(vecD dst, vecD src1, vecD src2) %{
7603   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7604   match(Set dst (MulVI src1 src2));
7605   format %{ "vpmulld $dst,$src1,$src2\t! mul packed2I" %}
7606   ins_encode %{
7607     int vector_len = 0;
7608     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7609   %}
7610   ins_pipe( pipe_slow );
7611 %}
7612 
7613 instruct vmul2I_mem(vecD dst, vecD src, memory mem) %{
7614   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7615   match(Set dst (MulVI src (LoadVector mem)));
7616   format %{ "vpmulld $dst,$src,$mem\t! mul packed2I" %}
7617   ins_encode %{
7618     int vector_len = 0;
7619     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7620   %}
7621   ins_pipe( pipe_slow );
7622 %}
7623 
7624 instruct vmul4I(vecX dst, vecX src) %{
7625   predicate(UseSSE > 3 && n->as_Vector()->length() == 4);
7626   match(Set dst (MulVI dst src));
7627   format %{ "pmulld  $dst,$src\t! mul packed4I" %}
7628   ins_encode %{
7629     __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
7630   %}
7631   ins_pipe( pipe_slow );
7632 %}
7633 
7634 instruct vmul4I_reg(vecX dst, vecX src1, vecX src2) %{
7635   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7636   match(Set dst (MulVI src1 src2));
7637   format %{ "vpmulld $dst,$src1,$src2\t! mul packed4I" %}
7638   ins_encode %{
7639     int vector_len = 0;
7640     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7641   %}
7642   ins_pipe( pipe_slow );
7643 %}
7644 
7645 instruct vmul4I_mem(vecX dst, vecX src, memory mem) %{
7646   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7647   match(Set dst (MulVI src (LoadVector mem)));
7648   format %{ "vpmulld $dst,$src,$mem\t! mul packed4I" %}
7649   ins_encode %{
7650     int vector_len = 0;
7651     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7652   %}
7653   ins_pipe( pipe_slow );
7654 %}
7655 
7656 instruct vmul2L_reg(vecX dst, vecX src1, vecX src2) %{
7657   predicate(UseAVX > 2 && n->as_Vector()->length() == 2 && VM_Version::supports_avx512dq());
7658   match(Set dst (MulVL src1 src2));
7659   format %{ "vpmullq $dst,$src1,$src2\t! mul packed2L" %}
7660   ins_encode %{
7661     int vector_len = 0;
7662     __ vpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7663   %}
7664   ins_pipe( pipe_slow );
7665 %}
7666 
7667 instruct vmul2L_mem(vecX dst, vecX src, memory mem) %{
7668   predicate(UseAVX > 2 && n->as_Vector()->length() == 2 && VM_Version::supports_avx512dq());
7669   match(Set dst (MulVL src (LoadVector mem)));
7670   format %{ "vpmullq $dst,$src,$mem\t! mul packed2L" %}
7671   ins_encode %{
7672     int vector_len = 0;
7673     __ vpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7674   %}
7675   ins_pipe( pipe_slow );
7676 %}
7677 
7678 instruct vmul4L_reg(vecY dst, vecY src1, vecY src2) %{
7679   predicate(UseAVX > 2 && n->as_Vector()->length() == 4 && VM_Version::supports_avx512dq());
7680   match(Set dst (MulVL src1 src2));
7681   format %{ "vpmullq $dst,$src1,$src2\t! mul packed4L" %}
7682   ins_encode %{
7683     int vector_len = 1;
7684     __ vpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7685   %}
7686   ins_pipe( pipe_slow );
7687 %}
7688 
7689 instruct vmul4L_mem(vecY dst, vecY src, memory mem) %{
7690   predicate(UseAVX > 2 && n->as_Vector()->length() == 4 && VM_Version::supports_avx512dq());
7691   match(Set dst (MulVL src (LoadVector mem)));
7692   format %{ "vpmullq $dst,$src,$mem\t! mul packed4L" %}
7693   ins_encode %{
7694     int vector_len = 1;
7695     __ vpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7696   %}
7697   ins_pipe( pipe_slow );
7698 %}
7699 
7700 instruct vmul8L_reg(vecZ dst, vecZ src1, vecZ src2) %{
7701   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && VM_Version::supports_avx512dq());
7702   match(Set dst (MulVL src1 src2));
7703   format %{ "vpmullq $dst,$src1,$src2\t! mul packed8L" %}
7704   ins_encode %{
7705     int vector_len = 2;
7706     __ vpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7707   %}
7708   ins_pipe( pipe_slow );
7709 %}
7710 
7711 instruct vmul8L_mem(vecZ dst, vecZ src, memory mem) %{
7712   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && VM_Version::supports_avx512dq());
7713   match(Set dst (MulVL src (LoadVector mem)));
7714   format %{ "vpmullq $dst,$src,$mem\t! mul packed8L" %}
7715   ins_encode %{
7716     int vector_len = 2;
7717     __ vpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7718   %}
7719   ins_pipe( pipe_slow );
7720 %}
7721 
7722 instruct vmul8I_reg(vecY dst, vecY src1, vecY src2) %{
7723   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
7724   match(Set dst (MulVI src1 src2));
7725   format %{ "vpmulld $dst,$src1,$src2\t! mul packed8I" %}
7726   ins_encode %{
7727     int vector_len = 1;
7728     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7729   %}
7730   ins_pipe( pipe_slow );
7731 %}
7732 
7733 instruct vmul8I_mem(vecY dst, vecY src, memory mem) %{
7734   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
7735   match(Set dst (MulVI src (LoadVector mem)));
7736   format %{ "vpmulld $dst,$src,$mem\t! mul packed8I" %}
7737   ins_encode %{
7738     int vector_len = 1;
7739     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7740   %}
7741   ins_pipe( pipe_slow );
7742 %}
7743 
7744 instruct vmul16I_reg(vecZ dst, vecZ src1, vecZ src2) %{
7745   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
7746   match(Set dst (MulVI src1 src2));
7747   format %{ "vpmulld $dst,$src1,$src2\t! mul packed16I" %}
7748   ins_encode %{
7749     int vector_len = 2;
7750     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7751   %}
7752   ins_pipe( pipe_slow );
7753 %}
7754 
7755 instruct vmul16I_mem(vecZ dst, vecZ src, memory mem) %{
7756   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
7757   match(Set dst (MulVI src (LoadVector mem)));
7758   format %{ "vpmulld $dst,$src,$mem\t! mul packed16I" %}
7759   ins_encode %{
7760     int vector_len = 2;
7761     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7762   %}
7763   ins_pipe( pipe_slow );
7764 %}
7765 
7766 // Floats vector mul
7767 instruct vmul2F(vecD dst, vecD src) %{
7768   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
7769   match(Set dst (MulVF dst src));
7770   format %{ "mulps   $dst,$src\t! mul packed2F" %}
7771   ins_encode %{
7772     __ mulps($dst$$XMMRegister, $src$$XMMRegister);
7773   %}
7774   ins_pipe( pipe_slow );
7775 %}
7776 
7777 instruct vmul2F_reg(vecD dst, vecD src1, vecD src2) %{
7778   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7779   match(Set dst (MulVF src1 src2));
7780   format %{ "vmulps  $dst,$src1,$src2\t! mul packed2F" %}
7781   ins_encode %{
7782     int vector_len = 0;
7783     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7784   %}
7785   ins_pipe( pipe_slow );
7786 %}
7787 
7788 instruct vmul2F_mem(vecD dst, vecD src, memory mem) %{
7789   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7790   match(Set dst (MulVF src (LoadVector mem)));
7791   format %{ "vmulps  $dst,$src,$mem\t! mul packed2F" %}
7792   ins_encode %{
7793     int vector_len = 0;
7794     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7795   %}
7796   ins_pipe( pipe_slow );
7797 %}
7798 
7799 instruct vmul4F(vecX dst, vecX src) %{
7800   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
7801   match(Set dst (MulVF dst src));
7802   format %{ "mulps   $dst,$src\t! mul packed4F" %}
7803   ins_encode %{
7804     __ mulps($dst$$XMMRegister, $src$$XMMRegister);
7805   %}
7806   ins_pipe( pipe_slow );
7807 %}
7808 
7809 instruct vmul4F_reg(vecX dst, vecX src1, vecX src2) %{
7810   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7811   match(Set dst (MulVF src1 src2));
7812   format %{ "vmulps  $dst,$src1,$src2\t! mul packed4F" %}
7813   ins_encode %{
7814     int vector_len = 0;
7815     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7816   %}
7817   ins_pipe( pipe_slow );
7818 %}
7819 
7820 instruct vmul4F_mem(vecX dst, vecX src, memory mem) %{
7821   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7822   match(Set dst (MulVF src (LoadVector mem)));
7823   format %{ "vmulps  $dst,$src,$mem\t! mul packed4F" %}
7824   ins_encode %{
7825     int vector_len = 0;
7826     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7827   %}
7828   ins_pipe( pipe_slow );
7829 %}
7830 
7831 instruct vmul8F_reg(vecY dst, vecY src1, vecY src2) %{
7832   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
7833   match(Set dst (MulVF src1 src2));
7834   format %{ "vmulps  $dst,$src1,$src2\t! mul packed8F" %}
7835   ins_encode %{
7836     int vector_len = 1;
7837     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7838   %}
7839   ins_pipe( pipe_slow );
7840 %}
7841 
7842 instruct vmul8F_mem(vecY dst, vecY src, memory mem) %{
7843   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
7844   match(Set dst (MulVF src (LoadVector mem)));
7845   format %{ "vmulps  $dst,$src,$mem\t! mul packed8F" %}
7846   ins_encode %{
7847     int vector_len = 1;
7848     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7849   %}
7850   ins_pipe( pipe_slow );
7851 %}
7852 
7853 instruct vmul16F_reg(vecZ dst, vecZ src1, vecZ src2) %{
7854   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
7855   match(Set dst (MulVF src1 src2));
7856   format %{ "vmulps  $dst,$src1,$src2\t! mul packed16F" %}
7857   ins_encode %{
7858     int vector_len = 2;
7859     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7860   %}
7861   ins_pipe( pipe_slow );
7862 %}
7863 
7864 instruct vmul16F_mem(vecZ dst, vecZ src, memory mem) %{
7865   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
7866   match(Set dst (MulVF src (LoadVector mem)));
7867   format %{ "vmulps  $dst,$src,$mem\t! mul packed16F" %}
7868   ins_encode %{
7869     int vector_len = 2;
7870     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7871   %}
7872   ins_pipe( pipe_slow );
7873 %}
7874 
7875 // Doubles vector mul
7876 instruct vmul2D(vecX dst, vecX src) %{
7877   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
7878   match(Set dst (MulVD dst src));
7879   format %{ "mulpd   $dst,$src\t! mul packed2D" %}
7880   ins_encode %{
7881     __ mulpd($dst$$XMMRegister, $src$$XMMRegister);
7882   %}
7883   ins_pipe( pipe_slow );
7884 %}
7885 
7886 instruct vmul2D_reg(vecX dst, vecX src1, vecX src2) %{
7887   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7888   match(Set dst (MulVD src1 src2));
7889   format %{ "vmulpd  $dst,$src1,$src2\t! mul packed2D" %}
7890   ins_encode %{
7891     int vector_len = 0;
7892     __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7893   %}
7894   ins_pipe( pipe_slow );
7895 %}
7896 
7897 instruct vmul2D_mem(vecX dst, vecX src, memory mem) %{
7898   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7899   match(Set dst (MulVD src (LoadVector mem)));
7900   format %{ "vmulpd  $dst,$src,$mem\t! mul packed2D" %}
7901   ins_encode %{
7902     int vector_len = 0;
7903     __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7904   %}
7905   ins_pipe( pipe_slow );
7906 %}
7907 
7908 instruct vmul4D_reg(vecY dst, vecY src1, vecY src2) %{
7909   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7910   match(Set dst (MulVD src1 src2));
7911   format %{ "vmulpd  $dst,$src1,$src2\t! mul packed4D" %}
7912   ins_encode %{
7913     int vector_len = 1;
7914     __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7915   %}
7916   ins_pipe( pipe_slow );
7917 %}
7918 
7919 instruct vmul4D_mem(vecY dst, vecY src, memory mem) %{
7920   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7921   match(Set dst (MulVD src (LoadVector mem)));
7922   format %{ "vmulpd  $dst,$src,$mem\t! mul packed4D" %}
7923   ins_encode %{
7924     int vector_len = 1;
7925     __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7926   %}
7927   ins_pipe( pipe_slow );
7928 %}
7929 
7930 instruct vmul8D_reg(vecZ dst, vecZ src1, vecZ src2) %{
7931   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
7932   match(Set dst (MulVD src1 src2));
7933   format %{ "vmulpd  $dst k0,$src1,$src2\t! mul packed8D" %}
7934   ins_encode %{
7935     int vector_len = 2;
7936     __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7937   %}
7938   ins_pipe( pipe_slow );
7939 %}
7940 
7941 instruct vmul8D_mem(vecZ dst, vecZ src, memory mem) %{
7942   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
7943   match(Set dst (MulVD src (LoadVector mem)));
7944   format %{ "vmulpd  $dst k0,$src,$mem\t! mul packed8D" %}
7945   ins_encode %{
7946     int vector_len = 2;
7947     __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7948   %}
7949   ins_pipe( pipe_slow );
7950 %}
7951 
7952 instruct vcmov8F_reg(legVecY dst, legVecY src1, legVecY src2, immI8 cop, cmpOp_vcmppd copnd) %{
7953   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
7954   match(Set dst (CMoveVF (Binary copnd cop) (Binary src1 src2)));
7955   effect(TEMP dst, USE src1, USE src2);
7956   format %{ "cmpps.$copnd  $dst, $src1, $src2  ! vcmovevf, cond=$cop\n\t"
7957             "blendvps $dst,$src1,$src2,$dst ! vcmovevf\n\t"
7958          %}
7959   ins_encode %{
7960     int vector_len = 1;
7961     int cond = (Assembler::Condition)($copnd$$cmpcode);
7962     __ cmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cond, vector_len);
7963     __ blendvps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $dst$$XMMRegister, vector_len);
7964   %}
7965   ins_pipe( pipe_slow );
7966 %}
7967 
7968 instruct vcmov4D_reg(legVecY dst, legVecY src1, legVecY src2, immI8 cop, cmpOp_vcmppd copnd) %{
7969   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7970   match(Set dst (CMoveVD (Binary copnd cop) (Binary src1 src2)));
7971   effect(TEMP dst, USE src1, USE src2);
7972   format %{ "cmppd.$copnd  $dst, $src1, $src2  ! vcmovevd, cond=$cop\n\t"
7973             "blendvpd $dst,$src1,$src2,$dst ! vcmovevd\n\t"
7974          %}
7975   ins_encode %{
7976     int vector_len = 1;
7977     int cond = (Assembler::Condition)($copnd$$cmpcode);
7978     __ cmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cond, vector_len);
7979     __ blendvpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $dst$$XMMRegister, vector_len);
7980   %}
7981   ins_pipe( pipe_slow );
7982 %}
7983 
7984 // --------------------------------- DIV --------------------------------------
7985 
7986 // Floats vector div
7987 instruct vdiv2F(vecD dst, vecD src) %{
7988   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
7989   match(Set dst (DivVF dst src));
7990   format %{ "divps   $dst,$src\t! div packed2F" %}
7991   ins_encode %{
7992     __ divps($dst$$XMMRegister, $src$$XMMRegister);
7993   %}
7994   ins_pipe( pipe_slow );
7995 %}
7996 
7997 instruct vdiv2F_reg(vecD dst, vecD src1, vecD src2) %{
7998   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7999   match(Set dst (DivVF src1 src2));
8000   format %{ "vdivps  $dst,$src1,$src2\t! div packed2F" %}
8001   ins_encode %{
8002     int vector_len = 0;
8003     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8004   %}
8005   ins_pipe( pipe_slow );
8006 %}
8007 
8008 instruct vdiv2F_mem(vecD dst, vecD src, memory mem) %{
8009   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8010   match(Set dst (DivVF src (LoadVector mem)));
8011   format %{ "vdivps  $dst,$src,$mem\t! div packed2F" %}
8012   ins_encode %{
8013     int vector_len = 0;
8014     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8015   %}
8016   ins_pipe( pipe_slow );
8017 %}
8018 
8019 instruct vdiv4F(vecX dst, vecX src) %{
8020   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
8021   match(Set dst (DivVF dst src));
8022   format %{ "divps   $dst,$src\t! div packed4F" %}
8023   ins_encode %{
8024     __ divps($dst$$XMMRegister, $src$$XMMRegister);
8025   %}
8026   ins_pipe( pipe_slow );
8027 %}
8028 
8029 instruct vdiv4F_reg(vecX dst, vecX src1, vecX src2) %{
8030   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8031   match(Set dst (DivVF src1 src2));
8032   format %{ "vdivps  $dst,$src1,$src2\t! div packed4F" %}
8033   ins_encode %{
8034     int vector_len = 0;
8035     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8036   %}
8037   ins_pipe( pipe_slow );
8038 %}
8039 
8040 instruct vdiv4F_mem(vecX dst, vecX src, memory mem) %{
8041   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8042   match(Set dst (DivVF src (LoadVector mem)));
8043   format %{ "vdivps  $dst,$src,$mem\t! div packed4F" %}
8044   ins_encode %{
8045     int vector_len = 0;
8046     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8047   %}
8048   ins_pipe( pipe_slow );
8049 %}
8050 
8051 instruct vdiv8F_reg(vecY dst, vecY src1, vecY src2) %{
8052   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
8053   match(Set dst (DivVF src1 src2));
8054   format %{ "vdivps  $dst,$src1,$src2\t! div packed8F" %}
8055   ins_encode %{
8056     int vector_len = 1;
8057     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8058   %}
8059   ins_pipe( pipe_slow );
8060 %}
8061 
8062 instruct vdiv8F_mem(vecY dst, vecY src, memory mem) %{
8063   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
8064   match(Set dst (DivVF src (LoadVector mem)));
8065   format %{ "vdivps  $dst,$src,$mem\t! div packed8F" %}
8066   ins_encode %{
8067     int vector_len = 1;
8068     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8069   %}
8070   ins_pipe( pipe_slow );
8071 %}
8072 
8073 instruct vdiv16F_reg(vecZ dst, vecZ src1, vecZ src2) %{
8074   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
8075   match(Set dst (DivVF src1 src2));
8076   format %{ "vdivps  $dst,$src1,$src2\t! div packed16F" %}
8077   ins_encode %{
8078     int vector_len = 2;
8079     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8080   %}
8081   ins_pipe( pipe_slow );
8082 %}
8083 
8084 instruct vdiv16F_mem(vecZ dst, vecZ src, memory mem) %{
8085   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
8086   match(Set dst (DivVF src (LoadVector mem)));
8087   format %{ "vdivps  $dst,$src,$mem\t! div packed16F" %}
8088   ins_encode %{
8089     int vector_len = 2;
8090     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8091   %}
8092   ins_pipe( pipe_slow );
8093 %}
8094 
8095 // Doubles vector div
8096 instruct vdiv2D(vecX dst, vecX src) %{
8097   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
8098   match(Set dst (DivVD dst src));
8099   format %{ "divpd   $dst,$src\t! div packed2D" %}
8100   ins_encode %{
8101     __ divpd($dst$$XMMRegister, $src$$XMMRegister);
8102   %}
8103   ins_pipe( pipe_slow );
8104 %}
8105 
8106 instruct vdiv2D_reg(vecX dst, vecX src1, vecX src2) %{
8107   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8108   match(Set dst (DivVD src1 src2));
8109   format %{ "vdivpd  $dst,$src1,$src2\t! div packed2D" %}
8110   ins_encode %{
8111     int vector_len = 0;
8112     __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8113   %}
8114   ins_pipe( pipe_slow );
8115 %}
8116 
8117 instruct vdiv2D_mem(vecX dst, vecX src, memory mem) %{
8118   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8119   match(Set dst (DivVD src (LoadVector mem)));
8120   format %{ "vdivpd  $dst,$src,$mem\t! div packed2D" %}
8121   ins_encode %{
8122     int vector_len = 0;
8123     __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8124   %}
8125   ins_pipe( pipe_slow );
8126 %}
8127 
8128 instruct vdiv4D_reg(vecY dst, vecY src1, vecY src2) %{
8129   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8130   match(Set dst (DivVD src1 src2));
8131   format %{ "vdivpd  $dst,$src1,$src2\t! div packed4D" %}
8132   ins_encode %{
8133     int vector_len = 1;
8134     __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8135   %}
8136   ins_pipe( pipe_slow );
8137 %}
8138 
8139 instruct vdiv4D_mem(vecY dst, vecY src, memory mem) %{
8140   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8141   match(Set dst (DivVD src (LoadVector mem)));
8142   format %{ "vdivpd  $dst,$src,$mem\t! div packed4D" %}
8143   ins_encode %{
8144     int vector_len = 1;
8145     __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8146   %}
8147   ins_pipe( pipe_slow );
8148 %}
8149 
8150 instruct vdiv8D_reg(vecZ dst, vecZ src1, vecZ src2) %{
8151   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8152   match(Set dst (DivVD src1 src2));
8153   format %{ "vdivpd  $dst,$src1,$src2\t! div packed8D" %}
8154   ins_encode %{
8155     int vector_len = 2;
8156     __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8157   %}
8158   ins_pipe( pipe_slow );
8159 %}
8160 
8161 instruct vdiv8D_mem(vecZ dst, vecZ src, memory mem) %{
8162   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8163   match(Set dst (DivVD src (LoadVector mem)));
8164   format %{ "vdivpd  $dst,$src,$mem\t! div packed8D" %}
8165   ins_encode %{
8166     int vector_len = 2;
8167     __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8168   %}
8169   ins_pipe( pipe_slow );
8170 %}
8171 
8172 // --------------------------------- Sqrt --------------------------------------
8173 
8174 // Floating point vector sqrt
8175 instruct vsqrt2D_reg(vecX dst, vecX src) %{
8176   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8177   match(Set dst (SqrtVD src));
8178   format %{ "vsqrtpd  $dst,$src\t! sqrt packed2D" %}
8179   ins_encode %{
8180     int vector_len = 0;
8181     __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
8182   %}
8183   ins_pipe( pipe_slow );
8184 %}
8185 
8186 instruct vsqrt2D_mem(vecX dst, memory mem) %{
8187   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8188   match(Set dst (SqrtVD (LoadVector mem)));
8189   format %{ "vsqrtpd  $dst,$mem\t! sqrt packed2D" %}
8190   ins_encode %{
8191     int vector_len = 0;
8192     __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vector_len);
8193   %}
8194   ins_pipe( pipe_slow );
8195 %}
8196 
8197 instruct vsqrt4D_reg(vecY dst, vecY src) %{
8198   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8199   match(Set dst (SqrtVD src));
8200   format %{ "vsqrtpd  $dst,$src\t! sqrt packed4D" %}
8201   ins_encode %{
8202     int vector_len = 1;
8203     __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
8204   %}
8205   ins_pipe( pipe_slow );
8206 %}
8207 
8208 instruct vsqrt4D_mem(vecY dst, memory mem) %{
8209   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8210   match(Set dst (SqrtVD (LoadVector mem)));
8211   format %{ "vsqrtpd  $dst,$mem\t! sqrt packed4D" %}
8212   ins_encode %{
8213     int vector_len = 1;
8214     __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vector_len);
8215   %}
8216   ins_pipe( pipe_slow );
8217 %}
8218 
8219 instruct vsqrt8D_reg(vecZ dst, vecZ src) %{
8220   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8221   match(Set dst (SqrtVD src));
8222   format %{ "vsqrtpd  $dst,$src\t! sqrt packed8D" %}
8223   ins_encode %{
8224     int vector_len = 2;
8225     __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
8226   %}
8227   ins_pipe( pipe_slow );
8228 %}
8229 
8230 instruct vsqrt8D_mem(vecZ dst, memory mem) %{
8231   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8232   match(Set dst (SqrtVD (LoadVector mem)));
8233   format %{ "vsqrtpd  $dst,$mem\t! sqrt packed8D" %}
8234   ins_encode %{
8235     int vector_len = 2;
8236     __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vector_len);
8237   %}
8238   ins_pipe( pipe_slow );
8239 %}
8240 
8241 instruct vsqrt2F_reg(vecD dst, vecD src) %{
8242   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8243   match(Set dst (SqrtVF src));
8244   format %{ "vsqrtps  $dst,$src\t! sqrt packed2F" %}
8245   ins_encode %{
8246     int vector_len = 0;
8247     __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vector_len);
8248   %}
8249   ins_pipe( pipe_slow );
8250 %}
8251 
8252 instruct vsqrt2F_mem(vecD dst, memory mem) %{
8253   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8254   match(Set dst (SqrtVF (LoadVector mem)));
8255   format %{ "vsqrtps  $dst,$mem\t! sqrt packed2F" %}
8256   ins_encode %{
8257     int vector_len = 0;
8258     __ vsqrtps($dst$$XMMRegister, $mem$$Address, vector_len);
8259   %}
8260   ins_pipe( pipe_slow );
8261 %}
8262 
8263 instruct vsqrt4F_reg(vecX dst, vecX src) %{
8264   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8265   match(Set dst (SqrtVF src));
8266   format %{ "vsqrtps  $dst,$src\t! sqrt packed4F" %}
8267   ins_encode %{
8268     int vector_len = 0;
8269     __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vector_len);
8270   %}
8271   ins_pipe( pipe_slow );
8272 %}
8273 
8274 instruct vsqrt4F_mem(vecX dst, memory mem) %{
8275   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8276   match(Set dst (SqrtVF (LoadVector mem)));
8277   format %{ "vsqrtps  $dst,$mem\t! sqrt packed4F" %}
8278   ins_encode %{
8279     int vector_len = 0;
8280     __ vsqrtps($dst$$XMMRegister, $mem$$Address, vector_len);
8281   %}
8282   ins_pipe( pipe_slow );
8283 %}
8284 
8285 instruct vsqrt8F_reg(vecY dst, vecY src) %{
8286   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
8287   match(Set dst (SqrtVF src));
8288   format %{ "vsqrtps  $dst,$src\t! sqrt packed8F" %}
8289   ins_encode %{
8290     int vector_len = 1;
8291     __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vector_len);
8292   %}
8293   ins_pipe( pipe_slow );
8294 %}
8295 
8296 instruct vsqrt8F_mem(vecY dst, memory mem) %{
8297   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
8298   match(Set dst (SqrtVF (LoadVector mem)));
8299   format %{ "vsqrtps  $dst,$mem\t! sqrt packed8F" %}
8300   ins_encode %{
8301     int vector_len = 1;
8302     __ vsqrtps($dst$$XMMRegister, $mem$$Address, vector_len);
8303   %}
8304   ins_pipe( pipe_slow );
8305 %}
8306 
8307 instruct vsqrt16F_reg(vecZ dst, vecZ src) %{
8308   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
8309   match(Set dst (SqrtVF src));
8310   format %{ "vsqrtps  $dst,$src\t! sqrt packed16F" %}
8311   ins_encode %{
8312     int vector_len = 2;
8313     __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vector_len);
8314   %}
8315   ins_pipe( pipe_slow );
8316 %}
8317 
8318 instruct vsqrt16F_mem(vecZ dst, memory mem) %{
8319   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
8320   match(Set dst (SqrtVF (LoadVector mem)));
8321   format %{ "vsqrtps  $dst,$mem\t! sqrt packed16F" %}
8322   ins_encode %{
8323     int vector_len = 2;
8324     __ vsqrtps($dst$$XMMRegister, $mem$$Address, vector_len);
8325   %}
8326   ins_pipe( pipe_slow );
8327 %}
8328 
8329 // ------------------------------ Shift ---------------------------------------
8330 
8331 // Left and right shift count vectors are the same on x86
8332 // (only lowest bits of xmm reg are used for count).
8333 instruct vshiftcnt(vecS dst, rRegI cnt) %{
8334   match(Set dst (LShiftCntV cnt));
8335   match(Set dst (RShiftCntV cnt));
8336   format %{ "movdl    $dst,$cnt\t! load shift count" %}
8337   ins_encode %{
8338     __ movdl($dst$$XMMRegister, $cnt$$Register);
8339   %}
8340   ins_pipe( pipe_slow );
8341 %}
8342 
8343 instruct vshiftcntimm(vecS dst, immI8 cnt, rRegI tmp) %{
8344   match(Set dst cnt);
8345   effect(TEMP tmp);
8346   format %{ "movl    $tmp,$cnt\t"
8347             "movdl   $dst,$tmp\t! load shift count" %}
8348   ins_encode %{
8349     __ movl($tmp$$Register, $cnt$$constant);
8350     __ movdl($dst$$XMMRegister, $tmp$$Register);
8351   %}
8352   ins_pipe( pipe_slow );
8353 %}
8354 
8355 // Byte vector shift
8356 instruct vshift4B(vecS dst, vecS src, vecS shift, vecS tmp, rRegI scratch) %{
8357   predicate(UseSSE > 3 && n->as_Vector()->length() == 4);
8358   match(Set dst (LShiftVB src shift));
8359   match(Set dst (RShiftVB src shift));
8360   match(Set dst (URShiftVB src shift));
8361   effect(TEMP dst, TEMP tmp, TEMP scratch);
8362   format %{"vextendbw $tmp,$src\n\t"
8363            "vshiftw   $tmp,$shift\n\t"
8364            "movdqu    $dst,[0x00ff00ff0x00ff00ff]\n\t"
8365            "pand      $dst,$tmp\n\t"
8366            "packuswb  $dst,$dst\n\t ! packed4B shift" %}
8367   ins_encode %{
8368     int opcode = this->as_Mach()->ideal_Opcode();
8369 
8370     __ vextendbw(opcode, $tmp$$XMMRegister, $src$$XMMRegister);
8371     __ vshiftw(opcode, $tmp$$XMMRegister, $shift$$XMMRegister);
8372     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register); 
8373     __ pand($dst$$XMMRegister, $tmp$$XMMRegister);
8374     __ packuswb($dst$$XMMRegister, $dst$$XMMRegister);
8375   %}
8376   ins_pipe( pipe_slow );
8377 %}
8378 
8379 instruct vshift8B(vecD dst, vecD src, vecS shift, vecD tmp, rRegI scratch) %{
8380   predicate(UseSSE > 3 && n->as_Vector()->length() == 8);
8381   match(Set dst (LShiftVB src shift));
8382   match(Set dst (RShiftVB src shift));
8383   match(Set dst (URShiftVB src shift));
8384   effect(TEMP dst, TEMP tmp, TEMP scratch);
8385   format %{"vextendbw $tmp,$src\n\t"
8386            "vshiftw   $tmp,$shift\n\t"
8387            "movdqu    $dst,[0x00ff00ff0x00ff00ff]\n\t"
8388            "pand      $dst,$tmp\n\t"
8389            "packuswb  $dst,$dst\n\t ! packed8B shift" %}
8390   ins_encode %{
8391     int opcode = this->as_Mach()->ideal_Opcode();
8392 
8393     __ vextendbw(opcode, $tmp$$XMMRegister, $src$$XMMRegister);
8394     __ vshiftw(opcode, $tmp$$XMMRegister, $shift$$XMMRegister);
8395     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register); 
8396     __ pand($dst$$XMMRegister, $tmp$$XMMRegister);
8397     __ packuswb($dst$$XMMRegister, $dst$$XMMRegister);
8398   %}
8399   ins_pipe( pipe_slow );
8400 %}
8401 
8402 instruct vshift16B(vecX dst, vecX src, vecS shift, vecX tmp1, vecX tmp2, rRegI scratch) %{
8403   predicate(UseSSE > 3  && UseAVX <= 1 && n->as_Vector()->length() == 16);
8404   match(Set dst (LShiftVB src shift));
8405   match(Set dst (RShiftVB src shift));
8406   match(Set dst (URShiftVB src shift));
8407   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP scratch);
8408   format %{"vextendbw $tmp1,$src\n\t"
8409            "vshiftw   $tmp1,$shift\n\t"
8410            "pshufd    $tmp2,$src\n\t"
8411            "vextendbw $tmp2,$tmp2\n\t"
8412            "vshiftw   $tmp2,$shift\n\t"
8413            "movdqu    $dst,[0x00ff00ff0x00ff00ff]\n\t"
8414            "pand      $tmp2,$dst\n\t"
8415            "pand      $dst,$tmp1\n\t"
8416            "packuswb  $dst,$tmp2\n\t! packed16B shift" %}
8417   ins_encode %{
8418     int opcode = this->as_Mach()->ideal_Opcode();
8419 
8420     __ vextendbw(opcode, $tmp1$$XMMRegister, $src$$XMMRegister);
8421     __ vshiftw(opcode, $tmp1$$XMMRegister, $shift$$XMMRegister);
8422     __ pshufd($tmp2$$XMMRegister, $src$$XMMRegister, 0xE);
8423     __ vextendbw(opcode, $tmp2$$XMMRegister, $tmp2$$XMMRegister);
8424     __ vshiftw(opcode, $tmp2$$XMMRegister, $shift$$XMMRegister);
8425     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
8426     __ pand($tmp2$$XMMRegister, $dst$$XMMRegister);
8427     __ pand($dst$$XMMRegister, $tmp1$$XMMRegister);
8428     __ packuswb($dst$$XMMRegister, $tmp2$$XMMRegister);
8429   %}
8430   ins_pipe( pipe_slow );
8431 %}
8432 
8433 instruct vshift16B_avx(vecX dst, vecX src, vecS shift, vecX tmp, rRegI scratch) %{
8434   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
8435   match(Set dst (LShiftVB src shift));
8436   match(Set dst (RShiftVB src shift));
8437   match(Set dst (URShiftVB src shift));
8438   effect(TEMP dst, TEMP tmp, TEMP scratch);
8439   format %{"vextendbw  $tmp,$src\n\t"
8440            "vshiftw    $tmp,$tmp,$shift\n\t"
8441            "vpand      $tmp,$tmp,[0x00ff00ff0x00ff00ff]\n\t"
8442            "vextracti128_high  $dst,$tmp\n\t"
8443            "vpackuswb  $dst,$tmp,$dst\n\t! packed16B shift" %}
8444   ins_encode %{
8445     int opcode = this->as_Mach()->ideal_Opcode();
8446 
8447     int vector_len = 1;
8448     __ vextendbw(opcode, $tmp$$XMMRegister, $src$$XMMRegister, vector_len);
8449     __ vshiftw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vector_len);
8450     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vector_len, $scratch$$Register);
8451     __ vextracti128_high($dst$$XMMRegister, $tmp$$XMMRegister);
8452     __ vpackuswb($dst$$XMMRegister, $tmp$$XMMRegister, $dst$$XMMRegister, 0);
8453   %}
8454   ins_pipe( pipe_slow );
8455 %}
8456 
8457 instruct vshift32B_avx(vecY dst, vecY src, vecS shift, vecY tmp, rRegI scratch) %{
8458   predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
8459   match(Set dst (LShiftVB src shift));
8460   match(Set dst (RShiftVB src shift));
8461   match(Set dst (URShiftVB src shift));
8462   effect(TEMP dst, TEMP tmp, TEMP scratch);
8463   format %{"vextracti128_high  $tmp,$src\n\t"
8464            "vextendbw  $tmp,$tmp\n\t"
8465            "vextendbw  $dst,$src\n\t"
8466            "vshiftw    $tmp,$tmp,$shift\n\t"
8467            "vshiftw    $dst,$dst,$shift\n\t"
8468            "vpand      $tmp,$tmp,[0x00ff00ff0x00ff00ff]\n\t"
8469            "vpand      $dst,$dst,[0x00ff00ff0x00ff00ff]\n\t"
8470            "vpackuswb  $dst,$dst,$tmp\n\t"
8471            "vpermq     $dst,$dst,0xD8\n\t! packed32B shift" %}
8472   ins_encode %{
8473     int opcode = this->as_Mach()->ideal_Opcode();
8474 
8475     int vector_len = 1;
8476     __ vextracti128_high($tmp$$XMMRegister, $src$$XMMRegister);
8477     __ vextendbw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, vector_len);
8478     __ vextendbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, vector_len);
8479     __ vshiftw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vector_len);
8480     __ vshiftw(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $shift$$XMMRegister, vector_len);
8481     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vector_len, $scratch$$Register);
8482     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vector_len, $scratch$$Register);
8483     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
8484     __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vector_len);
8485   %}
8486   ins_pipe( pipe_slow );
8487 %}
8488 
8489 instruct vshift64B_avx(vecZ dst, vecZ src, vecS shift, vecZ tmp1, vecZ tmp2, rRegI scratch) %{
8490   predicate(UseAVX > 2 && n->as_Vector()->length() == 64);
8491   match(Set dst (LShiftVB src shift));
8492   match(Set dst (RShiftVB src shift));
8493   match(Set dst (URShiftVB src shift));
8494   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP scratch);
8495   format %{"vextracti64x4  $tmp1,$src\n\t"
8496            "vextendbw      $tmp1,$tmp1\n\t"
8497            "vextendbw      $tmp2,$src\n\t"
8498            "vshiftw        $tmp1,$tmp1,$shift\n\t"
8499            "vshiftw        $tmp2,$tmp2,$shift\n\t"
8500            "vmovdqu        $dst,[0x00ff00ff0x00ff00ff]\n\t"
8501            "vpbroadcastd   $dst,$dst\n\t"
8502            "vpand          $tmp1,$tmp1,$dst\n\t"
8503            "vpand          $tmp2,$tmp2,$dst\n\t"
8504            "vpackuswb      $dst,$tmp1,$tmp2\n\t"
8505            "evmovdquq      $tmp2, [0x0604020007050301]\n\t"
8506            "vpermq         $dst,$tmp2,$dst\n\t! packed64B shift" %}
8507   ins_encode %{
8508     int opcode = this->as_Mach()->ideal_Opcode();
8509 
8510     int vector_len = 2;
8511     __ vextracti64x4($tmp1$$XMMRegister, $src$$XMMRegister, 1);
8512     __ vextendbw(opcode, $tmp1$$XMMRegister, $tmp1$$XMMRegister, vector_len);
8513     __ vextendbw(opcode, $tmp2$$XMMRegister, $src$$XMMRegister, vector_len);
8514     __ vshiftw(opcode, $tmp1$$XMMRegister, $tmp1$$XMMRegister, $shift$$XMMRegister, vector_len);
8515     __ vshiftw(opcode, $tmp2$$XMMRegister, $tmp2$$XMMRegister, $shift$$XMMRegister, vector_len);
8516     __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
8517     __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
8518     __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vector_len);
8519     __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vector_len);
8520     __ vpackuswb($dst$$XMMRegister, $tmp1$$XMMRegister, $tmp2$$XMMRegister, vector_len);
8521     __ evmovdquq($tmp2$$XMMRegister, ExternalAddress(vector_byte_perm_mask()), vector_len, $scratch$$Register);
8522     __ vpermq($dst$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vector_len);
8523   %}
8524   ins_pipe( pipe_slow );
8525 %}
8526 
8527 // Shorts vector logical right shift produces incorrect Java result
8528 // for negative data because java code convert short value into int with
8529 // sign extension before a shift. But char vectors are fine since chars are
8530 // unsigned values.
8531 // Shorts/Chars vector left shift
8532 instruct vshist2S(vecS dst, vecS src, vecS shift) %{
8533   predicate(n->as_Vector()->length() == 2);
8534   match(Set dst (LShiftVS src shift));
8535   match(Set dst (RShiftVS src shift));
8536   match(Set dst (URShiftVS src shift));
8537   format %{ "vshiftw  $dst,$src,$shift\t! shift packed2S" %}
8538   ins_encode %{
8539     int opcode = this->as_Mach()->ideal_Opcode();
8540     if (UseAVX == 0) { 
8541       if ($dst$$XMMRegister != $src$$XMMRegister)
8542          __ movflt($dst$$XMMRegister, $src$$XMMRegister);
8543       __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
8544     } else {
8545       int vector_len = 0;
8546       __ vshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8547     }
8548   %}
8549   ins_pipe( pipe_slow );
8550 %}
8551 
8552 instruct vshift4S(vecD dst, vecD src, vecS shift) %{
8553   predicate(n->as_Vector()->length() == 4);
8554   match(Set dst (LShiftVS src shift));
8555   match(Set dst (RShiftVS src shift));
8556   match(Set dst (URShiftVS src shift));
8557   format %{ "vshiftw  $dst,$src,$shift\t! shift packed4S" %}
8558   ins_encode %{
8559     int opcode = this->as_Mach()->ideal_Opcode();
8560     if (UseAVX == 0) { 
8561       if ($dst$$XMMRegister != $src$$XMMRegister)
8562          __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
8563       __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
8564     
8565     } else {
8566       int vector_len = 0;
8567       __ vshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8568     }
8569   %}
8570   ins_pipe( pipe_slow );
8571 %}
8572 
8573 instruct vshift8S(vecX dst, vecX src, vecS shift) %{
8574   predicate(n->as_Vector()->length() == 8);
8575   match(Set dst (LShiftVS src shift));
8576   match(Set dst (RShiftVS src shift));
8577   match(Set dst (URShiftVS src shift));
8578   format %{ "vshiftw  $dst,$src,$shift\t! shift packed8S" %}
8579   ins_encode %{
8580     int opcode = this->as_Mach()->ideal_Opcode();
8581     if (UseAVX == 0) { 
8582       if ($dst$$XMMRegister != $src$$XMMRegister)
8583          __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
8584       __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
8585     } else {
8586       int vector_len = 0;
8587       __ vshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8588     }
8589   %}
8590   ins_pipe( pipe_slow );
8591 %}
8592 
8593 instruct vshift16S(vecY dst, vecY src, vecS shift) %{
8594   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
8595   match(Set dst (LShiftVS src shift));
8596   match(Set dst (RShiftVS src shift));
8597   match(Set dst (URShiftVS src shift));
8598   format %{ "vshiftw  $dst,$src,$shift\t! shift packed16S" %}
8599   ins_encode %{
8600     int vector_len = 1;
8601     int opcode = this->as_Mach()->ideal_Opcode();
8602     __ vshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8603   %}
8604   ins_pipe( pipe_slow );
8605 %}
8606 
8607 instruct vshift32S(vecZ dst, vecZ src, vecS shift) %{
8608   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
8609   match(Set dst (LShiftVS src shift));
8610   match(Set dst (RShiftVS src shift));
8611   match(Set dst (URShiftVS src shift));
8612   format %{ "vshiftw  $dst,$src,$shift\t! shift packed32S" %}
8613   ins_encode %{
8614     int vector_len = 2;
8615     int opcode = this->as_Mach()->ideal_Opcode();
8616     __ vshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8617   %}
8618   ins_pipe( pipe_slow );
8619 %}
8620 
8621 // Integers vector left shift
8622 instruct vshift2I(vecD dst, vecD src, vecS shift) %{
8623   predicate(n->as_Vector()->length() == 2);
8624   match(Set dst (LShiftVI src shift));
8625   match(Set dst (RShiftVI src shift));
8626   match(Set dst (URShiftVI src shift));
8627   format %{ "vshiftd  $dst,$src,$shift\t! shift packed2I" %}
8628   ins_encode %{
8629     int opcode = this->as_Mach()->ideal_Opcode();
8630     if (UseAVX == 0) { 
8631       if ($dst$$XMMRegister != $src$$XMMRegister)
8632          __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
8633       __ vshiftd(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
8634     } else {
8635       int vector_len = 0;
8636       __ vshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8637     }
8638   %}
8639   ins_pipe( pipe_slow );
8640 %}
8641 
8642 instruct vshift4I(vecX dst, vecX src, vecS shift) %{
8643   predicate(n->as_Vector()->length() == 4);
8644   match(Set dst (LShiftVI src shift));
8645   match(Set dst (RShiftVI src shift));
8646   match(Set dst (URShiftVI src shift));
8647   format %{ "vshiftd  $dst,$src,$shift\t! shift packed4I" %}
8648   ins_encode %{
8649     int opcode = this->as_Mach()->ideal_Opcode();
8650     if (UseAVX == 0) { 
8651       if ($dst$$XMMRegister != $src$$XMMRegister)
8652          __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
8653       __ vshiftd(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
8654     } else {
8655       int vector_len = 0;
8656       __ vshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8657     }
8658   %}
8659   ins_pipe( pipe_slow );
8660 %}
8661 
8662 instruct vshift8I(vecY dst, vecY src, vecS shift) %{
8663   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
8664   match(Set dst (LShiftVI src shift));
8665   match(Set dst (RShiftVI src shift));
8666   match(Set dst (URShiftVI src shift));
8667   format %{ "vshiftd  $dst,$src,$shift\t! shift packed8I" %}
8668   ins_encode %{
8669     int vector_len = 1;
8670     int opcode = this->as_Mach()->ideal_Opcode();
8671     __ vshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8672   %}
8673   ins_pipe( pipe_slow );
8674 %}
8675 
8676 instruct vshift16I(vecZ dst, vecZ src, vecS shift) %{
8677   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
8678   match(Set dst (LShiftVI src shift));
8679   match(Set dst (RShiftVI src shift));
8680   match(Set dst (URShiftVI src shift));
8681   format %{ "vshiftd  $dst,$src,$shift\t! shift packed16I" %}
8682   ins_encode %{
8683     int vector_len = 2;
8684     int opcode = this->as_Mach()->ideal_Opcode();
8685     __ vshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8686   %}
8687   ins_pipe( pipe_slow );
8688 %}
8689 
8690 // Longs vector shift
8691 instruct vshift2L(vecX dst, vecX src, vecS shift) %{
8692   predicate(n->as_Vector()->length() == 2);
8693   match(Set dst (LShiftVL src shift));
8694   match(Set dst (URShiftVL src shift));
8695   format %{ "vshiftq  $dst,$src,$shift\t! shift packed2L" %}
8696   ins_encode %{
8697     int opcode = this->as_Mach()->ideal_Opcode();
8698     if (UseAVX == 0) { 
8699       if ($dst$$XMMRegister != $src$$XMMRegister)
8700          __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
8701       __ vshiftq(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
8702     } else {
8703       int vector_len = 0;
8704       __ vshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8705     }
8706   %}
8707   ins_pipe( pipe_slow );
8708 %}
8709 
8710 instruct vshift4L(vecY dst, vecY src, vecS shift) %{
8711   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
8712   match(Set dst (LShiftVL src shift));
8713   match(Set dst (URShiftVL src shift));
8714   format %{ "vshiftq  $dst,$src,$shift\t! left shift packed4L" %}
8715   ins_encode %{
8716     int vector_len = 1;
8717     int opcode = this->as_Mach()->ideal_Opcode();
8718     __ vshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8719   %}
8720   ins_pipe( pipe_slow );
8721 %}
8722 
8723 instruct vshift8L(vecZ dst, vecZ src, vecS shift) %{
8724   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8725   match(Set dst (LShiftVL src shift));
8726   match(Set dst (RShiftVL src shift));
8727   match(Set dst (URShiftVL src shift));
8728   format %{ "vshiftq  $dst,$src,$shift\t! shift packed8L" %}
8729   ins_encode %{
8730     int vector_len = 2;
8731     int opcode = this->as_Mach()->ideal_Opcode();
8732     __ vshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8733   %}
8734   ins_pipe( pipe_slow );
8735 %}
8736 
8737 // -------------------ArithmeticRightShift -----------------------------------
8738 // Long vector arithmetic right shift
8739 instruct vsra2L_reg(vecX dst, vecX src, vecS shift, vecX tmp, rRegI scratch) %{
8740   predicate(UseSSE >= 2 && n->as_Vector()->length() == 2);
8741   match(Set dst (RShiftVL src shift));
8742   effect(TEMP dst, TEMP tmp, TEMP scratch);
8743   format %{ "movdqu  $dst,$src\n\t"
8744             "psrlq   $dst,$shift\n\t"
8745             "movdqu  $tmp,[0x8000000000000000]\n\t"
8746             "psrlq   $tmp,$shift\n\t"
8747             "pxor    $dst,$tmp\n\t"
8748             "psubq   $dst,$tmp\t! arithmetic right shift packed2L" %}
8749   ins_encode %{
8750     __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
8751     __ psrlq($dst$$XMMRegister, $shift$$XMMRegister);
8752     __ movdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), $scratch$$Register);
8753     __ psrlq($tmp$$XMMRegister, $shift$$XMMRegister);
8754     __ pxor($dst$$XMMRegister, $tmp$$XMMRegister);
8755     __ psubq($dst$$XMMRegister, $tmp$$XMMRegister);
8756   %}
8757   ins_pipe( pipe_slow );
8758 %}
8759 
8760 instruct vsra2L_reg_evex(vecX dst, vecX src, vecS shift) %{
8761   predicate(UseAVX > 2 && n->as_Vector()->length() == 2);
8762   match(Set dst (RShiftVL src shift));
8763   format %{ "evpsraq  $dst,$src,$shift\t! arithmetic right shift packed2L" %}
8764   ins_encode %{
8765     int vector_len = 0;
8766     __ evpsraq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8767   %}
8768   ins_pipe( pipe_slow );
8769 %}
8770 
8771 instruct vsra4L_reg(vecY dst, vecY src, vecS shift, vecY tmp, rRegI scratch) %{
8772   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
8773   match(Set dst (RShiftVL src shift));
8774   effect(TEMP dst, TEMP tmp, TEMP scratch);
8775   format %{ "vpsrlq   $dst,$src,$shift\n\t"
8776             "vmovdqu  $tmp,[0x8000000000000000]\n\t"
8777             "vpsrlq   $tmp,$tmp,$shift\n\t"
8778             "vpxor    $dst,$dst,$tmp\n\t"
8779             "vpsubq   $dst,$dst,$tmp\t! arithmetic right shift packed4L" %}
8780   ins_encode %{
8781     int vector_len = 1;
8782     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8783     __ vmovdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), $scratch$$Register);
8784     __ vpsrlq($tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vector_len);
8785     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
8786     __ vpsubq($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
8787   %}
8788   ins_pipe( pipe_slow );
8789 %}
8790 
8791 instruct vsra4L_reg_evex(vecY dst, vecY src, vecS shift) %{
8792   predicate(UseAVX > 2 && n->as_Vector()->length() == 4);
8793   match(Set dst (RShiftVL src shift));
8794   format %{ "evpsraq  $dst,$src,$shift\t! arithmetic right shift packed4L" %}
8795   ins_encode %{
8796     int vector_len = 1;
8797     __ evpsraq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8798   %}
8799   ins_pipe( pipe_slow );
8800 %}
8801 
8802 // --------------------------------- AND --------------------------------------
8803 
8804 instruct vand4B(vecS dst, vecS src) %{
8805   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 4);
8806   match(Set dst (AndV dst src));
8807   format %{ "pand    $dst,$src\t! and vectors (4 bytes)" %}
8808   ins_encode %{
8809     __ pand($dst$$XMMRegister, $src$$XMMRegister);
8810   %}
8811   ins_pipe( pipe_slow );
8812 %}
8813 
8814 instruct vand4B_reg(vecS dst, vecS src1, vecS src2) %{
8815   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
8816   match(Set dst (AndV src1 src2));
8817   format %{ "vpand   $dst,$src1,$src2\t! and vectors (4 bytes)" %}
8818   ins_encode %{
8819     int vector_len = 0;
8820     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8821   %}
8822   ins_pipe( pipe_slow );
8823 %}
8824 
8825 instruct vand4B_mem(vecS dst, vecS src, memory mem) %{
8826   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
8827   match(Set dst (AndV src (LoadVector mem)));
8828   format %{ "vpand   $dst,$src,$mem\t! and vectors (4 bytes)" %}
8829   ins_encode %{
8830     int vector_len = 0;
8831     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8832   %}
8833   ins_pipe( pipe_slow );
8834 %}
8835 
8836 instruct vand8B(vecD dst, vecD src) %{
8837   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 8);
8838   match(Set dst (AndV dst src));
8839   format %{ "pand    $dst,$src\t! and vectors (8 bytes)" %}
8840   ins_encode %{
8841     __ pand($dst$$XMMRegister, $src$$XMMRegister);
8842   %}
8843   ins_pipe( pipe_slow );
8844 %}
8845 
8846 instruct vand8B_reg(vecD dst, vecD src1, vecD src2) %{
8847   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
8848   match(Set dst (AndV src1 src2));
8849   format %{ "vpand   $dst,$src1,$src2\t! and vectors (8 bytes)" %}
8850   ins_encode %{
8851     int vector_len = 0;
8852     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8853   %}
8854   ins_pipe( pipe_slow );
8855 %}
8856 
8857 instruct vand8B_mem(vecD dst, vecD src, memory mem) %{
8858   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
8859   match(Set dst (AndV src (LoadVector mem)));
8860   format %{ "vpand   $dst,$src,$mem\t! and vectors (8 bytes)" %}
8861   ins_encode %{
8862     int vector_len = 0;
8863     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8864   %}
8865   ins_pipe( pipe_slow );
8866 %}
8867 
8868 instruct vand16B(vecX dst, vecX src) %{
8869   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 16);
8870   match(Set dst (AndV dst src));
8871   format %{ "pand    $dst,$src\t! and vectors (16 bytes)" %}
8872   ins_encode %{
8873     __ pand($dst$$XMMRegister, $src$$XMMRegister);
8874   %}
8875   ins_pipe( pipe_slow );
8876 %}
8877 
8878 instruct vand16B_reg(vecX dst, vecX src1, vecX src2) %{
8879   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
8880   match(Set dst (AndV src1 src2));
8881   format %{ "vpand   $dst,$src1,$src2\t! and vectors (16 bytes)" %}
8882   ins_encode %{
8883     int vector_len = 0;
8884     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8885   %}
8886   ins_pipe( pipe_slow );
8887 %}
8888 
8889 instruct vand16B_mem(vecX dst, vecX src, memory mem) %{
8890   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
8891   match(Set dst (AndV src (LoadVector mem)));
8892   format %{ "vpand   $dst,$src,$mem\t! and vectors (16 bytes)" %}
8893   ins_encode %{
8894     int vector_len = 0;
8895     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8896   %}
8897   ins_pipe( pipe_slow );
8898 %}
8899 
8900 instruct vand32B_reg(vecY dst, vecY src1, vecY src2) %{
8901   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
8902   match(Set dst (AndV src1 src2));
8903   format %{ "vpand   $dst,$src1,$src2\t! and vectors (32 bytes)" %}
8904   ins_encode %{
8905     int vector_len = 1;
8906     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8907   %}
8908   ins_pipe( pipe_slow );
8909 %}
8910 
8911 instruct vand32B_mem(vecY dst, vecY src, memory mem) %{
8912   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
8913   match(Set dst (AndV src (LoadVector mem)));
8914   format %{ "vpand   $dst,$src,$mem\t! and vectors (32 bytes)" %}
8915   ins_encode %{
8916     int vector_len = 1;
8917     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8918   %}
8919   ins_pipe( pipe_slow );
8920 %}
8921 
8922 instruct vand64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
8923   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
8924   match(Set dst (AndV src1 src2));
8925   format %{ "vpand   $dst,$src1,$src2\t! and vectors (64 bytes)" %}
8926   ins_encode %{
8927     int vector_len = 2;
8928     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8929   %}
8930   ins_pipe( pipe_slow );
8931 %}
8932 
8933 instruct vand64B_mem(vecZ dst, vecZ src, memory mem) %{
8934   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
8935   match(Set dst (AndV src (LoadVector mem)));
8936   format %{ "vpand   $dst,$src,$mem\t! and vectors (64 bytes)" %}
8937   ins_encode %{
8938     int vector_len = 2;
8939     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8940   %}
8941   ins_pipe( pipe_slow );
8942 %}
8943 
8944 // --------------------------------- OR ---------------------------------------
8945 
8946 instruct vor4B(vecS dst, vecS src) %{
8947   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 4);
8948   match(Set dst (OrV dst src));
8949   format %{ "por     $dst,$src\t! or vectors (4 bytes)" %}
8950   ins_encode %{
8951     __ por($dst$$XMMRegister, $src$$XMMRegister);
8952   %}
8953   ins_pipe( pipe_slow );
8954 %}
8955 
8956 instruct vor4B_reg(vecS dst, vecS src1, vecS src2) %{
8957   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
8958   match(Set dst (OrV src1 src2));
8959   format %{ "vpor    $dst,$src1,$src2\t! or vectors (4 bytes)" %}
8960   ins_encode %{
8961     int vector_len = 0;
8962     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8963   %}
8964   ins_pipe( pipe_slow );
8965 %}
8966 
8967 instruct vor4B_mem(vecS dst, vecS src, memory mem) %{
8968   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
8969   match(Set dst (OrV src (LoadVector mem)));
8970   format %{ "vpor    $dst,$src,$mem\t! or vectors (4 bytes)" %}
8971   ins_encode %{
8972     int vector_len = 0;
8973     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8974   %}
8975   ins_pipe( pipe_slow );
8976 %}
8977 
8978 instruct vor8B(vecD dst, vecD src) %{
8979   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 8);
8980   match(Set dst (OrV dst src));
8981   format %{ "por     $dst,$src\t! or vectors (8 bytes)" %}
8982   ins_encode %{
8983     __ por($dst$$XMMRegister, $src$$XMMRegister);
8984   %}
8985   ins_pipe( pipe_slow );
8986 %}
8987 
8988 instruct vor8B_reg(vecD dst, vecD src1, vecD src2) %{
8989   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
8990   match(Set dst (OrV src1 src2));
8991   format %{ "vpor    $dst,$src1,$src2\t! or vectors (8 bytes)" %}
8992   ins_encode %{
8993     int vector_len = 0;
8994     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8995   %}
8996   ins_pipe( pipe_slow );
8997 %}
8998 
8999 instruct vor8B_mem(vecD dst, vecD src, memory mem) %{
9000   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
9001   match(Set dst (OrV src (LoadVector mem)));
9002   format %{ "vpor    $dst,$src,$mem\t! or vectors (8 bytes)" %}
9003   ins_encode %{
9004     int vector_len = 0;
9005     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9006   %}
9007   ins_pipe( pipe_slow );
9008 %}
9009 
9010 instruct vor16B(vecX dst, vecX src) %{
9011   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 16);
9012   match(Set dst (OrV dst src));
9013   format %{ "por     $dst,$src\t! or vectors (16 bytes)" %}
9014   ins_encode %{
9015     __ por($dst$$XMMRegister, $src$$XMMRegister);
9016   %}
9017   ins_pipe( pipe_slow );
9018 %}
9019 
9020 instruct vor16B_reg(vecX dst, vecX src1, vecX src2) %{
9021   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
9022   match(Set dst (OrV src1 src2));
9023   format %{ "vpor    $dst,$src1,$src2\t! or vectors (16 bytes)" %}
9024   ins_encode %{
9025     int vector_len = 0;
9026     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9027   %}
9028   ins_pipe( pipe_slow );
9029 %}
9030 
9031 instruct vor16B_mem(vecX dst, vecX src, memory mem) %{
9032   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
9033   match(Set dst (OrV src (LoadVector mem)));
9034   format %{ "vpor    $dst,$src,$mem\t! or vectors (16 bytes)" %}
9035   ins_encode %{
9036     int vector_len = 0;
9037     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9038   %}
9039   ins_pipe( pipe_slow );
9040 %}
9041 
9042 instruct vor32B_reg(vecY dst, vecY src1, vecY src2) %{
9043   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
9044   match(Set dst (OrV src1 src2));
9045   format %{ "vpor    $dst,$src1,$src2\t! or vectors (32 bytes)" %}
9046   ins_encode %{
9047     int vector_len = 1;
9048     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9049   %}
9050   ins_pipe( pipe_slow );
9051 %}
9052 
9053 instruct vor32B_mem(vecY dst, vecY src, memory mem) %{
9054   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
9055   match(Set dst (OrV src (LoadVector mem)));
9056   format %{ "vpor    $dst,$src,$mem\t! or vectors (32 bytes)" %}
9057   ins_encode %{
9058     int vector_len = 1;
9059     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9060   %}
9061   ins_pipe( pipe_slow );
9062 %}
9063 
9064 instruct vor64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
9065   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
9066   match(Set dst (OrV src1 src2));
9067   format %{ "vpor    $dst,$src1,$src2\t! or vectors (64 bytes)" %}
9068   ins_encode %{
9069     int vector_len = 2;
9070     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9071   %}
9072   ins_pipe( pipe_slow );
9073 %}
9074 
9075 instruct vor64B_mem(vecZ dst, vecZ src, memory mem) %{
9076   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
9077   match(Set dst (OrV src (LoadVector mem)));
9078   format %{ "vpor    $dst,$src,$mem\t! or vectors (64 bytes)" %}
9079   ins_encode %{
9080     int vector_len = 2;
9081     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9082   %}
9083   ins_pipe( pipe_slow );
9084 %}
9085 
9086 // --------------------------------- XOR --------------------------------------
9087 
9088 instruct vxor4B(vecS dst, vecS src) %{
9089   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 4);
9090   match(Set dst (XorV dst src));
9091   format %{ "pxor    $dst,$src\t! xor vectors (4 bytes)" %}
9092   ins_encode %{
9093     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
9094   %}
9095   ins_pipe( pipe_slow );
9096 %}
9097 
9098 instruct vxor4B_reg(vecS dst, vecS src1, vecS src2) %{
9099   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
9100   match(Set dst (XorV src1 src2));
9101   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (4 bytes)" %}
9102   ins_encode %{
9103     int vector_len = 0;
9104     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9105   %}
9106   ins_pipe( pipe_slow );
9107 %}
9108 
9109 instruct vxor4B_mem(vecS dst, vecS src, memory mem) %{
9110   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
9111   match(Set dst (XorV src (LoadVector mem)));
9112   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (4 bytes)" %}
9113   ins_encode %{
9114     int vector_len = 0;
9115     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9116   %}
9117   ins_pipe( pipe_slow );
9118 %}
9119 
9120 instruct vxor8B(vecD dst, vecD src) %{
9121   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 8);
9122   match(Set dst (XorV dst src));
9123   format %{ "pxor    $dst,$src\t! xor vectors (8 bytes)" %}
9124   ins_encode %{
9125     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
9126   %}
9127   ins_pipe( pipe_slow );
9128 %}
9129 
9130 instruct vxor8B_reg(vecD dst, vecD src1, vecD src2) %{
9131   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
9132   match(Set dst (XorV src1 src2));
9133   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (8 bytes)" %}
9134   ins_encode %{
9135     int vector_len = 0;
9136     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9137   %}
9138   ins_pipe( pipe_slow );
9139 %}
9140 
9141 instruct vxor8B_mem(vecD dst, vecD src, memory mem) %{
9142   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
9143   match(Set dst (XorV src (LoadVector mem)));
9144   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (8 bytes)" %}
9145   ins_encode %{
9146     int vector_len = 0;
9147     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9148   %}
9149   ins_pipe( pipe_slow );
9150 %}
9151 
9152 instruct vxor16B(vecX dst, vecX src) %{
9153   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 16);
9154   match(Set dst (XorV dst src));
9155   format %{ "pxor    $dst,$src\t! xor vectors (16 bytes)" %}
9156   ins_encode %{
9157     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
9158   %}
9159   ins_pipe( pipe_slow );
9160 %}
9161 
9162 instruct vxor16B_reg(vecX dst, vecX src1, vecX src2) %{
9163   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
9164   match(Set dst (XorV src1 src2));
9165   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (16 bytes)" %}
9166   ins_encode %{
9167     int vector_len = 0;
9168     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9169   %}
9170   ins_pipe( pipe_slow );
9171 %}
9172 
9173 instruct vxor16B_mem(vecX dst, vecX src, memory mem) %{
9174   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
9175   match(Set dst (XorV src (LoadVector mem)));
9176   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (16 bytes)" %}
9177   ins_encode %{
9178     int vector_len = 0;
9179     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9180   %}
9181   ins_pipe( pipe_slow );
9182 %}
9183 
9184 instruct vxor32B_reg(vecY dst, vecY src1, vecY src2) %{
9185   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
9186   match(Set dst (XorV src1 src2));
9187   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (32 bytes)" %}
9188   ins_encode %{
9189     int vector_len = 1;
9190     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9191   %}
9192   ins_pipe( pipe_slow );
9193 %}
9194 
9195 instruct vxor32B_mem(vecY dst, vecY src, memory mem) %{
9196   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
9197   match(Set dst (XorV src (LoadVector mem)));
9198   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (32 bytes)" %}
9199   ins_encode %{
9200     int vector_len = 1;
9201     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9202   %}
9203   ins_pipe( pipe_slow );
9204 %}
9205 
9206 instruct vxor64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
9207   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
9208   match(Set dst (XorV src1 src2));
9209   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (64 bytes)" %}
9210   ins_encode %{
9211     int vector_len = 2;
9212     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9213   %}
9214   ins_pipe( pipe_slow );
9215 %}
9216 
9217 instruct vxor64B_mem(vecZ dst, vecZ src, memory mem) %{
9218   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
9219   match(Set dst (XorV src (LoadVector mem)));
9220   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (64 bytes)" %}
9221   ins_encode %{
9222     int vector_len = 2;
9223     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9224   %}
9225   ins_pipe( pipe_slow );
9226 %}
9227 
9228 // --------------------------------- ABS --------------------------------------
9229 // a = |a|
9230 instruct vabs4B_reg(vecS dst, vecS src) %{
9231   predicate(UseSSE > 2 && n->as_Vector()->length() == 4);
9232   match(Set dst (AbsVB  src));
9233   format %{ "pabsb $dst,$src\t# $dst = |$src| abs packed4B" %}
9234   ins_encode %{
9235     __ pabsb($dst$$XMMRegister, $src$$XMMRegister);
9236   %}
9237   ins_pipe( pipe_slow );
9238 %}
9239 
9240 instruct vabs8B_reg(vecD dst, vecD src) %{
9241   predicate(UseSSE > 2 && n->as_Vector()->length() == 8);
9242   match(Set dst (AbsVB  src));
9243   format %{ "pabsb $dst,$src\t# $dst = |$src| abs packed8B" %}
9244   ins_encode %{
9245     __ pabsb($dst$$XMMRegister, $src$$XMMRegister);
9246   %}
9247   ins_pipe( pipe_slow );
9248 %}
9249 
9250 instruct vabs16B_reg(vecX dst, vecX src) %{
9251   predicate(UseSSE > 2 && n->as_Vector()->length() == 16);
9252   match(Set dst (AbsVB  src));
9253   format %{ "pabsb $dst,$src\t# $dst = |$src| abs packed16B" %}
9254   ins_encode %{
9255     __ pabsb($dst$$XMMRegister, $src$$XMMRegister);
9256   %}
9257   ins_pipe( pipe_slow );
9258 %}
9259 
9260 instruct vabs32B_reg(vecY dst, vecY src) %{
9261   predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
9262   match(Set dst (AbsVB  src));
9263   format %{ "vpabsb $dst,$src\t# $dst = |$src| abs packed32B" %}
9264   ins_encode %{
9265     int vector_len = 1;
9266     __ vpabsb($dst$$XMMRegister, $src$$XMMRegister, vector_len);
9267   %}
9268   ins_pipe( pipe_slow );
9269 %}
9270 
9271 instruct vabs64B_reg(vecZ dst, vecZ src) %{
9272   predicate(UseAVX > 2 && n->as_Vector()->length() == 64);
9273   match(Set dst (AbsVB  src));
9274   format %{ "vpabsb $dst,$src\t# $dst = |$src| abs packed64B" %}
9275   ins_encode %{
9276     int vector_len = 2;
9277     __ vpabsb($dst$$XMMRegister, $src$$XMMRegister, vector_len);
9278   %}
9279   ins_pipe( pipe_slow );
9280 %}
9281 
9282 instruct vabs2S_reg(vecD dst, vecD src) %{
9283   predicate(UseSSE > 2 && n->as_Vector()->length() == 2);
9284   match(Set dst (AbsVS  src));
9285   format %{ "pabsw $dst,$src\t# $dst = |$src| abs packed2S" %}
9286   ins_encode %{
9287     __ pabsw($dst$$XMMRegister, $src$$XMMRegister);
9288   %}
9289   ins_pipe( pipe_slow );
9290 %}
9291 
9292 instruct vabs4S_reg(vecD dst, vecD src) %{
9293   predicate(UseSSE > 2 && n->as_Vector()->length() == 4);
9294   match(Set dst (AbsVS  src));
9295   format %{ "pabsw $dst,$src\t# $dst = |$src| abs packed4S" %}
9296   ins_encode %{
9297     __ pabsw($dst$$XMMRegister, $src$$XMMRegister);
9298   %}
9299   ins_pipe( pipe_slow );
9300 %}
9301 
9302 instruct vabs8S_reg(vecX dst, vecX src) %{
9303   predicate(UseSSE > 2 && n->as_Vector()->length() == 8);
9304   match(Set dst (AbsVS  src));
9305   format %{ "pabsw $dst,$src\t# $dst = |$src| abs packed8S" %}
9306   ins_encode %{
9307     __ pabsw($dst$$XMMRegister, $src$$XMMRegister);
9308   %}
9309   ins_pipe( pipe_slow );
9310 %}
9311 
9312 instruct vabs16S_reg(vecY dst, vecY src) %{
9313   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
9314   match(Set dst (AbsVS  src));
9315   format %{ "vpabsw $dst,$src\t# $dst = |$src| abs packed16S" %}
9316   ins_encode %{
9317     int vector_len = 1;
9318     __ vpabsw($dst$$XMMRegister, $src$$XMMRegister, vector_len);
9319   %}
9320   ins_pipe( pipe_slow );
9321 %}
9322 
9323 instruct vabs32S_reg(vecZ dst, vecZ src) %{
9324   predicate(UseAVX > 2 && n->as_Vector()->length() == 32);
9325   match(Set dst (AbsVS  src));
9326   format %{ "vpabsw $dst,$src\t# $dst = |$src| abs packed32S" %}
9327   ins_encode %{
9328     int vector_len = 2;
9329     __ vpabsw($dst$$XMMRegister, $src$$XMMRegister, vector_len);
9330   %}
9331   ins_pipe( pipe_slow );
9332 %}
9333 
9334 instruct vabs2I_reg(vecD dst, vecD src) %{
9335   predicate(UseSSE > 2 && n->as_Vector()->length() == 2);
9336   match(Set dst (AbsVI  src));
9337   format %{ "pabsd $dst,$src\t# $dst = |$src| abs packed2I" %}
9338   ins_encode %{
9339     __ pabsd($dst$$XMMRegister, $src$$XMMRegister);
9340   %}
9341   ins_pipe( pipe_slow );
9342 %}
9343 
9344 instruct vabs4I_reg(vecX dst, vecX src) %{
9345   predicate(UseSSE > 2 && n->as_Vector()->length() == 4);
9346   match(Set dst (AbsVI  src));
9347   format %{ "pabsd $dst,$src\t# $dst = |$src| abs packed4I" %}
9348   ins_encode %{
9349     __ pabsd($dst$$XMMRegister, $src$$XMMRegister);
9350   %}
9351   ins_pipe( pipe_slow );
9352 %}
9353 
9354 instruct vabs8I_reg(vecY dst, vecY src) %{
9355   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
9356   match(Set dst (AbsVI src));
9357   format %{ "vpabsd $dst,$src\t# $dst = |$src| abs packed8I" %}
9358   ins_encode %{
9359     int vector_len = 1;
9360     __ vpabsd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
9361   %}
9362   ins_pipe( pipe_slow );
9363 %}
9364 
9365 instruct vabs16I_reg(vecZ dst, vecZ src) %{
9366   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
9367   match(Set dst (AbsVI src));
9368   format %{ "vpabsd $dst,$src\t# $dst = |$src| abs packed16I" %}
9369   ins_encode %{
9370     int vector_len = 2;
9371     __ vpabsd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
9372   %}
9373   ins_pipe( pipe_slow );
9374 %}
9375 
9376 instruct vabs2L_reg(vecX dst, vecX src) %{
9377   predicate(UseAVX > 2 && n->as_Vector()->length() == 2);
9378   match(Set dst (AbsVL  src));
9379   format %{ "evpabsq $dst,$src\t# $dst = |$src| abs packed2L" %}
9380   ins_encode %{
9381     int vector_len = 0;
9382     __ evpabsq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
9383   %}
9384   ins_pipe( pipe_slow );
9385 %}
9386 
9387 instruct vabs4L_reg(vecY dst, vecY src) %{
9388   predicate(UseAVX > 2 && n->as_Vector()->length() == 4);
9389   match(Set dst (AbsVL  src));
9390   format %{ "evpabsq $dst,$src\t# $dst = |$src| abs packed4L" %}
9391   ins_encode %{
9392     int vector_len = 1;
9393     __ evpabsq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
9394   %}
9395   ins_pipe( pipe_slow );
9396 %}
9397 
9398 instruct vabs8L_reg(vecZ dst, vecZ src) %{
9399   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
9400   match(Set dst (AbsVL  src));
9401   format %{ "evpabsq $dst,$src\t# $dst = |$src| abs packed8L" %}
9402   ins_encode %{
9403     int vector_len = 2;
9404     __ evpabsq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
9405   %}
9406   ins_pipe( pipe_slow );
9407 %}
9408 
9409 // --------------------------------- ABSNEG --------------------------------------
9410 
9411 instruct vabsneg2D(vecX dst, vecX src, rRegI scratch) %{
9412   predicate(UseSSE >= 2 && n->as_Vector()->length() == 2);
9413   match(Set dst (AbsVD  src));
9414   match(Set dst (NegVD  src));
9415   effect(TEMP scratch);
9416   format %{ "vabsnegd $dst,$src,[mask]\t# absneg packed2D" %}
9417   ins_encode %{
9418     int opcode = this->as_Mach()->ideal_Opcode();
9419     if ($dst$$XMMRegister != $src$$XMMRegister)
9420       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
9421     __ vabsnegd(opcode, $dst$$XMMRegister, $scratch$$Register);
9422   %}
9423   ins_pipe( pipe_slow );
9424 %}
9425 
9426 instruct vabsneg4D(vecY dst, vecY src, rRegI scratch) %{
9427   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
9428   match(Set dst (AbsVD  src));
9429   match(Set dst (NegVD  src));
9430   effect(TEMP scratch);
9431   format %{ "vabsnegd $dst,$src,[mask]\t# absneg packed4D" %}
9432   ins_encode %{
9433     int opcode = this->as_Mach()->ideal_Opcode();
9434     int vector_len = 1;
9435     __ vabsnegd(opcode, $dst$$XMMRegister, $src$$XMMRegister, vector_len, $scratch$$Register);
9436   %}
9437   ins_pipe( pipe_slow );
9438 %}
9439 
9440 instruct vabsneg8D(vecZ dst, vecZ src, rRegI scratch) %{
9441   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
9442   match(Set dst (AbsVD  src));
9443   match(Set dst (NegVD  src));
9444   effect(TEMP scratch);
9445   format %{ "vabsnegd $dst,$src,[mask]\t# absneg packed8D" %}
9446   ins_encode %{
9447     int opcode = this->as_Mach()->ideal_Opcode();
9448     int vector_len = 2;
9449     __ vabsnegd(opcode, $dst$$XMMRegister, $src$$XMMRegister, vector_len, $scratch$$Register);
9450   %}
9451   ins_pipe( pipe_slow );
9452 %}
9453 
9454 instruct vabsneg2F(vecD dst, vecD src, rRegI scratch) %{
9455   predicate(UseSSE > 0 && n->as_Vector()->length() == 2);
9456   match(Set dst (AbsVF  src));
9457   match(Set dst (NegVF  src));
9458   effect(TEMP scratch);
9459   format %{ "vabsnegf $dst,$src,[mask]\t# absneg packed2F" %}
9460   ins_encode %{
9461     int opcode = this->as_Mach()->ideal_Opcode();
9462     if ($dst$$XMMRegister != $src$$XMMRegister)
9463       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
9464     __ vabsnegf(opcode, $dst$$XMMRegister, $scratch$$Register);
9465   %}
9466   ins_pipe( pipe_slow );
9467 %}
9468 
9469 instruct vabsneg4F(vecX dst, rRegI scratch) %{
9470   predicate(UseSSE > 0 && n->as_Vector()->length() == 4);
9471   match(Set dst (AbsVF  dst));
9472   match(Set dst (NegVF  dst));
9473   effect(TEMP scratch);
9474   format %{ "vabsnegf $dst,[mask]\t# absneg packed4F" %}
9475   ins_cost(150);
9476   ins_encode %{
9477     int opcode = this->as_Mach()->ideal_Opcode();
9478     __ vabsnegf(opcode, $dst$$XMMRegister, $scratch$$Register);
9479   %}
9480   ins_pipe( pipe_slow );
9481 %}
9482 
9483 instruct vabsneg8F(vecY dst, vecY src, rRegI scratch) %{
9484   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
9485   match(Set dst (AbsVF  src));
9486   match(Set dst (NegVF  src));
9487   effect(TEMP scratch);
9488   format %{ "vabsnegf $dst,$src,[mask]\t# absneg packed8F" %}
9489   ins_cost(150);
9490   ins_encode %{
9491     int opcode = this->as_Mach()->ideal_Opcode();
9492     int vector_len = 1;
9493     __ vabsnegf(opcode, $dst$$XMMRegister, $src$$XMMRegister, vector_len, $scratch$$Register);
9494   %}
9495   ins_pipe( pipe_slow );
9496 %}
9497 
9498 instruct vabsneg16F(vecZ dst, vecZ src, rRegI scratch) %{
9499   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
9500   match(Set dst (AbsVF  src));
9501   match(Set dst (NegVF  src));
9502   effect(TEMP scratch);
9503   format %{ "vabsnegf $dst,$src,[mask]\t# absneg packed16F" %}
9504   ins_cost(150);
9505   ins_encode %{
9506     int opcode = this->as_Mach()->ideal_Opcode();
9507     int vector_len = 2;
9508     __ vabsnegf(opcode, $dst$$XMMRegister, $src$$XMMRegister, vector_len, $scratch$$Register);
9509   %}
9510   ins_pipe( pipe_slow );
9511 %}
9512 
9513 // --------------------------------- FMA --------------------------------------
9514 
9515 // a * b + c
9516 instruct vfma2D_reg(vecX a, vecX b, vecX c) %{
9517   predicate(UseFMA && n->as_Vector()->length() == 2);
9518   match(Set c (FmaVD  c (Binary a b)));
9519   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed2D" %}
9520   ins_cost(150);
9521   ins_encode %{
9522     int vector_len = 0;
9523     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
9524   %}
9525   ins_pipe( pipe_slow );
9526 %}
9527 
9528 // a * b + c
9529 instruct vfma2D_mem(vecX a, memory b, vecX c) %{
9530   predicate(UseFMA && n->as_Vector()->length() == 2);
9531   match(Set c (FmaVD  c (Binary a (LoadVector b))));
9532   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed2D" %}
9533   ins_cost(150);
9534   ins_encode %{
9535     int vector_len = 0;
9536     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
9537   %}
9538   ins_pipe( pipe_slow );
9539 %}
9540 
9541 
9542 // a * b + c
9543 instruct vfma4D_reg(vecY a, vecY b, vecY c) %{
9544   predicate(UseFMA && n->as_Vector()->length() == 4);
9545   match(Set c (FmaVD  c (Binary a b)));
9546   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed4D" %}
9547   ins_cost(150);
9548   ins_encode %{
9549     int vector_len = 1;
9550     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
9551   %}
9552   ins_pipe( pipe_slow );
9553 %}
9554 
9555 // a * b + c
9556 instruct vfma4D_mem(vecY a, memory b, vecY c) %{
9557   predicate(UseFMA && n->as_Vector()->length() == 4);
9558   match(Set c (FmaVD  c (Binary a (LoadVector b))));
9559   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed4D" %}
9560   ins_cost(150);
9561   ins_encode %{
9562     int vector_len = 1;
9563     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
9564   %}
9565   ins_pipe( pipe_slow );
9566 %}
9567 
9568 // a * b + c
9569 instruct vfma8D_reg(vecZ a, vecZ b, vecZ c) %{
9570   predicate(UseFMA && n->as_Vector()->length() == 8);
9571   match(Set c (FmaVD  c (Binary a b)));
9572   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed8D" %}
9573   ins_cost(150);
9574   ins_encode %{
9575     int vector_len = 2;
9576     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
9577   %}
9578   ins_pipe( pipe_slow );
9579 %}
9580 
9581 // a * b + c
9582 instruct vfma8D_mem(vecZ a, memory b, vecZ c) %{
9583   predicate(UseFMA && n->as_Vector()->length() == 8);
9584   match(Set c (FmaVD  c (Binary a (LoadVector b))));
9585   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed8D" %}
9586   ins_cost(150);
9587   ins_encode %{
9588     int vector_len = 2;
9589     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
9590   %}
9591   ins_pipe( pipe_slow );
9592 %}
9593 
9594 // a * b + c
9595 instruct vfma4F_reg(vecX a, vecX b, vecX c) %{
9596   predicate(UseFMA && n->as_Vector()->length() == 4);
9597   match(Set c (FmaVF  c (Binary a b)));
9598   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed4F" %}
9599   ins_cost(150);
9600   ins_encode %{
9601     int vector_len = 0;
9602     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
9603   %}
9604   ins_pipe( pipe_slow );
9605 %}
9606 
9607 // a * b + c
9608 instruct vfma4F_mem(vecX a, memory b, vecX c) %{
9609   predicate(UseFMA && n->as_Vector()->length() == 4);
9610   match(Set c (FmaVF  c (Binary a (LoadVector b))));
9611   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed4F" %}
9612   ins_cost(150);
9613   ins_encode %{
9614     int vector_len = 0;
9615     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
9616   %}
9617   ins_pipe( pipe_slow );
9618 %}
9619 
9620 // a * b + c
9621 instruct vfma8F_reg(vecY a, vecY b, vecY c) %{
9622   predicate(UseFMA && n->as_Vector()->length() == 8);
9623   match(Set c (FmaVF  c (Binary a b)));
9624   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed8F" %}
9625   ins_cost(150);
9626   ins_encode %{
9627     int vector_len = 1;
9628     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
9629   %}
9630   ins_pipe( pipe_slow );
9631 %}
9632 
9633 // a * b + c
9634 instruct vfma8F_mem(vecY a, memory b, vecY c) %{
9635   predicate(UseFMA && n->as_Vector()->length() == 8);
9636   match(Set c (FmaVF  c (Binary a (LoadVector b))));
9637   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed8F" %}
9638   ins_cost(150);
9639   ins_encode %{
9640     int vector_len = 1;
9641     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
9642   %}
9643   ins_pipe( pipe_slow );
9644 %}
9645 
9646 // a * b + c
9647 instruct vfma16F_reg(vecZ a, vecZ b, vecZ c) %{
9648   predicate(UseFMA && n->as_Vector()->length() == 16);
9649   match(Set c (FmaVF  c (Binary a b)));
9650   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed16F" %}
9651   ins_cost(150);
9652   ins_encode %{
9653     int vector_len = 2;
9654     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
9655   %}
9656   ins_pipe( pipe_slow );
9657 %}
9658 
9659 // a * b + c
9660 instruct vfma16F_mem(vecZ a, memory b, vecZ c) %{
9661   predicate(UseFMA && n->as_Vector()->length() == 16);
9662   match(Set c (FmaVF  c (Binary a (LoadVector b))));
9663   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed16F" %}
9664   ins_cost(150);
9665   ins_encode %{
9666     int vector_len = 2;
9667     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
9668   %}
9669   ins_pipe( pipe_slow );
9670 %}
9671 
9672 // --------------------------------- Vector Multiply Add --------------------------------------
9673 
9674 instruct smuladd4S2I_reg(vecD dst, vecD src1) %{
9675   predicate(UseSSE >= 2 && UseAVX == 0 && n->as_Vector()->length() == 2);
9676   match(Set dst (MulAddVS2VI dst src1));
9677   format %{ "pmaddwd $dst,$dst,$src1\t! muladd packed4Sto2I" %}
9678   ins_encode %{
9679     __ pmaddwd($dst$$XMMRegister, $src1$$XMMRegister);
9680   %}
9681   ins_pipe( pipe_slow );
9682 %}
9683 
9684 instruct vmuladd4S2I_reg(vecD dst, vecD src1, vecD src2) %{
9685   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
9686   match(Set dst (MulAddVS2VI src1 src2));
9687   format %{ "vpmaddwd $dst,$src1,$src2\t! muladd packed4Sto2I" %}
9688   ins_encode %{
9689     int vector_len = 0;
9690     __ vpmaddwd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9691   %}
9692   ins_pipe( pipe_slow );
9693 %}
9694 
9695 instruct smuladd8S4I_reg(vecX dst, vecX src1) %{
9696   predicate(UseSSE >= 2 && UseAVX == 0 && n->as_Vector()->length() == 4);
9697   match(Set dst (MulAddVS2VI dst src1));
9698   format %{ "pmaddwd $dst,$dst,$src1\t! muladd packed8Sto4I" %}
9699   ins_encode %{
9700     __ pmaddwd($dst$$XMMRegister, $src1$$XMMRegister);
9701   %}
9702   ins_pipe( pipe_slow );
9703 %}
9704 
9705 instruct vmuladd8S4I_reg(vecX dst, vecX src1, vecX src2) %{
9706   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
9707   match(Set dst (MulAddVS2VI src1 src2));
9708   format %{ "vpmaddwd $dst,$src1,$src2\t! muladd packed8Sto4I" %}
9709   ins_encode %{
9710     int vector_len = 0;
9711     __ vpmaddwd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9712   %}
9713   ins_pipe( pipe_slow );
9714 %}
9715 
9716 instruct vmuladd16S8I_reg(vecY dst, vecY src1, vecY src2) %{
9717   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
9718   match(Set dst (MulAddVS2VI src1 src2));
9719   format %{ "vpmaddwd $dst,$src1,$src2\t! muladd packed16Sto8I" %}
9720   ins_encode %{
9721     int vector_len = 1;
9722     __ vpmaddwd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9723   %}
9724   ins_pipe( pipe_slow );
9725 %}
9726 
9727 instruct vmuladd32S16I_reg(vecZ dst, vecZ src1, vecZ src2) %{
9728   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
9729   match(Set dst (MulAddVS2VI src1 src2));
9730   format %{ "vpmaddwd $dst,$src1,$src2\t! muladd packed32Sto16I" %}
9731   ins_encode %{
9732     int vector_len = 2;
9733     __ vpmaddwd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9734   %}
9735   ins_pipe( pipe_slow );
9736 %}
9737 
9738 // --------------------------------- Vector Multiply Add Add ----------------------------------
9739 
9740 instruct vmuladdadd4S2I_reg(vecD dst, vecD src1, vecD src2) %{
9741   predicate(VM_Version::supports_vnni() && UseAVX > 2 && n->as_Vector()->length() == 2);
9742   match(Set dst (AddVI (MulAddVS2VI src1 src2) dst));
9743   format %{ "evpdpwssd $dst,$src1,$src2\t! muladdadd packed4Sto2I" %}
9744   ins_encode %{
9745     int vector_len = 0;
9746     __ evpdpwssd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9747   %}
9748   ins_pipe( pipe_slow );
9749   ins_cost(10);
9750 %}
9751 
9752 instruct vmuladdadd8S4I_reg(vecX dst, vecX src1, vecX src2) %{
9753   predicate(VM_Version::supports_vnni() && UseAVX > 2 && n->as_Vector()->length() == 4);
9754   match(Set dst (AddVI (MulAddVS2VI src1 src2) dst));
9755   format %{ "evpdpwssd $dst,$src1,$src2\t! muladdadd packed8Sto4I" %}
9756   ins_encode %{
9757     int vector_len = 0;
9758     __ evpdpwssd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9759   %}
9760   ins_pipe( pipe_slow );
9761   ins_cost(10);
9762 %}
9763 
9764 instruct vmuladdadd16S8I_reg(vecY dst, vecY src1, vecY src2) %{
9765   predicate(VM_Version::supports_vnni() && UseAVX > 2 && n->as_Vector()->length() == 8);
9766   match(Set dst (AddVI (MulAddVS2VI src1 src2) dst));
9767   format %{ "evpdpwssd $dst,$src1,$src2\t! muladdadd packed16Sto8I" %}
9768   ins_encode %{
9769     int vector_len = 1;
9770     __ evpdpwssd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9771   %}
9772   ins_pipe( pipe_slow );
9773   ins_cost(10);
9774 %}
9775 
9776 instruct vmuladdadd32S16I_reg(vecZ dst, vecZ src1, vecZ src2) %{
9777   predicate(VM_Version::supports_vnni() && UseAVX > 2 && n->as_Vector()->length() == 16);
9778   match(Set dst (AddVI (MulAddVS2VI src1 src2) dst));
9779   format %{ "evpdpwssd $dst,$src1,$src2\t! muladdadd packed32Sto16I" %}
9780   ins_encode %{
9781     int vector_len = 2;
9782     __ evpdpwssd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9783   %}
9784   ins_pipe( pipe_slow );
9785   ins_cost(10);
9786 %}
9787 
9788 // --------------------------------- PopCount --------------------------------------
9789 
9790 instruct vpopcount2I(vecD dst, vecD src) %{
9791   predicate(VM_Version::supports_vpopcntdq() && UsePopCountInstruction && n->as_Vector()->length() == 2);
9792   match(Set dst (PopCountVI src));
9793   format %{ "vpopcntd  $dst,$src\t! vector popcount packed2I" %}
9794   ins_encode %{
9795     int vector_len = 0;
9796     __ vpopcntd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
9797   %}
9798   ins_pipe( pipe_slow );
9799 %}
9800 
9801 instruct vpopcount4I(vecX dst, vecX src) %{
9802   predicate(VM_Version::supports_vpopcntdq() && UsePopCountInstruction && n->as_Vector()->length() == 4);
9803   match(Set dst (PopCountVI src));
9804   format %{ "vpopcntd  $dst,$src\t! vector popcount packed4I" %}
9805   ins_encode %{
9806     int vector_len = 0;
9807     __ vpopcntd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
9808   %}
9809   ins_pipe( pipe_slow );
9810 %}
9811 
9812 instruct vpopcount8I(vecY dst, vecY src) %{
9813   predicate(VM_Version::supports_vpopcntdq() && UsePopCountInstruction && n->as_Vector()->length() == 8);
9814   match(Set dst (PopCountVI src));
9815   format %{ "vpopcntd  $dst,$src\t! vector popcount packed8I" %}
9816   ins_encode %{
9817     int vector_len = 1;
9818     __ vpopcntd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
9819   %}
9820   ins_pipe( pipe_slow );
9821 %}
9822 
9823 instruct vpopcount16I(vecZ dst, vecZ src) %{
9824   predicate(VM_Version::supports_vpopcntdq() && UsePopCountInstruction && n->as_Vector()->length() == 16);
9825   match(Set dst (PopCountVI src));
9826   format %{ "vpopcntd  $dst,$src\t! vector popcount packed16I" %}
9827   ins_encode %{
9828     int vector_len = 2;
9829     __ vpopcntd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
9830   %}
9831   ins_pipe( pipe_slow );
9832 %}