1 //
   2 // Copyright (c) 2011, 2014, Oracle and/or its affiliates. All rights reserved.
   3 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4 //
   5 // This code is free software; you can redistribute it and/or modify it
   6 // under the terms of the GNU General Public License version 2 only, as
   7 // published by the Free Software Foundation.
   8 //
   9 // This code is distributed in the hope that it will be useful, but WITHOUT
  10 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11 // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12 // version 2 for more details (a copy is included in the LICENSE file that
  13 // accompanied this code).
  14 //
  15 // You should have received a copy of the GNU General Public License version
  16 // 2 along with this work; if not, write to the Free Software Foundation,
  17 // Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18 //
  19 // Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20 // or visit www.oracle.com if you need additional information or have any
  21 // questions.
  22 //
  23 //
  24 
  25 // X86 Common Architecture Description File
  26 
  27 //----------REGISTER DEFINITION BLOCK------------------------------------------
  28 // This information is used by the matcher and the register allocator to
  29 // describe individual registers and classes of registers within the target
  30 // archtecture.
  31 
  32 register %{
  33 //----------Architecture Description Register Definitions----------------------
  34 // General Registers
  35 // "reg_def"  name ( register save type, C convention save type,
  36 //                   ideal register type, encoding );
  37 // Register Save Types:
  38 //
  39 // NS  = No-Save:       The register allocator assumes that these registers
  40 //                      can be used without saving upon entry to the method, &
  41 //                      that they do not need to be saved at call sites.
  42 //
  43 // SOC = Save-On-Call:  The register allocator assumes that these registers
  44 //                      can be used without saving upon entry to the method,
  45 //                      but that they must be saved at call sites.
  46 //
  47 // SOE = Save-On-Entry: The register allocator assumes that these registers
  48 //                      must be saved before using them upon entry to the
  49 //                      method, but they do not need to be saved at call
  50 //                      sites.
  51 //
  52 // AS  = Always-Save:   The register allocator assumes that these registers
  53 //                      must be saved before using them upon entry to the
  54 //                      method, & that they must be saved at call sites.
  55 //
  56 // Ideal Register Type is used to determine how to save & restore a
  57 // register.  Op_RegI will get spilled with LoadI/StoreI, Op_RegP will get
  58 // spilled with LoadP/StoreP.  If the register supports both, use Op_RegI.
  59 //
  60 // The encoding number is the actual bit-pattern placed into the opcodes.
  61 
  62 // XMM registers.  256-bit registers or 8 words each, labeled (a)-h.
  63 // Word a in each register holds a Float, words ab hold a Double.
  64 // The whole registers are used in SSE4.2 version intrinsics,
  65 // array copy stubs and superword operations (see UseSSE42Intrinsics,
  66 // UseXMMForArrayCopy and UseSuperword flags).
  67 // XMM8-XMM15 must be encoded with REX (VEX for UseAVX).
  68 // Linux ABI:   No register preserved across function calls
  69 //              XMM0-XMM7 might hold parameters
  70 // Windows ABI: XMM6-XMM15 preserved across function calls
  71 //              XMM0-XMM3 might hold parameters
  72 
  73 reg_def XMM0 ( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg());
  74 reg_def XMM0b( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(1));
  75 reg_def XMM0c( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(2));
  76 reg_def XMM0d( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(3));
  77 reg_def XMM0e( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(4));
  78 reg_def XMM0f( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(5));
  79 reg_def XMM0g( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(6));
  80 reg_def XMM0h( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(7));
  81 
  82 reg_def XMM1 ( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg());
  83 reg_def XMM1b( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(1));
  84 reg_def XMM1c( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(2));
  85 reg_def XMM1d( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(3));
  86 reg_def XMM1e( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(4));
  87 reg_def XMM1f( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(5));
  88 reg_def XMM1g( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(6));
  89 reg_def XMM1h( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(7));
  90 
  91 reg_def XMM2 ( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg());
  92 reg_def XMM2b( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(1));
  93 reg_def XMM2c( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(2));
  94 reg_def XMM2d( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(3));
  95 reg_def XMM2e( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(4));
  96 reg_def XMM2f( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(5));
  97 reg_def XMM2g( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(6));
  98 reg_def XMM2h( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(7));
  99 
 100 reg_def XMM3 ( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg());
 101 reg_def XMM3b( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(1));
 102 reg_def XMM3c( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(2));
 103 reg_def XMM3d( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(3));
 104 reg_def XMM3e( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(4));
 105 reg_def XMM3f( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(5));
 106 reg_def XMM3g( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(6));
 107 reg_def XMM3h( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(7));
 108 
 109 reg_def XMM4 ( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg());
 110 reg_def XMM4b( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(1));
 111 reg_def XMM4c( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(2));
 112 reg_def XMM4d( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(3));
 113 reg_def XMM4e( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(4));
 114 reg_def XMM4f( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(5));
 115 reg_def XMM4g( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(6));
 116 reg_def XMM4h( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(7));
 117 
 118 reg_def XMM5 ( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg());
 119 reg_def XMM5b( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(1));
 120 reg_def XMM5c( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(2));
 121 reg_def XMM5d( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(3));
 122 reg_def XMM5e( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(4));
 123 reg_def XMM5f( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(5));
 124 reg_def XMM5g( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(6));
 125 reg_def XMM5h( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(7));
 126 
 127 #ifdef _WIN64
 128 
 129 reg_def XMM6 ( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg());
 130 reg_def XMM6b( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg()->next(1));
 131 reg_def XMM6c( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg()->next(2));
 132 reg_def XMM6d( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg()->next(3));
 133 reg_def XMM6e( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg()->next(4));
 134 reg_def XMM6f( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg()->next(5));
 135 reg_def XMM6g( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg()->next(6));
 136 reg_def XMM6h( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg()->next(7));
 137 
 138 reg_def XMM7 ( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg());
 139 reg_def XMM7b( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg()->next(1));
 140 reg_def XMM7c( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg()->next(2));
 141 reg_def XMM7d( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg()->next(3));
 142 reg_def XMM7e( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg()->next(4));
 143 reg_def XMM7f( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg()->next(5));
 144 reg_def XMM7g( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg()->next(6));
 145 reg_def XMM7h( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg()->next(7));
 146 
 147 reg_def XMM8 ( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg());
 148 reg_def XMM8b( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg()->next(1));
 149 reg_def XMM8c( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg()->next(2));
 150 reg_def XMM8d( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg()->next(3));
 151 reg_def XMM8e( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg()->next(4));
 152 reg_def XMM8f( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg()->next(5));
 153 reg_def XMM8g( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg()->next(6));
 154 reg_def XMM8h( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg()->next(7));
 155 
 156 reg_def XMM9 ( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg());
 157 reg_def XMM9b( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg()->next(1));
 158 reg_def XMM9c( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg()->next(2));
 159 reg_def XMM9d( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg()->next(3));
 160 reg_def XMM9e( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg()->next(4));
 161 reg_def XMM9f( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg()->next(5));
 162 reg_def XMM9g( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg()->next(6));
 163 reg_def XMM9h( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg()->next(7));
 164 
 165 reg_def XMM10 ( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg());
 166 reg_def XMM10b( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg()->next(1));
 167 reg_def XMM10c( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg()->next(2));
 168 reg_def XMM10d( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg()->next(3));
 169 reg_def XMM10e( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg()->next(4));
 170 reg_def XMM10f( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg()->next(5));
 171 reg_def XMM10g( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg()->next(6));
 172 reg_def XMM10h( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg()->next(7));
 173 
 174 reg_def XMM11 ( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg());
 175 reg_def XMM11b( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg()->next(1));
 176 reg_def XMM11c( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg()->next(2));
 177 reg_def XMM11d( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg()->next(3));
 178 reg_def XMM11e( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg()->next(4));
 179 reg_def XMM11f( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg()->next(5));
 180 reg_def XMM11g( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg()->next(6));
 181 reg_def XMM11h( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg()->next(7));
 182 
 183 reg_def XMM12 ( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg());
 184 reg_def XMM12b( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg()->next(1));
 185 reg_def XMM12c( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg()->next(2));
 186 reg_def XMM12d( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg()->next(3));
 187 reg_def XMM12e( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg()->next(4));
 188 reg_def XMM12f( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg()->next(5));
 189 reg_def XMM12g( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg()->next(6));
 190 reg_def XMM12h( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg()->next(7));
 191 
 192 reg_def XMM13 ( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg());
 193 reg_def XMM13b( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg()->next(1));
 194 reg_def XMM13c( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg()->next(2));
 195 reg_def XMM13d( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg()->next(3));
 196 reg_def XMM13e( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg()->next(4));
 197 reg_def XMM13f( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg()->next(5));
 198 reg_def XMM13g( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg()->next(6));
 199 reg_def XMM13h( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg()->next(7));
 200 
 201 reg_def XMM14 ( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg());
 202 reg_def XMM14b( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg()->next(1));
 203 reg_def XMM14c( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg()->next(2));
 204 reg_def XMM14d( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg()->next(3));
 205 reg_def XMM14e( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg()->next(4));
 206 reg_def XMM14f( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg()->next(5));
 207 reg_def XMM14g( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg()->next(6));
 208 reg_def XMM14h( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg()->next(7));
 209 
 210 reg_def XMM15 ( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg());
 211 reg_def XMM15b( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg()->next(1));
 212 reg_def XMM15c( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg()->next(2));
 213 reg_def XMM15d( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg()->next(3));
 214 reg_def XMM15e( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg()->next(4));
 215 reg_def XMM15f( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg()->next(5));
 216 reg_def XMM15g( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg()->next(6));
 217 reg_def XMM15h( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg()->next(7));
 218 
 219 #else // _WIN64
 220 
 221 reg_def XMM6 ( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg());
 222 reg_def XMM6b( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(1));
 223 reg_def XMM6c( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(2));
 224 reg_def XMM6d( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(3));
 225 reg_def XMM6e( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(4));
 226 reg_def XMM6f( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(5));
 227 reg_def XMM6g( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(6));
 228 reg_def XMM6h( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(7));
 229 
 230 reg_def XMM7 ( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg());
 231 reg_def XMM7b( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(1));
 232 reg_def XMM7c( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(2));
 233 reg_def XMM7d( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(3));
 234 reg_def XMM7e( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(4));
 235 reg_def XMM7f( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(5));
 236 reg_def XMM7g( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(6));
 237 reg_def XMM7h( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(7));
 238 
 239 #ifdef _LP64
 240 
 241 reg_def XMM8 ( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg());
 242 reg_def XMM8b( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(1));
 243 reg_def XMM8c( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(2));
 244 reg_def XMM8d( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(3));
 245 reg_def XMM8e( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(4));
 246 reg_def XMM8f( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(5));
 247 reg_def XMM8g( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(6));
 248 reg_def XMM8h( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(7));
 249 
 250 reg_def XMM9 ( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg());
 251 reg_def XMM9b( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(1));
 252 reg_def XMM9c( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(2));
 253 reg_def XMM9d( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(3));
 254 reg_def XMM9e( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(4));
 255 reg_def XMM9f( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(5));
 256 reg_def XMM9g( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(6));
 257 reg_def XMM9h( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(7));
 258 
 259 reg_def XMM10 ( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg());
 260 reg_def XMM10b( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(1));
 261 reg_def XMM10c( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(2));
 262 reg_def XMM10d( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(3));
 263 reg_def XMM10e( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(4));
 264 reg_def XMM10f( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(5));
 265 reg_def XMM10g( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(6));
 266 reg_def XMM10h( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(7));
 267 
 268 reg_def XMM11 ( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg());
 269 reg_def XMM11b( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(1));
 270 reg_def XMM11c( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(2));
 271 reg_def XMM11d( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(3));
 272 reg_def XMM11e( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(4));
 273 reg_def XMM11f( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(5));
 274 reg_def XMM11g( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(6));
 275 reg_def XMM11h( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(7));
 276 
 277 reg_def XMM12 ( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg());
 278 reg_def XMM12b( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(1));
 279 reg_def XMM12c( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(2));
 280 reg_def XMM12d( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(3));
 281 reg_def XMM12e( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(4));
 282 reg_def XMM12f( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(5));
 283 reg_def XMM12g( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(6));
 284 reg_def XMM12h( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(7));
 285 
 286 reg_def XMM13 ( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg());
 287 reg_def XMM13b( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(1));
 288 reg_def XMM13c( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(2));
 289 reg_def XMM13d( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(3));
 290 reg_def XMM13e( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(4));
 291 reg_def XMM13f( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(5));
 292 reg_def XMM13g( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(6));
 293 reg_def XMM13h( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(7));
 294 
 295 reg_def XMM14 ( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg());
 296 reg_def XMM14b( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(1));
 297 reg_def XMM14c( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(2));
 298 reg_def XMM14d( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(3));
 299 reg_def XMM14e( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(4));
 300 reg_def XMM14f( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(5));
 301 reg_def XMM14g( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(6));
 302 reg_def XMM14h( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(7));
 303 
 304 reg_def XMM15 ( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg());
 305 reg_def XMM15b( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(1));
 306 reg_def XMM15c( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(2));
 307 reg_def XMM15d( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(3));
 308 reg_def XMM15e( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(4));
 309 reg_def XMM15f( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(5));
 310 reg_def XMM15g( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(6));
 311 reg_def XMM15h( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(7));
 312 
 313 #endif // _LP64
 314 
 315 #endif // _WIN64
 316 
 317 #ifdef _LP64
 318 reg_def RFLAGS(SOC, SOC, 0, 16, VMRegImpl::Bad());
 319 #else
 320 reg_def RFLAGS(SOC, SOC, 0, 8, VMRegImpl::Bad());
 321 #endif // _LP64
 322 
 323 alloc_class chunk1(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
 324                    XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
 325                    XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
 326                    XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
 327                    XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
 328                    XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
 329                    XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
 330                    XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
 331 #ifdef _LP64
 332                   ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
 333                    XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
 334                    XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
 335                    XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
 336                    XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
 337                    XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
 338                    XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
 339                    XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h
 340 #endif
 341                    );
 342 
 343 // flags allocation class should be last.
 344 alloc_class chunk2(RFLAGS);
 345 
 346 // Singleton class for condition codes
 347 reg_class int_flags(RFLAGS);
 348 
 349 // Class for all float registers
 350 reg_class float_reg(XMM0,
 351                     XMM1,
 352                     XMM2,
 353                     XMM3,
 354                     XMM4,
 355                     XMM5,
 356                     XMM6,
 357                     XMM7
 358 #ifdef _LP64
 359                    ,XMM8,
 360                     XMM9,
 361                     XMM10,
 362                     XMM11,
 363                     XMM12,
 364                     XMM13,
 365                     XMM14,
 366                     XMM15
 367 #endif
 368                     );
 369 
 370 // Class for all double registers
 371 reg_class double_reg(XMM0,  XMM0b,
 372                      XMM1,  XMM1b,
 373                      XMM2,  XMM2b,
 374                      XMM3,  XMM3b,
 375                      XMM4,  XMM4b,
 376                      XMM5,  XMM5b,
 377                      XMM6,  XMM6b,
 378                      XMM7,  XMM7b
 379 #ifdef _LP64
 380                     ,XMM8,  XMM8b,
 381                      XMM9,  XMM9b,
 382                      XMM10, XMM10b,
 383                      XMM11, XMM11b,
 384                      XMM12, XMM12b,
 385                      XMM13, XMM13b,
 386                      XMM14, XMM14b,
 387                      XMM15, XMM15b
 388 #endif
 389                      );
 390 
 391 // Class for all 32bit vector registers
 392 reg_class vectors_reg(XMM0,
 393                       XMM1,
 394                       XMM2,
 395                       XMM3,
 396                       XMM4,
 397                       XMM5,
 398                       XMM6,
 399                       XMM7
 400 #ifdef _LP64
 401                      ,XMM8,
 402                       XMM9,
 403                       XMM10,
 404                       XMM11,
 405                       XMM12,
 406                       XMM13,
 407                       XMM14,
 408                       XMM15
 409 #endif
 410                       );
 411 
 412 // Class for all 64bit vector registers
 413 reg_class vectord_reg(XMM0,  XMM0b,
 414                       XMM1,  XMM1b,
 415                       XMM2,  XMM2b,
 416                       XMM3,  XMM3b,
 417                       XMM4,  XMM4b,
 418                       XMM5,  XMM5b,
 419                       XMM6,  XMM6b,
 420                       XMM7,  XMM7b
 421 #ifdef _LP64
 422                      ,XMM8,  XMM8b,
 423                       XMM9,  XMM9b,
 424                       XMM10, XMM10b,
 425                       XMM11, XMM11b,
 426                       XMM12, XMM12b,
 427                       XMM13, XMM13b,
 428                       XMM14, XMM14b,
 429                       XMM15, XMM15b
 430 #endif
 431                       );
 432 
 433 // Class for all 128bit vector registers
 434 reg_class vectorx_reg(XMM0,  XMM0b,  XMM0c,  XMM0d,
 435                       XMM1,  XMM1b,  XMM1c,  XMM1d,
 436                       XMM2,  XMM2b,  XMM2c,  XMM2d,
 437                       XMM3,  XMM3b,  XMM3c,  XMM3d,
 438                       XMM4,  XMM4b,  XMM4c,  XMM4d,
 439                       XMM5,  XMM5b,  XMM5c,  XMM5d,
 440                       XMM6,  XMM6b,  XMM6c,  XMM6d,
 441                       XMM7,  XMM7b,  XMM7c,  XMM7d
 442 #ifdef _LP64
 443                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,
 444                       XMM9,  XMM9b,  XMM9c,  XMM9d,
 445                       XMM10, XMM10b, XMM10c, XMM10d,
 446                       XMM11, XMM11b, XMM11c, XMM11d,
 447                       XMM12, XMM12b, XMM12c, XMM12d,
 448                       XMM13, XMM13b, XMM13c, XMM13d,
 449                       XMM14, XMM14b, XMM14c, XMM14d,
 450                       XMM15, XMM15b, XMM15c, XMM15d
 451 #endif
 452                       );
 453 
 454 // Class for all 256bit vector registers
 455 reg_class vectory_reg(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
 456                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
 457                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
 458                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
 459                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
 460                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
 461                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
 462                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
 463 #ifdef _LP64
 464                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
 465                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
 466                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
 467                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
 468                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
 469                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
 470                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
 471                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h
 472 #endif
 473                       );
 474 
 475 %}
 476 
 477 
 478 //----------SOURCE BLOCK-------------------------------------------------------
 479 // This is a block of C++ code which provides values, functions, and
 480 // definitions necessary in the rest of the architecture description
 481 
 482 source_hpp %{
 483 // Header information of the source block.
 484 // Method declarations/definitions which are used outside
 485 // the ad-scope can conveniently be defined here.
 486 //
 487 // To keep related declarations/definitions/uses close together,
 488 // we switch between source %{ }% and source_hpp %{ }% freely as needed.
 489 
 490 class NativeJump;
 491 
 492 class CallStubImpl {
 493 
 494   //--------------------------------------------------------------
 495   //---<  Used for optimization in Compile::shorten_branches  >---
 496   //--------------------------------------------------------------
 497 
 498  public:
 499   // Size of call trampoline stub.
 500   static uint size_call_trampoline() {
 501     return 0; // no call trampolines on this platform
 502   }
 503 
 504   // number of relocations needed by a call trampoline stub
 505   static uint reloc_call_trampoline() {
 506     return 0; // no call trampolines on this platform
 507   }
 508 };
 509 
 510 class HandlerImpl {
 511 
 512  public:
 513 
 514   static int emit_exception_handler(CodeBuffer &cbuf);
 515   static int emit_deopt_handler(CodeBuffer& cbuf);
 516 
 517   static uint size_exception_handler() {
 518     // NativeCall instruction size is the same as NativeJump.
 519     // exception handler starts out as jump and can be patched to
 520     // a call be deoptimization.  (4932387)
 521     // Note that this value is also credited (in output.cpp) to
 522     // the size of the code section.
 523     return NativeJump::instruction_size;
 524   }
 525 
 526 #ifdef _LP64
 527   static uint size_deopt_handler() {
 528     // three 5 byte instructions
 529     return 15;
 530   }
 531 #else
 532   static uint size_deopt_handler() {
 533     // NativeCall instruction size is the same as NativeJump.
 534     // exception handler starts out as jump and can be patched to
 535     // a call be deoptimization.  (4932387)
 536     // Note that this value is also credited (in output.cpp) to
 537     // the size of the code section.
 538     return 5 + NativeJump::instruction_size; // pushl(); jmp;
 539   }
 540 #endif
 541 };
 542 
 543 %} // end source_hpp
 544 
 545 source %{
 546 
 547 // Emit exception handler code.
 548 // Stuff framesize into a register and call a VM stub routine.
 549 int HandlerImpl::emit_exception_handler(CodeBuffer& cbuf) {
 550 
 551   // Note that the code buffer's insts_mark is always relative to insts.
 552   // That's why we must use the macroassembler to generate a handler.
 553   MacroAssembler _masm(&cbuf);
 554   address base = __ start_a_stub(size_exception_handler());
 555   if (base == NULL)  return 0;  // CodeBuffer::expand failed
 556   int offset = __ offset();
 557   __ jump(RuntimeAddress(OptoRuntime::exception_blob()->entry_point()));
 558   assert(__ offset() - offset <= (int) size_exception_handler(), "overflow");
 559   __ end_a_stub();
 560   return offset;
 561 }
 562 
 563 // Emit deopt handler code.
 564 int HandlerImpl::emit_deopt_handler(CodeBuffer& cbuf) {
 565 
 566   // Note that the code buffer's insts_mark is always relative to insts.
 567   // That's why we must use the macroassembler to generate a handler.
 568   MacroAssembler _masm(&cbuf);
 569   address base = __ start_a_stub(size_deopt_handler());
 570   if (base == NULL)  return 0;  // CodeBuffer::expand failed
 571   int offset = __ offset();
 572 
 573 #ifdef _LP64
 574   address the_pc = (address) __ pc();
 575   Label next;
 576   // push a "the_pc" on the stack without destroying any registers
 577   // as they all may be live.
 578 
 579   // push address of "next"
 580   __ call(next, relocInfo::none); // reloc none is fine since it is a disp32
 581   __ bind(next);
 582   // adjust it so it matches "the_pc"
 583   __ subptr(Address(rsp, 0), __ offset() - offset);
 584 #else
 585   InternalAddress here(__ pc());
 586   __ pushptr(here.addr());
 587 #endif
 588 
 589   __ jump(RuntimeAddress(SharedRuntime::deopt_blob()->unpack()));
 590   assert(__ offset() - offset <= (int) size_deopt_handler(), "overflow");
 591   __ end_a_stub();
 592   return offset;
 593 }
 594 
 595 
 596 //=============================================================================
 597 
 598   // Float masks come from different places depending on platform.
 599 #ifdef _LP64
 600   static address float_signmask()  { return StubRoutines::x86::float_sign_mask(); }
 601   static address float_signflip()  { return StubRoutines::x86::float_sign_flip(); }
 602   static address double_signmask() { return StubRoutines::x86::double_sign_mask(); }
 603   static address double_signflip() { return StubRoutines::x86::double_sign_flip(); }
 604 #else
 605   static address float_signmask()  { return (address)float_signmask_pool; }
 606   static address float_signflip()  { return (address)float_signflip_pool; }
 607   static address double_signmask() { return (address)double_signmask_pool; }
 608   static address double_signflip() { return (address)double_signflip_pool; }
 609 #endif
 610 
 611 
 612 const bool Matcher::match_rule_supported(int opcode) {
 613   if (!has_match_rule(opcode))
 614     return false;
 615 
 616   switch (opcode) {
 617     case Op_PopCountI:
 618     case Op_PopCountL:
 619       if (!UsePopCountInstruction)
 620         return false;
 621     break;
 622     case Op_MulVI:
 623       if ((UseSSE < 4) && (UseAVX < 1)) // only with SSE4_1 or AVX
 624         return false;
 625     break;
 626     case Op_AddReductionVL:
 627       if (UseAVX < 3) // only EVEX : vector connectivity becomes an issue here
 628         return false;
 629     case Op_AddReductionVI:
 630       if (UseSSE < 3) // requires at least SSE3
 631         return false;
 632     case Op_MulReductionVI:
 633       if (UseSSE < 4) // requires at least SSE4
 634         return false;
 635     case Op_AddReductionVF:
 636     case Op_AddReductionVD:
 637     case Op_MulReductionVF:
 638     case Op_MulReductionVD:
 639       if (UseSSE < 1) // requires at least SSE
 640         return false;
 641     break;
 642     case Op_CompareAndSwapL:
 643 #ifdef _LP64
 644     case Op_CompareAndSwapP:
 645 #endif
 646       if (!VM_Version::supports_cx8())
 647         return false;
 648     break;
 649   }
 650 
 651   return true;  // Per default match rules are supported.
 652 }
 653 
 654 // Max vector size in bytes. 0 if not supported.
 655 const int Matcher::vector_width_in_bytes(BasicType bt) {
 656   assert(is_java_primitive(bt), "only primitive type vectors");
 657   if (UseSSE < 2) return 0;
 658   // SSE2 supports 128bit vectors for all types.
 659   // AVX2 supports 256bit vectors for all types.
 660   int size = (UseAVX > 1) ? 32 : 16;
 661   // AVX1 supports 256bit vectors only for FLOAT and DOUBLE.
 662   if (UseAVX > 0 && (bt == T_FLOAT || bt == T_DOUBLE))
 663     size = 32;
 664   // Use flag to limit vector size.
 665   size = MIN2(size,(int)MaxVectorSize);
 666   // Minimum 2 values in vector (or 4 for bytes).
 667   switch (bt) {
 668   case T_DOUBLE:
 669   case T_LONG:
 670     if (size < 16) return 0;
 671   case T_FLOAT:
 672   case T_INT:
 673     if (size < 8) return 0;
 674   case T_BOOLEAN:
 675   case T_BYTE:
 676   case T_CHAR:
 677   case T_SHORT:
 678     if (size < 4) return 0;
 679     break;
 680   default:
 681     ShouldNotReachHere();
 682   }
 683   return size;
 684 }
 685 
 686 // Limits on vector size (number of elements) loaded into vector.
 687 const int Matcher::max_vector_size(const BasicType bt) {
 688   return vector_width_in_bytes(bt)/type2aelembytes(bt);
 689 }
 690 const int Matcher::min_vector_size(const BasicType bt) {
 691   int max_size = max_vector_size(bt);
 692   // Min size which can be loaded into vector is 4 bytes.
 693   int size = (type2aelembytes(bt) == 1) ? 4 : 2;
 694   return MIN2(size,max_size);
 695 }
 696 
 697 // Vector ideal reg corresponding to specidied size in bytes
 698 const int Matcher::vector_ideal_reg(int size) {
 699   assert(MaxVectorSize >= size, "");
 700   switch(size) {
 701     case  4: return Op_VecS;
 702     case  8: return Op_VecD;
 703     case 16: return Op_VecX;
 704     case 32: return Op_VecY;
 705   }
 706   ShouldNotReachHere();
 707   return 0;
 708 }
 709 
 710 // Only lowest bits of xmm reg are used for vector shift count.
 711 const int Matcher::vector_shift_count_ideal_reg(int size) {
 712   return Op_VecS;
 713 }
 714 
 715 // x86 supports misaligned vectors store/load.
 716 const bool Matcher::misaligned_vectors_ok() {
 717   return !AlignVector; // can be changed by flag
 718 }
 719 
 720 // x86 AES instructions are compatible with SunJCE expanded
 721 // keys, hence we do not need to pass the original key to stubs
 722 const bool Matcher::pass_original_key_for_aes() {
 723   return false;
 724 }
 725 
 726 // Helper methods for MachSpillCopyNode::implementation().
 727 static int vec_mov_helper(CodeBuffer *cbuf, bool do_size, int src_lo, int dst_lo,
 728                           int src_hi, int dst_hi, uint ireg, outputStream* st) {
 729   // In 64-bit VM size calculation is very complex. Emitting instructions
 730   // into scratch buffer is used to get size in 64-bit VM.
 731   LP64_ONLY( assert(!do_size, "this method calculates size only for 32-bit VM"); )
 732   assert(ireg == Op_VecS || // 32bit vector
 733          (src_lo & 1) == 0 && (src_lo + 1) == src_hi &&
 734          (dst_lo & 1) == 0 && (dst_lo + 1) == dst_hi,
 735          "no non-adjacent vector moves" );
 736   if (cbuf) {
 737     MacroAssembler _masm(cbuf);
 738     int offset = __ offset();
 739     switch (ireg) {
 740     case Op_VecS: // copy whole register
 741     case Op_VecD:
 742     case Op_VecX:
 743       __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
 744       break;
 745     case Op_VecY:
 746       __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
 747       break;
 748     default:
 749       ShouldNotReachHere();
 750     }
 751     int size = __ offset() - offset;
 752 #ifdef ASSERT
 753     // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
 754     assert(!do_size || size == 4, "incorrect size calculattion");
 755 #endif
 756     return size;
 757 #ifndef PRODUCT
 758   } else if (!do_size) {
 759     switch (ireg) {
 760     case Op_VecS:
 761     case Op_VecD:
 762     case Op_VecX:
 763       st->print("movdqu  %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
 764       break;
 765     case Op_VecY:
 766       st->print("vmovdqu %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
 767       break;
 768     default:
 769       ShouldNotReachHere();
 770     }
 771 #endif
 772   }
 773   // VEX_2bytes prefix is used if UseAVX > 0, and it takes the same 2 bytes as SIMD prefix.
 774   return 4;
 775 }
 776 
 777 static int vec_spill_helper(CodeBuffer *cbuf, bool do_size, bool is_load,
 778                             int stack_offset, int reg, uint ireg, outputStream* st) {
 779   // In 64-bit VM size calculation is very complex. Emitting instructions
 780   // into scratch buffer is used to get size in 64-bit VM.
 781   LP64_ONLY( assert(!do_size, "this method calculates size only for 32-bit VM"); )
 782   if (cbuf) {
 783     MacroAssembler _masm(cbuf);
 784     int offset = __ offset();
 785     if (is_load) {
 786       switch (ireg) {
 787       case Op_VecS:
 788         __ movdl(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 789         break;
 790       case Op_VecD:
 791         __ movq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 792         break;
 793       case Op_VecX:
 794         __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 795         break;
 796       case Op_VecY:
 797         __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 798         break;
 799       default:
 800         ShouldNotReachHere();
 801       }
 802     } else { // store
 803       switch (ireg) {
 804       case Op_VecS:
 805         __ movdl(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 806         break;
 807       case Op_VecD:
 808         __ movq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 809         break;
 810       case Op_VecX:
 811         __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 812         break;
 813       case Op_VecY:
 814         __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 815         break;
 816       default:
 817         ShouldNotReachHere();
 818       }
 819     }
 820     int size = __ offset() - offset;
 821 #ifdef ASSERT
 822     int offset_size = (stack_offset == 0) ? 0 : ((stack_offset < 0x80) ? 1 : 4);
 823     // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
 824     assert(!do_size || size == (5+offset_size), "incorrect size calculattion");
 825 #endif
 826     return size;
 827 #ifndef PRODUCT
 828   } else if (!do_size) {
 829     if (is_load) {
 830       switch (ireg) {
 831       case Op_VecS:
 832         st->print("movd    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 833         break;
 834       case Op_VecD:
 835         st->print("movq    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 836         break;
 837        case Op_VecX:
 838         st->print("movdqu  %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 839         break;
 840       case Op_VecY:
 841         st->print("vmovdqu %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 842         break;
 843       default:
 844         ShouldNotReachHere();
 845       }
 846     } else { // store
 847       switch (ireg) {
 848       case Op_VecS:
 849         st->print("movd    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 850         break;
 851       case Op_VecD:
 852         st->print("movq    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 853         break;
 854        case Op_VecX:
 855         st->print("movdqu  [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 856         break;
 857       case Op_VecY:
 858         st->print("vmovdqu [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 859         break;
 860       default:
 861         ShouldNotReachHere();
 862       }
 863     }
 864 #endif
 865   }
 866   int offset_size = (stack_offset == 0) ? 0 : ((stack_offset < 0x80) ? 1 : 4);
 867   // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
 868   return 5+offset_size;
 869 }
 870 
 871 static inline jfloat replicate4_imm(int con, int width) {
 872   // Load a constant of "width" (in bytes) and replicate it to fill 32bit.
 873   assert(width == 1 || width == 2, "only byte or short types here");
 874   int bit_width = width * 8;
 875   jint val = con;
 876   val &= (1 << bit_width) - 1;  // mask off sign bits
 877   while(bit_width < 32) {
 878     val |= (val << bit_width);
 879     bit_width <<= 1;
 880   }
 881   jfloat fval = *((jfloat*) &val);  // coerce to float type
 882   return fval;
 883 }
 884 
 885 static inline jdouble replicate8_imm(int con, int width) {
 886   // Load a constant of "width" (in bytes) and replicate it to fill 64bit.
 887   assert(width == 1 || width == 2 || width == 4, "only byte, short or int types here");
 888   int bit_width = width * 8;
 889   jlong val = con;
 890   val &= (((jlong) 1) << bit_width) - 1;  // mask off sign bits
 891   while(bit_width < 64) {
 892     val |= (val << bit_width);
 893     bit_width <<= 1;
 894   }
 895   jdouble dval = *((jdouble*) &val);  // coerce to double type
 896   return dval;
 897 }
 898 
 899 #ifndef PRODUCT
 900   void MachNopNode::format(PhaseRegAlloc*, outputStream* st) const {
 901     st->print("nop \t# %d bytes pad for loops and calls", _count);
 902   }
 903 #endif
 904 
 905   void MachNopNode::emit(CodeBuffer &cbuf, PhaseRegAlloc*) const {
 906     MacroAssembler _masm(&cbuf);
 907     __ nop(_count);
 908   }
 909 
 910   uint MachNopNode::size(PhaseRegAlloc*) const {
 911     return _count;
 912   }
 913 
 914 #ifndef PRODUCT
 915   void MachBreakpointNode::format(PhaseRegAlloc*, outputStream* st) const {
 916     st->print("# breakpoint");
 917   }
 918 #endif
 919 
 920   void MachBreakpointNode::emit(CodeBuffer &cbuf, PhaseRegAlloc* ra_) const {
 921     MacroAssembler _masm(&cbuf);
 922     __ int3();
 923   }
 924 
 925   uint MachBreakpointNode::size(PhaseRegAlloc* ra_) const {
 926     return MachNode::size(ra_);
 927   }
 928 
 929 %}
 930 
 931 encode %{
 932 
 933   enc_class call_epilog %{
 934     if (VerifyStackAtCalls) {
 935       // Check that stack depth is unchanged: find majik cookie on stack
 936       int framesize = ra_->reg2offset_unchecked(OptoReg::add(ra_->_matcher._old_SP, -3*VMRegImpl::slots_per_word));
 937       MacroAssembler _masm(&cbuf);
 938       Label L;
 939       __ cmpptr(Address(rsp, framesize), (int32_t)0xbadb100d);
 940       __ jccb(Assembler::equal, L);
 941       // Die if stack mismatch
 942       __ int3();
 943       __ bind(L);
 944     }
 945   %}
 946 
 947 %}
 948 
 949 
 950 //----------OPERANDS-----------------------------------------------------------
 951 // Operand definitions must precede instruction definitions for correct parsing
 952 // in the ADLC because operands constitute user defined types which are used in
 953 // instruction definitions.
 954 
 955 // Vectors
 956 operand vecS() %{
 957   constraint(ALLOC_IN_RC(vectors_reg));
 958   match(VecS);
 959 
 960   format %{ %}
 961   interface(REG_INTER);
 962 %}
 963 
 964 operand vecD() %{
 965   constraint(ALLOC_IN_RC(vectord_reg));
 966   match(VecD);
 967 
 968   format %{ %}
 969   interface(REG_INTER);
 970 %}
 971 
 972 operand vecX() %{
 973   constraint(ALLOC_IN_RC(vectorx_reg));
 974   match(VecX);
 975 
 976   format %{ %}
 977   interface(REG_INTER);
 978 %}
 979 
 980 operand vecY() %{
 981   constraint(ALLOC_IN_RC(vectory_reg));
 982   match(VecY);
 983 
 984   format %{ %}
 985   interface(REG_INTER);
 986 %}
 987 
 988 
 989 // INSTRUCTIONS -- Platform independent definitions (same for 32- and 64-bit)
 990 
 991 // ============================================================================
 992 
 993 instruct ShouldNotReachHere() %{
 994   match(Halt);
 995   format %{ "int3\t# ShouldNotReachHere" %}
 996   ins_encode %{
 997     __ int3();
 998   %}
 999   ins_pipe(pipe_slow);
1000 %}
1001 
1002 // ============================================================================
1003 
1004 instruct addF_reg(regF dst, regF src) %{
1005   predicate((UseSSE>=1) && (UseAVX == 0));
1006   match(Set dst (AddF dst src));
1007 
1008   format %{ "addss   $dst, $src" %}
1009   ins_cost(150);
1010   ins_encode %{
1011     __ addss($dst$$XMMRegister, $src$$XMMRegister);
1012   %}
1013   ins_pipe(pipe_slow);
1014 %}
1015 
1016 instruct addF_mem(regF dst, memory src) %{
1017   predicate((UseSSE>=1) && (UseAVX == 0));
1018   match(Set dst (AddF dst (LoadF src)));
1019 
1020   format %{ "addss   $dst, $src" %}
1021   ins_cost(150);
1022   ins_encode %{
1023     __ addss($dst$$XMMRegister, $src$$Address);
1024   %}
1025   ins_pipe(pipe_slow);
1026 %}
1027 
1028 instruct addF_imm(regF dst, immF con) %{
1029   predicate((UseSSE>=1) && (UseAVX == 0));
1030   match(Set dst (AddF dst con));
1031   format %{ "addss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
1032   ins_cost(150);
1033   ins_encode %{
1034     __ addss($dst$$XMMRegister, $constantaddress($con));
1035   %}
1036   ins_pipe(pipe_slow);
1037 %}
1038 
1039 instruct addF_reg_reg(regF dst, regF src1, regF src2) %{
1040   predicate(UseAVX > 0);
1041   match(Set dst (AddF src1 src2));
1042 
1043   format %{ "vaddss  $dst, $src1, $src2" %}
1044   ins_cost(150);
1045   ins_encode %{
1046     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
1047   %}
1048   ins_pipe(pipe_slow);
1049 %}
1050 
1051 instruct addF_reg_mem(regF dst, regF src1, memory src2) %{
1052   predicate(UseAVX > 0);
1053   match(Set dst (AddF src1 (LoadF src2)));
1054 
1055   format %{ "vaddss  $dst, $src1, $src2" %}
1056   ins_cost(150);
1057   ins_encode %{
1058     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
1059   %}
1060   ins_pipe(pipe_slow);
1061 %}
1062 
1063 instruct addF_reg_imm(regF dst, regF src, immF con) %{
1064   predicate(UseAVX > 0);
1065   match(Set dst (AddF src con));
1066 
1067   format %{ "vaddss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
1068   ins_cost(150);
1069   ins_encode %{
1070     __ vaddss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
1071   %}
1072   ins_pipe(pipe_slow);
1073 %}
1074 
1075 instruct addD_reg(regD dst, regD src) %{
1076   predicate((UseSSE>=2) && (UseAVX == 0));
1077   match(Set dst (AddD dst src));
1078 
1079   format %{ "addsd   $dst, $src" %}
1080   ins_cost(150);
1081   ins_encode %{
1082     __ addsd($dst$$XMMRegister, $src$$XMMRegister);
1083   %}
1084   ins_pipe(pipe_slow);
1085 %}
1086 
1087 instruct addD_mem(regD dst, memory src) %{
1088   predicate((UseSSE>=2) && (UseAVX == 0));
1089   match(Set dst (AddD dst (LoadD src)));
1090 
1091   format %{ "addsd   $dst, $src" %}
1092   ins_cost(150);
1093   ins_encode %{
1094     __ addsd($dst$$XMMRegister, $src$$Address);
1095   %}
1096   ins_pipe(pipe_slow);
1097 %}
1098 
1099 instruct addD_imm(regD dst, immD con) %{
1100   predicate((UseSSE>=2) && (UseAVX == 0));
1101   match(Set dst (AddD dst con));
1102   format %{ "addsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
1103   ins_cost(150);
1104   ins_encode %{
1105     __ addsd($dst$$XMMRegister, $constantaddress($con));
1106   %}
1107   ins_pipe(pipe_slow);
1108 %}
1109 
1110 instruct addD_reg_reg(regD dst, regD src1, regD src2) %{
1111   predicate(UseAVX > 0);
1112   match(Set dst (AddD src1 src2));
1113 
1114   format %{ "vaddsd  $dst, $src1, $src2" %}
1115   ins_cost(150);
1116   ins_encode %{
1117     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
1118   %}
1119   ins_pipe(pipe_slow);
1120 %}
1121 
1122 instruct addD_reg_mem(regD dst, regD src1, memory src2) %{
1123   predicate(UseAVX > 0);
1124   match(Set dst (AddD src1 (LoadD src2)));
1125 
1126   format %{ "vaddsd  $dst, $src1, $src2" %}
1127   ins_cost(150);
1128   ins_encode %{
1129     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
1130   %}
1131   ins_pipe(pipe_slow);
1132 %}
1133 
1134 instruct addD_reg_imm(regD dst, regD src, immD con) %{
1135   predicate(UseAVX > 0);
1136   match(Set dst (AddD src con));
1137 
1138   format %{ "vaddsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
1139   ins_cost(150);
1140   ins_encode %{
1141     __ vaddsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
1142   %}
1143   ins_pipe(pipe_slow);
1144 %}
1145 
1146 instruct subF_reg(regF dst, regF src) %{
1147   predicate((UseSSE>=1) && (UseAVX == 0));
1148   match(Set dst (SubF dst src));
1149 
1150   format %{ "subss   $dst, $src" %}
1151   ins_cost(150);
1152   ins_encode %{
1153     __ subss($dst$$XMMRegister, $src$$XMMRegister);
1154   %}
1155   ins_pipe(pipe_slow);
1156 %}
1157 
1158 instruct subF_mem(regF dst, memory src) %{
1159   predicate((UseSSE>=1) && (UseAVX == 0));
1160   match(Set dst (SubF dst (LoadF src)));
1161 
1162   format %{ "subss   $dst, $src" %}
1163   ins_cost(150);
1164   ins_encode %{
1165     __ subss($dst$$XMMRegister, $src$$Address);
1166   %}
1167   ins_pipe(pipe_slow);
1168 %}
1169 
1170 instruct subF_imm(regF dst, immF con) %{
1171   predicate((UseSSE>=1) && (UseAVX == 0));
1172   match(Set dst (SubF dst con));
1173   format %{ "subss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
1174   ins_cost(150);
1175   ins_encode %{
1176     __ subss($dst$$XMMRegister, $constantaddress($con));
1177   %}
1178   ins_pipe(pipe_slow);
1179 %}
1180 
1181 instruct subF_reg_reg(regF dst, regF src1, regF src2) %{
1182   predicate(UseAVX > 0);
1183   match(Set dst (SubF src1 src2));
1184 
1185   format %{ "vsubss  $dst, $src1, $src2" %}
1186   ins_cost(150);
1187   ins_encode %{
1188     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
1189   %}
1190   ins_pipe(pipe_slow);
1191 %}
1192 
1193 instruct subF_reg_mem(regF dst, regF src1, memory src2) %{
1194   predicate(UseAVX > 0);
1195   match(Set dst (SubF src1 (LoadF src2)));
1196 
1197   format %{ "vsubss  $dst, $src1, $src2" %}
1198   ins_cost(150);
1199   ins_encode %{
1200     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
1201   %}
1202   ins_pipe(pipe_slow);
1203 %}
1204 
1205 instruct subF_reg_imm(regF dst, regF src, immF con) %{
1206   predicate(UseAVX > 0);
1207   match(Set dst (SubF src con));
1208 
1209   format %{ "vsubss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
1210   ins_cost(150);
1211   ins_encode %{
1212     __ vsubss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
1213   %}
1214   ins_pipe(pipe_slow);
1215 %}
1216 
1217 instruct subD_reg(regD dst, regD src) %{
1218   predicate((UseSSE>=2) && (UseAVX == 0));
1219   match(Set dst (SubD dst src));
1220 
1221   format %{ "subsd   $dst, $src" %}
1222   ins_cost(150);
1223   ins_encode %{
1224     __ subsd($dst$$XMMRegister, $src$$XMMRegister);
1225   %}
1226   ins_pipe(pipe_slow);
1227 %}
1228 
1229 instruct subD_mem(regD dst, memory src) %{
1230   predicate((UseSSE>=2) && (UseAVX == 0));
1231   match(Set dst (SubD dst (LoadD src)));
1232 
1233   format %{ "subsd   $dst, $src" %}
1234   ins_cost(150);
1235   ins_encode %{
1236     __ subsd($dst$$XMMRegister, $src$$Address);
1237   %}
1238   ins_pipe(pipe_slow);
1239 %}
1240 
1241 instruct subD_imm(regD dst, immD con) %{
1242   predicate((UseSSE>=2) && (UseAVX == 0));
1243   match(Set dst (SubD dst con));
1244   format %{ "subsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
1245   ins_cost(150);
1246   ins_encode %{
1247     __ subsd($dst$$XMMRegister, $constantaddress($con));
1248   %}
1249   ins_pipe(pipe_slow);
1250 %}
1251 
1252 instruct subD_reg_reg(regD dst, regD src1, regD src2) %{
1253   predicate(UseAVX > 0);
1254   match(Set dst (SubD src1 src2));
1255 
1256   format %{ "vsubsd  $dst, $src1, $src2" %}
1257   ins_cost(150);
1258   ins_encode %{
1259     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
1260   %}
1261   ins_pipe(pipe_slow);
1262 %}
1263 
1264 instruct subD_reg_mem(regD dst, regD src1, memory src2) %{
1265   predicate(UseAVX > 0);
1266   match(Set dst (SubD src1 (LoadD src2)));
1267 
1268   format %{ "vsubsd  $dst, $src1, $src2" %}
1269   ins_cost(150);
1270   ins_encode %{
1271     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
1272   %}
1273   ins_pipe(pipe_slow);
1274 %}
1275 
1276 instruct subD_reg_imm(regD dst, regD src, immD con) %{
1277   predicate(UseAVX > 0);
1278   match(Set dst (SubD src con));
1279 
1280   format %{ "vsubsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
1281   ins_cost(150);
1282   ins_encode %{
1283     __ vsubsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
1284   %}
1285   ins_pipe(pipe_slow);
1286 %}
1287 
1288 instruct mulF_reg(regF dst, regF src) %{
1289   predicate((UseSSE>=1) && (UseAVX == 0));
1290   match(Set dst (MulF dst src));
1291 
1292   format %{ "mulss   $dst, $src" %}
1293   ins_cost(150);
1294   ins_encode %{
1295     __ mulss($dst$$XMMRegister, $src$$XMMRegister);
1296   %}
1297   ins_pipe(pipe_slow);
1298 %}
1299 
1300 instruct mulF_mem(regF dst, memory src) %{
1301   predicate((UseSSE>=1) && (UseAVX == 0));
1302   match(Set dst (MulF dst (LoadF src)));
1303 
1304   format %{ "mulss   $dst, $src" %}
1305   ins_cost(150);
1306   ins_encode %{
1307     __ mulss($dst$$XMMRegister, $src$$Address);
1308   %}
1309   ins_pipe(pipe_slow);
1310 %}
1311 
1312 instruct mulF_imm(regF dst, immF con) %{
1313   predicate((UseSSE>=1) && (UseAVX == 0));
1314   match(Set dst (MulF dst con));
1315   format %{ "mulss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
1316   ins_cost(150);
1317   ins_encode %{
1318     __ mulss($dst$$XMMRegister, $constantaddress($con));
1319   %}
1320   ins_pipe(pipe_slow);
1321 %}
1322 
1323 instruct mulF_reg_reg(regF dst, regF src1, regF src2) %{
1324   predicate(UseAVX > 0);
1325   match(Set dst (MulF src1 src2));
1326 
1327   format %{ "vmulss  $dst, $src1, $src2" %}
1328   ins_cost(150);
1329   ins_encode %{
1330     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
1331   %}
1332   ins_pipe(pipe_slow);
1333 %}
1334 
1335 instruct mulF_reg_mem(regF dst, regF src1, memory src2) %{
1336   predicate(UseAVX > 0);
1337   match(Set dst (MulF src1 (LoadF src2)));
1338 
1339   format %{ "vmulss  $dst, $src1, $src2" %}
1340   ins_cost(150);
1341   ins_encode %{
1342     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
1343   %}
1344   ins_pipe(pipe_slow);
1345 %}
1346 
1347 instruct mulF_reg_imm(regF dst, regF src, immF con) %{
1348   predicate(UseAVX > 0);
1349   match(Set dst (MulF src con));
1350 
1351   format %{ "vmulss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
1352   ins_cost(150);
1353   ins_encode %{
1354     __ vmulss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
1355   %}
1356   ins_pipe(pipe_slow);
1357 %}
1358 
1359 instruct mulD_reg(regD dst, regD src) %{
1360   predicate((UseSSE>=2) && (UseAVX == 0));
1361   match(Set dst (MulD dst src));
1362 
1363   format %{ "mulsd   $dst, $src" %}
1364   ins_cost(150);
1365   ins_encode %{
1366     __ mulsd($dst$$XMMRegister, $src$$XMMRegister);
1367   %}
1368   ins_pipe(pipe_slow);
1369 %}
1370 
1371 instruct mulD_mem(regD dst, memory src) %{
1372   predicate((UseSSE>=2) && (UseAVX == 0));
1373   match(Set dst (MulD dst (LoadD src)));
1374 
1375   format %{ "mulsd   $dst, $src" %}
1376   ins_cost(150);
1377   ins_encode %{
1378     __ mulsd($dst$$XMMRegister, $src$$Address);
1379   %}
1380   ins_pipe(pipe_slow);
1381 %}
1382 
1383 instruct mulD_imm(regD dst, immD con) %{
1384   predicate((UseSSE>=2) && (UseAVX == 0));
1385   match(Set dst (MulD dst con));
1386   format %{ "mulsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
1387   ins_cost(150);
1388   ins_encode %{
1389     __ mulsd($dst$$XMMRegister, $constantaddress($con));
1390   %}
1391   ins_pipe(pipe_slow);
1392 %}
1393 
1394 instruct mulD_reg_reg(regD dst, regD src1, regD src2) %{
1395   predicate(UseAVX > 0);
1396   match(Set dst (MulD src1 src2));
1397 
1398   format %{ "vmulsd  $dst, $src1, $src2" %}
1399   ins_cost(150);
1400   ins_encode %{
1401     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
1402   %}
1403   ins_pipe(pipe_slow);
1404 %}
1405 
1406 instruct mulD_reg_mem(regD dst, regD src1, memory src2) %{
1407   predicate(UseAVX > 0);
1408   match(Set dst (MulD src1 (LoadD src2)));
1409 
1410   format %{ "vmulsd  $dst, $src1, $src2" %}
1411   ins_cost(150);
1412   ins_encode %{
1413     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
1414   %}
1415   ins_pipe(pipe_slow);
1416 %}
1417 
1418 instruct mulD_reg_imm(regD dst, regD src, immD con) %{
1419   predicate(UseAVX > 0);
1420   match(Set dst (MulD src con));
1421 
1422   format %{ "vmulsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
1423   ins_cost(150);
1424   ins_encode %{
1425     __ vmulsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
1426   %}
1427   ins_pipe(pipe_slow);
1428 %}
1429 
1430 instruct divF_reg(regF dst, regF src) %{
1431   predicate((UseSSE>=1) && (UseAVX == 0));
1432   match(Set dst (DivF dst src));
1433 
1434   format %{ "divss   $dst, $src" %}
1435   ins_cost(150);
1436   ins_encode %{
1437     __ divss($dst$$XMMRegister, $src$$XMMRegister);
1438   %}
1439   ins_pipe(pipe_slow);
1440 %}
1441 
1442 instruct divF_mem(regF dst, memory src) %{
1443   predicate((UseSSE>=1) && (UseAVX == 0));
1444   match(Set dst (DivF dst (LoadF src)));
1445 
1446   format %{ "divss   $dst, $src" %}
1447   ins_cost(150);
1448   ins_encode %{
1449     __ divss($dst$$XMMRegister, $src$$Address);
1450   %}
1451   ins_pipe(pipe_slow);
1452 %}
1453 
1454 instruct divF_imm(regF dst, immF con) %{
1455   predicate((UseSSE>=1) && (UseAVX == 0));
1456   match(Set dst (DivF dst con));
1457   format %{ "divss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
1458   ins_cost(150);
1459   ins_encode %{
1460     __ divss($dst$$XMMRegister, $constantaddress($con));
1461   %}
1462   ins_pipe(pipe_slow);
1463 %}
1464 
1465 instruct divF_reg_reg(regF dst, regF src1, regF src2) %{
1466   predicate(UseAVX > 0);
1467   match(Set dst (DivF src1 src2));
1468 
1469   format %{ "vdivss  $dst, $src1, $src2" %}
1470   ins_cost(150);
1471   ins_encode %{
1472     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
1473   %}
1474   ins_pipe(pipe_slow);
1475 %}
1476 
1477 instruct divF_reg_mem(regF dst, regF src1, memory src2) %{
1478   predicate(UseAVX > 0);
1479   match(Set dst (DivF src1 (LoadF src2)));
1480 
1481   format %{ "vdivss  $dst, $src1, $src2" %}
1482   ins_cost(150);
1483   ins_encode %{
1484     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
1485   %}
1486   ins_pipe(pipe_slow);
1487 %}
1488 
1489 instruct divF_reg_imm(regF dst, regF src, immF con) %{
1490   predicate(UseAVX > 0);
1491   match(Set dst (DivF src con));
1492 
1493   format %{ "vdivss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
1494   ins_cost(150);
1495   ins_encode %{
1496     __ vdivss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
1497   %}
1498   ins_pipe(pipe_slow);
1499 %}
1500 
1501 instruct divD_reg(regD dst, regD src) %{
1502   predicate((UseSSE>=2) && (UseAVX == 0));
1503   match(Set dst (DivD dst src));
1504 
1505   format %{ "divsd   $dst, $src" %}
1506   ins_cost(150);
1507   ins_encode %{
1508     __ divsd($dst$$XMMRegister, $src$$XMMRegister);
1509   %}
1510   ins_pipe(pipe_slow);
1511 %}
1512 
1513 instruct divD_mem(regD dst, memory src) %{
1514   predicate((UseSSE>=2) && (UseAVX == 0));
1515   match(Set dst (DivD dst (LoadD src)));
1516 
1517   format %{ "divsd   $dst, $src" %}
1518   ins_cost(150);
1519   ins_encode %{
1520     __ divsd($dst$$XMMRegister, $src$$Address);
1521   %}
1522   ins_pipe(pipe_slow);
1523 %}
1524 
1525 instruct divD_imm(regD dst, immD con) %{
1526   predicate((UseSSE>=2) && (UseAVX == 0));
1527   match(Set dst (DivD dst con));
1528   format %{ "divsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
1529   ins_cost(150);
1530   ins_encode %{
1531     __ divsd($dst$$XMMRegister, $constantaddress($con));
1532   %}
1533   ins_pipe(pipe_slow);
1534 %}
1535 
1536 instruct divD_reg_reg(regD dst, regD src1, regD src2) %{
1537   predicate(UseAVX > 0);
1538   match(Set dst (DivD src1 src2));
1539 
1540   format %{ "vdivsd  $dst, $src1, $src2" %}
1541   ins_cost(150);
1542   ins_encode %{
1543     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
1544   %}
1545   ins_pipe(pipe_slow);
1546 %}
1547 
1548 instruct divD_reg_mem(regD dst, regD src1, memory src2) %{
1549   predicate(UseAVX > 0);
1550   match(Set dst (DivD src1 (LoadD src2)));
1551 
1552   format %{ "vdivsd  $dst, $src1, $src2" %}
1553   ins_cost(150);
1554   ins_encode %{
1555     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
1556   %}
1557   ins_pipe(pipe_slow);
1558 %}
1559 
1560 instruct divD_reg_imm(regD dst, regD src, immD con) %{
1561   predicate(UseAVX > 0);
1562   match(Set dst (DivD src con));
1563 
1564   format %{ "vdivsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
1565   ins_cost(150);
1566   ins_encode %{
1567     __ vdivsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
1568   %}
1569   ins_pipe(pipe_slow);
1570 %}
1571 
1572 instruct absF_reg(regF dst) %{
1573   predicate((UseSSE>=1) && (UseAVX == 0));
1574   match(Set dst (AbsF dst));
1575   ins_cost(150);
1576   format %{ "andps   $dst, [0x7fffffff]\t# abs float by sign masking" %}
1577   ins_encode %{
1578     __ andps($dst$$XMMRegister, ExternalAddress(float_signmask()));
1579   %}
1580   ins_pipe(pipe_slow);
1581 %}
1582 
1583 instruct absF_reg_reg(regF dst, regF src) %{
1584   predicate(UseAVX > 0);
1585   match(Set dst (AbsF src));
1586   ins_cost(150);
1587   format %{ "vandps  $dst, $src, [0x7fffffff]\t# abs float by sign masking" %}
1588   ins_encode %{
1589     bool vector256 = false;
1590     __ vandps($dst$$XMMRegister, $src$$XMMRegister,
1591               ExternalAddress(float_signmask()), vector256);
1592   %}
1593   ins_pipe(pipe_slow);
1594 %}
1595 
1596 instruct absD_reg(regD dst) %{
1597   predicate((UseSSE>=2) && (UseAVX == 0));
1598   match(Set dst (AbsD dst));
1599   ins_cost(150);
1600   format %{ "andpd   $dst, [0x7fffffffffffffff]\t"
1601             "# abs double by sign masking" %}
1602   ins_encode %{
1603     __ andpd($dst$$XMMRegister, ExternalAddress(double_signmask()));
1604   %}
1605   ins_pipe(pipe_slow);
1606 %}
1607 
1608 instruct absD_reg_reg(regD dst, regD src) %{
1609   predicate(UseAVX > 0);
1610   match(Set dst (AbsD src));
1611   ins_cost(150);
1612   format %{ "vandpd  $dst, $src, [0x7fffffffffffffff]\t"
1613             "# abs double by sign masking" %}
1614   ins_encode %{
1615     bool vector256 = false;
1616     __ vandpd($dst$$XMMRegister, $src$$XMMRegister,
1617               ExternalAddress(double_signmask()), vector256);
1618   %}
1619   ins_pipe(pipe_slow);
1620 %}
1621 
1622 instruct negF_reg(regF dst) %{
1623   predicate((UseSSE>=1) && (UseAVX == 0));
1624   match(Set dst (NegF dst));
1625   ins_cost(150);
1626   format %{ "xorps   $dst, [0x80000000]\t# neg float by sign flipping" %}
1627   ins_encode %{
1628     __ xorps($dst$$XMMRegister, ExternalAddress(float_signflip()));
1629   %}
1630   ins_pipe(pipe_slow);
1631 %}
1632 
1633 instruct negF_reg_reg(regF dst, regF src) %{
1634   predicate(UseAVX > 0);
1635   match(Set dst (NegF src));
1636   ins_cost(150);
1637   format %{ "vxorps  $dst, $src, [0x80000000]\t# neg float by sign flipping" %}
1638   ins_encode %{
1639     bool vector256 = false;
1640     __ vxorps($dst$$XMMRegister, $src$$XMMRegister,
1641               ExternalAddress(float_signflip()), vector256);
1642   %}
1643   ins_pipe(pipe_slow);
1644 %}
1645 
1646 instruct negD_reg(regD dst) %{
1647   predicate((UseSSE>=2) && (UseAVX == 0));
1648   match(Set dst (NegD dst));
1649   ins_cost(150);
1650   format %{ "xorpd   $dst, [0x8000000000000000]\t"
1651             "# neg double by sign flipping" %}
1652   ins_encode %{
1653     __ xorpd($dst$$XMMRegister, ExternalAddress(double_signflip()));
1654   %}
1655   ins_pipe(pipe_slow);
1656 %}
1657 
1658 instruct negD_reg_reg(regD dst, regD src) %{
1659   predicate(UseAVX > 0);
1660   match(Set dst (NegD src));
1661   ins_cost(150);
1662   format %{ "vxorpd  $dst, $src, [0x8000000000000000]\t"
1663             "# neg double by sign flipping" %}
1664   ins_encode %{
1665     bool vector256 = false;
1666     __ vxorpd($dst$$XMMRegister, $src$$XMMRegister,
1667               ExternalAddress(double_signflip()), vector256);
1668   %}
1669   ins_pipe(pipe_slow);
1670 %}
1671 
1672 instruct sqrtF_reg(regF dst, regF src) %{
1673   predicate(UseSSE>=1);
1674   match(Set dst (ConvD2F (SqrtD (ConvF2D src))));
1675 
1676   format %{ "sqrtss  $dst, $src" %}
1677   ins_cost(150);
1678   ins_encode %{
1679     __ sqrtss($dst$$XMMRegister, $src$$XMMRegister);
1680   %}
1681   ins_pipe(pipe_slow);
1682 %}
1683 
1684 instruct sqrtF_mem(regF dst, memory src) %{
1685   predicate(UseSSE>=1);
1686   match(Set dst (ConvD2F (SqrtD (ConvF2D (LoadF src)))));
1687 
1688   format %{ "sqrtss  $dst, $src" %}
1689   ins_cost(150);
1690   ins_encode %{
1691     __ sqrtss($dst$$XMMRegister, $src$$Address);
1692   %}
1693   ins_pipe(pipe_slow);
1694 %}
1695 
1696 instruct sqrtF_imm(regF dst, immF con) %{
1697   predicate(UseSSE>=1);
1698   match(Set dst (ConvD2F (SqrtD (ConvF2D con))));
1699   format %{ "sqrtss  $dst, [$constantaddress]\t# load from constant table: float=$con" %}
1700   ins_cost(150);
1701   ins_encode %{
1702     __ sqrtss($dst$$XMMRegister, $constantaddress($con));
1703   %}
1704   ins_pipe(pipe_slow);
1705 %}
1706 
1707 instruct sqrtD_reg(regD dst, regD src) %{
1708   predicate(UseSSE>=2);
1709   match(Set dst (SqrtD src));
1710 
1711   format %{ "sqrtsd  $dst, $src" %}
1712   ins_cost(150);
1713   ins_encode %{
1714     __ sqrtsd($dst$$XMMRegister, $src$$XMMRegister);
1715   %}
1716   ins_pipe(pipe_slow);
1717 %}
1718 
1719 instruct sqrtD_mem(regD dst, memory src) %{
1720   predicate(UseSSE>=2);
1721   match(Set dst (SqrtD (LoadD src)));
1722 
1723   format %{ "sqrtsd  $dst, $src" %}
1724   ins_cost(150);
1725   ins_encode %{
1726     __ sqrtsd($dst$$XMMRegister, $src$$Address);
1727   %}
1728   ins_pipe(pipe_slow);
1729 %}
1730 
1731 instruct sqrtD_imm(regD dst, immD con) %{
1732   predicate(UseSSE>=2);
1733   match(Set dst (SqrtD con));
1734   format %{ "sqrtsd  $dst, [$constantaddress]\t# load from constant table: double=$con" %}
1735   ins_cost(150);
1736   ins_encode %{
1737     __ sqrtsd($dst$$XMMRegister, $constantaddress($con));
1738   %}
1739   ins_pipe(pipe_slow);
1740 %}
1741 
1742 
1743 // ====================VECTOR INSTRUCTIONS=====================================
1744 
1745 // Load vectors (4 bytes long)
1746 instruct loadV4(vecS dst, memory mem) %{
1747   predicate(n->as_LoadVector()->memory_size() == 4);
1748   match(Set dst (LoadVector mem));
1749   ins_cost(125);
1750   format %{ "movd    $dst,$mem\t! load vector (4 bytes)" %}
1751   ins_encode %{
1752     __ movdl($dst$$XMMRegister, $mem$$Address);
1753   %}
1754   ins_pipe( pipe_slow );
1755 %}
1756 
1757 // Load vectors (8 bytes long)
1758 instruct loadV8(vecD dst, memory mem) %{
1759   predicate(n->as_LoadVector()->memory_size() == 8);
1760   match(Set dst (LoadVector mem));
1761   ins_cost(125);
1762   format %{ "movq    $dst,$mem\t! load vector (8 bytes)" %}
1763   ins_encode %{
1764     __ movq($dst$$XMMRegister, $mem$$Address);
1765   %}
1766   ins_pipe( pipe_slow );
1767 %}
1768 
1769 // Load vectors (16 bytes long)
1770 instruct loadV16(vecX dst, memory mem) %{
1771   predicate(n->as_LoadVector()->memory_size() == 16);
1772   match(Set dst (LoadVector mem));
1773   ins_cost(125);
1774   format %{ "movdqu  $dst,$mem\t! load vector (16 bytes)" %}
1775   ins_encode %{
1776     __ movdqu($dst$$XMMRegister, $mem$$Address);
1777   %}
1778   ins_pipe( pipe_slow );
1779 %}
1780 
1781 // Load vectors (32 bytes long)
1782 instruct loadV32(vecY dst, memory mem) %{
1783   predicate(n->as_LoadVector()->memory_size() == 32);
1784   match(Set dst (LoadVector mem));
1785   ins_cost(125);
1786   format %{ "vmovdqu $dst,$mem\t! load vector (32 bytes)" %}
1787   ins_encode %{
1788     __ vmovdqu($dst$$XMMRegister, $mem$$Address);
1789   %}
1790   ins_pipe( pipe_slow );
1791 %}
1792 
1793 // Store vectors
1794 instruct storeV4(memory mem, vecS src) %{
1795   predicate(n->as_StoreVector()->memory_size() == 4);
1796   match(Set mem (StoreVector mem src));
1797   ins_cost(145);
1798   format %{ "movd    $mem,$src\t! store vector (4 bytes)" %}
1799   ins_encode %{
1800     __ movdl($mem$$Address, $src$$XMMRegister);
1801   %}
1802   ins_pipe( pipe_slow );
1803 %}
1804 
1805 instruct storeV8(memory mem, vecD src) %{
1806   predicate(n->as_StoreVector()->memory_size() == 8);
1807   match(Set mem (StoreVector mem src));
1808   ins_cost(145);
1809   format %{ "movq    $mem,$src\t! store vector (8 bytes)" %}
1810   ins_encode %{
1811     __ movq($mem$$Address, $src$$XMMRegister);
1812   %}
1813   ins_pipe( pipe_slow );
1814 %}
1815 
1816 instruct storeV16(memory mem, vecX src) %{
1817   predicate(n->as_StoreVector()->memory_size() == 16);
1818   match(Set mem (StoreVector mem src));
1819   ins_cost(145);
1820   format %{ "movdqu  $mem,$src\t! store vector (16 bytes)" %}
1821   ins_encode %{
1822     __ movdqu($mem$$Address, $src$$XMMRegister);
1823   %}
1824   ins_pipe( pipe_slow );
1825 %}
1826 
1827 instruct storeV32(memory mem, vecY src) %{
1828   predicate(n->as_StoreVector()->memory_size() == 32);
1829   match(Set mem (StoreVector mem src));
1830   ins_cost(145);
1831   format %{ "vmovdqu $mem,$src\t! store vector (32 bytes)" %}
1832   ins_encode %{
1833     __ vmovdqu($mem$$Address, $src$$XMMRegister);
1834   %}
1835   ins_pipe( pipe_slow );
1836 %}
1837 
1838 // Replicate byte scalar to be vector
1839 instruct Repl4B(vecS dst, rRegI src) %{
1840   predicate(n->as_Vector()->length() == 4);
1841   match(Set dst (ReplicateB src));
1842   format %{ "movd    $dst,$src\n\t"
1843             "punpcklbw $dst,$dst\n\t"
1844             "pshuflw $dst,$dst,0x00\t! replicate4B" %}
1845   ins_encode %{
1846     __ movdl($dst$$XMMRegister, $src$$Register);
1847     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
1848     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
1849   %}
1850   ins_pipe( pipe_slow );
1851 %}
1852 
1853 instruct Repl8B(vecD dst, rRegI src) %{
1854   predicate(n->as_Vector()->length() == 8);
1855   match(Set dst (ReplicateB src));
1856   format %{ "movd    $dst,$src\n\t"
1857             "punpcklbw $dst,$dst\n\t"
1858             "pshuflw $dst,$dst,0x00\t! replicate8B" %}
1859   ins_encode %{
1860     __ movdl($dst$$XMMRegister, $src$$Register);
1861     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
1862     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
1863   %}
1864   ins_pipe( pipe_slow );
1865 %}
1866 
1867 instruct Repl16B(vecX dst, rRegI src) %{
1868   predicate(n->as_Vector()->length() == 16);
1869   match(Set dst (ReplicateB src));
1870   format %{ "movd    $dst,$src\n\t"
1871             "punpcklbw $dst,$dst\n\t"
1872             "pshuflw $dst,$dst,0x00\n\t"
1873             "punpcklqdq $dst,$dst\t! replicate16B" %}
1874   ins_encode %{
1875     __ movdl($dst$$XMMRegister, $src$$Register);
1876     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
1877     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
1878     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
1879   %}
1880   ins_pipe( pipe_slow );
1881 %}
1882 
1883 instruct Repl32B(vecY dst, rRegI src) %{
1884   predicate(n->as_Vector()->length() == 32);
1885   match(Set dst (ReplicateB src));
1886   format %{ "movd    $dst,$src\n\t"
1887             "punpcklbw $dst,$dst\n\t"
1888             "pshuflw $dst,$dst,0x00\n\t"
1889             "punpcklqdq $dst,$dst\n\t"
1890             "vinserti128h $dst,$dst,$dst\t! replicate32B" %}
1891   ins_encode %{
1892     __ movdl($dst$$XMMRegister, $src$$Register);
1893     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
1894     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
1895     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
1896     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
1897   %}
1898   ins_pipe( pipe_slow );
1899 %}
1900 
1901 // Replicate byte scalar immediate to be vector by loading from const table.
1902 instruct Repl4B_imm(vecS dst, immI con) %{
1903   predicate(n->as_Vector()->length() == 4);
1904   match(Set dst (ReplicateB con));
1905   format %{ "movdl   $dst,[$constantaddress]\t! replicate4B($con)" %}
1906   ins_encode %{
1907     __ movdl($dst$$XMMRegister, $constantaddress(replicate4_imm($con$$constant, 1)));
1908   %}
1909   ins_pipe( pipe_slow );
1910 %}
1911 
1912 instruct Repl8B_imm(vecD dst, immI con) %{
1913   predicate(n->as_Vector()->length() == 8);
1914   match(Set dst (ReplicateB con));
1915   format %{ "movq    $dst,[$constantaddress]\t! replicate8B($con)" %}
1916   ins_encode %{
1917     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
1918   %}
1919   ins_pipe( pipe_slow );
1920 %}
1921 
1922 instruct Repl16B_imm(vecX dst, immI con) %{
1923   predicate(n->as_Vector()->length() == 16);
1924   match(Set dst (ReplicateB con));
1925   format %{ "movq    $dst,[$constantaddress]\n\t"
1926             "punpcklqdq $dst,$dst\t! replicate16B($con)" %}
1927   ins_encode %{
1928     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
1929     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
1930   %}
1931   ins_pipe( pipe_slow );
1932 %}
1933 
1934 instruct Repl32B_imm(vecY dst, immI con) %{
1935   predicate(n->as_Vector()->length() == 32);
1936   match(Set dst (ReplicateB con));
1937   format %{ "movq    $dst,[$constantaddress]\n\t"
1938             "punpcklqdq $dst,$dst\n\t"
1939             "vinserti128h $dst,$dst,$dst\t! lreplicate32B($con)" %}
1940   ins_encode %{
1941     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
1942     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
1943     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
1944   %}
1945   ins_pipe( pipe_slow );
1946 %}
1947 
1948 // Replicate byte scalar zero to be vector
1949 instruct Repl4B_zero(vecS dst, immI0 zero) %{
1950   predicate(n->as_Vector()->length() == 4);
1951   match(Set dst (ReplicateB zero));
1952   format %{ "pxor    $dst,$dst\t! replicate4B zero" %}
1953   ins_encode %{
1954     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
1955   %}
1956   ins_pipe( fpu_reg_reg );
1957 %}
1958 
1959 instruct Repl8B_zero(vecD dst, immI0 zero) %{
1960   predicate(n->as_Vector()->length() == 8);
1961   match(Set dst (ReplicateB zero));
1962   format %{ "pxor    $dst,$dst\t! replicate8B zero" %}
1963   ins_encode %{
1964     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
1965   %}
1966   ins_pipe( fpu_reg_reg );
1967 %}
1968 
1969 instruct Repl16B_zero(vecX dst, immI0 zero) %{
1970   predicate(n->as_Vector()->length() == 16);
1971   match(Set dst (ReplicateB zero));
1972   format %{ "pxor    $dst,$dst\t! replicate16B zero" %}
1973   ins_encode %{
1974     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
1975   %}
1976   ins_pipe( fpu_reg_reg );
1977 %}
1978 
1979 instruct Repl32B_zero(vecY dst, immI0 zero) %{
1980   predicate(n->as_Vector()->length() == 32);
1981   match(Set dst (ReplicateB zero));
1982   format %{ "vpxor   $dst,$dst,$dst\t! replicate32B zero" %}
1983   ins_encode %{
1984     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
1985     bool vector256 = true;
1986     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector256);
1987   %}
1988   ins_pipe( fpu_reg_reg );
1989 %}
1990 
1991 // Replicate char/short (2 byte) scalar to be vector
1992 instruct Repl2S(vecS dst, rRegI src) %{
1993   predicate(n->as_Vector()->length() == 2);
1994   match(Set dst (ReplicateS src));
1995   format %{ "movd    $dst,$src\n\t"
1996             "pshuflw $dst,$dst,0x00\t! replicate2S" %}
1997   ins_encode %{
1998     __ movdl($dst$$XMMRegister, $src$$Register);
1999     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
2000   %}
2001   ins_pipe( fpu_reg_reg );
2002 %}
2003 
2004 instruct Repl4S(vecD dst, rRegI src) %{
2005   predicate(n->as_Vector()->length() == 4);
2006   match(Set dst (ReplicateS src));
2007   format %{ "movd    $dst,$src\n\t"
2008             "pshuflw $dst,$dst,0x00\t! replicate4S" %}
2009   ins_encode %{
2010     __ movdl($dst$$XMMRegister, $src$$Register);
2011     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
2012   %}
2013   ins_pipe( fpu_reg_reg );
2014 %}
2015 
2016 instruct Repl8S(vecX dst, rRegI src) %{
2017   predicate(n->as_Vector()->length() == 8);
2018   match(Set dst (ReplicateS src));
2019   format %{ "movd    $dst,$src\n\t"
2020             "pshuflw $dst,$dst,0x00\n\t"
2021             "punpcklqdq $dst,$dst\t! replicate8S" %}
2022   ins_encode %{
2023     __ movdl($dst$$XMMRegister, $src$$Register);
2024     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
2025     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2026   %}
2027   ins_pipe( pipe_slow );
2028 %}
2029 
2030 instruct Repl16S(vecY dst, rRegI src) %{
2031   predicate(n->as_Vector()->length() == 16);
2032   match(Set dst (ReplicateS src));
2033   format %{ "movd    $dst,$src\n\t"
2034             "pshuflw $dst,$dst,0x00\n\t"
2035             "punpcklqdq $dst,$dst\n\t"
2036             "vinserti128h $dst,$dst,$dst\t! replicate16S" %}
2037   ins_encode %{
2038     __ movdl($dst$$XMMRegister, $src$$Register);
2039     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
2040     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2041     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
2042   %}
2043   ins_pipe( pipe_slow );
2044 %}
2045 
2046 // Replicate char/short (2 byte) scalar immediate to be vector by loading from const table.
2047 instruct Repl2S_imm(vecS dst, immI con) %{
2048   predicate(n->as_Vector()->length() == 2);
2049   match(Set dst (ReplicateS con));
2050   format %{ "movdl   $dst,[$constantaddress]\t! replicate2S($con)" %}
2051   ins_encode %{
2052     __ movdl($dst$$XMMRegister, $constantaddress(replicate4_imm($con$$constant, 2)));
2053   %}
2054   ins_pipe( fpu_reg_reg );
2055 %}
2056 
2057 instruct Repl4S_imm(vecD dst, immI con) %{
2058   predicate(n->as_Vector()->length() == 4);
2059   match(Set dst (ReplicateS con));
2060   format %{ "movq    $dst,[$constantaddress]\t! replicate4S($con)" %}
2061   ins_encode %{
2062     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
2063   %}
2064   ins_pipe( fpu_reg_reg );
2065 %}
2066 
2067 instruct Repl8S_imm(vecX dst, immI con) %{
2068   predicate(n->as_Vector()->length() == 8);
2069   match(Set dst (ReplicateS con));
2070   format %{ "movq    $dst,[$constantaddress]\n\t"
2071             "punpcklqdq $dst,$dst\t! replicate8S($con)" %}
2072   ins_encode %{
2073     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
2074     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2075   %}
2076   ins_pipe( pipe_slow );
2077 %}
2078 
2079 instruct Repl16S_imm(vecY dst, immI con) %{
2080   predicate(n->as_Vector()->length() == 16);
2081   match(Set dst (ReplicateS con));
2082   format %{ "movq    $dst,[$constantaddress]\n\t"
2083             "punpcklqdq $dst,$dst\n\t"
2084             "vinserti128h $dst,$dst,$dst\t! replicate16S($con)" %}
2085   ins_encode %{
2086     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
2087     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2088     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
2089   %}
2090   ins_pipe( pipe_slow );
2091 %}
2092 
2093 // Replicate char/short (2 byte) scalar zero to be vector
2094 instruct Repl2S_zero(vecS dst, immI0 zero) %{
2095   predicate(n->as_Vector()->length() == 2);
2096   match(Set dst (ReplicateS zero));
2097   format %{ "pxor    $dst,$dst\t! replicate2S zero" %}
2098   ins_encode %{
2099     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
2100   %}
2101   ins_pipe( fpu_reg_reg );
2102 %}
2103 
2104 instruct Repl4S_zero(vecD dst, immI0 zero) %{
2105   predicate(n->as_Vector()->length() == 4);
2106   match(Set dst (ReplicateS zero));
2107   format %{ "pxor    $dst,$dst\t! replicate4S zero" %}
2108   ins_encode %{
2109     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
2110   %}
2111   ins_pipe( fpu_reg_reg );
2112 %}
2113 
2114 instruct Repl8S_zero(vecX dst, immI0 zero) %{
2115   predicate(n->as_Vector()->length() == 8);
2116   match(Set dst (ReplicateS zero));
2117   format %{ "pxor    $dst,$dst\t! replicate8S zero" %}
2118   ins_encode %{
2119     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
2120   %}
2121   ins_pipe( fpu_reg_reg );
2122 %}
2123 
2124 instruct Repl16S_zero(vecY dst, immI0 zero) %{
2125   predicate(n->as_Vector()->length() == 16);
2126   match(Set dst (ReplicateS zero));
2127   format %{ "vpxor   $dst,$dst,$dst\t! replicate16S zero" %}
2128   ins_encode %{
2129     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
2130     bool vector256 = true;
2131     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector256);
2132   %}
2133   ins_pipe( fpu_reg_reg );
2134 %}
2135 
2136 // Replicate integer (4 byte) scalar to be vector
2137 instruct Repl2I(vecD dst, rRegI src) %{
2138   predicate(n->as_Vector()->length() == 2);
2139   match(Set dst (ReplicateI src));
2140   format %{ "movd    $dst,$src\n\t"
2141             "pshufd  $dst,$dst,0x00\t! replicate2I" %}
2142   ins_encode %{
2143     __ movdl($dst$$XMMRegister, $src$$Register);
2144     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
2145   %}
2146   ins_pipe( fpu_reg_reg );
2147 %}
2148 
2149 instruct Repl4I(vecX dst, rRegI src) %{
2150   predicate(n->as_Vector()->length() == 4);
2151   match(Set dst (ReplicateI src));
2152   format %{ "movd    $dst,$src\n\t"
2153             "pshufd  $dst,$dst,0x00\t! replicate4I" %}
2154   ins_encode %{
2155     __ movdl($dst$$XMMRegister, $src$$Register);
2156     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
2157   %}
2158   ins_pipe( pipe_slow );
2159 %}
2160 
2161 instruct Repl8I(vecY dst, rRegI src) %{
2162   predicate(n->as_Vector()->length() == 8);
2163   match(Set dst (ReplicateI src));
2164   format %{ "movd    $dst,$src\n\t"
2165             "pshufd  $dst,$dst,0x00\n\t"
2166             "vinserti128h $dst,$dst,$dst\t! replicate8I" %}
2167   ins_encode %{
2168     __ movdl($dst$$XMMRegister, $src$$Register);
2169     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
2170     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
2171   %}
2172   ins_pipe( pipe_slow );
2173 %}
2174 
2175 // Replicate integer (4 byte) scalar immediate to be vector by loading from const table.
2176 instruct Repl2I_imm(vecD dst, immI con) %{
2177   predicate(n->as_Vector()->length() == 2);
2178   match(Set dst (ReplicateI con));
2179   format %{ "movq    $dst,[$constantaddress]\t! replicate2I($con)" %}
2180   ins_encode %{
2181     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
2182   %}
2183   ins_pipe( fpu_reg_reg );
2184 %}
2185 
2186 instruct Repl4I_imm(vecX dst, immI con) %{
2187   predicate(n->as_Vector()->length() == 4);
2188   match(Set dst (ReplicateI con));
2189   format %{ "movq    $dst,[$constantaddress]\t! replicate4I($con)\n\t"
2190             "punpcklqdq $dst,$dst" %}
2191   ins_encode %{
2192     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
2193     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2194   %}
2195   ins_pipe( pipe_slow );
2196 %}
2197 
2198 instruct Repl8I_imm(vecY dst, immI con) %{
2199   predicate(n->as_Vector()->length() == 8);
2200   match(Set dst (ReplicateI con));
2201   format %{ "movq    $dst,[$constantaddress]\t! replicate8I($con)\n\t"
2202             "punpcklqdq $dst,$dst\n\t"
2203             "vinserti128h $dst,$dst,$dst" %}
2204   ins_encode %{
2205     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
2206     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2207     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
2208   %}
2209   ins_pipe( pipe_slow );
2210 %}
2211 
2212 // Integer could be loaded into xmm register directly from memory.
2213 instruct Repl2I_mem(vecD dst, memory mem) %{
2214   predicate(n->as_Vector()->length() == 2);
2215   match(Set dst (ReplicateI (LoadI mem)));
2216   format %{ "movd    $dst,$mem\n\t"
2217             "pshufd  $dst,$dst,0x00\t! replicate2I" %}
2218   ins_encode %{
2219     __ movdl($dst$$XMMRegister, $mem$$Address);
2220     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
2221   %}
2222   ins_pipe( fpu_reg_reg );
2223 %}
2224 
2225 instruct Repl4I_mem(vecX dst, memory mem) %{
2226   predicate(n->as_Vector()->length() == 4);
2227   match(Set dst (ReplicateI (LoadI mem)));
2228   format %{ "movd    $dst,$mem\n\t"
2229             "pshufd  $dst,$dst,0x00\t! replicate4I" %}
2230   ins_encode %{
2231     __ movdl($dst$$XMMRegister, $mem$$Address);
2232     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
2233   %}
2234   ins_pipe( pipe_slow );
2235 %}
2236 
2237 instruct Repl8I_mem(vecY dst, memory mem) %{
2238   predicate(n->as_Vector()->length() == 8);
2239   match(Set dst (ReplicateI (LoadI mem)));
2240   format %{ "movd    $dst,$mem\n\t"
2241             "pshufd  $dst,$dst,0x00\n\t"
2242             "vinserti128h $dst,$dst,$dst\t! replicate8I" %}
2243   ins_encode %{
2244     __ movdl($dst$$XMMRegister, $mem$$Address);
2245     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
2246     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
2247   %}
2248   ins_pipe( pipe_slow );
2249 %}
2250 
2251 // Replicate integer (4 byte) scalar zero to be vector
2252 instruct Repl2I_zero(vecD dst, immI0 zero) %{
2253   predicate(n->as_Vector()->length() == 2);
2254   match(Set dst (ReplicateI zero));
2255   format %{ "pxor    $dst,$dst\t! replicate2I" %}
2256   ins_encode %{
2257     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
2258   %}
2259   ins_pipe( fpu_reg_reg );
2260 %}
2261 
2262 instruct Repl4I_zero(vecX dst, immI0 zero) %{
2263   predicate(n->as_Vector()->length() == 4);
2264   match(Set dst (ReplicateI zero));
2265   format %{ "pxor    $dst,$dst\t! replicate4I zero)" %}
2266   ins_encode %{
2267     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
2268   %}
2269   ins_pipe( fpu_reg_reg );
2270 %}
2271 
2272 instruct Repl8I_zero(vecY dst, immI0 zero) %{
2273   predicate(n->as_Vector()->length() == 8);
2274   match(Set dst (ReplicateI zero));
2275   format %{ "vpxor   $dst,$dst,$dst\t! replicate8I zero" %}
2276   ins_encode %{
2277     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
2278     bool vector256 = true;
2279     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector256);
2280   %}
2281   ins_pipe( fpu_reg_reg );
2282 %}
2283 
2284 // Replicate long (8 byte) scalar to be vector
2285 #ifdef _LP64
2286 instruct Repl2L(vecX dst, rRegL src) %{
2287   predicate(n->as_Vector()->length() == 2);
2288   match(Set dst (ReplicateL src));
2289   format %{ "movdq   $dst,$src\n\t"
2290             "punpcklqdq $dst,$dst\t! replicate2L" %}
2291   ins_encode %{
2292     __ movdq($dst$$XMMRegister, $src$$Register);
2293     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2294   %}
2295   ins_pipe( pipe_slow );
2296 %}
2297 
2298 instruct Repl4L(vecY dst, rRegL src) %{
2299   predicate(n->as_Vector()->length() == 4);
2300   match(Set dst (ReplicateL src));
2301   format %{ "movdq   $dst,$src\n\t"
2302             "punpcklqdq $dst,$dst\n\t"
2303             "vinserti128h $dst,$dst,$dst\t! replicate4L" %}
2304   ins_encode %{
2305     __ movdq($dst$$XMMRegister, $src$$Register);
2306     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2307     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
2308   %}
2309   ins_pipe( pipe_slow );
2310 %}
2311 #else // _LP64
2312 instruct Repl2L(vecX dst, eRegL src, regD tmp) %{
2313   predicate(n->as_Vector()->length() == 2);
2314   match(Set dst (ReplicateL src));
2315   effect(TEMP dst, USE src, TEMP tmp);
2316   format %{ "movdl   $dst,$src.lo\n\t"
2317             "movdl   $tmp,$src.hi\n\t"
2318             "punpckldq $dst,$tmp\n\t"
2319             "punpcklqdq $dst,$dst\t! replicate2L"%}
2320   ins_encode %{
2321     __ movdl($dst$$XMMRegister, $src$$Register);
2322     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
2323     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
2324     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2325   %}
2326   ins_pipe( pipe_slow );
2327 %}
2328 
2329 instruct Repl4L(vecY dst, eRegL src, regD tmp) %{
2330   predicate(n->as_Vector()->length() == 4);
2331   match(Set dst (ReplicateL src));
2332   effect(TEMP dst, USE src, TEMP tmp);
2333   format %{ "movdl   $dst,$src.lo\n\t"
2334             "movdl   $tmp,$src.hi\n\t"
2335             "punpckldq $dst,$tmp\n\t"
2336             "punpcklqdq $dst,$dst\n\t"
2337             "vinserti128h $dst,$dst,$dst\t! replicate4L" %}
2338   ins_encode %{
2339     __ movdl($dst$$XMMRegister, $src$$Register);
2340     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
2341     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
2342     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2343     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
2344   %}
2345   ins_pipe( pipe_slow );
2346 %}
2347 #endif // _LP64
2348 
2349 // Replicate long (8 byte) scalar immediate to be vector by loading from const table.
2350 instruct Repl2L_imm(vecX dst, immL con) %{
2351   predicate(n->as_Vector()->length() == 2);
2352   match(Set dst (ReplicateL con));
2353   format %{ "movq    $dst,[$constantaddress]\n\t"
2354             "punpcklqdq $dst,$dst\t! replicate2L($con)" %}
2355   ins_encode %{
2356     __ movq($dst$$XMMRegister, $constantaddress($con));
2357     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2358   %}
2359   ins_pipe( pipe_slow );
2360 %}
2361 
2362 instruct Repl4L_imm(vecY dst, immL con) %{
2363   predicate(n->as_Vector()->length() == 4);
2364   match(Set dst (ReplicateL con));
2365   format %{ "movq    $dst,[$constantaddress]\n\t"
2366             "punpcklqdq $dst,$dst\n\t"
2367             "vinserti128h $dst,$dst,$dst\t! replicate4L($con)" %}
2368   ins_encode %{
2369     __ movq($dst$$XMMRegister, $constantaddress($con));
2370     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2371     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
2372   %}
2373   ins_pipe( pipe_slow );
2374 %}
2375 
2376 // Long could be loaded into xmm register directly from memory.
2377 instruct Repl2L_mem(vecX dst, memory mem) %{
2378   predicate(n->as_Vector()->length() == 2);
2379   match(Set dst (ReplicateL (LoadL mem)));
2380   format %{ "movq    $dst,$mem\n\t"
2381             "punpcklqdq $dst,$dst\t! replicate2L" %}
2382   ins_encode %{
2383     __ movq($dst$$XMMRegister, $mem$$Address);
2384     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2385   %}
2386   ins_pipe( pipe_slow );
2387 %}
2388 
2389 instruct Repl4L_mem(vecY dst, memory mem) %{
2390   predicate(n->as_Vector()->length() == 4);
2391   match(Set dst (ReplicateL (LoadL mem)));
2392   format %{ "movq    $dst,$mem\n\t"
2393             "punpcklqdq $dst,$dst\n\t"
2394             "vinserti128h $dst,$dst,$dst\t! replicate4L" %}
2395   ins_encode %{
2396     __ movq($dst$$XMMRegister, $mem$$Address);
2397     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2398     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
2399   %}
2400   ins_pipe( pipe_slow );
2401 %}
2402 
2403 // Replicate long (8 byte) scalar zero to be vector
2404 instruct Repl2L_zero(vecX dst, immL0 zero) %{
2405   predicate(n->as_Vector()->length() == 2);
2406   match(Set dst (ReplicateL zero));
2407   format %{ "pxor    $dst,$dst\t! replicate2L zero" %}
2408   ins_encode %{
2409     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
2410   %}
2411   ins_pipe( fpu_reg_reg );
2412 %}
2413 
2414 instruct Repl4L_zero(vecY dst, immL0 zero) %{
2415   predicate(n->as_Vector()->length() == 4);
2416   match(Set dst (ReplicateL zero));
2417   format %{ "vpxor   $dst,$dst,$dst\t! replicate4L zero" %}
2418   ins_encode %{
2419     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
2420     bool vector256 = true;
2421     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector256);
2422   %}
2423   ins_pipe( fpu_reg_reg );
2424 %}
2425 
2426 // Replicate float (4 byte) scalar to be vector
2427 instruct Repl2F(vecD dst, regF src) %{
2428   predicate(n->as_Vector()->length() == 2);
2429   match(Set dst (ReplicateF src));
2430   format %{ "pshufd  $dst,$dst,0x00\t! replicate2F" %}
2431   ins_encode %{
2432     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
2433   %}
2434   ins_pipe( fpu_reg_reg );
2435 %}
2436 
2437 instruct Repl4F(vecX dst, regF src) %{
2438   predicate(n->as_Vector()->length() == 4);
2439   match(Set dst (ReplicateF src));
2440   format %{ "pshufd  $dst,$dst,0x00\t! replicate4F" %}
2441   ins_encode %{
2442     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
2443   %}
2444   ins_pipe( pipe_slow );
2445 %}
2446 
2447 instruct Repl8F(vecY dst, regF src) %{
2448   predicate(n->as_Vector()->length() == 8);
2449   match(Set dst (ReplicateF src));
2450   format %{ "pshufd  $dst,$src,0x00\n\t"
2451             "vinsertf128h $dst,$dst,$dst\t! replicate8F" %}
2452   ins_encode %{
2453     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
2454     __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
2455   %}
2456   ins_pipe( pipe_slow );
2457 %}
2458 
2459 // Replicate float (4 byte) scalar zero to be vector
2460 instruct Repl2F_zero(vecD dst, immF0 zero) %{
2461   predicate(n->as_Vector()->length() == 2);
2462   match(Set dst (ReplicateF zero));
2463   format %{ "xorps   $dst,$dst\t! replicate2F zero" %}
2464   ins_encode %{
2465     __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
2466   %}
2467   ins_pipe( fpu_reg_reg );
2468 %}
2469 
2470 instruct Repl4F_zero(vecX dst, immF0 zero) %{
2471   predicate(n->as_Vector()->length() == 4);
2472   match(Set dst (ReplicateF zero));
2473   format %{ "xorps   $dst,$dst\t! replicate4F zero" %}
2474   ins_encode %{
2475     __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
2476   %}
2477   ins_pipe( fpu_reg_reg );
2478 %}
2479 
2480 instruct Repl8F_zero(vecY dst, immF0 zero) %{
2481   predicate(n->as_Vector()->length() == 8);
2482   match(Set dst (ReplicateF zero));
2483   format %{ "vxorps  $dst,$dst,$dst\t! replicate8F zero" %}
2484   ins_encode %{
2485     bool vector256 = true;
2486     __ vxorps($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector256);
2487   %}
2488   ins_pipe( fpu_reg_reg );
2489 %}
2490 
2491 // Replicate double (8 bytes) scalar to be vector
2492 instruct Repl2D(vecX dst, regD src) %{
2493   predicate(n->as_Vector()->length() == 2);
2494   match(Set dst (ReplicateD src));
2495   format %{ "pshufd  $dst,$src,0x44\t! replicate2D" %}
2496   ins_encode %{
2497     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
2498   %}
2499   ins_pipe( pipe_slow );
2500 %}
2501 
2502 instruct Repl4D(vecY dst, regD src) %{
2503   predicate(n->as_Vector()->length() == 4);
2504   match(Set dst (ReplicateD src));
2505   format %{ "pshufd  $dst,$src,0x44\n\t"
2506             "vinsertf128h $dst,$dst,$dst\t! replicate4D" %}
2507   ins_encode %{
2508     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
2509     __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
2510   %}
2511   ins_pipe( pipe_slow );
2512 %}
2513 
2514 // Replicate double (8 byte) scalar zero to be vector
2515 instruct Repl2D_zero(vecX dst, immD0 zero) %{
2516   predicate(n->as_Vector()->length() == 2);
2517   match(Set dst (ReplicateD zero));
2518   format %{ "xorpd   $dst,$dst\t! replicate2D zero" %}
2519   ins_encode %{
2520     __ xorpd($dst$$XMMRegister, $dst$$XMMRegister);
2521   %}
2522   ins_pipe( fpu_reg_reg );
2523 %}
2524 
2525 instruct Repl4D_zero(vecY dst, immD0 zero) %{
2526   predicate(n->as_Vector()->length() == 4);
2527   match(Set dst (ReplicateD zero));
2528   format %{ "vxorpd  $dst,$dst,$dst,vect256\t! replicate4D zero" %}
2529   ins_encode %{
2530     bool vector256 = true;
2531     __ vxorpd($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector256);
2532   %}
2533   ins_pipe( fpu_reg_reg );
2534 %}
2535 
2536 // ====================REDUCTION ARITHMETIC=======================================
2537 
2538 instruct rsadd2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
2539   predicate(UseSSE > 2 && UseAVX == 0);
2540   match(Set dst (AddReductionVI src1 src2));
2541   effect(TEMP tmp2, TEMP tmp);
2542   format %{ "movdqu  $tmp2,$src2\n\t"
2543             "phaddd  $tmp2,$tmp2\n\t"
2544             "movd    $tmp,$src1\n\t"
2545             "paddd   $tmp,$tmp2\n\t"
2546             "movd    $dst,$tmp\t! add reduction2I" %}
2547   ins_encode %{
2548     __ movdqu($tmp2$$XMMRegister, $src2$$XMMRegister);
2549     __ phaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister);
2550     __ movdl($tmp$$XMMRegister, $src1$$Register);
2551     __ paddd($tmp$$XMMRegister, $tmp2$$XMMRegister);
2552     __ movdl($dst$$Register, $tmp$$XMMRegister);
2553   %}
2554   ins_pipe( pipe_slow );
2555 %}
2556 
2557 instruct rvadd2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
2558   predicate(UseAVX > 0);
2559   match(Set dst (AddReductionVI src1 src2));
2560   effect(TEMP tmp, TEMP tmp2);
2561   format %{ "vphaddd $tmp,$src2,$src2\n\t"
2562             "movd    $tmp2,$src1\n\t"
2563             "vpaddd  $tmp2,$tmp2,$tmp\n\t"
2564             "movd    $dst,$tmp2\t! add reduction2I" %}
2565   ins_encode %{
2566     __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, false);
2567     __ movdl($tmp2$$XMMRegister, $src1$$Register);
2568     __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, false);
2569     __ movdl($dst$$Register, $tmp2$$XMMRegister);
2570   %}
2571   ins_pipe( pipe_slow );
2572 %}
2573 
2574 instruct rsadd4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
2575   predicate(UseSSE > 2 && UseAVX == 0);
2576   match(Set dst (AddReductionVI src1 src2));
2577   effect(TEMP tmp2, TEMP tmp);
2578   format %{ "movdqu  $tmp2,$src2\n\t"
2579             "phaddd  $tmp2,$tmp2\n\t"
2580             "phaddd  $tmp2,$tmp2\n\t"
2581             "movd    $tmp,$src1\n\t"
2582             "paddd   $tmp,$tmp2\n\t"
2583             "movd    $dst,$tmp\t! add reduction4I" %}
2584   ins_encode %{
2585     __ movdqu($tmp2$$XMMRegister, $src2$$XMMRegister);
2586     __ phaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister);
2587     __ phaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister);
2588     __ movdl($tmp$$XMMRegister, $src1$$Register);
2589     __ paddd($tmp$$XMMRegister, $tmp2$$XMMRegister);
2590     __ movdl($dst$$Register, $tmp$$XMMRegister);
2591   %}
2592   ins_pipe( pipe_slow );
2593 %}
2594 
2595 instruct rvadd4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
2596   predicate(UseAVX > 0);
2597   match(Set dst (AddReductionVI src1 src2));
2598   effect(TEMP tmp, TEMP tmp2);
2599   format %{ "vphaddd $tmp,$src2,$src2\n\t"
2600             "vphaddd $tmp,$tmp,$tmp2\n\t"
2601             "movd    $tmp2,$src1\n\t"
2602             "vpaddd  $tmp2,$tmp2,$tmp\n\t"
2603             "movd    $dst,$tmp2\t! add reduction4I" %}
2604   ins_encode %{
2605     __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, false);
2606     __ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, false);
2607     __ movdl($tmp2$$XMMRegister, $src1$$Register);
2608     __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, false);
2609     __ movdl($dst$$Register, $tmp2$$XMMRegister);
2610   %}
2611   ins_pipe( pipe_slow );
2612 %}
2613 
2614 instruct rvadd8I_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2) %{
2615   predicate(UseAVX > 0);
2616   match(Set dst (AddReductionVI src1 src2));
2617   effect(TEMP tmp, TEMP tmp2);
2618   format %{ "vphaddd $tmp,$src2,$src2\n\t"
2619             "vphaddd $tmp,$tmp,$tmp2\n\t"
2620             "vextractf128  $tmp2,$tmp\n\t"
2621             "vpaddd  $tmp,$tmp,$tmp2\n\t"
2622             "movd    $tmp2,$src1\n\t"
2623             "vpaddd  $tmp2,$tmp2,$tmp\n\t"
2624             "movd    $dst,$tmp2\t! add reduction8I" %}
2625   ins_encode %{
2626     __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, true);
2627     __ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, true);
2628     __ vextractf128h($tmp2$$XMMRegister, $tmp$$XMMRegister);
2629     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, false);
2630     __ movdl($tmp2$$XMMRegister, $src1$$Register);
2631     __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, false);
2632     __ movdl($dst$$Register, $tmp2$$XMMRegister);
2633   %}
2634   ins_pipe( pipe_slow );
2635 %}
2636 
2637 instruct rsadd2F_reduction_reg(regF dst, regF src1, vecD src2, regF tmp, regF tmp2) %{
2638   predicate(UseSSE >= 1 && UseAVX == 0);
2639   match(Set dst (AddReductionVF src1 src2));
2640   effect(TEMP tmp, TEMP tmp2);
2641   format %{ "movdqu  $tmp,$src1\n\t"
2642             "addss   $tmp,$src2\n\t"
2643             "pshufd  $tmp2,$src2,0x01\n\t"
2644             "addss   $tmp,$tmp2\n\t"
2645             "movdqu  $dst,$tmp\t! add reduction2F" %}
2646   ins_encode %{
2647     __ movdqu($tmp$$XMMRegister, $src1$$XMMRegister);
2648     __ addss($tmp$$XMMRegister, $src2$$XMMRegister);
2649     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x01);
2650     __ addss($tmp$$XMMRegister, $tmp2$$XMMRegister);
2651     __ movdqu($dst$$XMMRegister, $tmp$$XMMRegister);
2652   %}
2653   ins_pipe( pipe_slow );
2654 %}
2655 
2656 instruct rvadd2F_reduction_reg(regF dst, regF src1, vecD src2, regF tmp, regF tmp2) %{
2657   predicate(UseAVX > 0);
2658   match(Set dst (AddReductionVF src1 src2));
2659   effect(TEMP tmp2, TEMP tmp);
2660   format %{ "vaddss  $tmp2,$src1,$src2\n\t"
2661             "pshufd  $tmp,$src2,0x01\n\t"
2662             "vaddss  $dst,$tmp2,$tmp\t! add reduction2F" %}
2663   ins_encode %{
2664     __ vaddss($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2665     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
2666     __ vaddss($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
2667   %}
2668   ins_pipe( pipe_slow );
2669 %}
2670 
2671 instruct rsadd4F_reduction_reg(regF dst, regF src1, vecX src2, regF tmp, regF tmp2) %{
2672   predicate(UseSSE >= 1 && UseAVX == 0);
2673   match(Set dst (AddReductionVF src1 src2));
2674   effect(TEMP tmp, TEMP tmp2);
2675   format %{ "movdqu  $tmp,$src1\n\t"
2676             "addss   $tmp,$src2\n\t"
2677             "pshufd  $tmp2,$src2,0x01\n\t"
2678             "addss   $tmp,$tmp2\n\t"
2679             "pshufd  $tmp2,$src2,0x02\n\t"
2680             "addss   $tmp,$tmp2\n\t"
2681             "pshufd  $tmp2,$src2,0x03\n\t"
2682             "addss   $tmp,$tmp2\n\t"
2683             "movdqu  $dst,$tmp\t! add reduction4F" %}
2684   ins_encode %{
2685     __ movdqu($tmp$$XMMRegister, $src1$$XMMRegister);
2686     __ addss($tmp$$XMMRegister, $src2$$XMMRegister);
2687     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x01);
2688     __ addss($tmp$$XMMRegister, $tmp2$$XMMRegister);
2689     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x02);
2690     __ addss($tmp$$XMMRegister, $tmp2$$XMMRegister);
2691     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x03);
2692     __ addss($tmp$$XMMRegister, $tmp2$$XMMRegister);
2693     __ movdqu($dst$$XMMRegister, $tmp$$XMMRegister);
2694   %}
2695   ins_pipe( pipe_slow );
2696 %}
2697 
2698 instruct rvadd4F_reduction_reg(regF dst, regF src1, vecX src2, regF tmp, regF tmp2) %{
2699   predicate(UseAVX > 0);
2700   match(Set dst (AddReductionVF src1 src2));
2701   effect(TEMP tmp, TEMP tmp2);
2702   format %{ "vaddss  $tmp2,$src1,$src2\n\t"
2703             "pshufd  $tmp,$src2,0x01\n\t"
2704             "vaddss  $tmp2,$tmp2,$tmp\n\t"
2705             "pshufd  $tmp,$src2,0x02\n\t"
2706             "vaddss  $tmp2,$tmp2,$tmp\n\t"
2707             "pshufd  $tmp,$src2,0x03\n\t"
2708             "vaddss  $dst,$tmp2,$tmp\t! add reduction4F" %}
2709   ins_encode %{
2710     __ vaddss($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2711     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
2712     __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
2713     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
2714     __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
2715     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
2716     __ vaddss($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
2717   %}
2718   ins_pipe( pipe_slow );
2719 %}
2720 
2721 instruct radd8F_reduction_reg(regF dst, regF src1, vecY src2, regF tmp, regF tmp2, regF tmp3) %{
2722   predicate(UseAVX > 0);
2723   match(Set dst (AddReductionVF src1 src2));
2724   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
2725   format %{ "vaddss  $tmp2,$src1,$src2\n\t"
2726             "pshufd  $tmp,$src2,0x01\n\t"
2727             "vaddss  $tmp2,$tmp2,$tmp\n\t"
2728             "pshufd  $tmp,$src2,0x02\n\t"
2729             "vaddss  $tmp2,$tmp2,$tmp\n\t"
2730             "pshufd  $tmp,$src2,0x03\n\t"
2731             "vaddss  $tmp2,$tmp2,$tmp\n\t"
2732             "vextractf128  $tmp3,$src2\n\t"
2733             "vaddss  $tmp2,$tmp2,$tmp3\n\t"
2734             "pshufd  $tmp,$tmp3,0x01\n\t"
2735             "vaddss  $tmp2,$tmp2,$tmp\n\t"
2736             "pshufd  $tmp,$tmp3,0x02\n\t"
2737             "vaddss  $tmp2,$tmp2,$tmp\n\t"
2738             "pshufd  $tmp,$tmp3,0x03\n\t"
2739             "vaddss  $dst,$tmp2,$tmp\t! add reduction8F" %}
2740   ins_encode %{
2741     __ vaddss($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2742     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
2743     __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
2744     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
2745     __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
2746     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
2747     __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
2748     __ vextractf128h($tmp3$$XMMRegister, $src2$$XMMRegister);
2749     __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister);
2750     __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x01);
2751     __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
2752     __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x02);
2753     __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
2754     __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x03);
2755     __ vaddss($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
2756   %}
2757   ins_pipe( pipe_slow );
2758 %}
2759 
2760 instruct rsadd2D_reduction_reg(regD dst, regD src1, vecX src2, regD tmp) %{
2761   predicate(UseSSE >= 1 && UseAVX == 0);
2762   match(Set dst (AddReductionVD src1 src2));
2763   effect(TEMP tmp, TEMP dst);
2764   format %{ "movdqu  $tmp,$src1\n\t"
2765             "addsd   $tmp,$src2\n\t"
2766             "pshufd  $dst,$src2,0xE\n\t"
2767             "addsd   $dst,$tmp\t! add reduction2D" %}
2768   ins_encode %{
2769     __ movdqu($tmp$$XMMRegister, $src1$$XMMRegister);
2770     __ addsd($tmp$$XMMRegister, $src2$$XMMRegister);
2771     __ pshufd($dst$$XMMRegister, $src2$$XMMRegister, 0xE);
2772     __ addsd($dst$$XMMRegister, $tmp$$XMMRegister);
2773   %}
2774   ins_pipe( pipe_slow );
2775 %}
2776 
2777 instruct rvadd2D_reduction_reg(regD dst, regD src1, vecX src2, regD tmp, regD tmp2) %{
2778   predicate(UseAVX > 0);
2779   match(Set dst (AddReductionVD src1 src2));
2780   effect(TEMP tmp, TEMP tmp2);
2781   format %{ "vaddsd  $tmp2,$src1,$src2\n\t"
2782             "pshufd  $tmp,$src2,0xE\n\t"
2783             "vaddsd  $dst,$tmp2,$tmp\t! add reduction2D" %}
2784   ins_encode %{
2785     __ vaddsd($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2786     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
2787     __ vaddsd($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
2788   %}
2789   ins_pipe( pipe_slow );
2790 %}
2791 
2792 instruct rvadd4D_reduction_reg(regD dst, regD src1, vecY src2, regD tmp, regD tmp2, regD tmp3) %{
2793   predicate(UseAVX > 0);
2794   match(Set dst (AddReductionVD src1 src2));
2795   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
2796   format %{ "vaddsd  $tmp2,$src1,$src2\n\t"
2797             "pshufd  $tmp,$src2,0xE\n\t"
2798             "vaddsd  $tmp2,$tmp2,$tmp\n\t"
2799             "vextractf128  $tmp3,$src2\n\t"
2800             "vaddsd  $tmp2,$tmp2,$tmp3\n\t"
2801             "pshufd  $tmp,$tmp3,0xE\n\t"
2802             "vaddsd  $dst,$tmp2,$tmp\t! add reduction4D" %}
2803   ins_encode %{
2804     __ vaddsd($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2805     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
2806     __ vaddsd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
2807     __ vextractf128h($tmp3$$XMMRegister, $src2$$XMMRegister);
2808     __ vaddsd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister);
2809     __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0xE);
2810     __ vaddsd($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
2811   %}
2812   ins_pipe( pipe_slow );
2813 %}
2814 
2815 instruct rsmul2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
2816   predicate(UseSSE > 3 && UseAVX == 0);
2817   match(Set dst (MulReductionVI src1 src2));
2818   effect(TEMP tmp, TEMP tmp2);
2819   format %{ "pshufd  $tmp2,$src2,0x1\n\t"
2820             "pmulld  $tmp2,$src2\n\t"
2821             "movd    $tmp,$src1\n\t"
2822             "pmulld  $tmp2,$tmp\n\t"
2823             "movd    $dst,$tmp2\t! mul reduction2I" %}
2824   ins_encode %{
2825     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
2826     __ pmulld($tmp2$$XMMRegister, $src2$$XMMRegister);
2827     __ movdl($tmp$$XMMRegister, $src1$$Register);
2828     __ pmulld($tmp2$$XMMRegister, $tmp$$XMMRegister);
2829     __ movdl($dst$$Register, $tmp2$$XMMRegister);
2830   %}
2831   ins_pipe( pipe_slow );
2832 %}
2833 
2834 instruct rvmul2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
2835   predicate(UseAVX > 0);
2836   match(Set dst (MulReductionVI src1 src2));
2837   effect(TEMP tmp, TEMP tmp2);
2838   format %{ "pshufd  $tmp2,$src2,0x1\n\t"
2839             "vpmulld $tmp,$src2,$tmp2\n\t"
2840             "movd    $tmp2,$src1\n\t"
2841             "vpmulld $tmp2,$tmp,$tmp2\n\t"
2842             "movd    $dst,$tmp2\t! mul reduction2I" %}
2843   ins_encode %{
2844     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
2845     __ vpmulld($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, false);
2846     __ movdl($tmp2$$XMMRegister, $src1$$Register);
2847     __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, false);
2848     __ movdl($dst$$Register, $tmp2$$XMMRegister);
2849   %}
2850   ins_pipe( pipe_slow );
2851 %}
2852 
2853 instruct rsmul4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
2854   predicate(UseSSE > 3 && UseAVX == 0);
2855   match(Set dst (MulReductionVI src1 src2));
2856   effect(TEMP tmp, TEMP tmp2);
2857   format %{ "pshufd  $tmp2,$src2,0xE\n\t"
2858             "pmulld  $tmp2,$src2\n\t"
2859             "pshufd  $tmp,$tmp2,0x1\n\t"
2860             "pmulld  $tmp2,$tmp\n\t"
2861             "movd    $tmp,$src1\n\t"
2862             "pmulld  $tmp2,$tmp\n\t"
2863             "movd    $dst,$tmp2\t! mul reduction4I" %}
2864   ins_encode %{
2865     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
2866     __ pmulld($tmp2$$XMMRegister, $src2$$XMMRegister);
2867     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x1);
2868     __ pmulld($tmp2$$XMMRegister, $tmp$$XMMRegister);
2869     __ movdl($tmp$$XMMRegister, $src1$$Register);
2870     __ pmulld($tmp2$$XMMRegister, $tmp$$XMMRegister);
2871     __ movdl($dst$$Register, $tmp2$$XMMRegister);
2872   %}
2873   ins_pipe( pipe_slow );
2874 %}
2875 
2876 instruct rvmul4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
2877   predicate(UseAVX > 0);
2878   match(Set dst (MulReductionVI src1 src2));
2879   effect(TEMP tmp, TEMP tmp2);
2880   format %{ "pshufd  $tmp2,$src2,0xE\n\t"
2881             "vpmulld $tmp,$src2,$tmp2\n\t"
2882             "pshufd  $tmp2,$tmp,0x1\n\t"
2883             "vpmulld $tmp,$tmp,$tmp2\n\t"
2884             "movd    $tmp2,$src1\n\t"
2885             "vpmulld $tmp2,$tmp,$tmp2\n\t"
2886             "movd    $dst,$tmp2\t! mul reduction4I" %}
2887   ins_encode %{
2888     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
2889     __ vpmulld($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, false);
2890     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
2891     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, false);
2892     __ movdl($tmp2$$XMMRegister, $src1$$Register);
2893     __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, false);
2894     __ movdl($dst$$Register, $tmp2$$XMMRegister);
2895   %}
2896   ins_pipe( pipe_slow );
2897 %}
2898 
2899 instruct rvmul8I_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2) %{
2900   predicate(UseAVX > 0);
2901   match(Set dst (MulReductionVI src1 src2));
2902   effect(TEMP tmp, TEMP tmp2);
2903   format %{ "vextractf128  $tmp,$src2\n\t"
2904             "vpmulld $tmp,$tmp,$src2\n\t"
2905             "pshufd  $tmp2,$tmp,0xE\n\t"
2906             "vpmulld $tmp,$tmp,$tmp2\n\t"
2907             "pshufd  $tmp2,$tmp,0x1\n\t"
2908             "vpmulld $tmp,$tmp,$tmp2\n\t"
2909             "movd    $tmp2,$src1\n\t"
2910             "vpmulld $tmp2,$tmp,$tmp2\n\t"
2911             "movd    $dst,$tmp2\t! mul reduction8I" %}
2912   ins_encode %{
2913     __ vextractf128h($tmp$$XMMRegister, $src2$$XMMRegister);
2914     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, false);
2915     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
2916     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, false);
2917     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
2918     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, false);
2919     __ movdl($tmp2$$XMMRegister, $src1$$Register);
2920     __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, false);
2921     __ movdl($dst$$Register, $tmp2$$XMMRegister);
2922   %}
2923   ins_pipe( pipe_slow );
2924 %}
2925 
2926 instruct rsmul2F_reduction_reg(regF dst, regF src1, vecD src2, regF tmp, regF tmp2) %{
2927   predicate(UseSSE >= 1 && UseAVX == 0);
2928   match(Set dst (MulReductionVF src1 src2));
2929   effect(TEMP tmp, TEMP tmp2);
2930   format %{ "movdqu  $tmp,$src1\n\t"
2931             "mulss   $tmp,$src2\n\t"
2932             "pshufd  $tmp2,$src2,0x01\n\t"
2933             "mulss   $tmp,$tmp2\n\t"
2934             "movdqu  $dst,$tmp\t! add reduction2F" %}
2935   ins_encode %{
2936     __ movdqu($tmp$$XMMRegister, $src1$$XMMRegister);
2937     __ mulss($tmp$$XMMRegister, $src2$$XMMRegister);
2938     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x01);
2939     __ mulss($tmp$$XMMRegister, $tmp2$$XMMRegister);
2940     __ movdqu($dst$$XMMRegister, $tmp$$XMMRegister);
2941   %}
2942   ins_pipe( pipe_slow );
2943 %}
2944 
2945 instruct rvmul2F_reduction_reg(regF dst, regF src1, vecD src2, regF tmp, regF tmp2) %{
2946   predicate(UseAVX > 0);
2947   match(Set dst (MulReductionVF src1 src2));
2948   effect(TEMP tmp, TEMP tmp2);
2949   format %{ "vmulss  $tmp2,$src1,$src2\n\t"
2950             "pshufd  $tmp,$src2,0x01\n\t"
2951             "vmulss  $dst,$tmp2,$tmp\t! add reduction2F" %}
2952   ins_encode %{
2953     __ vmulss($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2954     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
2955     __ vmulss($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
2956   %}
2957   ins_pipe( pipe_slow );
2958 %}
2959 
2960 instruct rsmul4F_reduction_reg(regF dst, regF src1, vecX src2, regF tmp, regF tmp2) %{
2961   predicate(UseSSE >= 1 && UseAVX == 0);
2962   match(Set dst (MulReductionVF src1 src2));
2963   effect(TEMP tmp, TEMP tmp2);
2964   format %{ "movdqu  $tmp,$src1\n\t"
2965             "mulss   $tmp,$src2\n\t"
2966             "pshufd  $tmp2,$src2,0x01\n\t"
2967             "mulss   $tmp,$tmp2\n\t"
2968             "pshufd  $tmp2,$src2,0x02\n\t"
2969             "mulss   $tmp,$tmp2\n\t"
2970             "pshufd  $tmp2,$src2,0x03\n\t"
2971             "mulss   $tmp,$tmp2\n\t"
2972             "movdqu  $dst,$tmp\t! add reduction4F" %}
2973   ins_encode %{
2974     __ movdqu($tmp$$XMMRegister, $src1$$XMMRegister);
2975     __ mulss($tmp$$XMMRegister, $src2$$XMMRegister);
2976     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x01);
2977     __ mulss($tmp$$XMMRegister, $tmp2$$XMMRegister);
2978     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x02);
2979     __ mulss($tmp$$XMMRegister, $tmp2$$XMMRegister);
2980     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x03);
2981     __ mulss($tmp$$XMMRegister, $tmp2$$XMMRegister);
2982     __ movdqu($dst$$XMMRegister, $tmp$$XMMRegister);
2983   %}
2984   ins_pipe( pipe_slow );
2985 %}
2986 
2987 instruct rvmul4F_reduction_reg(regF dst, regF src1, vecX src2, regF tmp, regF tmp2) %{
2988   predicate(UseAVX > 0);
2989   match(Set dst (MulReductionVF src1 src2));
2990   effect(TEMP tmp, TEMP tmp2);
2991   format %{ "vmulss  $tmp2,$src1,$src2\n\t"
2992             "pshufd  $tmp,$src2,0x01\n\t"
2993             "vmulss  $tmp2,$tmp2,$tmp\n\t"
2994             "pshufd  $tmp,$src2,0x02\n\t"
2995             "vmulss  $tmp2,$tmp2,$tmp\n\t"
2996             "pshufd  $tmp,$src2,0x03\n\t"
2997             "vmulss  $dst,$tmp2,$tmp\t! add reduction4F" %}
2998   ins_encode %{
2999     __ vmulss($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
3000     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
3001     __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
3002     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
3003     __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
3004     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
3005     __ vmulss($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
3006   %}
3007   ins_pipe( pipe_slow );
3008 %}
3009 
3010 instruct rvmul8F_reduction_reg(regF dst, regF src1, vecY src2, regF tmp, regF tmp2, regF tmp3) %{
3011   predicate(UseAVX > 0);
3012   match(Set dst (MulReductionVF src1 src2));
3013   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
3014   format %{ "vmulss  $tmp2,$src1,$src2\n\t"
3015             "pshufd  $tmp,$src2,0x01\n\t"
3016             "vmulss  $tmp2,$tmp2,$tmp\n\t"
3017             "pshufd  $tmp,$src2,0x02\n\t"
3018             "vmulss  $tmp2,$tmp2,$tmp\n\t"
3019             "pshufd  $tmp,$src2,0x03\n\t"
3020             "vmulss  $tmp2,$tmp2,$tmp\n\t"
3021             "vextractf128  $tmp3,$src2\n\t"
3022             "vmulss  $tmp2,$tmp2,$tmp3\n\t"
3023             "pshufd  $tmp,$tmp3,0x01\n\t"
3024             "vmulss  $tmp2,$tmp2,$tmp\n\t"
3025             "pshufd  $tmp,$tmp3,0x02\n\t"
3026             "vmulss  $tmp2,$tmp2,$tmp\n\t"
3027             "pshufd  $tmp,$tmp3,0x03\n\t"
3028             "vmulss  $dst,$tmp2,$tmp\t! mul reduction8F" %}
3029   ins_encode %{
3030     __ vmulss($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
3031     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
3032     __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
3033     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
3034     __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
3035     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
3036     __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
3037     __ vextractf128h($tmp3$$XMMRegister, $src2$$XMMRegister);
3038     __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister);
3039     __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x01);
3040     __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
3041     __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x02);
3042     __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
3043     __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x03);
3044     __ vmulss($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
3045   %}
3046   ins_pipe( pipe_slow );
3047 %}
3048 
3049 instruct rsmul2D_reduction_reg(regD dst, regD src1, vecX src2, regD tmp) %{
3050   predicate(UseSSE >= 1 && UseAVX == 0);
3051   match(Set dst (MulReductionVD src1 src2));
3052   effect(TEMP tmp, TEMP dst);
3053   format %{ "movdqu  $tmp,$src1\n\t"
3054             "mulsd   $tmp,$src2\n\t"
3055             "pshufd  $dst,$src2,0xE\n\t"
3056             "mulsd   $dst,$tmp\t! add reduction2D" %}
3057   ins_encode %{
3058     __ movdqu($tmp$$XMMRegister, $src1$$XMMRegister);
3059     __ mulsd($tmp$$XMMRegister, $src2$$XMMRegister);
3060     __ pshufd($dst$$XMMRegister, $src2$$XMMRegister, 0xE);
3061     __ mulsd($dst$$XMMRegister, $tmp$$XMMRegister);
3062   %}
3063   ins_pipe( pipe_slow );
3064 %}
3065 
3066 instruct rvmul2D_reduction_reg(regD dst, regD src1, vecX src2, regD tmp, regD tmp2) %{
3067   predicate(UseAVX > 0);
3068   match(Set dst (MulReductionVD src1 src2));
3069   effect(TEMP tmp, TEMP tmp2);
3070   format %{ "vmulsd  $tmp2,$src1,$src2\n\t"
3071             "pshufd  $tmp,$src2,0xE\n\t"
3072             "vmulsd  $dst,$tmp2,$tmp\t! mul reduction2D" %}
3073   ins_encode %{
3074     __ vmulsd($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
3075     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
3076     __ vmulsd($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
3077   %}
3078   ins_pipe( pipe_slow );
3079 %}
3080 
3081 instruct rvmul4D_reduction_reg(regD dst, regD src1, vecY src2, regD tmp, regD tmp2, regD tmp3) %{
3082   predicate(UseAVX > 0);
3083   match(Set dst (MulReductionVD src1 src2));
3084   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
3085   format %{ "vmulsd  $tmp2,$src1,$src2\n\t"
3086             "pshufd  $tmp,$src2,0xE\n\t"
3087             "vmulsd  $tmp2,$tmp2,$tmp\n\t"
3088             "vextractf128  $tmp3,$src2\n\t"
3089             "vmulsd  $tmp2,$tmp2,$tmp3\n\t"
3090             "pshufd  $tmp,$tmp3,0xE\n\t"
3091             "vmulsd  $dst,$tmp2,$tmp\t! mul reduction4D" %}
3092   ins_encode %{
3093     __ vmulsd($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
3094     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
3095     __ vmulsd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
3096     __ vextractf128h($tmp3$$XMMRegister, $src2$$XMMRegister);
3097     __ vmulsd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister);
3098     __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0xE);
3099     __ vmulsd($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
3100   %}
3101   ins_pipe( pipe_slow );
3102 %}
3103 
3104 // ====================VECTOR ARITHMETIC=======================================
3105 
3106 // --------------------------------- ADD --------------------------------------
3107 
3108 // Bytes vector add
3109 instruct vadd4B(vecS dst, vecS src) %{
3110   predicate(n->as_Vector()->length() == 4);
3111   match(Set dst (AddVB dst src));
3112   format %{ "paddb   $dst,$src\t! add packed4B" %}
3113   ins_encode %{
3114     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
3115   %}
3116   ins_pipe( pipe_slow );
3117 %}
3118 
3119 instruct vadd4B_reg(vecS dst, vecS src1, vecS src2) %{
3120   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
3121   match(Set dst (AddVB src1 src2));
3122   format %{ "vpaddb  $dst,$src1,$src2\t! add packed4B" %}
3123   ins_encode %{
3124     bool vector256 = false;
3125     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3126   %}
3127   ins_pipe( pipe_slow );
3128 %}
3129 
3130 instruct vadd8B(vecD dst, vecD src) %{
3131   predicate(n->as_Vector()->length() == 8);
3132   match(Set dst (AddVB dst src));
3133   format %{ "paddb   $dst,$src\t! add packed8B" %}
3134   ins_encode %{
3135     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
3136   %}
3137   ins_pipe( pipe_slow );
3138 %}
3139 
3140 instruct vadd8B_reg(vecD dst, vecD src1, vecD src2) %{
3141   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
3142   match(Set dst (AddVB src1 src2));
3143   format %{ "vpaddb  $dst,$src1,$src2\t! add packed8B" %}
3144   ins_encode %{
3145     bool vector256 = false;
3146     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3147   %}
3148   ins_pipe( pipe_slow );
3149 %}
3150 
3151 instruct vadd16B(vecX dst, vecX src) %{
3152   predicate(n->as_Vector()->length() == 16);
3153   match(Set dst (AddVB dst src));
3154   format %{ "paddb   $dst,$src\t! add packed16B" %}
3155   ins_encode %{
3156     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
3157   %}
3158   ins_pipe( pipe_slow );
3159 %}
3160 
3161 instruct vadd16B_reg(vecX dst, vecX src1, vecX src2) %{
3162   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
3163   match(Set dst (AddVB src1 src2));
3164   format %{ "vpaddb  $dst,$src1,$src2\t! add packed16B" %}
3165   ins_encode %{
3166     bool vector256 = false;
3167     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3168   %}
3169   ins_pipe( pipe_slow );
3170 %}
3171 
3172 instruct vadd16B_mem(vecX dst, vecX src, memory mem) %{
3173   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
3174   match(Set dst (AddVB src (LoadVector mem)));
3175   format %{ "vpaddb  $dst,$src,$mem\t! add packed16B" %}
3176   ins_encode %{
3177     bool vector256 = false;
3178     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3179   %}
3180   ins_pipe( pipe_slow );
3181 %}
3182 
3183 instruct vadd32B_reg(vecY dst, vecY src1, vecY src2) %{
3184   predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
3185   match(Set dst (AddVB src1 src2));
3186   format %{ "vpaddb  $dst,$src1,$src2\t! add packed32B" %}
3187   ins_encode %{
3188     bool vector256 = true;
3189     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3190   %}
3191   ins_pipe( pipe_slow );
3192 %}
3193 
3194 instruct vadd32B_mem(vecY dst, vecY src, memory mem) %{
3195   predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
3196   match(Set dst (AddVB src (LoadVector mem)));
3197   format %{ "vpaddb  $dst,$src,$mem\t! add packed32B" %}
3198   ins_encode %{
3199     bool vector256 = true;
3200     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3201   %}
3202   ins_pipe( pipe_slow );
3203 %}
3204 
3205 // Shorts/Chars vector add
3206 instruct vadd2S(vecS dst, vecS src) %{
3207   predicate(n->as_Vector()->length() == 2);
3208   match(Set dst (AddVS dst src));
3209   format %{ "paddw   $dst,$src\t! add packed2S" %}
3210   ins_encode %{
3211     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
3212   %}
3213   ins_pipe( pipe_slow );
3214 %}
3215 
3216 instruct vadd2S_reg(vecS dst, vecS src1, vecS src2) %{
3217   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
3218   match(Set dst (AddVS src1 src2));
3219   format %{ "vpaddw  $dst,$src1,$src2\t! add packed2S" %}
3220   ins_encode %{
3221     bool vector256 = false;
3222     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3223   %}
3224   ins_pipe( pipe_slow );
3225 %}
3226 
3227 instruct vadd4S(vecD dst, vecD src) %{
3228   predicate(n->as_Vector()->length() == 4);
3229   match(Set dst (AddVS dst src));
3230   format %{ "paddw   $dst,$src\t! add packed4S" %}
3231   ins_encode %{
3232     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
3233   %}
3234   ins_pipe( pipe_slow );
3235 %}
3236 
3237 instruct vadd4S_reg(vecD dst, vecD src1, vecD src2) %{
3238   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
3239   match(Set dst (AddVS src1 src2));
3240   format %{ "vpaddw  $dst,$src1,$src2\t! add packed4S" %}
3241   ins_encode %{
3242     bool vector256 = false;
3243     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3244   %}
3245   ins_pipe( pipe_slow );
3246 %}
3247 
3248 instruct vadd8S(vecX dst, vecX src) %{
3249   predicate(n->as_Vector()->length() == 8);
3250   match(Set dst (AddVS dst src));
3251   format %{ "paddw   $dst,$src\t! add packed8S" %}
3252   ins_encode %{
3253     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
3254   %}
3255   ins_pipe( pipe_slow );
3256 %}
3257 
3258 instruct vadd8S_reg(vecX dst, vecX src1, vecX src2) %{
3259   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
3260   match(Set dst (AddVS src1 src2));
3261   format %{ "vpaddw  $dst,$src1,$src2\t! add packed8S" %}
3262   ins_encode %{
3263     bool vector256 = false;
3264     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3265   %}
3266   ins_pipe( pipe_slow );
3267 %}
3268 
3269 instruct vadd8S_mem(vecX dst, vecX src, memory mem) %{
3270   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
3271   match(Set dst (AddVS src (LoadVector mem)));
3272   format %{ "vpaddw  $dst,$src,$mem\t! add packed8S" %}
3273   ins_encode %{
3274     bool vector256 = false;
3275     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3276   %}
3277   ins_pipe( pipe_slow );
3278 %}
3279 
3280 instruct vadd16S_reg(vecY dst, vecY src1, vecY src2) %{
3281   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
3282   match(Set dst (AddVS src1 src2));
3283   format %{ "vpaddw  $dst,$src1,$src2\t! add packed16S" %}
3284   ins_encode %{
3285     bool vector256 = true;
3286     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3287   %}
3288   ins_pipe( pipe_slow );
3289 %}
3290 
3291 instruct vadd16S_mem(vecY dst, vecY src, memory mem) %{
3292   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
3293   match(Set dst (AddVS src (LoadVector mem)));
3294   format %{ "vpaddw  $dst,$src,$mem\t! add packed16S" %}
3295   ins_encode %{
3296     bool vector256 = true;
3297     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3298   %}
3299   ins_pipe( pipe_slow );
3300 %}
3301 
3302 // Integers vector add
3303 instruct vadd2I(vecD dst, vecD src) %{
3304   predicate(n->as_Vector()->length() == 2);
3305   match(Set dst (AddVI dst src));
3306   format %{ "paddd   $dst,$src\t! add packed2I" %}
3307   ins_encode %{
3308     __ paddd($dst$$XMMRegister, $src$$XMMRegister);
3309   %}
3310   ins_pipe( pipe_slow );
3311 %}
3312 
3313 instruct vadd2I_reg(vecD dst, vecD src1, vecD src2) %{
3314   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
3315   match(Set dst (AddVI src1 src2));
3316   format %{ "vpaddd  $dst,$src1,$src2\t! add packed2I" %}
3317   ins_encode %{
3318     bool vector256 = false;
3319     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3320   %}
3321   ins_pipe( pipe_slow );
3322 %}
3323 
3324 instruct vadd4I(vecX dst, vecX src) %{
3325   predicate(n->as_Vector()->length() == 4);
3326   match(Set dst (AddVI dst src));
3327   format %{ "paddd   $dst,$src\t! add packed4I" %}
3328   ins_encode %{
3329     __ paddd($dst$$XMMRegister, $src$$XMMRegister);
3330   %}
3331   ins_pipe( pipe_slow );
3332 %}
3333 
3334 instruct vadd4I_reg(vecX dst, vecX src1, vecX src2) %{
3335   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
3336   match(Set dst (AddVI src1 src2));
3337   format %{ "vpaddd  $dst,$src1,$src2\t! add packed4I" %}
3338   ins_encode %{
3339     bool vector256 = false;
3340     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3341   %}
3342   ins_pipe( pipe_slow );
3343 %}
3344 
3345 instruct vadd4I_mem(vecX dst, vecX src, memory mem) %{
3346   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
3347   match(Set dst (AddVI src (LoadVector mem)));
3348   format %{ "vpaddd  $dst,$src,$mem\t! add packed4I" %}
3349   ins_encode %{
3350     bool vector256 = false;
3351     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3352   %}
3353   ins_pipe( pipe_slow );
3354 %}
3355 
3356 instruct vadd8I_reg(vecY dst, vecY src1, vecY src2) %{
3357   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
3358   match(Set dst (AddVI src1 src2));
3359   format %{ "vpaddd  $dst,$src1,$src2\t! add packed8I" %}
3360   ins_encode %{
3361     bool vector256 = true;
3362     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3363   %}
3364   ins_pipe( pipe_slow );
3365 %}
3366 
3367 instruct vadd8I_mem(vecY dst, vecY src, memory mem) %{
3368   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
3369   match(Set dst (AddVI src (LoadVector mem)));
3370   format %{ "vpaddd  $dst,$src,$mem\t! add packed8I" %}
3371   ins_encode %{
3372     bool vector256 = true;
3373     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3374   %}
3375   ins_pipe( pipe_slow );
3376 %}
3377 
3378 // Longs vector add
3379 instruct vadd2L(vecX dst, vecX src) %{
3380   predicate(n->as_Vector()->length() == 2);
3381   match(Set dst (AddVL dst src));
3382   format %{ "paddq   $dst,$src\t! add packed2L" %}
3383   ins_encode %{
3384     __ paddq($dst$$XMMRegister, $src$$XMMRegister);
3385   %}
3386   ins_pipe( pipe_slow );
3387 %}
3388 
3389 instruct vadd2L_reg(vecX dst, vecX src1, vecX src2) %{
3390   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
3391   match(Set dst (AddVL src1 src2));
3392   format %{ "vpaddq  $dst,$src1,$src2\t! add packed2L" %}
3393   ins_encode %{
3394     bool vector256 = false;
3395     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3396   %}
3397   ins_pipe( pipe_slow );
3398 %}
3399 
3400 instruct vadd2L_mem(vecX dst, vecX src, memory mem) %{
3401   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
3402   match(Set dst (AddVL src (LoadVector mem)));
3403   format %{ "vpaddq  $dst,$src,$mem\t! add packed2L" %}
3404   ins_encode %{
3405     bool vector256 = false;
3406     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3407   %}
3408   ins_pipe( pipe_slow );
3409 %}
3410 
3411 instruct vadd4L_reg(vecY dst, vecY src1, vecY src2) %{
3412   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
3413   match(Set dst (AddVL src1 src2));
3414   format %{ "vpaddq  $dst,$src1,$src2\t! add packed4L" %}
3415   ins_encode %{
3416     bool vector256 = true;
3417     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3418   %}
3419   ins_pipe( pipe_slow );
3420 %}
3421 
3422 instruct vadd4L_mem(vecY dst, vecY src, memory mem) %{
3423   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
3424   match(Set dst (AddVL src (LoadVector mem)));
3425   format %{ "vpaddq  $dst,$src,$mem\t! add packed4L" %}
3426   ins_encode %{
3427     bool vector256 = true;
3428     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3429   %}
3430   ins_pipe( pipe_slow );
3431 %}
3432 
3433 // Floats vector add
3434 instruct vadd2F(vecD dst, vecD src) %{
3435   predicate(n->as_Vector()->length() == 2);
3436   match(Set dst (AddVF dst src));
3437   format %{ "addps   $dst,$src\t! add packed2F" %}
3438   ins_encode %{
3439     __ addps($dst$$XMMRegister, $src$$XMMRegister);
3440   %}
3441   ins_pipe( pipe_slow );
3442 %}
3443 
3444 instruct vadd2F_reg(vecD dst, vecD src1, vecD src2) %{
3445   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
3446   match(Set dst (AddVF src1 src2));
3447   format %{ "vaddps  $dst,$src1,$src2\t! add packed2F" %}
3448   ins_encode %{
3449     bool vector256 = false;
3450     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3451   %}
3452   ins_pipe( pipe_slow );
3453 %}
3454 
3455 instruct vadd4F(vecX dst, vecX src) %{
3456   predicate(n->as_Vector()->length() == 4);
3457   match(Set dst (AddVF dst src));
3458   format %{ "addps   $dst,$src\t! add packed4F" %}
3459   ins_encode %{
3460     __ addps($dst$$XMMRegister, $src$$XMMRegister);
3461   %}
3462   ins_pipe( pipe_slow );
3463 %}
3464 
3465 instruct vadd4F_reg(vecX dst, vecX src1, vecX src2) %{
3466   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
3467   match(Set dst (AddVF src1 src2));
3468   format %{ "vaddps  $dst,$src1,$src2\t! add packed4F" %}
3469   ins_encode %{
3470     bool vector256 = false;
3471     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3472   %}
3473   ins_pipe( pipe_slow );
3474 %}
3475 
3476 instruct vadd4F_mem(vecX dst, vecX src, memory mem) %{
3477   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
3478   match(Set dst (AddVF src (LoadVector mem)));
3479   format %{ "vaddps  $dst,$src,$mem\t! add packed4F" %}
3480   ins_encode %{
3481     bool vector256 = false;
3482     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3483   %}
3484   ins_pipe( pipe_slow );
3485 %}
3486 
3487 instruct vadd8F_reg(vecY dst, vecY src1, vecY src2) %{
3488   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
3489   match(Set dst (AddVF src1 src2));
3490   format %{ "vaddps  $dst,$src1,$src2\t! add packed8F" %}
3491   ins_encode %{
3492     bool vector256 = true;
3493     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3494   %}
3495   ins_pipe( pipe_slow );
3496 %}
3497 
3498 instruct vadd8F_mem(vecY dst, vecY src, memory mem) %{
3499   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
3500   match(Set dst (AddVF src (LoadVector mem)));
3501   format %{ "vaddps  $dst,$src,$mem\t! add packed8F" %}
3502   ins_encode %{
3503     bool vector256 = true;
3504     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3505   %}
3506   ins_pipe( pipe_slow );
3507 %}
3508 
3509 // Doubles vector add
3510 instruct vadd2D(vecX dst, vecX src) %{
3511   predicate(n->as_Vector()->length() == 2);
3512   match(Set dst (AddVD dst src));
3513   format %{ "addpd   $dst,$src\t! add packed2D" %}
3514   ins_encode %{
3515     __ addpd($dst$$XMMRegister, $src$$XMMRegister);
3516   %}
3517   ins_pipe( pipe_slow );
3518 %}
3519 
3520 instruct vadd2D_reg(vecX dst, vecX src1, vecX src2) %{
3521   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
3522   match(Set dst (AddVD src1 src2));
3523   format %{ "vaddpd  $dst,$src1,$src2\t! add packed2D" %}
3524   ins_encode %{
3525     bool vector256 = false;
3526     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3527   %}
3528   ins_pipe( pipe_slow );
3529 %}
3530 
3531 instruct vadd2D_mem(vecX dst, vecX src, memory mem) %{
3532   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
3533   match(Set dst (AddVD src (LoadVector mem)));
3534   format %{ "vaddpd  $dst,$src,$mem\t! add packed2D" %}
3535   ins_encode %{
3536     bool vector256 = false;
3537     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3538   %}
3539   ins_pipe( pipe_slow );
3540 %}
3541 
3542 instruct vadd4D_reg(vecY dst, vecY src1, vecY src2) %{
3543   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
3544   match(Set dst (AddVD src1 src2));
3545   format %{ "vaddpd  $dst,$src1,$src2\t! add packed4D" %}
3546   ins_encode %{
3547     bool vector256 = true;
3548     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3549   %}
3550   ins_pipe( pipe_slow );
3551 %}
3552 
3553 instruct vadd4D_mem(vecY dst, vecY src, memory mem) %{
3554   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
3555   match(Set dst (AddVD src (LoadVector mem)));
3556   format %{ "vaddpd  $dst,$src,$mem\t! add packed4D" %}
3557   ins_encode %{
3558     bool vector256 = true;
3559     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3560   %}
3561   ins_pipe( pipe_slow );
3562 %}
3563 
3564 // --------------------------------- SUB --------------------------------------
3565 
3566 // Bytes vector sub
3567 instruct vsub4B(vecS dst, vecS src) %{
3568   predicate(n->as_Vector()->length() == 4);
3569   match(Set dst (SubVB dst src));
3570   format %{ "psubb   $dst,$src\t! sub packed4B" %}
3571   ins_encode %{
3572     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
3573   %}
3574   ins_pipe( pipe_slow );
3575 %}
3576 
3577 instruct vsub4B_reg(vecS dst, vecS src1, vecS src2) %{
3578   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
3579   match(Set dst (SubVB src1 src2));
3580   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed4B" %}
3581   ins_encode %{
3582     bool vector256 = false;
3583     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3584   %}
3585   ins_pipe( pipe_slow );
3586 %}
3587 
3588 instruct vsub8B(vecD dst, vecD src) %{
3589   predicate(n->as_Vector()->length() == 8);
3590   match(Set dst (SubVB dst src));
3591   format %{ "psubb   $dst,$src\t! sub packed8B" %}
3592   ins_encode %{
3593     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
3594   %}
3595   ins_pipe( pipe_slow );
3596 %}
3597 
3598 instruct vsub8B_reg(vecD dst, vecD src1, vecD src2) %{
3599   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
3600   match(Set dst (SubVB src1 src2));
3601   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed8B" %}
3602   ins_encode %{
3603     bool vector256 = false;
3604     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3605   %}
3606   ins_pipe( pipe_slow );
3607 %}
3608 
3609 instruct vsub16B(vecX dst, vecX src) %{
3610   predicate(n->as_Vector()->length() == 16);
3611   match(Set dst (SubVB dst src));
3612   format %{ "psubb   $dst,$src\t! sub packed16B" %}
3613   ins_encode %{
3614     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
3615   %}
3616   ins_pipe( pipe_slow );
3617 %}
3618 
3619 instruct vsub16B_reg(vecX dst, vecX src1, vecX src2) %{
3620   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
3621   match(Set dst (SubVB src1 src2));
3622   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed16B" %}
3623   ins_encode %{
3624     bool vector256 = false;
3625     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3626   %}
3627   ins_pipe( pipe_slow );
3628 %}
3629 
3630 instruct vsub16B_mem(vecX dst, vecX src, memory mem) %{
3631   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
3632   match(Set dst (SubVB src (LoadVector mem)));
3633   format %{ "vpsubb  $dst,$src,$mem\t! sub packed16B" %}
3634   ins_encode %{
3635     bool vector256 = false;
3636     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3637   %}
3638   ins_pipe( pipe_slow );
3639 %}
3640 
3641 instruct vsub32B_reg(vecY dst, vecY src1, vecY src2) %{
3642   predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
3643   match(Set dst (SubVB src1 src2));
3644   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed32B" %}
3645   ins_encode %{
3646     bool vector256 = true;
3647     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3648   %}
3649   ins_pipe( pipe_slow );
3650 %}
3651 
3652 instruct vsub32B_mem(vecY dst, vecY src, memory mem) %{
3653   predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
3654   match(Set dst (SubVB src (LoadVector mem)));
3655   format %{ "vpsubb  $dst,$src,$mem\t! sub packed32B" %}
3656   ins_encode %{
3657     bool vector256 = true;
3658     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3659   %}
3660   ins_pipe( pipe_slow );
3661 %}
3662 
3663 // Shorts/Chars vector sub
3664 instruct vsub2S(vecS dst, vecS src) %{
3665   predicate(n->as_Vector()->length() == 2);
3666   match(Set dst (SubVS dst src));
3667   format %{ "psubw   $dst,$src\t! sub packed2S" %}
3668   ins_encode %{
3669     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
3670   %}
3671   ins_pipe( pipe_slow );
3672 %}
3673 
3674 instruct vsub2S_reg(vecS dst, vecS src1, vecS src2) %{
3675   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
3676   match(Set dst (SubVS src1 src2));
3677   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed2S" %}
3678   ins_encode %{
3679     bool vector256 = false;
3680     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3681   %}
3682   ins_pipe( pipe_slow );
3683 %}
3684 
3685 instruct vsub4S(vecD dst, vecD src) %{
3686   predicate(n->as_Vector()->length() == 4);
3687   match(Set dst (SubVS dst src));
3688   format %{ "psubw   $dst,$src\t! sub packed4S" %}
3689   ins_encode %{
3690     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
3691   %}
3692   ins_pipe( pipe_slow );
3693 %}
3694 
3695 instruct vsub4S_reg(vecD dst, vecD src1, vecD src2) %{
3696   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
3697   match(Set dst (SubVS src1 src2));
3698   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed4S" %}
3699   ins_encode %{
3700     bool vector256 = false;
3701     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3702   %}
3703   ins_pipe( pipe_slow );
3704 %}
3705 
3706 instruct vsub8S(vecX dst, vecX src) %{
3707   predicate(n->as_Vector()->length() == 8);
3708   match(Set dst (SubVS dst src));
3709   format %{ "psubw   $dst,$src\t! sub packed8S" %}
3710   ins_encode %{
3711     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
3712   %}
3713   ins_pipe( pipe_slow );
3714 %}
3715 
3716 instruct vsub8S_reg(vecX dst, vecX src1, vecX src2) %{
3717   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
3718   match(Set dst (SubVS src1 src2));
3719   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed8S" %}
3720   ins_encode %{
3721     bool vector256 = false;
3722     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3723   %}
3724   ins_pipe( pipe_slow );
3725 %}
3726 
3727 instruct vsub8S_mem(vecX dst, vecX src, memory mem) %{
3728   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
3729   match(Set dst (SubVS src (LoadVector mem)));
3730   format %{ "vpsubw  $dst,$src,$mem\t! sub packed8S" %}
3731   ins_encode %{
3732     bool vector256 = false;
3733     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3734   %}
3735   ins_pipe( pipe_slow );
3736 %}
3737 
3738 instruct vsub16S_reg(vecY dst, vecY src1, vecY src2) %{
3739   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
3740   match(Set dst (SubVS src1 src2));
3741   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed16S" %}
3742   ins_encode %{
3743     bool vector256 = true;
3744     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3745   %}
3746   ins_pipe( pipe_slow );
3747 %}
3748 
3749 instruct vsub16S_mem(vecY dst, vecY src, memory mem) %{
3750   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
3751   match(Set dst (SubVS src (LoadVector mem)));
3752   format %{ "vpsubw  $dst,$src,$mem\t! sub packed16S" %}
3753   ins_encode %{
3754     bool vector256 = true;
3755     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3756   %}
3757   ins_pipe( pipe_slow );
3758 %}
3759 
3760 // Integers vector sub
3761 instruct vsub2I(vecD dst, vecD src) %{
3762   predicate(n->as_Vector()->length() == 2);
3763   match(Set dst (SubVI dst src));
3764   format %{ "psubd   $dst,$src\t! sub packed2I" %}
3765   ins_encode %{
3766     __ psubd($dst$$XMMRegister, $src$$XMMRegister);
3767   %}
3768   ins_pipe( pipe_slow );
3769 %}
3770 
3771 instruct vsub2I_reg(vecD dst, vecD src1, vecD src2) %{
3772   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
3773   match(Set dst (SubVI src1 src2));
3774   format %{ "vpsubd  $dst,$src1,$src2\t! sub packed2I" %}
3775   ins_encode %{
3776     bool vector256 = false;
3777     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3778   %}
3779   ins_pipe( pipe_slow );
3780 %}
3781 
3782 instruct vsub4I(vecX dst, vecX src) %{
3783   predicate(n->as_Vector()->length() == 4);
3784   match(Set dst (SubVI dst src));
3785   format %{ "psubd   $dst,$src\t! sub packed4I" %}
3786   ins_encode %{
3787     __ psubd($dst$$XMMRegister, $src$$XMMRegister);
3788   %}
3789   ins_pipe( pipe_slow );
3790 %}
3791 
3792 instruct vsub4I_reg(vecX dst, vecX src1, vecX src2) %{
3793   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
3794   match(Set dst (SubVI src1 src2));
3795   format %{ "vpsubd  $dst,$src1,$src2\t! sub packed4I" %}
3796   ins_encode %{
3797     bool vector256 = false;
3798     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3799   %}
3800   ins_pipe( pipe_slow );
3801 %}
3802 
3803 instruct vsub4I_mem(vecX dst, vecX src, memory mem) %{
3804   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
3805   match(Set dst (SubVI src (LoadVector mem)));
3806   format %{ "vpsubd  $dst,$src,$mem\t! sub packed4I" %}
3807   ins_encode %{
3808     bool vector256 = false;
3809     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3810   %}
3811   ins_pipe( pipe_slow );
3812 %}
3813 
3814 instruct vsub8I_reg(vecY dst, vecY src1, vecY src2) %{
3815   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
3816   match(Set dst (SubVI src1 src2));
3817   format %{ "vpsubd  $dst,$src1,$src2\t! sub packed8I" %}
3818   ins_encode %{
3819     bool vector256 = true;
3820     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3821   %}
3822   ins_pipe( pipe_slow );
3823 %}
3824 
3825 instruct vsub8I_mem(vecY dst, vecY src, memory mem) %{
3826   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
3827   match(Set dst (SubVI src (LoadVector mem)));
3828   format %{ "vpsubd  $dst,$src,$mem\t! sub packed8I" %}
3829   ins_encode %{
3830     bool vector256 = true;
3831     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3832   %}
3833   ins_pipe( pipe_slow );
3834 %}
3835 
3836 // Longs vector sub
3837 instruct vsub2L(vecX dst, vecX src) %{
3838   predicate(n->as_Vector()->length() == 2);
3839   match(Set dst (SubVL dst src));
3840   format %{ "psubq   $dst,$src\t! sub packed2L" %}
3841   ins_encode %{
3842     __ psubq($dst$$XMMRegister, $src$$XMMRegister);
3843   %}
3844   ins_pipe( pipe_slow );
3845 %}
3846 
3847 instruct vsub2L_reg(vecX dst, vecX src1, vecX src2) %{
3848   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
3849   match(Set dst (SubVL src1 src2));
3850   format %{ "vpsubq  $dst,$src1,$src2\t! sub packed2L" %}
3851   ins_encode %{
3852     bool vector256 = false;
3853     __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3854   %}
3855   ins_pipe( pipe_slow );
3856 %}
3857 
3858 instruct vsub2L_mem(vecX dst, vecX src, memory mem) %{
3859   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
3860   match(Set dst (SubVL src (LoadVector mem)));
3861   format %{ "vpsubq  $dst,$src,$mem\t! sub packed2L" %}
3862   ins_encode %{
3863     bool vector256 = false;
3864     __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3865   %}
3866   ins_pipe( pipe_slow );
3867 %}
3868 
3869 instruct vsub4L_reg(vecY dst, vecY src1, vecY src2) %{
3870   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
3871   match(Set dst (SubVL src1 src2));
3872   format %{ "vpsubq  $dst,$src1,$src2\t! sub packed4L" %}
3873   ins_encode %{
3874     bool vector256 = true;
3875     __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3876   %}
3877   ins_pipe( pipe_slow );
3878 %}
3879 
3880 instruct vsub4L_mem(vecY dst, vecY src, memory mem) %{
3881   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
3882   match(Set dst (SubVL src (LoadVector mem)));
3883   format %{ "vpsubq  $dst,$src,$mem\t! sub packed4L" %}
3884   ins_encode %{
3885     bool vector256 = true;
3886     __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3887   %}
3888   ins_pipe( pipe_slow );
3889 %}
3890 
3891 // Floats vector sub
3892 instruct vsub2F(vecD dst, vecD src) %{
3893   predicate(n->as_Vector()->length() == 2);
3894   match(Set dst (SubVF dst src));
3895   format %{ "subps   $dst,$src\t! sub packed2F" %}
3896   ins_encode %{
3897     __ subps($dst$$XMMRegister, $src$$XMMRegister);
3898   %}
3899   ins_pipe( pipe_slow );
3900 %}
3901 
3902 instruct vsub2F_reg(vecD dst, vecD src1, vecD src2) %{
3903   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
3904   match(Set dst (SubVF src1 src2));
3905   format %{ "vsubps  $dst,$src1,$src2\t! sub packed2F" %}
3906   ins_encode %{
3907     bool vector256 = false;
3908     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3909   %}
3910   ins_pipe( pipe_slow );
3911 %}
3912 
3913 instruct vsub4F(vecX dst, vecX src) %{
3914   predicate(n->as_Vector()->length() == 4);
3915   match(Set dst (SubVF dst src));
3916   format %{ "subps   $dst,$src\t! sub packed4F" %}
3917   ins_encode %{
3918     __ subps($dst$$XMMRegister, $src$$XMMRegister);
3919   %}
3920   ins_pipe( pipe_slow );
3921 %}
3922 
3923 instruct vsub4F_reg(vecX dst, vecX src1, vecX src2) %{
3924   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
3925   match(Set dst (SubVF src1 src2));
3926   format %{ "vsubps  $dst,$src1,$src2\t! sub packed4F" %}
3927   ins_encode %{
3928     bool vector256 = false;
3929     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3930   %}
3931   ins_pipe( pipe_slow );
3932 %}
3933 
3934 instruct vsub4F_mem(vecX dst, vecX src, memory mem) %{
3935   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
3936   match(Set dst (SubVF src (LoadVector mem)));
3937   format %{ "vsubps  $dst,$src,$mem\t! sub packed4F" %}
3938   ins_encode %{
3939     bool vector256 = false;
3940     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3941   %}
3942   ins_pipe( pipe_slow );
3943 %}
3944 
3945 instruct vsub8F_reg(vecY dst, vecY src1, vecY src2) %{
3946   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
3947   match(Set dst (SubVF src1 src2));
3948   format %{ "vsubps  $dst,$src1,$src2\t! sub packed8F" %}
3949   ins_encode %{
3950     bool vector256 = true;
3951     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3952   %}
3953   ins_pipe( pipe_slow );
3954 %}
3955 
3956 instruct vsub8F_mem(vecY dst, vecY src, memory mem) %{
3957   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
3958   match(Set dst (SubVF src (LoadVector mem)));
3959   format %{ "vsubps  $dst,$src,$mem\t! sub packed8F" %}
3960   ins_encode %{
3961     bool vector256 = true;
3962     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3963   %}
3964   ins_pipe( pipe_slow );
3965 %}
3966 
3967 // Doubles vector sub
3968 instruct vsub2D(vecX dst, vecX src) %{
3969   predicate(n->as_Vector()->length() == 2);
3970   match(Set dst (SubVD dst src));
3971   format %{ "subpd   $dst,$src\t! sub packed2D" %}
3972   ins_encode %{
3973     __ subpd($dst$$XMMRegister, $src$$XMMRegister);
3974   %}
3975   ins_pipe( pipe_slow );
3976 %}
3977 
3978 instruct vsub2D_reg(vecX dst, vecX src1, vecX src2) %{
3979   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
3980   match(Set dst (SubVD src1 src2));
3981   format %{ "vsubpd  $dst,$src1,$src2\t! sub packed2D" %}
3982   ins_encode %{
3983     bool vector256 = false;
3984     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3985   %}
3986   ins_pipe( pipe_slow );
3987 %}
3988 
3989 instruct vsub2D_mem(vecX dst, vecX src, memory mem) %{
3990   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
3991   match(Set dst (SubVD src (LoadVector mem)));
3992   format %{ "vsubpd  $dst,$src,$mem\t! sub packed2D" %}
3993   ins_encode %{
3994     bool vector256 = false;
3995     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3996   %}
3997   ins_pipe( pipe_slow );
3998 %}
3999 
4000 instruct vsub4D_reg(vecY dst, vecY src1, vecY src2) %{
4001   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
4002   match(Set dst (SubVD src1 src2));
4003   format %{ "vsubpd  $dst,$src1,$src2\t! sub packed4D" %}
4004   ins_encode %{
4005     bool vector256 = true;
4006     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
4007   %}
4008   ins_pipe( pipe_slow );
4009 %}
4010 
4011 instruct vsub4D_mem(vecY dst, vecY src, memory mem) %{
4012   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
4013   match(Set dst (SubVD src (LoadVector mem)));
4014   format %{ "vsubpd  $dst,$src,$mem\t! sub packed4D" %}
4015   ins_encode %{
4016     bool vector256 = true;
4017     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
4018   %}
4019   ins_pipe( pipe_slow );
4020 %}
4021 
4022 // --------------------------------- MUL --------------------------------------
4023 
4024 // Shorts/Chars vector mul
4025 instruct vmul2S(vecS dst, vecS src) %{
4026   predicate(n->as_Vector()->length() == 2);
4027   match(Set dst (MulVS dst src));
4028   format %{ "pmullw $dst,$src\t! mul packed2S" %}
4029   ins_encode %{
4030     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
4031   %}
4032   ins_pipe( pipe_slow );
4033 %}
4034 
4035 instruct vmul2S_reg(vecS dst, vecS src1, vecS src2) %{
4036   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
4037   match(Set dst (MulVS src1 src2));
4038   format %{ "vpmullw $dst,$src1,$src2\t! mul packed2S" %}
4039   ins_encode %{
4040     bool vector256 = false;
4041     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
4042   %}
4043   ins_pipe( pipe_slow );
4044 %}
4045 
4046 instruct vmul4S(vecD dst, vecD src) %{
4047   predicate(n->as_Vector()->length() == 4);
4048   match(Set dst (MulVS dst src));
4049   format %{ "pmullw  $dst,$src\t! mul packed4S" %}
4050   ins_encode %{
4051     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
4052   %}
4053   ins_pipe( pipe_slow );
4054 %}
4055 
4056 instruct vmul4S_reg(vecD dst, vecD src1, vecD src2) %{
4057   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
4058   match(Set dst (MulVS src1 src2));
4059   format %{ "vpmullw $dst,$src1,$src2\t! mul packed4S" %}
4060   ins_encode %{
4061     bool vector256 = false;
4062     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
4063   %}
4064   ins_pipe( pipe_slow );
4065 %}
4066 
4067 instruct vmul8S(vecX dst, vecX src) %{
4068   predicate(n->as_Vector()->length() == 8);
4069   match(Set dst (MulVS dst src));
4070   format %{ "pmullw  $dst,$src\t! mul packed8S" %}
4071   ins_encode %{
4072     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
4073   %}
4074   ins_pipe( pipe_slow );
4075 %}
4076 
4077 instruct vmul8S_reg(vecX dst, vecX src1, vecX src2) %{
4078   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
4079   match(Set dst (MulVS src1 src2));
4080   format %{ "vpmullw $dst,$src1,$src2\t! mul packed8S" %}
4081   ins_encode %{
4082     bool vector256 = false;
4083     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
4084   %}
4085   ins_pipe( pipe_slow );
4086 %}
4087 
4088 instruct vmul8S_mem(vecX dst, vecX src, memory mem) %{
4089   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
4090   match(Set dst (MulVS src (LoadVector mem)));
4091   format %{ "vpmullw $dst,$src,$mem\t! mul packed8S" %}
4092   ins_encode %{
4093     bool vector256 = false;
4094     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
4095   %}
4096   ins_pipe( pipe_slow );
4097 %}
4098 
4099 instruct vmul16S_reg(vecY dst, vecY src1, vecY src2) %{
4100   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
4101   match(Set dst (MulVS src1 src2));
4102   format %{ "vpmullw $dst,$src1,$src2\t! mul packed16S" %}
4103   ins_encode %{
4104     bool vector256 = true;
4105     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
4106   %}
4107   ins_pipe( pipe_slow );
4108 %}
4109 
4110 instruct vmul16S_mem(vecY dst, vecY src, memory mem) %{
4111   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
4112   match(Set dst (MulVS src (LoadVector mem)));
4113   format %{ "vpmullw $dst,$src,$mem\t! mul packed16S" %}
4114   ins_encode %{
4115     bool vector256 = true;
4116     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
4117   %}
4118   ins_pipe( pipe_slow );
4119 %}
4120 
4121 // Integers vector mul (sse4_1)
4122 instruct vmul2I(vecD dst, vecD src) %{
4123   predicate(UseSSE > 3 && n->as_Vector()->length() == 2);
4124   match(Set dst (MulVI dst src));
4125   format %{ "pmulld  $dst,$src\t! mul packed2I" %}
4126   ins_encode %{
4127     __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
4128   %}
4129   ins_pipe( pipe_slow );
4130 %}
4131 
4132 instruct vmul2I_reg(vecD dst, vecD src1, vecD src2) %{
4133   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
4134   match(Set dst (MulVI src1 src2));
4135   format %{ "vpmulld $dst,$src1,$src2\t! mul packed2I" %}
4136   ins_encode %{
4137     bool vector256 = false;
4138     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
4139   %}
4140   ins_pipe( pipe_slow );
4141 %}
4142 
4143 instruct vmul4I(vecX dst, vecX src) %{
4144   predicate(UseSSE > 3 && n->as_Vector()->length() == 4);
4145   match(Set dst (MulVI dst src));
4146   format %{ "pmulld  $dst,$src\t! mul packed4I" %}
4147   ins_encode %{
4148     __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
4149   %}
4150   ins_pipe( pipe_slow );
4151 %}
4152 
4153 instruct vmul4I_reg(vecX dst, vecX src1, vecX src2) %{
4154   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
4155   match(Set dst (MulVI src1 src2));
4156   format %{ "vpmulld $dst,$src1,$src2\t! mul packed4I" %}
4157   ins_encode %{
4158     bool vector256 = false;
4159     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
4160   %}
4161   ins_pipe( pipe_slow );
4162 %}
4163 
4164 instruct vmul4I_mem(vecX dst, vecX src, memory mem) %{
4165   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
4166   match(Set dst (MulVI src (LoadVector mem)));
4167   format %{ "vpmulld $dst,$src,$mem\t! mul packed4I" %}
4168   ins_encode %{
4169     bool vector256 = false;
4170     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
4171   %}
4172   ins_pipe( pipe_slow );
4173 %}
4174 
4175 instruct vmul8I_reg(vecY dst, vecY src1, vecY src2) %{
4176   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
4177   match(Set dst (MulVI src1 src2));
4178   format %{ "vpmulld $dst,$src1,$src2\t! mul packed8I" %}
4179   ins_encode %{
4180     bool vector256 = true;
4181     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
4182   %}
4183   ins_pipe( pipe_slow );
4184 %}
4185 
4186 instruct vmul8I_mem(vecY dst, vecY src, memory mem) %{
4187   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
4188   match(Set dst (MulVI src (LoadVector mem)));
4189   format %{ "vpmulld $dst,$src,$mem\t! mul packed8I" %}
4190   ins_encode %{
4191     bool vector256 = true;
4192     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
4193   %}
4194   ins_pipe( pipe_slow );
4195 %}
4196 
4197 // Floats vector mul
4198 instruct vmul2F(vecD dst, vecD src) %{
4199   predicate(n->as_Vector()->length() == 2);
4200   match(Set dst (MulVF dst src));
4201   format %{ "mulps   $dst,$src\t! mul packed2F" %}
4202   ins_encode %{
4203     __ mulps($dst$$XMMRegister, $src$$XMMRegister);
4204   %}
4205   ins_pipe( pipe_slow );
4206 %}
4207 
4208 instruct vmul2F_reg(vecD dst, vecD src1, vecD src2) %{
4209   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
4210   match(Set dst (MulVF src1 src2));
4211   format %{ "vmulps  $dst,$src1,$src2\t! mul packed2F" %}
4212   ins_encode %{
4213     bool vector256 = false;
4214     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
4215   %}
4216   ins_pipe( pipe_slow );
4217 %}
4218 
4219 instruct vmul4F(vecX dst, vecX src) %{
4220   predicate(n->as_Vector()->length() == 4);
4221   match(Set dst (MulVF dst src));
4222   format %{ "mulps   $dst,$src\t! mul packed4F" %}
4223   ins_encode %{
4224     __ mulps($dst$$XMMRegister, $src$$XMMRegister);
4225   %}
4226   ins_pipe( pipe_slow );
4227 %}
4228 
4229 instruct vmul4F_reg(vecX dst, vecX src1, vecX src2) %{
4230   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
4231   match(Set dst (MulVF src1 src2));
4232   format %{ "vmulps  $dst,$src1,$src2\t! mul packed4F" %}
4233   ins_encode %{
4234     bool vector256 = false;
4235     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
4236   %}
4237   ins_pipe( pipe_slow );
4238 %}
4239 
4240 instruct vmul4F_mem(vecX dst, vecX src, memory mem) %{
4241   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
4242   match(Set dst (MulVF src (LoadVector mem)));
4243   format %{ "vmulps  $dst,$src,$mem\t! mul packed4F" %}
4244   ins_encode %{
4245     bool vector256 = false;
4246     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
4247   %}
4248   ins_pipe( pipe_slow );
4249 %}
4250 
4251 instruct vmul8F_reg(vecY dst, vecY src1, vecY src2) %{
4252   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
4253   match(Set dst (MulVF src1 src2));
4254   format %{ "vmulps  $dst,$src1,$src2\t! mul packed8F" %}
4255   ins_encode %{
4256     bool vector256 = true;
4257     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
4258   %}
4259   ins_pipe( pipe_slow );
4260 %}
4261 
4262 instruct vmul8F_mem(vecY dst, vecY src, memory mem) %{
4263   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
4264   match(Set dst (MulVF src (LoadVector mem)));
4265   format %{ "vmulps  $dst,$src,$mem\t! mul packed8F" %}
4266   ins_encode %{
4267     bool vector256 = true;
4268     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
4269   %}
4270   ins_pipe( pipe_slow );
4271 %}
4272 
4273 // Doubles vector mul
4274 instruct vmul2D(vecX dst, vecX src) %{
4275   predicate(n->as_Vector()->length() == 2);
4276   match(Set dst (MulVD dst src));
4277   format %{ "mulpd   $dst,$src\t! mul packed2D" %}
4278   ins_encode %{
4279     __ mulpd($dst$$XMMRegister, $src$$XMMRegister);
4280   %}
4281   ins_pipe( pipe_slow );
4282 %}
4283 
4284 instruct vmul2D_reg(vecX dst, vecX src1, vecX src2) %{
4285   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
4286   match(Set dst (MulVD src1 src2));
4287   format %{ "vmulpd  $dst,$src1,$src2\t! mul packed2D" %}
4288   ins_encode %{
4289     bool vector256 = false;
4290     __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
4291   %}
4292   ins_pipe( pipe_slow );
4293 %}
4294 
4295 instruct vmul2D_mem(vecX dst, vecX src, memory mem) %{
4296   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
4297   match(Set dst (MulVD src (LoadVector mem)));
4298   format %{ "vmulpd  $dst,$src,$mem\t! mul packed2D" %}
4299   ins_encode %{
4300     bool vector256 = false;
4301     __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
4302   %}
4303   ins_pipe( pipe_slow );
4304 %}
4305 
4306 instruct vmul4D_reg(vecY dst, vecY src1, vecY src2) %{
4307   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
4308   match(Set dst (MulVD src1 src2));
4309   format %{ "vmulpd  $dst,$src1,$src2\t! mul packed4D" %}
4310   ins_encode %{
4311     bool vector256 = true;
4312     __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
4313   %}
4314   ins_pipe( pipe_slow );
4315 %}
4316 
4317 instruct vmul4D_mem(vecY dst, vecY src, memory mem) %{
4318   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
4319   match(Set dst (MulVD src (LoadVector mem)));
4320   format %{ "vmulpd  $dst,$src,$mem\t! mul packed4D" %}
4321   ins_encode %{
4322     bool vector256 = true;
4323     __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
4324   %}
4325   ins_pipe( pipe_slow );
4326 %}
4327 
4328 // --------------------------------- DIV --------------------------------------
4329 
4330 // Floats vector div
4331 instruct vdiv2F(vecD dst, vecD src) %{
4332   predicate(n->as_Vector()->length() == 2);
4333   match(Set dst (DivVF dst src));
4334   format %{ "divps   $dst,$src\t! div packed2F" %}
4335   ins_encode %{
4336     __ divps($dst$$XMMRegister, $src$$XMMRegister);
4337   %}
4338   ins_pipe( pipe_slow );
4339 %}
4340 
4341 instruct vdiv2F_reg(vecD dst, vecD src1, vecD src2) %{
4342   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
4343   match(Set dst (DivVF src1 src2));
4344   format %{ "vdivps  $dst,$src1,$src2\t! div packed2F" %}
4345   ins_encode %{
4346     bool vector256 = false;
4347     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
4348   %}
4349   ins_pipe( pipe_slow );
4350 %}
4351 
4352 instruct vdiv4F(vecX dst, vecX src) %{
4353   predicate(n->as_Vector()->length() == 4);
4354   match(Set dst (DivVF dst src));
4355   format %{ "divps   $dst,$src\t! div packed4F" %}
4356   ins_encode %{
4357     __ divps($dst$$XMMRegister, $src$$XMMRegister);
4358   %}
4359   ins_pipe( pipe_slow );
4360 %}
4361 
4362 instruct vdiv4F_reg(vecX dst, vecX src1, vecX src2) %{
4363   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
4364   match(Set dst (DivVF src1 src2));
4365   format %{ "vdivps  $dst,$src1,$src2\t! div packed4F" %}
4366   ins_encode %{
4367     bool vector256 = false;
4368     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
4369   %}
4370   ins_pipe( pipe_slow );
4371 %}
4372 
4373 instruct vdiv4F_mem(vecX dst, vecX src, memory mem) %{
4374   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
4375   match(Set dst (DivVF src (LoadVector mem)));
4376   format %{ "vdivps  $dst,$src,$mem\t! div packed4F" %}
4377   ins_encode %{
4378     bool vector256 = false;
4379     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
4380   %}
4381   ins_pipe( pipe_slow );
4382 %}
4383 
4384 instruct vdiv8F_reg(vecY dst, vecY src1, vecY src2) %{
4385   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
4386   match(Set dst (DivVF src1 src2));
4387   format %{ "vdivps  $dst,$src1,$src2\t! div packed8F" %}
4388   ins_encode %{
4389     bool vector256 = true;
4390     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
4391   %}
4392   ins_pipe( pipe_slow );
4393 %}
4394 
4395 instruct vdiv8F_mem(vecY dst, vecY src, memory mem) %{
4396   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
4397   match(Set dst (DivVF src (LoadVector mem)));
4398   format %{ "vdivps  $dst,$src,$mem\t! div packed8F" %}
4399   ins_encode %{
4400     bool vector256 = true;
4401     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
4402   %}
4403   ins_pipe( pipe_slow );
4404 %}
4405 
4406 // Doubles vector div
4407 instruct vdiv2D(vecX dst, vecX src) %{
4408   predicate(n->as_Vector()->length() == 2);
4409   match(Set dst (DivVD dst src));
4410   format %{ "divpd   $dst,$src\t! div packed2D" %}
4411   ins_encode %{
4412     __ divpd($dst$$XMMRegister, $src$$XMMRegister);
4413   %}
4414   ins_pipe( pipe_slow );
4415 %}
4416 
4417 instruct vdiv2D_reg(vecX dst, vecX src1, vecX src2) %{
4418   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
4419   match(Set dst (DivVD src1 src2));
4420   format %{ "vdivpd  $dst,$src1,$src2\t! div packed2D" %}
4421   ins_encode %{
4422     bool vector256 = false;
4423     __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
4424   %}
4425   ins_pipe( pipe_slow );
4426 %}
4427 
4428 instruct vdiv2D_mem(vecX dst, vecX src, memory mem) %{
4429   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
4430   match(Set dst (DivVD src (LoadVector mem)));
4431   format %{ "vdivpd  $dst,$src,$mem\t! div packed2D" %}
4432   ins_encode %{
4433     bool vector256 = false;
4434     __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
4435   %}
4436   ins_pipe( pipe_slow );
4437 %}
4438 
4439 instruct vdiv4D_reg(vecY dst, vecY src1, vecY src2) %{
4440   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
4441   match(Set dst (DivVD src1 src2));
4442   format %{ "vdivpd  $dst,$src1,$src2\t! div packed4D" %}
4443   ins_encode %{
4444     bool vector256 = true;
4445     __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
4446   %}
4447   ins_pipe( pipe_slow );
4448 %}
4449 
4450 instruct vdiv4D_mem(vecY dst, vecY src, memory mem) %{
4451   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
4452   match(Set dst (DivVD src (LoadVector mem)));
4453   format %{ "vdivpd  $dst,$src,$mem\t! div packed4D" %}
4454   ins_encode %{
4455     bool vector256 = true;
4456     __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
4457   %}
4458   ins_pipe( pipe_slow );
4459 %}
4460 
4461 // ------------------------------ Shift ---------------------------------------
4462 
4463 // Left and right shift count vectors are the same on x86
4464 // (only lowest bits of xmm reg are used for count).
4465 instruct vshiftcnt(vecS dst, rRegI cnt) %{
4466   match(Set dst (LShiftCntV cnt));
4467   match(Set dst (RShiftCntV cnt));
4468   format %{ "movd    $dst,$cnt\t! load shift count" %}
4469   ins_encode %{
4470     __ movdl($dst$$XMMRegister, $cnt$$Register);
4471   %}
4472   ins_pipe( pipe_slow );
4473 %}
4474 
4475 // ------------------------------ LeftShift -----------------------------------
4476 
4477 // Shorts/Chars vector left shift
4478 instruct vsll2S(vecS dst, vecS shift) %{
4479   predicate(n->as_Vector()->length() == 2);
4480   match(Set dst (LShiftVS dst shift));
4481   format %{ "psllw   $dst,$shift\t! left shift packed2S" %}
4482   ins_encode %{
4483     __ psllw($dst$$XMMRegister, $shift$$XMMRegister);
4484   %}
4485   ins_pipe( pipe_slow );
4486 %}
4487 
4488 instruct vsll2S_imm(vecS dst, immI8 shift) %{
4489   predicate(n->as_Vector()->length() == 2);
4490   match(Set dst (LShiftVS dst shift));
4491   format %{ "psllw   $dst,$shift\t! left shift packed2S" %}
4492   ins_encode %{
4493     __ psllw($dst$$XMMRegister, (int)$shift$$constant);
4494   %}
4495   ins_pipe( pipe_slow );
4496 %}
4497 
4498 instruct vsll2S_reg(vecS dst, vecS src, vecS shift) %{
4499   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
4500   match(Set dst (LShiftVS src shift));
4501   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
4502   ins_encode %{
4503     bool vector256 = false;
4504     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
4505   %}
4506   ins_pipe( pipe_slow );
4507 %}
4508 
4509 instruct vsll2S_reg_imm(vecS dst, vecS src, immI8 shift) %{
4510   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
4511   match(Set dst (LShiftVS src shift));
4512   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
4513   ins_encode %{
4514     bool vector256 = false;
4515     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
4516   %}
4517   ins_pipe( pipe_slow );
4518 %}
4519 
4520 instruct vsll4S(vecD dst, vecS shift) %{
4521   predicate(n->as_Vector()->length() == 4);
4522   match(Set dst (LShiftVS dst shift));
4523   format %{ "psllw   $dst,$shift\t! left shift packed4S" %}
4524   ins_encode %{
4525     __ psllw($dst$$XMMRegister, $shift$$XMMRegister);
4526   %}
4527   ins_pipe( pipe_slow );
4528 %}
4529 
4530 instruct vsll4S_imm(vecD dst, immI8 shift) %{
4531   predicate(n->as_Vector()->length() == 4);
4532   match(Set dst (LShiftVS dst shift));
4533   format %{ "psllw   $dst,$shift\t! left shift packed4S" %}
4534   ins_encode %{
4535     __ psllw($dst$$XMMRegister, (int)$shift$$constant);
4536   %}
4537   ins_pipe( pipe_slow );
4538 %}
4539 
4540 instruct vsll4S_reg(vecD dst, vecD src, vecS shift) %{
4541   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
4542   match(Set dst (LShiftVS src shift));
4543   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
4544   ins_encode %{
4545     bool vector256 = false;
4546     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
4547   %}
4548   ins_pipe( pipe_slow );
4549 %}
4550 
4551 instruct vsll4S_reg_imm(vecD dst, vecD src, immI8 shift) %{
4552   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
4553   match(Set dst (LShiftVS src shift));
4554   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
4555   ins_encode %{
4556     bool vector256 = false;
4557     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
4558   %}
4559   ins_pipe( pipe_slow );
4560 %}
4561 
4562 instruct vsll8S(vecX dst, vecS shift) %{
4563   predicate(n->as_Vector()->length() == 8);
4564   match(Set dst (LShiftVS dst shift));
4565   format %{ "psllw   $dst,$shift\t! left shift packed8S" %}
4566   ins_encode %{
4567     __ psllw($dst$$XMMRegister, $shift$$XMMRegister);
4568   %}
4569   ins_pipe( pipe_slow );
4570 %}
4571 
4572 instruct vsll8S_imm(vecX dst, immI8 shift) %{
4573   predicate(n->as_Vector()->length() == 8);
4574   match(Set dst (LShiftVS dst shift));
4575   format %{ "psllw   $dst,$shift\t! left shift packed8S" %}
4576   ins_encode %{
4577     __ psllw($dst$$XMMRegister, (int)$shift$$constant);
4578   %}
4579   ins_pipe( pipe_slow );
4580 %}
4581 
4582 instruct vsll8S_reg(vecX dst, vecX src, vecS shift) %{
4583   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
4584   match(Set dst (LShiftVS src shift));
4585   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
4586   ins_encode %{
4587     bool vector256 = false;
4588     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
4589   %}
4590   ins_pipe( pipe_slow );
4591 %}
4592 
4593 instruct vsll8S_reg_imm(vecX dst, vecX src, immI8 shift) %{
4594   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
4595   match(Set dst (LShiftVS src shift));
4596   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
4597   ins_encode %{
4598     bool vector256 = false;
4599     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
4600   %}
4601   ins_pipe( pipe_slow );
4602 %}
4603 
4604 instruct vsll16S_reg(vecY dst, vecY src, vecS shift) %{
4605   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
4606   match(Set dst (LShiftVS src shift));
4607   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
4608   ins_encode %{
4609     bool vector256 = true;
4610     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
4611   %}
4612   ins_pipe( pipe_slow );
4613 %}
4614 
4615 instruct vsll16S_reg_imm(vecY dst, vecY src, immI8 shift) %{
4616   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
4617   match(Set dst (LShiftVS src shift));
4618   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
4619   ins_encode %{
4620     bool vector256 = true;
4621     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
4622   %}
4623   ins_pipe( pipe_slow );
4624 %}
4625 
4626 // Integers vector left shift
4627 instruct vsll2I(vecD dst, vecS shift) %{
4628   predicate(n->as_Vector()->length() == 2);
4629   match(Set dst (LShiftVI dst shift));
4630   format %{ "pslld   $dst,$shift\t! left shift packed2I" %}
4631   ins_encode %{
4632     __ pslld($dst$$XMMRegister, $shift$$XMMRegister);
4633   %}
4634   ins_pipe( pipe_slow );
4635 %}
4636 
4637 instruct vsll2I_imm(vecD dst, immI8 shift) %{
4638   predicate(n->as_Vector()->length() == 2);
4639   match(Set dst (LShiftVI dst shift));
4640   format %{ "pslld   $dst,$shift\t! left shift packed2I" %}
4641   ins_encode %{
4642     __ pslld($dst$$XMMRegister, (int)$shift$$constant);
4643   %}
4644   ins_pipe( pipe_slow );
4645 %}
4646 
4647 instruct vsll2I_reg(vecD dst, vecD src, vecS shift) %{
4648   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
4649   match(Set dst (LShiftVI src shift));
4650   format %{ "vpslld  $dst,$src,$shift\t! left shift packed2I" %}
4651   ins_encode %{
4652     bool vector256 = false;
4653     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
4654   %}
4655   ins_pipe( pipe_slow );
4656 %}
4657 
4658 instruct vsll2I_reg_imm(vecD dst, vecD src, immI8 shift) %{
4659   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
4660   match(Set dst (LShiftVI src shift));
4661   format %{ "vpslld  $dst,$src,$shift\t! left shift packed2I" %}
4662   ins_encode %{
4663     bool vector256 = false;
4664     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
4665   %}
4666   ins_pipe( pipe_slow );
4667 %}
4668 
4669 instruct vsll4I(vecX dst, vecS shift) %{
4670   predicate(n->as_Vector()->length() == 4);
4671   match(Set dst (LShiftVI dst shift));
4672   format %{ "pslld   $dst,$shift\t! left shift packed4I" %}
4673   ins_encode %{
4674     __ pslld($dst$$XMMRegister, $shift$$XMMRegister);
4675   %}
4676   ins_pipe( pipe_slow );
4677 %}
4678 
4679 instruct vsll4I_imm(vecX dst, immI8 shift) %{
4680   predicate(n->as_Vector()->length() == 4);
4681   match(Set dst (LShiftVI dst shift));
4682   format %{ "pslld   $dst,$shift\t! left shift packed4I" %}
4683   ins_encode %{
4684     __ pslld($dst$$XMMRegister, (int)$shift$$constant);
4685   %}
4686   ins_pipe( pipe_slow );
4687 %}
4688 
4689 instruct vsll4I_reg(vecX dst, vecX src, vecS shift) %{
4690   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
4691   match(Set dst (LShiftVI src shift));
4692   format %{ "vpslld  $dst,$src,$shift\t! left shift packed4I" %}
4693   ins_encode %{
4694     bool vector256 = false;
4695     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
4696   %}
4697   ins_pipe( pipe_slow );
4698 %}
4699 
4700 instruct vsll4I_reg_imm(vecX dst, vecX src, immI8 shift) %{
4701   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
4702   match(Set dst (LShiftVI src shift));
4703   format %{ "vpslld  $dst,$src,$shift\t! left shift packed4I" %}
4704   ins_encode %{
4705     bool vector256 = false;
4706     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
4707   %}
4708   ins_pipe( pipe_slow );
4709 %}
4710 
4711 instruct vsll8I_reg(vecY dst, vecY src, vecS shift) %{
4712   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
4713   match(Set dst (LShiftVI src shift));
4714   format %{ "vpslld  $dst,$src,$shift\t! left shift packed8I" %}
4715   ins_encode %{
4716     bool vector256 = true;
4717     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
4718   %}
4719   ins_pipe( pipe_slow );
4720 %}
4721 
4722 instruct vsll8I_reg_imm(vecY dst, vecY src, immI8 shift) %{
4723   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
4724   match(Set dst (LShiftVI src shift));
4725   format %{ "vpslld  $dst,$src,$shift\t! left shift packed8I" %}
4726   ins_encode %{
4727     bool vector256 = true;
4728     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
4729   %}
4730   ins_pipe( pipe_slow );
4731 %}
4732 
4733 // Longs vector left shift
4734 instruct vsll2L(vecX dst, vecS shift) %{
4735   predicate(n->as_Vector()->length() == 2);
4736   match(Set dst (LShiftVL dst shift));
4737   format %{ "psllq   $dst,$shift\t! left shift packed2L" %}
4738   ins_encode %{
4739     __ psllq($dst$$XMMRegister, $shift$$XMMRegister);
4740   %}
4741   ins_pipe( pipe_slow );
4742 %}
4743 
4744 instruct vsll2L_imm(vecX dst, immI8 shift) %{
4745   predicate(n->as_Vector()->length() == 2);
4746   match(Set dst (LShiftVL dst shift));
4747   format %{ "psllq   $dst,$shift\t! left shift packed2L" %}
4748   ins_encode %{
4749     __ psllq($dst$$XMMRegister, (int)$shift$$constant);
4750   %}
4751   ins_pipe( pipe_slow );
4752 %}
4753 
4754 instruct vsll2L_reg(vecX dst, vecX src, vecS shift) %{
4755   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
4756   match(Set dst (LShiftVL src shift));
4757   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed2L" %}
4758   ins_encode %{
4759     bool vector256 = false;
4760     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
4761   %}
4762   ins_pipe( pipe_slow );
4763 %}
4764 
4765 instruct vsll2L_reg_imm(vecX dst, vecX src, immI8 shift) %{
4766   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
4767   match(Set dst (LShiftVL src shift));
4768   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed2L" %}
4769   ins_encode %{
4770     bool vector256 = false;
4771     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
4772   %}
4773   ins_pipe( pipe_slow );
4774 %}
4775 
4776 instruct vsll4L_reg(vecY dst, vecY src, vecS shift) %{
4777   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
4778   match(Set dst (LShiftVL src shift));
4779   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed4L" %}
4780   ins_encode %{
4781     bool vector256 = true;
4782     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
4783   %}
4784   ins_pipe( pipe_slow );
4785 %}
4786 
4787 instruct vsll4L_reg_imm(vecY dst, vecY src, immI8 shift) %{
4788   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
4789   match(Set dst (LShiftVL src shift));
4790   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed4L" %}
4791   ins_encode %{
4792     bool vector256 = true;
4793     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
4794   %}
4795   ins_pipe( pipe_slow );
4796 %}
4797 
4798 // ----------------------- LogicalRightShift -----------------------------------
4799 
4800 // Shorts vector logical right shift produces incorrect Java result
4801 // for negative data because java code convert short value into int with
4802 // sign extension before a shift. But char vectors are fine since chars are
4803 // unsigned values.
4804 
4805 instruct vsrl2S(vecS dst, vecS shift) %{
4806   predicate(n->as_Vector()->length() == 2);
4807   match(Set dst (URShiftVS dst shift));
4808   format %{ "psrlw   $dst,$shift\t! logical right shift packed2S" %}
4809   ins_encode %{
4810     __ psrlw($dst$$XMMRegister, $shift$$XMMRegister);
4811   %}
4812   ins_pipe( pipe_slow );
4813 %}
4814 
4815 instruct vsrl2S_imm(vecS dst, immI8 shift) %{
4816   predicate(n->as_Vector()->length() == 2);
4817   match(Set dst (URShiftVS dst shift));
4818   format %{ "psrlw   $dst,$shift\t! logical right shift packed2S" %}
4819   ins_encode %{
4820     __ psrlw($dst$$XMMRegister, (int)$shift$$constant);
4821   %}
4822   ins_pipe( pipe_slow );
4823 %}
4824 
4825 instruct vsrl2S_reg(vecS dst, vecS src, vecS shift) %{
4826   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
4827   match(Set dst (URShiftVS src shift));
4828   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed2S" %}
4829   ins_encode %{
4830     bool vector256 = false;
4831     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
4832   %}
4833   ins_pipe( pipe_slow );
4834 %}
4835 
4836 instruct vsrl2S_reg_imm(vecS dst, vecS src, immI8 shift) %{
4837   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
4838   match(Set dst (URShiftVS src shift));
4839   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed2S" %}
4840   ins_encode %{
4841     bool vector256 = false;
4842     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
4843   %}
4844   ins_pipe( pipe_slow );
4845 %}
4846 
4847 instruct vsrl4S(vecD dst, vecS shift) %{
4848   predicate(n->as_Vector()->length() == 4);
4849   match(Set dst (URShiftVS dst shift));
4850   format %{ "psrlw   $dst,$shift\t! logical right shift packed4S" %}
4851   ins_encode %{
4852     __ psrlw($dst$$XMMRegister, $shift$$XMMRegister);
4853   %}
4854   ins_pipe( pipe_slow );
4855 %}
4856 
4857 instruct vsrl4S_imm(vecD dst, immI8 shift) %{
4858   predicate(n->as_Vector()->length() == 4);
4859   match(Set dst (URShiftVS dst shift));
4860   format %{ "psrlw   $dst,$shift\t! logical right shift packed4S" %}
4861   ins_encode %{
4862     __ psrlw($dst$$XMMRegister, (int)$shift$$constant);
4863   %}
4864   ins_pipe( pipe_slow );
4865 %}
4866 
4867 instruct vsrl4S_reg(vecD dst, vecD src, vecS shift) %{
4868   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
4869   match(Set dst (URShiftVS src shift));
4870   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed4S" %}
4871   ins_encode %{
4872     bool vector256 = false;
4873     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
4874   %}
4875   ins_pipe( pipe_slow );
4876 %}
4877 
4878 instruct vsrl4S_reg_imm(vecD dst, vecD src, immI8 shift) %{
4879   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
4880   match(Set dst (URShiftVS src shift));
4881   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed4S" %}
4882   ins_encode %{
4883     bool vector256 = false;
4884     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
4885   %}
4886   ins_pipe( pipe_slow );
4887 %}
4888 
4889 instruct vsrl8S(vecX dst, vecS shift) %{
4890   predicate(n->as_Vector()->length() == 8);
4891   match(Set dst (URShiftVS dst shift));
4892   format %{ "psrlw   $dst,$shift\t! logical right shift packed8S" %}
4893   ins_encode %{
4894     __ psrlw($dst$$XMMRegister, $shift$$XMMRegister);
4895   %}
4896   ins_pipe( pipe_slow );
4897 %}
4898 
4899 instruct vsrl8S_imm(vecX dst, immI8 shift) %{
4900   predicate(n->as_Vector()->length() == 8);
4901   match(Set dst (URShiftVS dst shift));
4902   format %{ "psrlw   $dst,$shift\t! logical right shift packed8S" %}
4903   ins_encode %{
4904     __ psrlw($dst$$XMMRegister, (int)$shift$$constant);
4905   %}
4906   ins_pipe( pipe_slow );
4907 %}
4908 
4909 instruct vsrl8S_reg(vecX dst, vecX src, vecS shift) %{
4910   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
4911   match(Set dst (URShiftVS src shift));
4912   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed8S" %}
4913   ins_encode %{
4914     bool vector256 = false;
4915     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
4916   %}
4917   ins_pipe( pipe_slow );
4918 %}
4919 
4920 instruct vsrl8S_reg_imm(vecX dst, vecX src, immI8 shift) %{
4921   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
4922   match(Set dst (URShiftVS src shift));
4923   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed8S" %}
4924   ins_encode %{
4925     bool vector256 = false;
4926     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
4927   %}
4928   ins_pipe( pipe_slow );
4929 %}
4930 
4931 instruct vsrl16S_reg(vecY dst, vecY src, vecS shift) %{
4932   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
4933   match(Set dst (URShiftVS src shift));
4934   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed16S" %}
4935   ins_encode %{
4936     bool vector256 = true;
4937     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
4938   %}
4939   ins_pipe( pipe_slow );
4940 %}
4941 
4942 instruct vsrl16S_reg_imm(vecY dst, vecY src, immI8 shift) %{
4943   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
4944   match(Set dst (URShiftVS src shift));
4945   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed16S" %}
4946   ins_encode %{
4947     bool vector256 = true;
4948     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
4949   %}
4950   ins_pipe( pipe_slow );
4951 %}
4952 
4953 // Integers vector logical right shift
4954 instruct vsrl2I(vecD dst, vecS shift) %{
4955   predicate(n->as_Vector()->length() == 2);
4956   match(Set dst (URShiftVI dst shift));
4957   format %{ "psrld   $dst,$shift\t! logical right shift packed2I" %}
4958   ins_encode %{
4959     __ psrld($dst$$XMMRegister, $shift$$XMMRegister);
4960   %}
4961   ins_pipe( pipe_slow );
4962 %}
4963 
4964 instruct vsrl2I_imm(vecD dst, immI8 shift) %{
4965   predicate(n->as_Vector()->length() == 2);
4966   match(Set dst (URShiftVI dst shift));
4967   format %{ "psrld   $dst,$shift\t! logical right shift packed2I" %}
4968   ins_encode %{
4969     __ psrld($dst$$XMMRegister, (int)$shift$$constant);
4970   %}
4971   ins_pipe( pipe_slow );
4972 %}
4973 
4974 instruct vsrl2I_reg(vecD dst, vecD src, vecS shift) %{
4975   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
4976   match(Set dst (URShiftVI src shift));
4977   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed2I" %}
4978   ins_encode %{
4979     bool vector256 = false;
4980     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
4981   %}
4982   ins_pipe( pipe_slow );
4983 %}
4984 
4985 instruct vsrl2I_reg_imm(vecD dst, vecD src, immI8 shift) %{
4986   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
4987   match(Set dst (URShiftVI src shift));
4988   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed2I" %}
4989   ins_encode %{
4990     bool vector256 = false;
4991     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
4992   %}
4993   ins_pipe( pipe_slow );
4994 %}
4995 
4996 instruct vsrl4I(vecX dst, vecS shift) %{
4997   predicate(n->as_Vector()->length() == 4);
4998   match(Set dst (URShiftVI dst shift));
4999   format %{ "psrld   $dst,$shift\t! logical right shift packed4I" %}
5000   ins_encode %{
5001     __ psrld($dst$$XMMRegister, $shift$$XMMRegister);
5002   %}
5003   ins_pipe( pipe_slow );
5004 %}
5005 
5006 instruct vsrl4I_imm(vecX dst, immI8 shift) %{
5007   predicate(n->as_Vector()->length() == 4);
5008   match(Set dst (URShiftVI dst shift));
5009   format %{ "psrld   $dst,$shift\t! logical right shift packed4I" %}
5010   ins_encode %{
5011     __ psrld($dst$$XMMRegister, (int)$shift$$constant);
5012   %}
5013   ins_pipe( pipe_slow );
5014 %}
5015 
5016 instruct vsrl4I_reg(vecX dst, vecX src, vecS shift) %{
5017   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
5018   match(Set dst (URShiftVI src shift));
5019   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed4I" %}
5020   ins_encode %{
5021     bool vector256 = false;
5022     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
5023   %}
5024   ins_pipe( pipe_slow );
5025 %}
5026 
5027 instruct vsrl4I_reg_imm(vecX dst, vecX src, immI8 shift) %{
5028   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
5029   match(Set dst (URShiftVI src shift));
5030   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed4I" %}
5031   ins_encode %{
5032     bool vector256 = false;
5033     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
5034   %}
5035   ins_pipe( pipe_slow );
5036 %}
5037 
5038 instruct vsrl8I_reg(vecY dst, vecY src, vecS shift) %{
5039   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
5040   match(Set dst (URShiftVI src shift));
5041   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed8I" %}
5042   ins_encode %{
5043     bool vector256 = true;
5044     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
5045   %}
5046   ins_pipe( pipe_slow );
5047 %}
5048 
5049 instruct vsrl8I_reg_imm(vecY dst, vecY src, immI8 shift) %{
5050   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
5051   match(Set dst (URShiftVI src shift));
5052   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed8I" %}
5053   ins_encode %{
5054     bool vector256 = true;
5055     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
5056   %}
5057   ins_pipe( pipe_slow );
5058 %}
5059 
5060 // Longs vector logical right shift
5061 instruct vsrl2L(vecX dst, vecS shift) %{
5062   predicate(n->as_Vector()->length() == 2);
5063   match(Set dst (URShiftVL dst shift));
5064   format %{ "psrlq   $dst,$shift\t! logical right shift packed2L" %}
5065   ins_encode %{
5066     __ psrlq($dst$$XMMRegister, $shift$$XMMRegister);
5067   %}
5068   ins_pipe( pipe_slow );
5069 %}
5070 
5071 instruct vsrl2L_imm(vecX dst, immI8 shift) %{
5072   predicate(n->as_Vector()->length() == 2);
5073   match(Set dst (URShiftVL dst shift));
5074   format %{ "psrlq   $dst,$shift\t! logical right shift packed2L" %}
5075   ins_encode %{
5076     __ psrlq($dst$$XMMRegister, (int)$shift$$constant);
5077   %}
5078   ins_pipe( pipe_slow );
5079 %}
5080 
5081 instruct vsrl2L_reg(vecX dst, vecX src, vecS shift) %{
5082   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
5083   match(Set dst (URShiftVL src shift));
5084   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed2L" %}
5085   ins_encode %{
5086     bool vector256 = false;
5087     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
5088   %}
5089   ins_pipe( pipe_slow );
5090 %}
5091 
5092 instruct vsrl2L_reg_imm(vecX dst, vecX src, immI8 shift) %{
5093   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
5094   match(Set dst (URShiftVL src shift));
5095   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed2L" %}
5096   ins_encode %{
5097     bool vector256 = false;
5098     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
5099   %}
5100   ins_pipe( pipe_slow );
5101 %}
5102 
5103 instruct vsrl4L_reg(vecY dst, vecY src, vecS shift) %{
5104   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
5105   match(Set dst (URShiftVL src shift));
5106   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed4L" %}
5107   ins_encode %{
5108     bool vector256 = true;
5109     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
5110   %}
5111   ins_pipe( pipe_slow );
5112 %}
5113 
5114 instruct vsrl4L_reg_imm(vecY dst, vecY src, immI8 shift) %{
5115   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
5116   match(Set dst (URShiftVL src shift));
5117   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed4L" %}
5118   ins_encode %{
5119     bool vector256 = true;
5120     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
5121   %}
5122   ins_pipe( pipe_slow );
5123 %}
5124 
5125 // ------------------- ArithmeticRightShift -----------------------------------
5126 
5127 // Shorts/Chars vector arithmetic right shift
5128 instruct vsra2S(vecS dst, vecS shift) %{
5129   predicate(n->as_Vector()->length() == 2);
5130   match(Set dst (RShiftVS dst shift));
5131   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed2S" %}
5132   ins_encode %{
5133     __ psraw($dst$$XMMRegister, $shift$$XMMRegister);
5134   %}
5135   ins_pipe( pipe_slow );
5136 %}
5137 
5138 instruct vsra2S_imm(vecS dst, immI8 shift) %{
5139   predicate(n->as_Vector()->length() == 2);
5140   match(Set dst (RShiftVS dst shift));
5141   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed2S" %}
5142   ins_encode %{
5143     __ psraw($dst$$XMMRegister, (int)$shift$$constant);
5144   %}
5145   ins_pipe( pipe_slow );
5146 %}
5147 
5148 instruct vsra2S_reg(vecS dst, vecS src, vecS shift) %{
5149   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
5150   match(Set dst (RShiftVS src shift));
5151   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
5152   ins_encode %{
5153     bool vector256 = false;
5154     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
5155   %}
5156   ins_pipe( pipe_slow );
5157 %}
5158 
5159 instruct vsra2S_reg_imm(vecS dst, vecS src, immI8 shift) %{
5160   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
5161   match(Set dst (RShiftVS src shift));
5162   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
5163   ins_encode %{
5164     bool vector256 = false;
5165     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
5166   %}
5167   ins_pipe( pipe_slow );
5168 %}
5169 
5170 instruct vsra4S(vecD dst, vecS shift) %{
5171   predicate(n->as_Vector()->length() == 4);
5172   match(Set dst (RShiftVS dst shift));
5173   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed4S" %}
5174   ins_encode %{
5175     __ psraw($dst$$XMMRegister, $shift$$XMMRegister);
5176   %}
5177   ins_pipe( pipe_slow );
5178 %}
5179 
5180 instruct vsra4S_imm(vecD dst, immI8 shift) %{
5181   predicate(n->as_Vector()->length() == 4);
5182   match(Set dst (RShiftVS dst shift));
5183   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed4S" %}
5184   ins_encode %{
5185     __ psraw($dst$$XMMRegister, (int)$shift$$constant);
5186   %}
5187   ins_pipe( pipe_slow );
5188 %}
5189 
5190 instruct vsra4S_reg(vecD dst, vecD src, vecS shift) %{
5191   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
5192   match(Set dst (RShiftVS src shift));
5193   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
5194   ins_encode %{
5195     bool vector256 = false;
5196     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
5197   %}
5198   ins_pipe( pipe_slow );
5199 %}
5200 
5201 instruct vsra4S_reg_imm(vecD dst, vecD src, immI8 shift) %{
5202   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
5203   match(Set dst (RShiftVS src shift));
5204   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
5205   ins_encode %{
5206     bool vector256 = false;
5207     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
5208   %}
5209   ins_pipe( pipe_slow );
5210 %}
5211 
5212 instruct vsra8S(vecX dst, vecS shift) %{
5213   predicate(n->as_Vector()->length() == 8);
5214   match(Set dst (RShiftVS dst shift));
5215   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed8S" %}
5216   ins_encode %{
5217     __ psraw($dst$$XMMRegister, $shift$$XMMRegister);
5218   %}
5219   ins_pipe( pipe_slow );
5220 %}
5221 
5222 instruct vsra8S_imm(vecX dst, immI8 shift) %{
5223   predicate(n->as_Vector()->length() == 8);
5224   match(Set dst (RShiftVS dst shift));
5225   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed8S" %}
5226   ins_encode %{
5227     __ psraw($dst$$XMMRegister, (int)$shift$$constant);
5228   %}
5229   ins_pipe( pipe_slow );
5230 %}
5231 
5232 instruct vsra8S_reg(vecX dst, vecX src, vecS shift) %{
5233   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
5234   match(Set dst (RShiftVS src shift));
5235   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
5236   ins_encode %{
5237     bool vector256 = false;
5238     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
5239   %}
5240   ins_pipe( pipe_slow );
5241 %}
5242 
5243 instruct vsra8S_reg_imm(vecX dst, vecX src, immI8 shift) %{
5244   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
5245   match(Set dst (RShiftVS src shift));
5246   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
5247   ins_encode %{
5248     bool vector256 = false;
5249     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
5250   %}
5251   ins_pipe( pipe_slow );
5252 %}
5253 
5254 instruct vsra16S_reg(vecY dst, vecY src, vecS shift) %{
5255   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
5256   match(Set dst (RShiftVS src shift));
5257   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
5258   ins_encode %{
5259     bool vector256 = true;
5260     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
5261   %}
5262   ins_pipe( pipe_slow );
5263 %}
5264 
5265 instruct vsra16S_reg_imm(vecY dst, vecY src, immI8 shift) %{
5266   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
5267   match(Set dst (RShiftVS src shift));
5268   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
5269   ins_encode %{
5270     bool vector256 = true;
5271     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
5272   %}
5273   ins_pipe( pipe_slow );
5274 %}
5275 
5276 // Integers vector arithmetic right shift
5277 instruct vsra2I(vecD dst, vecS shift) %{
5278   predicate(n->as_Vector()->length() == 2);
5279   match(Set dst (RShiftVI dst shift));
5280   format %{ "psrad   $dst,$shift\t! arithmetic right shift packed2I" %}
5281   ins_encode %{
5282     __ psrad($dst$$XMMRegister, $shift$$XMMRegister);
5283   %}
5284   ins_pipe( pipe_slow );
5285 %}
5286 
5287 instruct vsra2I_imm(vecD dst, immI8 shift) %{
5288   predicate(n->as_Vector()->length() == 2);
5289   match(Set dst (RShiftVI dst shift));
5290   format %{ "psrad   $dst,$shift\t! arithmetic right shift packed2I" %}
5291   ins_encode %{
5292     __ psrad($dst$$XMMRegister, (int)$shift$$constant);
5293   %}
5294   ins_pipe( pipe_slow );
5295 %}
5296 
5297 instruct vsra2I_reg(vecD dst, vecD src, vecS shift) %{
5298   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
5299   match(Set dst (RShiftVI src shift));
5300   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed2I" %}
5301   ins_encode %{
5302     bool vector256 = false;
5303     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
5304   %}
5305   ins_pipe( pipe_slow );
5306 %}
5307 
5308 instruct vsra2I_reg_imm(vecD dst, vecD src, immI8 shift) %{
5309   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
5310   match(Set dst (RShiftVI src shift));
5311   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed2I" %}
5312   ins_encode %{
5313     bool vector256 = false;
5314     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
5315   %}
5316   ins_pipe( pipe_slow );
5317 %}
5318 
5319 instruct vsra4I(vecX dst, vecS shift) %{
5320   predicate(n->as_Vector()->length() == 4);
5321   match(Set dst (RShiftVI dst shift));
5322   format %{ "psrad   $dst,$shift\t! arithmetic right shift packed4I" %}
5323   ins_encode %{
5324     __ psrad($dst$$XMMRegister, $shift$$XMMRegister);
5325   %}
5326   ins_pipe( pipe_slow );
5327 %}
5328 
5329 instruct vsra4I_imm(vecX dst, immI8 shift) %{
5330   predicate(n->as_Vector()->length() == 4);
5331   match(Set dst (RShiftVI dst shift));
5332   format %{ "psrad   $dst,$shift\t! arithmetic right shift packed4I" %}
5333   ins_encode %{
5334     __ psrad($dst$$XMMRegister, (int)$shift$$constant);
5335   %}
5336   ins_pipe( pipe_slow );
5337 %}
5338 
5339 instruct vsra4I_reg(vecX dst, vecX src, vecS shift) %{
5340   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
5341   match(Set dst (RShiftVI src shift));
5342   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed4I" %}
5343   ins_encode %{
5344     bool vector256 = false;
5345     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
5346   %}
5347   ins_pipe( pipe_slow );
5348 %}
5349 
5350 instruct vsra4I_reg_imm(vecX dst, vecX src, immI8 shift) %{
5351   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
5352   match(Set dst (RShiftVI src shift));
5353   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed4I" %}
5354   ins_encode %{
5355     bool vector256 = false;
5356     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
5357   %}
5358   ins_pipe( pipe_slow );
5359 %}
5360 
5361 instruct vsra8I_reg(vecY dst, vecY src, vecS shift) %{
5362   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
5363   match(Set dst (RShiftVI src shift));
5364   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed8I" %}
5365   ins_encode %{
5366     bool vector256 = true;
5367     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
5368   %}
5369   ins_pipe( pipe_slow );
5370 %}
5371 
5372 instruct vsra8I_reg_imm(vecY dst, vecY src, immI8 shift) %{
5373   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
5374   match(Set dst (RShiftVI src shift));
5375   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed8I" %}
5376   ins_encode %{
5377     bool vector256 = true;
5378     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
5379   %}
5380   ins_pipe( pipe_slow );
5381 %}
5382 
5383 // There are no longs vector arithmetic right shift instructions.
5384 
5385 
5386 // --------------------------------- AND --------------------------------------
5387 
5388 instruct vand4B(vecS dst, vecS src) %{
5389   predicate(n->as_Vector()->length_in_bytes() == 4);
5390   match(Set dst (AndV dst src));
5391   format %{ "pand    $dst,$src\t! and vectors (4 bytes)" %}
5392   ins_encode %{
5393     __ pand($dst$$XMMRegister, $src$$XMMRegister);
5394   %}
5395   ins_pipe( pipe_slow );
5396 %}
5397 
5398 instruct vand4B_reg(vecS dst, vecS src1, vecS src2) %{
5399   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
5400   match(Set dst (AndV src1 src2));
5401   format %{ "vpand   $dst,$src1,$src2\t! and vectors (4 bytes)" %}
5402   ins_encode %{
5403     bool vector256 = false;
5404     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
5405   %}
5406   ins_pipe( pipe_slow );
5407 %}
5408 
5409 instruct vand8B(vecD dst, vecD src) %{
5410   predicate(n->as_Vector()->length_in_bytes() == 8);
5411   match(Set dst (AndV dst src));
5412   format %{ "pand    $dst,$src\t! and vectors (8 bytes)" %}
5413   ins_encode %{
5414     __ pand($dst$$XMMRegister, $src$$XMMRegister);
5415   %}
5416   ins_pipe( pipe_slow );
5417 %}
5418 
5419 instruct vand8B_reg(vecD dst, vecD src1, vecD src2) %{
5420   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
5421   match(Set dst (AndV src1 src2));
5422   format %{ "vpand   $dst,$src1,$src2\t! and vectors (8 bytes)" %}
5423   ins_encode %{
5424     bool vector256 = false;
5425     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
5426   %}
5427   ins_pipe( pipe_slow );
5428 %}
5429 
5430 instruct vand16B(vecX dst, vecX src) %{
5431   predicate(n->as_Vector()->length_in_bytes() == 16);
5432   match(Set dst (AndV dst src));
5433   format %{ "pand    $dst,$src\t! and vectors (16 bytes)" %}
5434   ins_encode %{
5435     __ pand($dst$$XMMRegister, $src$$XMMRegister);
5436   %}
5437   ins_pipe( pipe_slow );
5438 %}
5439 
5440 instruct vand16B_reg(vecX dst, vecX src1, vecX src2) %{
5441   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
5442   match(Set dst (AndV src1 src2));
5443   format %{ "vpand   $dst,$src1,$src2\t! and vectors (16 bytes)" %}
5444   ins_encode %{
5445     bool vector256 = false;
5446     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
5447   %}
5448   ins_pipe( pipe_slow );
5449 %}
5450 
5451 instruct vand16B_mem(vecX dst, vecX src, memory mem) %{
5452   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
5453   match(Set dst (AndV src (LoadVector mem)));
5454   format %{ "vpand   $dst,$src,$mem\t! and vectors (16 bytes)" %}
5455   ins_encode %{
5456     bool vector256 = false;
5457     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
5458   %}
5459   ins_pipe( pipe_slow );
5460 %}
5461 
5462 instruct vand32B_reg(vecY dst, vecY src1, vecY src2) %{
5463   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
5464   match(Set dst (AndV src1 src2));
5465   format %{ "vpand   $dst,$src1,$src2\t! and vectors (32 bytes)" %}
5466   ins_encode %{
5467     bool vector256 = true;
5468     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
5469   %}
5470   ins_pipe( pipe_slow );
5471 %}
5472 
5473 instruct vand32B_mem(vecY dst, vecY src, memory mem) %{
5474   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
5475   match(Set dst (AndV src (LoadVector mem)));
5476   format %{ "vpand   $dst,$src,$mem\t! and vectors (32 bytes)" %}
5477   ins_encode %{
5478     bool vector256 = true;
5479     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
5480   %}
5481   ins_pipe( pipe_slow );
5482 %}
5483 
5484 // --------------------------------- OR ---------------------------------------
5485 
5486 instruct vor4B(vecS dst, vecS src) %{
5487   predicate(n->as_Vector()->length_in_bytes() == 4);
5488   match(Set dst (OrV dst src));
5489   format %{ "por     $dst,$src\t! or vectors (4 bytes)" %}
5490   ins_encode %{
5491     __ por($dst$$XMMRegister, $src$$XMMRegister);
5492   %}
5493   ins_pipe( pipe_slow );
5494 %}
5495 
5496 instruct vor4B_reg(vecS dst, vecS src1, vecS src2) %{
5497   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
5498   match(Set dst (OrV src1 src2));
5499   format %{ "vpor    $dst,$src1,$src2\t! or vectors (4 bytes)" %}
5500   ins_encode %{
5501     bool vector256 = false;
5502     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
5503   %}
5504   ins_pipe( pipe_slow );
5505 %}
5506 
5507 instruct vor8B(vecD dst, vecD src) %{
5508   predicate(n->as_Vector()->length_in_bytes() == 8);
5509   match(Set dst (OrV dst src));
5510   format %{ "por     $dst,$src\t! or vectors (8 bytes)" %}
5511   ins_encode %{
5512     __ por($dst$$XMMRegister, $src$$XMMRegister);
5513   %}
5514   ins_pipe( pipe_slow );
5515 %}
5516 
5517 instruct vor8B_reg(vecD dst, vecD src1, vecD src2) %{
5518   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
5519   match(Set dst (OrV src1 src2));
5520   format %{ "vpor    $dst,$src1,$src2\t! or vectors (8 bytes)" %}
5521   ins_encode %{
5522     bool vector256 = false;
5523     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
5524   %}
5525   ins_pipe( pipe_slow );
5526 %}
5527 
5528 instruct vor16B(vecX dst, vecX src) %{
5529   predicate(n->as_Vector()->length_in_bytes() == 16);
5530   match(Set dst (OrV dst src));
5531   format %{ "por     $dst,$src\t! or vectors (16 bytes)" %}
5532   ins_encode %{
5533     __ por($dst$$XMMRegister, $src$$XMMRegister);
5534   %}
5535   ins_pipe( pipe_slow );
5536 %}
5537 
5538 instruct vor16B_reg(vecX dst, vecX src1, vecX src2) %{
5539   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
5540   match(Set dst (OrV src1 src2));
5541   format %{ "vpor    $dst,$src1,$src2\t! or vectors (16 bytes)" %}
5542   ins_encode %{
5543     bool vector256 = false;
5544     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
5545   %}
5546   ins_pipe( pipe_slow );
5547 %}
5548 
5549 instruct vor16B_mem(vecX dst, vecX src, memory mem) %{
5550   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
5551   match(Set dst (OrV src (LoadVector mem)));
5552   format %{ "vpor    $dst,$src,$mem\t! or vectors (16 bytes)" %}
5553   ins_encode %{
5554     bool vector256 = false;
5555     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
5556   %}
5557   ins_pipe( pipe_slow );
5558 %}
5559 
5560 instruct vor32B_reg(vecY dst, vecY src1, vecY src2) %{
5561   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
5562   match(Set dst (OrV src1 src2));
5563   format %{ "vpor    $dst,$src1,$src2\t! or vectors (32 bytes)" %}
5564   ins_encode %{
5565     bool vector256 = true;
5566     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
5567   %}
5568   ins_pipe( pipe_slow );
5569 %}
5570 
5571 instruct vor32B_mem(vecY dst, vecY src, memory mem) %{
5572   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
5573   match(Set dst (OrV src (LoadVector mem)));
5574   format %{ "vpor    $dst,$src,$mem\t! or vectors (32 bytes)" %}
5575   ins_encode %{
5576     bool vector256 = true;
5577     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
5578   %}
5579   ins_pipe( pipe_slow );
5580 %}
5581 
5582 // --------------------------------- XOR --------------------------------------
5583 
5584 instruct vxor4B(vecS dst, vecS src) %{
5585   predicate(n->as_Vector()->length_in_bytes() == 4);
5586   match(Set dst (XorV dst src));
5587   format %{ "pxor    $dst,$src\t! xor vectors (4 bytes)" %}
5588   ins_encode %{
5589     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
5590   %}
5591   ins_pipe( pipe_slow );
5592 %}
5593 
5594 instruct vxor4B_reg(vecS dst, vecS src1, vecS src2) %{
5595   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
5596   match(Set dst (XorV src1 src2));
5597   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (4 bytes)" %}
5598   ins_encode %{
5599     bool vector256 = false;
5600     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
5601   %}
5602   ins_pipe( pipe_slow );
5603 %}
5604 
5605 instruct vxor8B(vecD dst, vecD src) %{
5606   predicate(n->as_Vector()->length_in_bytes() == 8);
5607   match(Set dst (XorV dst src));
5608   format %{ "pxor    $dst,$src\t! xor vectors (8 bytes)" %}
5609   ins_encode %{
5610     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
5611   %}
5612   ins_pipe( pipe_slow );
5613 %}
5614 
5615 instruct vxor8B_reg(vecD dst, vecD src1, vecD src2) %{
5616   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
5617   match(Set dst (XorV src1 src2));
5618   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (8 bytes)" %}
5619   ins_encode %{
5620     bool vector256 = false;
5621     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
5622   %}
5623   ins_pipe( pipe_slow );
5624 %}
5625 
5626 instruct vxor16B(vecX dst, vecX src) %{
5627   predicate(n->as_Vector()->length_in_bytes() == 16);
5628   match(Set dst (XorV dst src));
5629   format %{ "pxor    $dst,$src\t! xor vectors (16 bytes)" %}
5630   ins_encode %{
5631     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
5632   %}
5633   ins_pipe( pipe_slow );
5634 %}
5635 
5636 instruct vxor16B_reg(vecX dst, vecX src1, vecX src2) %{
5637   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
5638   match(Set dst (XorV src1 src2));
5639   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (16 bytes)" %}
5640   ins_encode %{
5641     bool vector256 = false;
5642     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
5643   %}
5644   ins_pipe( pipe_slow );
5645 %}
5646 
5647 instruct vxor16B_mem(vecX dst, vecX src, memory mem) %{
5648   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
5649   match(Set dst (XorV src (LoadVector mem)));
5650   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (16 bytes)" %}
5651   ins_encode %{
5652     bool vector256 = false;
5653     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
5654   %}
5655   ins_pipe( pipe_slow );
5656 %}
5657 
5658 instruct vxor32B_reg(vecY dst, vecY src1, vecY src2) %{
5659   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
5660   match(Set dst (XorV src1 src2));
5661   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (32 bytes)" %}
5662   ins_encode %{
5663     bool vector256 = true;
5664     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
5665   %}
5666   ins_pipe( pipe_slow );
5667 %}
5668 
5669 instruct vxor32B_mem(vecY dst, vecY src, memory mem) %{
5670   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
5671   match(Set dst (XorV src (LoadVector mem)));
5672   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (32 bytes)" %}
5673   ins_encode %{
5674     bool vector256 = true;
5675     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
5676   %}
5677   ins_pipe( pipe_slow );
5678 %}
5679