1 //
   2 // Copyright (c) 2011, 2012, Oracle and/or its affiliates. All rights reserved.
   3 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4 //
   5 // This code is free software; you can redistribute it and/or modify it
   6 // under the terms of the GNU General Public License version 2 only, as
   7 // published by the Free Software Foundation.
   8 //
   9 // This code is distributed in the hope that it will be useful, but WITHOUT
  10 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11 // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12 // version 2 for more details (a copy is included in the LICENSE file that
  13 // accompanied this code).
  14 //
  15 // You should have received a copy of the GNU General Public License version
  16 // 2 along with this work; if not, write to the Free Software Foundation,
  17 // Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18 //
  19 // Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20 // or visit www.oracle.com if you need additional information or have any
  21 // questions.
  22 //
  23 //
  24 
  25 // X86 Common Architecture Description File
  26 
  27 //----------REGISTER DEFINITION BLOCK------------------------------------------
  28 // This information is used by the matcher and the register allocator to
  29 // describe individual registers and classes of registers within the target
  30 // archtecture.
  31 
  32 register %{
  33 //----------Architecture Description Register Definitions----------------------
  34 // General Registers
  35 // "reg_def"  name ( register save type, C convention save type,
  36 //                   ideal register type, encoding );
  37 // Register Save Types:
  38 //
  39 // NS  = No-Save:       The register allocator assumes that these registers
  40 //                      can be used without saving upon entry to the method, &
  41 //                      that they do not need to be saved at call sites.
  42 //
  43 // SOC = Save-On-Call:  The register allocator assumes that these registers
  44 //                      can be used without saving upon entry to the method,
  45 //                      but that they must be saved at call sites.
  46 //
  47 // SOE = Save-On-Entry: The register allocator assumes that these registers
  48 //                      must be saved before using them upon entry to the
  49 //                      method, but they do not need to be saved at call
  50 //                      sites.
  51 //
  52 // AS  = Always-Save:   The register allocator assumes that these registers
  53 //                      must be saved before using them upon entry to the
  54 //                      method, & that they must be saved at call sites.
  55 //
  56 // Ideal Register Type is used to determine how to save & restore a
  57 // register.  Op_RegI will get spilled with LoadI/StoreI, Op_RegP will get
  58 // spilled with LoadP/StoreP.  If the register supports both, use Op_RegI.
  59 //
  60 // The encoding number is the actual bit-pattern placed into the opcodes.
  61 
  62 // XMM registers.  256-bit registers or 8 words each, labeled (a)-h.
  63 // Word a in each register holds a Float, words ab hold a Double.
  64 // The whole registers are used in SSE4.2 version intrinsics,
  65 // array copy stubs and superword operations (see UseSSE42Intrinsics,
  66 // UseXMMForArrayCopy and UseSuperword flags).
  67 // XMM8-XMM15 must be encoded with REX (VEX for UseAVX).
  68 // Linux ABI:   No register preserved across function calls
  69 //              XMM0-XMM7 might hold parameters
  70 // Windows ABI: XMM6-XMM15 preserved across function calls
  71 //              XMM0-XMM3 might hold parameters
  72 
  73 reg_def XMM0 ( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg());
  74 reg_def XMM0b( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(1));
  75 reg_def XMM0c( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(2));
  76 reg_def XMM0d( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(3));
  77 reg_def XMM0e( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(4));
  78 reg_def XMM0f( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(5));
  79 reg_def XMM0g( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(6));
  80 reg_def XMM0h( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(7));
  81 
  82 reg_def XMM1 ( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg());
  83 reg_def XMM1b( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(1));
  84 reg_def XMM1c( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(2));
  85 reg_def XMM1d( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(3));
  86 reg_def XMM1e( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(4));
  87 reg_def XMM1f( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(5));
  88 reg_def XMM1g( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(6));
  89 reg_def XMM1h( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(7));
  90 
  91 reg_def XMM2 ( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg());
  92 reg_def XMM2b( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(1));
  93 reg_def XMM2c( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(2));
  94 reg_def XMM2d( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(3));
  95 reg_def XMM2e( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(4));
  96 reg_def XMM2f( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(5));
  97 reg_def XMM2g( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(6));
  98 reg_def XMM2h( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(7));
  99 
 100 reg_def XMM3 ( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg());
 101 reg_def XMM3b( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(1));
 102 reg_def XMM3c( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(2));
 103 reg_def XMM3d( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(3));
 104 reg_def XMM3e( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(4));
 105 reg_def XMM3f( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(5));
 106 reg_def XMM3g( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(6));
 107 reg_def XMM3h( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(7));
 108 
 109 reg_def XMM4 ( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg());
 110 reg_def XMM4b( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(1));
 111 reg_def XMM4c( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(2));
 112 reg_def XMM4d( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(3));
 113 reg_def XMM4e( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(4));
 114 reg_def XMM4f( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(5));
 115 reg_def XMM4g( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(6));
 116 reg_def XMM4h( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(7));
 117 
 118 reg_def XMM5 ( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg());
 119 reg_def XMM5b( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(1));
 120 reg_def XMM5c( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(2));
 121 reg_def XMM5d( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(3));
 122 reg_def XMM5e( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(4));
 123 reg_def XMM5f( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(5));
 124 reg_def XMM5g( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(6));
 125 reg_def XMM5h( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(7));
 126 
 127 #ifdef _WIN64
 128 
 129 reg_def XMM6 ( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg());
 130 reg_def XMM6b( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg()->next(1));
 131 reg_def XMM6c( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg()->next(2));
 132 reg_def XMM6d( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg()->next(3));
 133 reg_def XMM6e( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg()->next(4));
 134 reg_def XMM6f( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg()->next(5));
 135 reg_def XMM6g( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg()->next(6));
 136 reg_def XMM6h( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg()->next(7));
 137 
 138 reg_def XMM7 ( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg());
 139 reg_def XMM7b( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg()->next(1));
 140 reg_def XMM7c( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg()->next(2));
 141 reg_def XMM7d( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg()->next(3));
 142 reg_def XMM7e( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg()->next(4));
 143 reg_def XMM7f( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg()->next(5));
 144 reg_def XMM7g( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg()->next(6));
 145 reg_def XMM7h( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg()->next(7));
 146 
 147 reg_def XMM8 ( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg());
 148 reg_def XMM8b( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg()->next(1));
 149 reg_def XMM8c( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg()->next(2));
 150 reg_def XMM8d( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg()->next(3));
 151 reg_def XMM8e( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg()->next(4));
 152 reg_def XMM8f( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg()->next(5));
 153 reg_def XMM8g( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg()->next(6));
 154 reg_def XMM8h( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg()->next(7));
 155 
 156 reg_def XMM9 ( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg());
 157 reg_def XMM9b( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg()->next(1));
 158 reg_def XMM9c( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg()->next(2));
 159 reg_def XMM9d( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg()->next(3));
 160 reg_def XMM9e( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg()->next(4));
 161 reg_def XMM9f( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg()->next(5));
 162 reg_def XMM9g( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg()->next(6));
 163 reg_def XMM9h( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg()->next(7));
 164 
 165 reg_def XMM10 ( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg());
 166 reg_def XMM10b( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg()->next(1));
 167 reg_def XMM10c( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg()->next(2));
 168 reg_def XMM10d( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg()->next(3));
 169 reg_def XMM10e( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg()->next(4));
 170 reg_def XMM10f( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg()->next(5));
 171 reg_def XMM10g( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg()->next(6));
 172 reg_def XMM10h( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg()->next(7));
 173 
 174 reg_def XMM11 ( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg());
 175 reg_def XMM11b( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg()->next(1));
 176 reg_def XMM11c( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg()->next(2));
 177 reg_def XMM11d( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg()->next(3));
 178 reg_def XMM11e( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg()->next(4));
 179 reg_def XMM11f( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg()->next(5));
 180 reg_def XMM11g( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg()->next(6));
 181 reg_def XMM11h( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg()->next(7));
 182 
 183 reg_def XMM12 ( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg());
 184 reg_def XMM12b( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg()->next(1));
 185 reg_def XMM12c( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg()->next(2));
 186 reg_def XMM12d( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg()->next(3));
 187 reg_def XMM12e( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg()->next(4));
 188 reg_def XMM12f( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg()->next(5));
 189 reg_def XMM12g( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg()->next(6));
 190 reg_def XMM12h( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg()->next(7));
 191 
 192 reg_def XMM13 ( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg());
 193 reg_def XMM13b( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg()->next(1));
 194 reg_def XMM13c( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg()->next(2));
 195 reg_def XMM13d( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg()->next(3));
 196 reg_def XMM13e( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg()->next(4));
 197 reg_def XMM13f( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg()->next(5));
 198 reg_def XMM13g( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg()->next(6));
 199 reg_def XMM13h( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg()->next(7));
 200 
 201 reg_def XMM14 ( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg());
 202 reg_def XMM14b( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg()->next(1));
 203 reg_def XMM14c( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg()->next(2));
 204 reg_def XMM14d( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg()->next(3));
 205 reg_def XMM14e( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg()->next(4));
 206 reg_def XMM14f( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg()->next(5));
 207 reg_def XMM14g( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg()->next(6));
 208 reg_def XMM14h( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg()->next(7));
 209 
 210 reg_def XMM15 ( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg());
 211 reg_def XMM15b( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg()->next(1));
 212 reg_def XMM15c( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg()->next(2));
 213 reg_def XMM15d( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg()->next(3));
 214 reg_def XMM15e( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg()->next(4));
 215 reg_def XMM15f( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg()->next(5));
 216 reg_def XMM15g( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg()->next(6));
 217 reg_def XMM15h( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg()->next(7));
 218 
 219 #else // _WIN64
 220 
 221 reg_def XMM6 ( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg());
 222 reg_def XMM6b( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(1));
 223 reg_def XMM6c( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(2));
 224 reg_def XMM6d( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(3));
 225 reg_def XMM6e( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(4));
 226 reg_def XMM6f( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(5));
 227 reg_def XMM6g( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(6));
 228 reg_def XMM6h( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(7));
 229 
 230 reg_def XMM7 ( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg());
 231 reg_def XMM7b( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(1));
 232 reg_def XMM7c( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(2));
 233 reg_def XMM7d( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(3));
 234 reg_def XMM7e( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(4));
 235 reg_def XMM7f( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(5));
 236 reg_def XMM7g( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(6));
 237 reg_def XMM7h( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(7));
 238 
 239 #ifdef _LP64
 240 
 241 reg_def XMM8 ( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg());
 242 reg_def XMM8b( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(1));
 243 reg_def XMM8c( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(2));
 244 reg_def XMM8d( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(3));
 245 reg_def XMM8e( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(4));
 246 reg_def XMM8f( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(5));
 247 reg_def XMM8g( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(6));
 248 reg_def XMM8h( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(7));
 249 
 250 reg_def XMM9 ( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg());
 251 reg_def XMM9b( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(1));
 252 reg_def XMM9c( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(2));
 253 reg_def XMM9d( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(3));
 254 reg_def XMM9e( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(4));
 255 reg_def XMM9f( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(5));
 256 reg_def XMM9g( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(6));
 257 reg_def XMM9h( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(7));
 258 
 259 reg_def XMM10 ( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg());
 260 reg_def XMM10b( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(1));
 261 reg_def XMM10c( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(2));
 262 reg_def XMM10d( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(3));
 263 reg_def XMM10e( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(4));
 264 reg_def XMM10f( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(5));
 265 reg_def XMM10g( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(6));
 266 reg_def XMM10h( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(7));
 267 
 268 reg_def XMM11 ( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg());
 269 reg_def XMM11b( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(1));
 270 reg_def XMM11c( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(2));
 271 reg_def XMM11d( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(3));
 272 reg_def XMM11e( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(4));
 273 reg_def XMM11f( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(5));
 274 reg_def XMM11g( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(6));
 275 reg_def XMM11h( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(7));
 276 
 277 reg_def XMM12 ( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg());
 278 reg_def XMM12b( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(1));
 279 reg_def XMM12c( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(2));
 280 reg_def XMM12d( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(3));
 281 reg_def XMM12e( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(4));
 282 reg_def XMM12f( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(5));
 283 reg_def XMM12g( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(6));
 284 reg_def XMM12h( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(7));
 285 
 286 reg_def XMM13 ( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg());
 287 reg_def XMM13b( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(1));
 288 reg_def XMM13c( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(2));
 289 reg_def XMM13d( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(3));
 290 reg_def XMM13e( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(4));
 291 reg_def XMM13f( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(5));
 292 reg_def XMM13g( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(6));
 293 reg_def XMM13h( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(7));
 294 
 295 reg_def XMM14 ( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg());
 296 reg_def XMM14b( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(1));
 297 reg_def XMM14c( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(2));
 298 reg_def XMM14d( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(3));
 299 reg_def XMM14e( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(4));
 300 reg_def XMM14f( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(5));
 301 reg_def XMM14g( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(6));
 302 reg_def XMM14h( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(7));
 303 
 304 reg_def XMM15 ( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg());
 305 reg_def XMM15b( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(1));
 306 reg_def XMM15c( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(2));
 307 reg_def XMM15d( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(3));
 308 reg_def XMM15e( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(4));
 309 reg_def XMM15f( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(5));
 310 reg_def XMM15g( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(6));
 311 reg_def XMM15h( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(7));
 312 
 313 #endif // _LP64
 314 
 315 #endif // _WIN64
 316 
 317 #ifdef _LP64
 318 reg_def RFLAGS(SOC, SOC, 0, 16, VMRegImpl::Bad());
 319 #else
 320 reg_def RFLAGS(SOC, SOC, 0, 8, VMRegImpl::Bad());
 321 #endif // _LP64
 322 
 323 alloc_class chunk1(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
 324                    XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
 325                    XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
 326                    XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
 327                    XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
 328                    XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
 329                    XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
 330                    XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
 331 #ifdef _LP64
 332                   ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
 333                    XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
 334                    XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
 335                    XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
 336                    XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
 337                    XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
 338                    XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
 339                    XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h
 340 #endif
 341                    );
 342 
 343 // flags allocation class should be last.
 344 alloc_class chunk2(RFLAGS);
 345 
 346 // Singleton class for condition codes
 347 reg_class int_flags(RFLAGS);
 348 
 349 // Class for all float registers
 350 reg_class float_reg(XMM0,
 351                     XMM1,
 352                     XMM2,
 353                     XMM3,
 354                     XMM4,
 355                     XMM5,
 356                     XMM6,
 357                     XMM7
 358 #ifdef _LP64
 359                    ,XMM8,
 360                     XMM9,
 361                     XMM10,
 362                     XMM11,
 363                     XMM12,
 364                     XMM13,
 365                     XMM14,
 366                     XMM15
 367 #endif
 368                     );
 369 
 370 // Class for all double registers
 371 reg_class double_reg(XMM0,  XMM0b,
 372                      XMM1,  XMM1b,
 373                      XMM2,  XMM2b,
 374                      XMM3,  XMM3b,
 375                      XMM4,  XMM4b,
 376                      XMM5,  XMM5b,
 377                      XMM6,  XMM6b,
 378                      XMM7,  XMM7b
 379 #ifdef _LP64
 380                     ,XMM8,  XMM8b,
 381                      XMM9,  XMM9b,
 382                      XMM10, XMM10b,
 383                      XMM11, XMM11b,
 384                      XMM12, XMM12b,
 385                      XMM13, XMM13b,
 386                      XMM14, XMM14b,
 387                      XMM15, XMM15b
 388 #endif
 389                      );
 390 
 391 // Class for all 32bit vector registers
 392 reg_class vectors_reg(XMM0,
 393                       XMM1,
 394                       XMM2,
 395                       XMM3,
 396                       XMM4,
 397                       XMM5,
 398                       XMM6,
 399                       XMM7
 400 #ifdef _LP64
 401                      ,XMM8,
 402                       XMM9,
 403                       XMM10,
 404                       XMM11,
 405                       XMM12,
 406                       XMM13,
 407                       XMM14,
 408                       XMM15
 409 #endif
 410                       );
 411 
 412 // Class for all 64bit vector registers
 413 reg_class vectord_reg(XMM0,  XMM0b,
 414                       XMM1,  XMM1b,
 415                       XMM2,  XMM2b,
 416                       XMM3,  XMM3b,
 417                       XMM4,  XMM4b,
 418                       XMM5,  XMM5b,
 419                       XMM6,  XMM6b,
 420                       XMM7,  XMM7b
 421 #ifdef _LP64
 422                      ,XMM8,  XMM8b,
 423                       XMM9,  XMM9b,
 424                       XMM10, XMM10b,
 425                       XMM11, XMM11b,
 426                       XMM12, XMM12b,
 427                       XMM13, XMM13b,
 428                       XMM14, XMM14b,
 429                       XMM15, XMM15b
 430 #endif
 431                       );
 432 
 433 // Class for all 128bit vector registers
 434 reg_class vectorx_reg(XMM0,  XMM0b,  XMM0c,  XMM0d,
 435                       XMM1,  XMM1b,  XMM1c,  XMM1d,
 436                       XMM2,  XMM2b,  XMM2c,  XMM2d,
 437                       XMM3,  XMM3b,  XMM3c,  XMM3d,
 438                       XMM4,  XMM4b,  XMM4c,  XMM4d,
 439                       XMM5,  XMM5b,  XMM5c,  XMM5d,
 440                       XMM6,  XMM6b,  XMM6c,  XMM6d,
 441                       XMM7,  XMM7b,  XMM7c,  XMM7d
 442 #ifdef _LP64
 443                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,
 444                       XMM9,  XMM9b,  XMM9c,  XMM9d,
 445                       XMM10, XMM10b, XMM10c, XMM10d,
 446                       XMM11, XMM11b, XMM11c, XMM11d,
 447                       XMM12, XMM12b, XMM12c, XMM12d,
 448                       XMM13, XMM13b, XMM13c, XMM13d,
 449                       XMM14, XMM14b, XMM14c, XMM14d,
 450                       XMM15, XMM15b, XMM15c, XMM15d
 451 #endif
 452                       );
 453 
 454 // Class for all 256bit vector registers
 455 reg_class vectory_reg(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
 456                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
 457                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
 458                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
 459                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
 460                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
 461                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
 462                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
 463 #ifdef _LP64
 464                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
 465                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
 466                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
 467                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
 468                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
 469                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
 470                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
 471                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h
 472 #endif
 473                       );
 474 
 475 %}
 476 
 477 
 478 //----------SOURCE BLOCK-------------------------------------------------------
 479 // This is a block of C++ code which provides values, functions, and
 480 // definitions necessary in the rest of the architecture description
 481 
 482 source_hpp %{
 483 // Header information of the source block.
 484 // Method declarations/definitions which are used outside
 485 // the ad-scope can conveniently be defined here.
 486 //
 487 // To keep related declarations/definitions/uses close together,
 488 // we switch between source %{ }% and source_hpp %{ }% freely as needed.
 489 
 490 class CallStubImpl {
 491  
 492   //--------------------------------------------------------------
 493   //---<  Used for optimization in Compile::shorten_branches  >---
 494   //--------------------------------------------------------------
 495 
 496  public:
 497   // Size of call trampoline stub.
 498   static uint size_call_trampoline() {
 499     return 0; // no call trampolines on this platform
 500   }
 501   
 502   // number of relocations needed by a call trampoline stub
 503   static uint reloc_call_trampoline() { 
 504     return 0; // no call trampolines on this platform
 505   }
 506 };
 507 
 508 class HandlerImpl {
 509 
 510  public:
 511 
 512   static int emit_exception_handler(CodeBuffer &cbuf);
 513   static int emit_deopt_handler(CodeBuffer& cbuf);
 514 
 515   static uint size_exception_handler() {
 516     // NativeCall instruction size is the same as NativeJump.
 517     // exception handler starts out as jump and can be patched to
 518     // a call be deoptimization.  (4932387)
 519     // Note that this value is also credited (in output.cpp) to
 520     // the size of the code section.
 521     return NativeJump::instruction_size;
 522   }
 523 
 524 #ifdef _LP64
 525   static uint size_deopt_handler() {
 526     // three 5 byte instructions
 527     return 15;
 528   }
 529 #else
 530   static uint size_deopt_handler() {
 531     // NativeCall instruction size is the same as NativeJump.
 532     // exception handler starts out as jump and can be patched to
 533     // a call be deoptimization.  (4932387)
 534     // Note that this value is also credited (in output.cpp) to
 535     // the size of the code section.
 536     return 5 + NativeJump::instruction_size; // pushl(); jmp;
 537   }
 538 #endif
 539 };
 540 
 541 %} // end source_hpp
 542 
 543 source %{
 544 
 545 // Emit exception handler code.
 546 // Stuff framesize into a register and call a VM stub routine.
 547 int HandlerImpl::emit_exception_handler(CodeBuffer& cbuf) {
 548 
 549   // Note that the code buffer's insts_mark is always relative to insts.
 550   // That's why we must use the macroassembler to generate a handler.
 551   MacroAssembler _masm(&cbuf);
 552   address base = __ start_a_stub(size_exception_handler());
 553   if (base == NULL)  return 0;  // CodeBuffer::expand failed
 554   int offset = __ offset();
 555   __ jump(RuntimeAddress(OptoRuntime::exception_blob()->entry_point()));
 556   assert(__ offset() - offset <= (int) size_exception_handler(), "overflow");
 557   __ end_a_stub();
 558   return offset;
 559 }
 560 
 561 // Emit deopt handler code.
 562 int HandlerImpl::emit_deopt_handler(CodeBuffer& cbuf) {
 563 
 564   // Note that the code buffer's insts_mark is always relative to insts.
 565   // That's why we must use the macroassembler to generate a handler.
 566   MacroAssembler _masm(&cbuf);
 567   address base = __ start_a_stub(size_deopt_handler());
 568   if (base == NULL)  return 0;  // CodeBuffer::expand failed
 569   int offset = __ offset();
 570 
 571 #ifdef _LP64
 572   address the_pc = (address) __ pc();
 573   Label next;
 574   // push a "the_pc" on the stack without destroying any registers
 575   // as they all may be live.
 576 
 577   // push address of "next"
 578   __ call(next, relocInfo::none); // reloc none is fine since it is a disp32
 579   __ bind(next);
 580   // adjust it so it matches "the_pc"
 581   __ subptr(Address(rsp, 0), __ offset() - offset);
 582 #else
 583   InternalAddress here(__ pc());
 584   __ pushptr(here.addr());
 585 #endif
 586 
 587   __ jump(RuntimeAddress(SharedRuntime::deopt_blob()->unpack()));
 588   assert(__ offset() - offset <= (int) size_deopt_handler(), "overflow");
 589   __ end_a_stub();
 590   return offset;
 591 }
 592 
 593 
 594 //=============================================================================
 595 
 596   // Float masks come from different places depending on platform.
 597 #ifdef _LP64
 598   static address float_signmask()  { return StubRoutines::x86::float_sign_mask(); }
 599   static address float_signflip()  { return StubRoutines::x86::float_sign_flip(); }
 600   static address double_signmask() { return StubRoutines::x86::double_sign_mask(); }
 601   static address double_signflip() { return StubRoutines::x86::double_sign_flip(); }
 602 #else
 603   static address float_signmask()  { return (address)float_signmask_pool; }
 604   static address float_signflip()  { return (address)float_signflip_pool; }
 605   static address double_signmask() { return (address)double_signmask_pool; }
 606   static address double_signflip() { return (address)double_signflip_pool; }
 607 #endif
 608 
 609 
 610 const bool Matcher::match_rule_supported(int opcode) {
 611   if (!has_match_rule(opcode))
 612     return false;
 613 
 614   switch (opcode) {
 615     case Op_PopCountI:
 616     case Op_PopCountL:
 617       if (!UsePopCountInstruction)
 618         return false;
 619     break;
 620     case Op_MulVI:
 621       if ((UseSSE < 4) && (UseAVX < 1)) // only with SSE4_1 or AVX
 622         return false;
 623     break;
 624     case Op_CompareAndSwapL:
 625 #ifdef _LP64
 626     case Op_CompareAndSwapP:
 627 #endif
 628       if (!VM_Version::supports_cx8())
 629         return false;
 630     break;
 631   }
 632 
 633   return true;  // Per default match rules are supported.
 634 }
 635 
 636 // Max vector size in bytes. 0 if not supported.
 637 const int Matcher::vector_width_in_bytes(BasicType bt) {
 638   assert(is_java_primitive(bt), "only primitive type vectors");
 639   if (UseSSE < 2) return 0;
 640   // SSE2 supports 128bit vectors for all types.
 641   // AVX2 supports 256bit vectors for all types.
 642   int size = (UseAVX > 1) ? 32 : 16;
 643   // AVX1 supports 256bit vectors only for FLOAT and DOUBLE.
 644   if (UseAVX > 0 && (bt == T_FLOAT || bt == T_DOUBLE))
 645     size = 32;
 646   // Use flag to limit vector size.
 647   size = MIN2(size,(int)MaxVectorSize);
 648   // Minimum 2 values in vector (or 4 for bytes).
 649   switch (bt) {
 650   case T_DOUBLE:
 651   case T_LONG:
 652     if (size < 16) return 0;
 653   case T_FLOAT:
 654   case T_INT:
 655     if (size < 8) return 0;
 656   case T_BOOLEAN:
 657   case T_BYTE:
 658   case T_CHAR:
 659   case T_SHORT:
 660     if (size < 4) return 0;
 661     break;
 662   default:
 663     ShouldNotReachHere();
 664   }
 665   return size;
 666 }
 667 
 668 // Limits on vector size (number of elements) loaded into vector.
 669 const int Matcher::max_vector_size(const BasicType bt) {
 670   return vector_width_in_bytes(bt)/type2aelembytes(bt);
 671 }
 672 const int Matcher::min_vector_size(const BasicType bt) {
 673   int max_size = max_vector_size(bt);
 674   // Min size which can be loaded into vector is 4 bytes.
 675   int size = (type2aelembytes(bt) == 1) ? 4 : 2;
 676   return MIN2(size,max_size);
 677 }
 678 
 679 // Vector ideal reg corresponding to specidied size in bytes
 680 const int Matcher::vector_ideal_reg(int size) {
 681   assert(MaxVectorSize >= size, "");
 682   switch(size) {
 683     case  4: return Op_VecS;
 684     case  8: return Op_VecD;
 685     case 16: return Op_VecX;
 686     case 32: return Op_VecY;
 687   }
 688   ShouldNotReachHere();
 689   return 0;
 690 }
 691 
 692 // Only lowest bits of xmm reg are used for vector shift count.
 693 const int Matcher::vector_shift_count_ideal_reg(int size) {
 694   return Op_VecS;
 695 }
 696 
 697 // x86 supports misaligned vectors store/load.
 698 const bool Matcher::misaligned_vectors_ok() {
 699   return !AlignVector; // can be changed by flag
 700 }
 701 
 702 // x86 AES instructions are compatible with SunJCE expanded
 703 // keys, hence we do not need to pass the original key to stubs
 704 const bool Matcher::pass_original_key_for_aes() {
 705   return false;
 706 }
 707 
 708 // Helper methods for MachSpillCopyNode::implementation().
 709 static int vec_mov_helper(CodeBuffer *cbuf, bool do_size, int src_lo, int dst_lo,
 710                           int src_hi, int dst_hi, uint ireg, outputStream* st) {
 711   // In 64-bit VM size calculation is very complex. Emitting instructions
 712   // into scratch buffer is used to get size in 64-bit VM.
 713   LP64_ONLY( assert(!do_size, "this method calculates size only for 32-bit VM"); )
 714   assert(ireg == Op_VecS || // 32bit vector
 715          (src_lo & 1) == 0 && (src_lo + 1) == src_hi &&
 716          (dst_lo & 1) == 0 && (dst_lo + 1) == dst_hi,
 717          "no non-adjacent vector moves" );
 718   if (cbuf) {
 719     MacroAssembler _masm(cbuf);
 720     int offset = __ offset();
 721     switch (ireg) {
 722     case Op_VecS: // copy whole register
 723     case Op_VecD:
 724     case Op_VecX:
 725       __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
 726       break;
 727     case Op_VecY:
 728       __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
 729       break;
 730     default:
 731       ShouldNotReachHere();
 732     }
 733     int size = __ offset() - offset;
 734 #ifdef ASSERT
 735     // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
 736     assert(!do_size || size == 4, "incorrect size calculattion");
 737 #endif
 738     return size;
 739 #ifndef PRODUCT
 740   } else if (!do_size) {
 741     switch (ireg) {
 742     case Op_VecS:
 743     case Op_VecD:
 744     case Op_VecX:
 745       st->print("movdqu  %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
 746       break;
 747     case Op_VecY:
 748       st->print("vmovdqu %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
 749       break;
 750     default:
 751       ShouldNotReachHere();
 752     }
 753 #endif
 754   }
 755   // VEX_2bytes prefix is used if UseAVX > 0, and it takes the same 2 bytes as SIMD prefix.
 756   return 4;
 757 }
 758 
 759 static int vec_spill_helper(CodeBuffer *cbuf, bool do_size, bool is_load,
 760                             int stack_offset, int reg, uint ireg, outputStream* st) {
 761   // In 64-bit VM size calculation is very complex. Emitting instructions
 762   // into scratch buffer is used to get size in 64-bit VM.
 763   LP64_ONLY( assert(!do_size, "this method calculates size only for 32-bit VM"); )
 764   if (cbuf) {
 765     MacroAssembler _masm(cbuf);
 766     int offset = __ offset();
 767     if (is_load) {
 768       switch (ireg) {
 769       case Op_VecS:
 770         __ movdl(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 771         break;
 772       case Op_VecD:
 773         __ movq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 774         break;
 775       case Op_VecX:
 776         __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 777         break;
 778       case Op_VecY:
 779         __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 780         break;
 781       default:
 782         ShouldNotReachHere();
 783       }
 784     } else { // store
 785       switch (ireg) {
 786       case Op_VecS:
 787         __ movdl(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 788         break;
 789       case Op_VecD:
 790         __ movq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 791         break;
 792       case Op_VecX:
 793         __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 794         break;
 795       case Op_VecY:
 796         __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 797         break;
 798       default:
 799         ShouldNotReachHere();
 800       }
 801     }
 802     int size = __ offset() - offset;
 803 #ifdef ASSERT
 804     int offset_size = (stack_offset == 0) ? 0 : ((stack_offset < 0x80) ? 1 : 4);
 805     // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
 806     assert(!do_size || size == (5+offset_size), "incorrect size calculattion");
 807 #endif
 808     return size;
 809 #ifndef PRODUCT
 810   } else if (!do_size) {
 811     if (is_load) {
 812       switch (ireg) {
 813       case Op_VecS:
 814         st->print("movd    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 815         break;
 816       case Op_VecD:
 817         st->print("movq    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 818         break;
 819        case Op_VecX:
 820         st->print("movdqu  %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 821         break;
 822       case Op_VecY:
 823         st->print("vmovdqu %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 824         break;
 825       default:
 826         ShouldNotReachHere();
 827       }
 828     } else { // store
 829       switch (ireg) {
 830       case Op_VecS:
 831         st->print("movd    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 832         break;
 833       case Op_VecD:
 834         st->print("movq    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 835         break;
 836        case Op_VecX:
 837         st->print("movdqu  [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 838         break;
 839       case Op_VecY:
 840         st->print("vmovdqu [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 841         break;
 842       default:
 843         ShouldNotReachHere();
 844       }
 845     }
 846 #endif
 847   }
 848   int offset_size = (stack_offset == 0) ? 0 : ((stack_offset < 0x80) ? 1 : 4);
 849   // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
 850   return 5+offset_size;
 851 }
 852 
 853 static inline jfloat replicate4_imm(int con, int width) {
 854   // Load a constant of "width" (in bytes) and replicate it to fill 32bit.
 855   assert(width == 1 || width == 2, "only byte or short types here");
 856   int bit_width = width * 8;
 857   jint val = con;
 858   val &= (1 << bit_width) - 1;  // mask off sign bits
 859   while(bit_width < 32) {
 860     val |= (val << bit_width);
 861     bit_width <<= 1;
 862   }
 863   jfloat fval = *((jfloat*) &val);  // coerce to float type
 864   return fval;
 865 }
 866 
 867 static inline jdouble replicate8_imm(int con, int width) {
 868   // Load a constant of "width" (in bytes) and replicate it to fill 64bit.
 869   assert(width == 1 || width == 2 || width == 4, "only byte, short or int types here");
 870   int bit_width = width * 8;
 871   jlong val = con;
 872   val &= (((jlong) 1) << bit_width) - 1;  // mask off sign bits
 873   while(bit_width < 64) {
 874     val |= (val << bit_width);
 875     bit_width <<= 1;
 876   }
 877   jdouble dval = *((jdouble*) &val);  // coerce to double type
 878   return dval;
 879 }
 880 
 881 #ifndef PRODUCT
 882   void MachNopNode::format(PhaseRegAlloc*, outputStream* st) const {
 883     st->print("nop \t# %d bytes pad for loops and calls", _count);
 884   }
 885 #endif
 886 
 887   void MachNopNode::emit(CodeBuffer &cbuf, PhaseRegAlloc*) const {
 888     MacroAssembler _masm(&cbuf);
 889     __ nop(_count);
 890   }
 891 
 892   uint MachNopNode::size(PhaseRegAlloc*) const {
 893     return _count;
 894   }
 895 
 896 #ifndef PRODUCT
 897   void MachBreakpointNode::format(PhaseRegAlloc*, outputStream* st) const {
 898     st->print("# breakpoint");
 899   }
 900 #endif
 901 
 902   void MachBreakpointNode::emit(CodeBuffer &cbuf, PhaseRegAlloc* ra_) const {
 903     MacroAssembler _masm(&cbuf);
 904     __ int3();
 905   }
 906 
 907   uint MachBreakpointNode::size(PhaseRegAlloc* ra_) const {
 908     return MachNode::size(ra_);
 909   }
 910 
 911 %}
 912 
 913 encode %{
 914 
 915   enc_class preserve_SP %{
 916     debug_only(int off0 = cbuf.insts_size());
 917     MacroAssembler _masm(&cbuf);
 918     // RBP is preserved across all calls, even compiled calls.
 919     // Use it to preserve RSP in places where the callee might change the SP.
 920     __ movptr(rbp_mh_SP_save, rsp);
 921     debug_only(int off1 = cbuf.insts_size());
 922     assert(off1 - off0 == preserve_SP_size(), "correct size prediction");
 923   %}
 924 
 925   enc_class restore_SP %{
 926     MacroAssembler _masm(&cbuf);
 927     __ movptr(rsp, rbp_mh_SP_save);
 928   %}
 929 
 930   enc_class call_epilog %{
 931     if (VerifyStackAtCalls) {
 932       // Check that stack depth is unchanged: find majik cookie on stack
 933       int framesize = ra_->reg2offset_unchecked(OptoReg::add(ra_->_matcher._old_SP, -3*VMRegImpl::slots_per_word));
 934       MacroAssembler _masm(&cbuf);
 935       Label L;
 936       __ cmpptr(Address(rsp, framesize), (int32_t)0xbadb100d);
 937       __ jccb(Assembler::equal, L);
 938       // Die if stack mismatch
 939       __ int3();
 940       __ bind(L);
 941     }
 942   %}
 943 
 944 %}
 945 
 946 
 947 //----------OPERANDS-----------------------------------------------------------
 948 // Operand definitions must precede instruction definitions for correct parsing
 949 // in the ADLC because operands constitute user defined types which are used in
 950 // instruction definitions.
 951 
 952 // Vectors
 953 operand vecS() %{
 954   constraint(ALLOC_IN_RC(vectors_reg));
 955   match(VecS);
 956 
 957   format %{ %}
 958   interface(REG_INTER);
 959 %}
 960 
 961 operand vecD() %{
 962   constraint(ALLOC_IN_RC(vectord_reg));
 963   match(VecD);
 964 
 965   format %{ %}
 966   interface(REG_INTER);
 967 %}
 968 
 969 operand vecX() %{
 970   constraint(ALLOC_IN_RC(vectorx_reg));
 971   match(VecX);
 972 
 973   format %{ %}
 974   interface(REG_INTER);
 975 %}
 976 
 977 operand vecY() %{
 978   constraint(ALLOC_IN_RC(vectory_reg));
 979   match(VecY);
 980 
 981   format %{ %}
 982   interface(REG_INTER);
 983 %}
 984 
 985 
 986 // INSTRUCTIONS -- Platform independent definitions (same for 32- and 64-bit)
 987 
 988 // ============================================================================
 989 
 990 instruct ShouldNotReachHere() %{
 991   match(Halt);
 992   format %{ "int3\t# ShouldNotReachHere" %}
 993   ins_encode %{
 994     __ int3();
 995   %}
 996   ins_pipe(pipe_slow);
 997 %}
 998 
 999 // ============================================================================
1000 
1001 instruct addF_reg(regF dst, regF src) %{
1002   predicate((UseSSE>=1) && (UseAVX == 0));
1003   match(Set dst (AddF dst src));
1004 
1005   format %{ "addss   $dst, $src" %}
1006   ins_cost(150);
1007   ins_encode %{
1008     __ addss($dst$$XMMRegister, $src$$XMMRegister);
1009   %}
1010   ins_pipe(pipe_slow);
1011 %}
1012 
1013 instruct addF_mem(regF dst, memory src) %{
1014   predicate((UseSSE>=1) && (UseAVX == 0));
1015   match(Set dst (AddF dst (LoadF src)));
1016 
1017   format %{ "addss   $dst, $src" %}
1018   ins_cost(150);
1019   ins_encode %{
1020     __ addss($dst$$XMMRegister, $src$$Address);
1021   %}
1022   ins_pipe(pipe_slow);
1023 %}
1024 
1025 instruct addF_imm(regF dst, immF con) %{
1026   predicate((UseSSE>=1) && (UseAVX == 0));
1027   match(Set dst (AddF dst con));
1028   format %{ "addss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
1029   ins_cost(150);
1030   ins_encode %{
1031     __ addss($dst$$XMMRegister, $constantaddress($con));
1032   %}
1033   ins_pipe(pipe_slow);
1034 %}
1035 
1036 instruct addF_reg_reg(regF dst, regF src1, regF src2) %{
1037   predicate(UseAVX > 0);
1038   match(Set dst (AddF src1 src2));
1039 
1040   format %{ "vaddss  $dst, $src1, $src2" %}
1041   ins_cost(150);
1042   ins_encode %{
1043     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
1044   %}
1045   ins_pipe(pipe_slow);
1046 %}
1047 
1048 instruct addF_reg_mem(regF dst, regF src1, memory src2) %{
1049   predicate(UseAVX > 0);
1050   match(Set dst (AddF src1 (LoadF src2)));
1051 
1052   format %{ "vaddss  $dst, $src1, $src2" %}
1053   ins_cost(150);
1054   ins_encode %{
1055     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
1056   %}
1057   ins_pipe(pipe_slow);
1058 %}
1059 
1060 instruct addF_reg_imm(regF dst, regF src, immF con) %{
1061   predicate(UseAVX > 0);
1062   match(Set dst (AddF src con));
1063 
1064   format %{ "vaddss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
1065   ins_cost(150);
1066   ins_encode %{
1067     __ vaddss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
1068   %}
1069   ins_pipe(pipe_slow);
1070 %}
1071 
1072 instruct addD_reg(regD dst, regD src) %{
1073   predicate((UseSSE>=2) && (UseAVX == 0));
1074   match(Set dst (AddD dst src));
1075 
1076   format %{ "addsd   $dst, $src" %}
1077   ins_cost(150);
1078   ins_encode %{
1079     __ addsd($dst$$XMMRegister, $src$$XMMRegister);
1080   %}
1081   ins_pipe(pipe_slow);
1082 %}
1083 
1084 instruct addD_mem(regD dst, memory src) %{
1085   predicate((UseSSE>=2) && (UseAVX == 0));
1086   match(Set dst (AddD dst (LoadD src)));
1087 
1088   format %{ "addsd   $dst, $src" %}
1089   ins_cost(150);
1090   ins_encode %{
1091     __ addsd($dst$$XMMRegister, $src$$Address);
1092   %}
1093   ins_pipe(pipe_slow);
1094 %}
1095 
1096 instruct addD_imm(regD dst, immD con) %{
1097   predicate((UseSSE>=2) && (UseAVX == 0));
1098   match(Set dst (AddD dst con));
1099   format %{ "addsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
1100   ins_cost(150);
1101   ins_encode %{
1102     __ addsd($dst$$XMMRegister, $constantaddress($con));
1103   %}
1104   ins_pipe(pipe_slow);
1105 %}
1106 
1107 instruct addD_reg_reg(regD dst, regD src1, regD src2) %{
1108   predicate(UseAVX > 0);
1109   match(Set dst (AddD src1 src2));
1110 
1111   format %{ "vaddsd  $dst, $src1, $src2" %}
1112   ins_cost(150);
1113   ins_encode %{
1114     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
1115   %}
1116   ins_pipe(pipe_slow);
1117 %}
1118 
1119 instruct addD_reg_mem(regD dst, regD src1, memory src2) %{
1120   predicate(UseAVX > 0);
1121   match(Set dst (AddD src1 (LoadD src2)));
1122 
1123   format %{ "vaddsd  $dst, $src1, $src2" %}
1124   ins_cost(150);
1125   ins_encode %{
1126     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
1127   %}
1128   ins_pipe(pipe_slow);
1129 %}
1130 
1131 instruct addD_reg_imm(regD dst, regD src, immD con) %{
1132   predicate(UseAVX > 0);
1133   match(Set dst (AddD src con));
1134 
1135   format %{ "vaddsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
1136   ins_cost(150);
1137   ins_encode %{
1138     __ vaddsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
1139   %}
1140   ins_pipe(pipe_slow);
1141 %}
1142 
1143 instruct subF_reg(regF dst, regF src) %{
1144   predicate((UseSSE>=1) && (UseAVX == 0));
1145   match(Set dst (SubF dst src));
1146 
1147   format %{ "subss   $dst, $src" %}
1148   ins_cost(150);
1149   ins_encode %{
1150     __ subss($dst$$XMMRegister, $src$$XMMRegister);
1151   %}
1152   ins_pipe(pipe_slow);
1153 %}
1154 
1155 instruct subF_mem(regF dst, memory src) %{
1156   predicate((UseSSE>=1) && (UseAVX == 0));
1157   match(Set dst (SubF dst (LoadF src)));
1158 
1159   format %{ "subss   $dst, $src" %}
1160   ins_cost(150);
1161   ins_encode %{
1162     __ subss($dst$$XMMRegister, $src$$Address);
1163   %}
1164   ins_pipe(pipe_slow);
1165 %}
1166 
1167 instruct subF_imm(regF dst, immF con) %{
1168   predicate((UseSSE>=1) && (UseAVX == 0));
1169   match(Set dst (SubF dst con));
1170   format %{ "subss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
1171   ins_cost(150);
1172   ins_encode %{
1173     __ subss($dst$$XMMRegister, $constantaddress($con));
1174   %}
1175   ins_pipe(pipe_slow);
1176 %}
1177 
1178 instruct subF_reg_reg(regF dst, regF src1, regF src2) %{
1179   predicate(UseAVX > 0);
1180   match(Set dst (SubF src1 src2));
1181 
1182   format %{ "vsubss  $dst, $src1, $src2" %}
1183   ins_cost(150);
1184   ins_encode %{
1185     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
1186   %}
1187   ins_pipe(pipe_slow);
1188 %}
1189 
1190 instruct subF_reg_mem(regF dst, regF src1, memory src2) %{
1191   predicate(UseAVX > 0);
1192   match(Set dst (SubF src1 (LoadF src2)));
1193 
1194   format %{ "vsubss  $dst, $src1, $src2" %}
1195   ins_cost(150);
1196   ins_encode %{
1197     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
1198   %}
1199   ins_pipe(pipe_slow);
1200 %}
1201 
1202 instruct subF_reg_imm(regF dst, regF src, immF con) %{
1203   predicate(UseAVX > 0);
1204   match(Set dst (SubF src con));
1205 
1206   format %{ "vsubss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
1207   ins_cost(150);
1208   ins_encode %{
1209     __ vsubss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
1210   %}
1211   ins_pipe(pipe_slow);
1212 %}
1213 
1214 instruct subD_reg(regD dst, regD src) %{
1215   predicate((UseSSE>=2) && (UseAVX == 0));
1216   match(Set dst (SubD dst src));
1217 
1218   format %{ "subsd   $dst, $src" %}
1219   ins_cost(150);
1220   ins_encode %{
1221     __ subsd($dst$$XMMRegister, $src$$XMMRegister);
1222   %}
1223   ins_pipe(pipe_slow);
1224 %}
1225 
1226 instruct subD_mem(regD dst, memory src) %{
1227   predicate((UseSSE>=2) && (UseAVX == 0));
1228   match(Set dst (SubD dst (LoadD src)));
1229 
1230   format %{ "subsd   $dst, $src" %}
1231   ins_cost(150);
1232   ins_encode %{
1233     __ subsd($dst$$XMMRegister, $src$$Address);
1234   %}
1235   ins_pipe(pipe_slow);
1236 %}
1237 
1238 instruct subD_imm(regD dst, immD con) %{
1239   predicate((UseSSE>=2) && (UseAVX == 0));
1240   match(Set dst (SubD dst con));
1241   format %{ "subsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
1242   ins_cost(150);
1243   ins_encode %{
1244     __ subsd($dst$$XMMRegister, $constantaddress($con));
1245   %}
1246   ins_pipe(pipe_slow);
1247 %}
1248 
1249 instruct subD_reg_reg(regD dst, regD src1, regD src2) %{
1250   predicate(UseAVX > 0);
1251   match(Set dst (SubD src1 src2));
1252 
1253   format %{ "vsubsd  $dst, $src1, $src2" %}
1254   ins_cost(150);
1255   ins_encode %{
1256     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
1257   %}
1258   ins_pipe(pipe_slow);
1259 %}
1260 
1261 instruct subD_reg_mem(regD dst, regD src1, memory src2) %{
1262   predicate(UseAVX > 0);
1263   match(Set dst (SubD src1 (LoadD src2)));
1264 
1265   format %{ "vsubsd  $dst, $src1, $src2" %}
1266   ins_cost(150);
1267   ins_encode %{
1268     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
1269   %}
1270   ins_pipe(pipe_slow);
1271 %}
1272 
1273 instruct subD_reg_imm(regD dst, regD src, immD con) %{
1274   predicate(UseAVX > 0);
1275   match(Set dst (SubD src con));
1276 
1277   format %{ "vsubsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
1278   ins_cost(150);
1279   ins_encode %{
1280     __ vsubsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
1281   %}
1282   ins_pipe(pipe_slow);
1283 %}
1284 
1285 instruct mulF_reg(regF dst, regF src) %{
1286   predicate((UseSSE>=1) && (UseAVX == 0));
1287   match(Set dst (MulF dst src));
1288 
1289   format %{ "mulss   $dst, $src" %}
1290   ins_cost(150);
1291   ins_encode %{
1292     __ mulss($dst$$XMMRegister, $src$$XMMRegister);
1293   %}
1294   ins_pipe(pipe_slow);
1295 %}
1296 
1297 instruct mulF_mem(regF dst, memory src) %{
1298   predicate((UseSSE>=1) && (UseAVX == 0));
1299   match(Set dst (MulF dst (LoadF src)));
1300 
1301   format %{ "mulss   $dst, $src" %}
1302   ins_cost(150);
1303   ins_encode %{
1304     __ mulss($dst$$XMMRegister, $src$$Address);
1305   %}
1306   ins_pipe(pipe_slow);
1307 %}
1308 
1309 instruct mulF_imm(regF dst, immF con) %{
1310   predicate((UseSSE>=1) && (UseAVX == 0));
1311   match(Set dst (MulF dst con));
1312   format %{ "mulss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
1313   ins_cost(150);
1314   ins_encode %{
1315     __ mulss($dst$$XMMRegister, $constantaddress($con));
1316   %}
1317   ins_pipe(pipe_slow);
1318 %}
1319 
1320 instruct mulF_reg_reg(regF dst, regF src1, regF src2) %{
1321   predicate(UseAVX > 0);
1322   match(Set dst (MulF src1 src2));
1323 
1324   format %{ "vmulss  $dst, $src1, $src2" %}
1325   ins_cost(150);
1326   ins_encode %{
1327     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
1328   %}
1329   ins_pipe(pipe_slow);
1330 %}
1331 
1332 instruct mulF_reg_mem(regF dst, regF src1, memory src2) %{
1333   predicate(UseAVX > 0);
1334   match(Set dst (MulF src1 (LoadF src2)));
1335 
1336   format %{ "vmulss  $dst, $src1, $src2" %}
1337   ins_cost(150);
1338   ins_encode %{
1339     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
1340   %}
1341   ins_pipe(pipe_slow);
1342 %}
1343 
1344 instruct mulF_reg_imm(regF dst, regF src, immF con) %{
1345   predicate(UseAVX > 0);
1346   match(Set dst (MulF src con));
1347 
1348   format %{ "vmulss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
1349   ins_cost(150);
1350   ins_encode %{
1351     __ vmulss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
1352   %}
1353   ins_pipe(pipe_slow);
1354 %}
1355 
1356 instruct mulD_reg(regD dst, regD src) %{
1357   predicate((UseSSE>=2) && (UseAVX == 0));
1358   match(Set dst (MulD dst src));
1359 
1360   format %{ "mulsd   $dst, $src" %}
1361   ins_cost(150);
1362   ins_encode %{
1363     __ mulsd($dst$$XMMRegister, $src$$XMMRegister);
1364   %}
1365   ins_pipe(pipe_slow);
1366 %}
1367 
1368 instruct mulD_mem(regD dst, memory src) %{
1369   predicate((UseSSE>=2) && (UseAVX == 0));
1370   match(Set dst (MulD dst (LoadD src)));
1371 
1372   format %{ "mulsd   $dst, $src" %}
1373   ins_cost(150);
1374   ins_encode %{
1375     __ mulsd($dst$$XMMRegister, $src$$Address);
1376   %}
1377   ins_pipe(pipe_slow);
1378 %}
1379 
1380 instruct mulD_imm(regD dst, immD con) %{
1381   predicate((UseSSE>=2) && (UseAVX == 0));
1382   match(Set dst (MulD dst con));
1383   format %{ "mulsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
1384   ins_cost(150);
1385   ins_encode %{
1386     __ mulsd($dst$$XMMRegister, $constantaddress($con));
1387   %}
1388   ins_pipe(pipe_slow);
1389 %}
1390 
1391 instruct mulD_reg_reg(regD dst, regD src1, regD src2) %{
1392   predicate(UseAVX > 0);
1393   match(Set dst (MulD src1 src2));
1394 
1395   format %{ "vmulsd  $dst, $src1, $src2" %}
1396   ins_cost(150);
1397   ins_encode %{
1398     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
1399   %}
1400   ins_pipe(pipe_slow);
1401 %}
1402 
1403 instruct mulD_reg_mem(regD dst, regD src1, memory src2) %{
1404   predicate(UseAVX > 0);
1405   match(Set dst (MulD src1 (LoadD src2)));
1406 
1407   format %{ "vmulsd  $dst, $src1, $src2" %}
1408   ins_cost(150);
1409   ins_encode %{
1410     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
1411   %}
1412   ins_pipe(pipe_slow);
1413 %}
1414 
1415 instruct mulD_reg_imm(regD dst, regD src, immD con) %{
1416   predicate(UseAVX > 0);
1417   match(Set dst (MulD src con));
1418 
1419   format %{ "vmulsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
1420   ins_cost(150);
1421   ins_encode %{
1422     __ vmulsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
1423   %}
1424   ins_pipe(pipe_slow);
1425 %}
1426 
1427 instruct divF_reg(regF dst, regF src) %{
1428   predicate((UseSSE>=1) && (UseAVX == 0));
1429   match(Set dst (DivF dst src));
1430 
1431   format %{ "divss   $dst, $src" %}
1432   ins_cost(150);
1433   ins_encode %{
1434     __ divss($dst$$XMMRegister, $src$$XMMRegister);
1435   %}
1436   ins_pipe(pipe_slow);
1437 %}
1438 
1439 instruct divF_mem(regF dst, memory src) %{
1440   predicate((UseSSE>=1) && (UseAVX == 0));
1441   match(Set dst (DivF dst (LoadF src)));
1442 
1443   format %{ "divss   $dst, $src" %}
1444   ins_cost(150);
1445   ins_encode %{
1446     __ divss($dst$$XMMRegister, $src$$Address);
1447   %}
1448   ins_pipe(pipe_slow);
1449 %}
1450 
1451 instruct divF_imm(regF dst, immF con) %{
1452   predicate((UseSSE>=1) && (UseAVX == 0));
1453   match(Set dst (DivF dst con));
1454   format %{ "divss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
1455   ins_cost(150);
1456   ins_encode %{
1457     __ divss($dst$$XMMRegister, $constantaddress($con));
1458   %}
1459   ins_pipe(pipe_slow);
1460 %}
1461 
1462 instruct divF_reg_reg(regF dst, regF src1, regF src2) %{
1463   predicate(UseAVX > 0);
1464   match(Set dst (DivF src1 src2));
1465 
1466   format %{ "vdivss  $dst, $src1, $src2" %}
1467   ins_cost(150);
1468   ins_encode %{
1469     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
1470   %}
1471   ins_pipe(pipe_slow);
1472 %}
1473 
1474 instruct divF_reg_mem(regF dst, regF src1, memory src2) %{
1475   predicate(UseAVX > 0);
1476   match(Set dst (DivF src1 (LoadF src2)));
1477 
1478   format %{ "vdivss  $dst, $src1, $src2" %}
1479   ins_cost(150);
1480   ins_encode %{
1481     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
1482   %}
1483   ins_pipe(pipe_slow);
1484 %}
1485 
1486 instruct divF_reg_imm(regF dst, regF src, immF con) %{
1487   predicate(UseAVX > 0);
1488   match(Set dst (DivF src con));
1489 
1490   format %{ "vdivss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
1491   ins_cost(150);
1492   ins_encode %{
1493     __ vdivss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
1494   %}
1495   ins_pipe(pipe_slow);
1496 %}
1497 
1498 instruct divD_reg(regD dst, regD src) %{
1499   predicate((UseSSE>=2) && (UseAVX == 0));
1500   match(Set dst (DivD dst src));
1501 
1502   format %{ "divsd   $dst, $src" %}
1503   ins_cost(150);
1504   ins_encode %{
1505     __ divsd($dst$$XMMRegister, $src$$XMMRegister);
1506   %}
1507   ins_pipe(pipe_slow);
1508 %}
1509 
1510 instruct divD_mem(regD dst, memory src) %{
1511   predicate((UseSSE>=2) && (UseAVX == 0));
1512   match(Set dst (DivD dst (LoadD src)));
1513 
1514   format %{ "divsd   $dst, $src" %}
1515   ins_cost(150);
1516   ins_encode %{
1517     __ divsd($dst$$XMMRegister, $src$$Address);
1518   %}
1519   ins_pipe(pipe_slow);
1520 %}
1521 
1522 instruct divD_imm(regD dst, immD con) %{
1523   predicate((UseSSE>=2) && (UseAVX == 0));
1524   match(Set dst (DivD dst con));
1525   format %{ "divsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
1526   ins_cost(150);
1527   ins_encode %{
1528     __ divsd($dst$$XMMRegister, $constantaddress($con));
1529   %}
1530   ins_pipe(pipe_slow);
1531 %}
1532 
1533 instruct divD_reg_reg(regD dst, regD src1, regD src2) %{
1534   predicate(UseAVX > 0);
1535   match(Set dst (DivD src1 src2));
1536 
1537   format %{ "vdivsd  $dst, $src1, $src2" %}
1538   ins_cost(150);
1539   ins_encode %{
1540     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
1541   %}
1542   ins_pipe(pipe_slow);
1543 %}
1544 
1545 instruct divD_reg_mem(regD dst, regD src1, memory src2) %{
1546   predicate(UseAVX > 0);
1547   match(Set dst (DivD src1 (LoadD src2)));
1548 
1549   format %{ "vdivsd  $dst, $src1, $src2" %}
1550   ins_cost(150);
1551   ins_encode %{
1552     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
1553   %}
1554   ins_pipe(pipe_slow);
1555 %}
1556 
1557 instruct divD_reg_imm(regD dst, regD src, immD con) %{
1558   predicate(UseAVX > 0);
1559   match(Set dst (DivD src con));
1560 
1561   format %{ "vdivsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
1562   ins_cost(150);
1563   ins_encode %{
1564     __ vdivsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
1565   %}
1566   ins_pipe(pipe_slow);
1567 %}
1568 
1569 instruct absF_reg(regF dst) %{
1570   predicate((UseSSE>=1) && (UseAVX == 0));
1571   match(Set dst (AbsF dst));
1572   ins_cost(150);
1573   format %{ "andps   $dst, [0x7fffffff]\t# abs float by sign masking" %}
1574   ins_encode %{
1575     __ andps($dst$$XMMRegister, ExternalAddress(float_signmask()));
1576   %}
1577   ins_pipe(pipe_slow);
1578 %}
1579 
1580 instruct absF_reg_reg(regF dst, regF src) %{
1581   predicate(UseAVX > 0);
1582   match(Set dst (AbsF src));
1583   ins_cost(150);
1584   format %{ "vandps  $dst, $src, [0x7fffffff]\t# abs float by sign masking" %}
1585   ins_encode %{
1586     bool vector256 = false;
1587     __ vandps($dst$$XMMRegister, $src$$XMMRegister,
1588               ExternalAddress(float_signmask()), vector256);
1589   %}
1590   ins_pipe(pipe_slow);
1591 %}
1592 
1593 instruct absD_reg(regD dst) %{
1594   predicate((UseSSE>=2) && (UseAVX == 0));
1595   match(Set dst (AbsD dst));
1596   ins_cost(150);
1597   format %{ "andpd   $dst, [0x7fffffffffffffff]\t"
1598             "# abs double by sign masking" %}
1599   ins_encode %{
1600     __ andpd($dst$$XMMRegister, ExternalAddress(double_signmask()));
1601   %}
1602   ins_pipe(pipe_slow);
1603 %}
1604 
1605 instruct absD_reg_reg(regD dst, regD src) %{
1606   predicate(UseAVX > 0);
1607   match(Set dst (AbsD src));
1608   ins_cost(150);
1609   format %{ "vandpd  $dst, $src, [0x7fffffffffffffff]\t"
1610             "# abs double by sign masking" %}
1611   ins_encode %{
1612     bool vector256 = false;
1613     __ vandpd($dst$$XMMRegister, $src$$XMMRegister,
1614               ExternalAddress(double_signmask()), vector256);
1615   %}
1616   ins_pipe(pipe_slow);
1617 %}
1618 
1619 instruct negF_reg(regF dst) %{
1620   predicate((UseSSE>=1) && (UseAVX == 0));
1621   match(Set dst (NegF dst));
1622   ins_cost(150);
1623   format %{ "xorps   $dst, [0x80000000]\t# neg float by sign flipping" %}
1624   ins_encode %{
1625     __ xorps($dst$$XMMRegister, ExternalAddress(float_signflip()));
1626   %}
1627   ins_pipe(pipe_slow);
1628 %}
1629 
1630 instruct negF_reg_reg(regF dst, regF src) %{
1631   predicate(UseAVX > 0);
1632   match(Set dst (NegF src));
1633   ins_cost(150);
1634   format %{ "vxorps  $dst, $src, [0x80000000]\t# neg float by sign flipping" %}
1635   ins_encode %{
1636     bool vector256 = false;
1637     __ vxorps($dst$$XMMRegister, $src$$XMMRegister,
1638               ExternalAddress(float_signflip()), vector256);
1639   %}
1640   ins_pipe(pipe_slow);
1641 %}
1642 
1643 instruct negD_reg(regD dst) %{
1644   predicate((UseSSE>=2) && (UseAVX == 0));
1645   match(Set dst (NegD dst));
1646   ins_cost(150);
1647   format %{ "xorpd   $dst, [0x8000000000000000]\t"
1648             "# neg double by sign flipping" %}
1649   ins_encode %{
1650     __ xorpd($dst$$XMMRegister, ExternalAddress(double_signflip()));
1651   %}
1652   ins_pipe(pipe_slow);
1653 %}
1654 
1655 instruct negD_reg_reg(regD dst, regD src) %{
1656   predicate(UseAVX > 0);
1657   match(Set dst (NegD src));
1658   ins_cost(150);
1659   format %{ "vxorpd  $dst, $src, [0x8000000000000000]\t"
1660             "# neg double by sign flipping" %}
1661   ins_encode %{
1662     bool vector256 = false;
1663     __ vxorpd($dst$$XMMRegister, $src$$XMMRegister,
1664               ExternalAddress(double_signflip()), vector256);
1665   %}
1666   ins_pipe(pipe_slow);
1667 %}
1668 
1669 instruct sqrtF_reg(regF dst, regF src) %{
1670   predicate(UseSSE>=1);
1671   match(Set dst (ConvD2F (SqrtD (ConvF2D src))));
1672 
1673   format %{ "sqrtss  $dst, $src" %}
1674   ins_cost(150);
1675   ins_encode %{
1676     __ sqrtss($dst$$XMMRegister, $src$$XMMRegister);
1677   %}
1678   ins_pipe(pipe_slow);
1679 %}
1680 
1681 instruct sqrtF_mem(regF dst, memory src) %{
1682   predicate(UseSSE>=1);
1683   match(Set dst (ConvD2F (SqrtD (ConvF2D (LoadF src)))));
1684 
1685   format %{ "sqrtss  $dst, $src" %}
1686   ins_cost(150);
1687   ins_encode %{
1688     __ sqrtss($dst$$XMMRegister, $src$$Address);
1689   %}
1690   ins_pipe(pipe_slow);
1691 %}
1692 
1693 instruct sqrtF_imm(regF dst, immF con) %{
1694   predicate(UseSSE>=1);
1695   match(Set dst (ConvD2F (SqrtD (ConvF2D con))));
1696   format %{ "sqrtss  $dst, [$constantaddress]\t# load from constant table: float=$con" %}
1697   ins_cost(150);
1698   ins_encode %{
1699     __ sqrtss($dst$$XMMRegister, $constantaddress($con));
1700   %}
1701   ins_pipe(pipe_slow);
1702 %}
1703 
1704 instruct sqrtD_reg(regD dst, regD src) %{
1705   predicate(UseSSE>=2);
1706   match(Set dst (SqrtD src));
1707 
1708   format %{ "sqrtsd  $dst, $src" %}
1709   ins_cost(150);
1710   ins_encode %{
1711     __ sqrtsd($dst$$XMMRegister, $src$$XMMRegister);
1712   %}
1713   ins_pipe(pipe_slow);
1714 %}
1715 
1716 instruct sqrtD_mem(regD dst, memory src) %{
1717   predicate(UseSSE>=2);
1718   match(Set dst (SqrtD (LoadD src)));
1719 
1720   format %{ "sqrtsd  $dst, $src" %}
1721   ins_cost(150);
1722   ins_encode %{
1723     __ sqrtsd($dst$$XMMRegister, $src$$Address);
1724   %}
1725   ins_pipe(pipe_slow);
1726 %}
1727 
1728 instruct sqrtD_imm(regD dst, immD con) %{
1729   predicate(UseSSE>=2);
1730   match(Set dst (SqrtD con));
1731   format %{ "sqrtsd  $dst, [$constantaddress]\t# load from constant table: double=$con" %}
1732   ins_cost(150);
1733   ins_encode %{
1734     __ sqrtsd($dst$$XMMRegister, $constantaddress($con));
1735   %}
1736   ins_pipe(pipe_slow);
1737 %}
1738 
1739 
1740 // ====================VECTOR INSTRUCTIONS=====================================
1741 
1742 // Load vectors (4 bytes long)
1743 instruct loadV4(vecS dst, memory mem) %{
1744   predicate(n->as_LoadVector()->memory_size() == 4);
1745   match(Set dst (LoadVector mem));
1746   ins_cost(125);
1747   format %{ "movd    $dst,$mem\t! load vector (4 bytes)" %}
1748   ins_encode %{
1749     __ movdl($dst$$XMMRegister, $mem$$Address);
1750   %}
1751   ins_pipe( pipe_slow );
1752 %}
1753 
1754 // Load vectors (8 bytes long)
1755 instruct loadV8(vecD dst, memory mem) %{
1756   predicate(n->as_LoadVector()->memory_size() == 8);
1757   match(Set dst (LoadVector mem));
1758   ins_cost(125);
1759   format %{ "movq    $dst,$mem\t! load vector (8 bytes)" %}
1760   ins_encode %{
1761     __ movq($dst$$XMMRegister, $mem$$Address);
1762   %}
1763   ins_pipe( pipe_slow );
1764 %}
1765 
1766 // Load vectors (16 bytes long)
1767 instruct loadV16(vecX dst, memory mem) %{
1768   predicate(n->as_LoadVector()->memory_size() == 16);
1769   match(Set dst (LoadVector mem));
1770   ins_cost(125);
1771   format %{ "movdqu  $dst,$mem\t! load vector (16 bytes)" %}
1772   ins_encode %{
1773     __ movdqu($dst$$XMMRegister, $mem$$Address);
1774   %}
1775   ins_pipe( pipe_slow );
1776 %}
1777 
1778 // Load vectors (32 bytes long)
1779 instruct loadV32(vecY dst, memory mem) %{
1780   predicate(n->as_LoadVector()->memory_size() == 32);
1781   match(Set dst (LoadVector mem));
1782   ins_cost(125);
1783   format %{ "vmovdqu $dst,$mem\t! load vector (32 bytes)" %}
1784   ins_encode %{
1785     __ vmovdqu($dst$$XMMRegister, $mem$$Address);
1786   %}
1787   ins_pipe( pipe_slow );
1788 %}
1789 
1790 // Store vectors
1791 instruct storeV4(memory mem, vecS src) %{
1792   predicate(n->as_StoreVector()->memory_size() == 4);
1793   match(Set mem (StoreVector mem src));
1794   ins_cost(145);
1795   format %{ "movd    $mem,$src\t! store vector (4 bytes)" %}
1796   ins_encode %{
1797     __ movdl($mem$$Address, $src$$XMMRegister);
1798   %}
1799   ins_pipe( pipe_slow );
1800 %}
1801 
1802 instruct storeV8(memory mem, vecD src) %{
1803   predicate(n->as_StoreVector()->memory_size() == 8);
1804   match(Set mem (StoreVector mem src));
1805   ins_cost(145);
1806   format %{ "movq    $mem,$src\t! store vector (8 bytes)" %}
1807   ins_encode %{
1808     __ movq($mem$$Address, $src$$XMMRegister);
1809   %}
1810   ins_pipe( pipe_slow );
1811 %}
1812 
1813 instruct storeV16(memory mem, vecX src) %{
1814   predicate(n->as_StoreVector()->memory_size() == 16);
1815   match(Set mem (StoreVector mem src));
1816   ins_cost(145);
1817   format %{ "movdqu  $mem,$src\t! store vector (16 bytes)" %}
1818   ins_encode %{
1819     __ movdqu($mem$$Address, $src$$XMMRegister);
1820   %}
1821   ins_pipe( pipe_slow );
1822 %}
1823 
1824 instruct storeV32(memory mem, vecY src) %{
1825   predicate(n->as_StoreVector()->memory_size() == 32);
1826   match(Set mem (StoreVector mem src));
1827   ins_cost(145);
1828   format %{ "vmovdqu $mem,$src\t! store vector (32 bytes)" %}
1829   ins_encode %{
1830     __ vmovdqu($mem$$Address, $src$$XMMRegister);
1831   %}
1832   ins_pipe( pipe_slow );
1833 %}
1834 
1835 // Replicate byte scalar to be vector
1836 instruct Repl4B(vecS dst, rRegI src) %{
1837   predicate(n->as_Vector()->length() == 4);
1838   match(Set dst (ReplicateB src));
1839   format %{ "movd    $dst,$src\n\t"
1840             "punpcklbw $dst,$dst\n\t"
1841             "pshuflw $dst,$dst,0x00\t! replicate4B" %}
1842   ins_encode %{
1843     __ movdl($dst$$XMMRegister, $src$$Register);
1844     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
1845     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
1846   %}
1847   ins_pipe( pipe_slow );
1848 %}
1849 
1850 instruct Repl8B(vecD dst, rRegI src) %{
1851   predicate(n->as_Vector()->length() == 8);
1852   match(Set dst (ReplicateB src));
1853   format %{ "movd    $dst,$src\n\t"
1854             "punpcklbw $dst,$dst\n\t"
1855             "pshuflw $dst,$dst,0x00\t! replicate8B" %}
1856   ins_encode %{
1857     __ movdl($dst$$XMMRegister, $src$$Register);
1858     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
1859     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
1860   %}
1861   ins_pipe( pipe_slow );
1862 %}
1863 
1864 instruct Repl16B(vecX dst, rRegI src) %{
1865   predicate(n->as_Vector()->length() == 16);
1866   match(Set dst (ReplicateB src));
1867   format %{ "movd    $dst,$src\n\t"
1868             "punpcklbw $dst,$dst\n\t"
1869             "pshuflw $dst,$dst,0x00\n\t"
1870             "punpcklqdq $dst,$dst\t! replicate16B" %}
1871   ins_encode %{
1872     __ movdl($dst$$XMMRegister, $src$$Register);
1873     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
1874     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
1875     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
1876   %}
1877   ins_pipe( pipe_slow );
1878 %}
1879 
1880 instruct Repl32B(vecY dst, rRegI src) %{
1881   predicate(n->as_Vector()->length() == 32);
1882   match(Set dst (ReplicateB src));
1883   format %{ "movd    $dst,$src\n\t"
1884             "punpcklbw $dst,$dst\n\t"
1885             "pshuflw $dst,$dst,0x00\n\t"
1886             "punpcklqdq $dst,$dst\n\t"
1887             "vinserti128h $dst,$dst,$dst\t! replicate32B" %}
1888   ins_encode %{
1889     __ movdl($dst$$XMMRegister, $src$$Register);
1890     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
1891     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
1892     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
1893     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
1894   %}
1895   ins_pipe( pipe_slow );
1896 %}
1897 
1898 // Replicate byte scalar immediate to be vector by loading from const table.
1899 instruct Repl4B_imm(vecS dst, immI con) %{
1900   predicate(n->as_Vector()->length() == 4);
1901   match(Set dst (ReplicateB con));
1902   format %{ "movdl   $dst,[$constantaddress]\t! replicate4B($con)" %}
1903   ins_encode %{
1904     __ movdl($dst$$XMMRegister, $constantaddress(replicate4_imm($con$$constant, 1)));
1905   %}
1906   ins_pipe( pipe_slow );
1907 %}
1908 
1909 instruct Repl8B_imm(vecD dst, immI con) %{
1910   predicate(n->as_Vector()->length() == 8);
1911   match(Set dst (ReplicateB con));
1912   format %{ "movq    $dst,[$constantaddress]\t! replicate8B($con)" %}
1913   ins_encode %{
1914     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
1915   %}
1916   ins_pipe( pipe_slow );
1917 %}
1918 
1919 instruct Repl16B_imm(vecX dst, immI con) %{
1920   predicate(n->as_Vector()->length() == 16);
1921   match(Set dst (ReplicateB con));
1922   format %{ "movq    $dst,[$constantaddress]\n\t"
1923             "punpcklqdq $dst,$dst\t! replicate16B($con)" %}
1924   ins_encode %{
1925     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
1926     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
1927   %}
1928   ins_pipe( pipe_slow );
1929 %}
1930 
1931 instruct Repl32B_imm(vecY dst, immI con) %{
1932   predicate(n->as_Vector()->length() == 32);
1933   match(Set dst (ReplicateB con));
1934   format %{ "movq    $dst,[$constantaddress]\n\t"
1935             "punpcklqdq $dst,$dst\n\t"
1936             "vinserti128h $dst,$dst,$dst\t! lreplicate32B($con)" %}
1937   ins_encode %{
1938     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
1939     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
1940     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
1941   %}
1942   ins_pipe( pipe_slow );
1943 %}
1944 
1945 // Replicate byte scalar zero to be vector
1946 instruct Repl4B_zero(vecS dst, immI0 zero) %{
1947   predicate(n->as_Vector()->length() == 4);
1948   match(Set dst (ReplicateB zero));
1949   format %{ "pxor    $dst,$dst\t! replicate4B zero" %}
1950   ins_encode %{
1951     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
1952   %}
1953   ins_pipe( fpu_reg_reg );
1954 %}
1955 
1956 instruct Repl8B_zero(vecD dst, immI0 zero) %{
1957   predicate(n->as_Vector()->length() == 8);
1958   match(Set dst (ReplicateB zero));
1959   format %{ "pxor    $dst,$dst\t! replicate8B zero" %}
1960   ins_encode %{
1961     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
1962   %}
1963   ins_pipe( fpu_reg_reg );
1964 %}
1965 
1966 instruct Repl16B_zero(vecX dst, immI0 zero) %{
1967   predicate(n->as_Vector()->length() == 16);
1968   match(Set dst (ReplicateB zero));
1969   format %{ "pxor    $dst,$dst\t! replicate16B zero" %}
1970   ins_encode %{
1971     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
1972   %}
1973   ins_pipe( fpu_reg_reg );
1974 %}
1975 
1976 instruct Repl32B_zero(vecY dst, immI0 zero) %{
1977   predicate(n->as_Vector()->length() == 32);
1978   match(Set dst (ReplicateB zero));
1979   format %{ "vpxor   $dst,$dst,$dst\t! replicate32B zero" %}
1980   ins_encode %{
1981     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
1982     bool vector256 = true;
1983     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector256);
1984   %}
1985   ins_pipe( fpu_reg_reg );
1986 %}
1987 
1988 // Replicate char/short (2 byte) scalar to be vector
1989 instruct Repl2S(vecS dst, rRegI src) %{
1990   predicate(n->as_Vector()->length() == 2);
1991   match(Set dst (ReplicateS src));
1992   format %{ "movd    $dst,$src\n\t"
1993             "pshuflw $dst,$dst,0x00\t! replicate2S" %}
1994   ins_encode %{
1995     __ movdl($dst$$XMMRegister, $src$$Register);
1996     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
1997   %}
1998   ins_pipe( fpu_reg_reg );
1999 %}
2000 
2001 instruct Repl4S(vecD dst, rRegI src) %{
2002   predicate(n->as_Vector()->length() == 4);
2003   match(Set dst (ReplicateS src));
2004   format %{ "movd    $dst,$src\n\t"
2005             "pshuflw $dst,$dst,0x00\t! replicate4S" %}
2006   ins_encode %{
2007     __ movdl($dst$$XMMRegister, $src$$Register);
2008     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
2009   %}
2010   ins_pipe( fpu_reg_reg );
2011 %}
2012 
2013 instruct Repl8S(vecX dst, rRegI src) %{
2014   predicate(n->as_Vector()->length() == 8);
2015   match(Set dst (ReplicateS src));
2016   format %{ "movd    $dst,$src\n\t"
2017             "pshuflw $dst,$dst,0x00\n\t"
2018             "punpcklqdq $dst,$dst\t! replicate8S" %}
2019   ins_encode %{
2020     __ movdl($dst$$XMMRegister, $src$$Register);
2021     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
2022     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2023   %}
2024   ins_pipe( pipe_slow );
2025 %}
2026 
2027 instruct Repl16S(vecY dst, rRegI src) %{
2028   predicate(n->as_Vector()->length() == 16);
2029   match(Set dst (ReplicateS src));
2030   format %{ "movd    $dst,$src\n\t"
2031             "pshuflw $dst,$dst,0x00\n\t"
2032             "punpcklqdq $dst,$dst\n\t"
2033             "vinserti128h $dst,$dst,$dst\t! replicate16S" %}
2034   ins_encode %{
2035     __ movdl($dst$$XMMRegister, $src$$Register);
2036     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
2037     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2038     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
2039   %}
2040   ins_pipe( pipe_slow );
2041 %}
2042 
2043 // Replicate char/short (2 byte) scalar immediate to be vector by loading from const table.
2044 instruct Repl2S_imm(vecS dst, immI con) %{
2045   predicate(n->as_Vector()->length() == 2);
2046   match(Set dst (ReplicateS con));
2047   format %{ "movdl   $dst,[$constantaddress]\t! replicate2S($con)" %}
2048   ins_encode %{
2049     __ movdl($dst$$XMMRegister, $constantaddress(replicate4_imm($con$$constant, 2)));
2050   %}
2051   ins_pipe( fpu_reg_reg );
2052 %}
2053 
2054 instruct Repl4S_imm(vecD dst, immI con) %{
2055   predicate(n->as_Vector()->length() == 4);
2056   match(Set dst (ReplicateS con));
2057   format %{ "movq    $dst,[$constantaddress]\t! replicate4S($con)" %}
2058   ins_encode %{
2059     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
2060   %}
2061   ins_pipe( fpu_reg_reg );
2062 %}
2063 
2064 instruct Repl8S_imm(vecX dst, immI con) %{
2065   predicate(n->as_Vector()->length() == 8);
2066   match(Set dst (ReplicateS con));
2067   format %{ "movq    $dst,[$constantaddress]\n\t"
2068             "punpcklqdq $dst,$dst\t! replicate8S($con)" %}
2069   ins_encode %{
2070     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
2071     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2072   %}
2073   ins_pipe( pipe_slow );
2074 %}
2075 
2076 instruct Repl16S_imm(vecY dst, immI con) %{
2077   predicate(n->as_Vector()->length() == 16);
2078   match(Set dst (ReplicateS con));
2079   format %{ "movq    $dst,[$constantaddress]\n\t"
2080             "punpcklqdq $dst,$dst\n\t"
2081             "vinserti128h $dst,$dst,$dst\t! replicate16S($con)" %}
2082   ins_encode %{
2083     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
2084     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2085     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
2086   %}
2087   ins_pipe( pipe_slow );
2088 %}
2089 
2090 // Replicate char/short (2 byte) scalar zero to be vector
2091 instruct Repl2S_zero(vecS dst, immI0 zero) %{
2092   predicate(n->as_Vector()->length() == 2);
2093   match(Set dst (ReplicateS zero));
2094   format %{ "pxor    $dst,$dst\t! replicate2S zero" %}
2095   ins_encode %{
2096     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
2097   %}
2098   ins_pipe( fpu_reg_reg );
2099 %}
2100 
2101 instruct Repl4S_zero(vecD dst, immI0 zero) %{
2102   predicate(n->as_Vector()->length() == 4);
2103   match(Set dst (ReplicateS zero));
2104   format %{ "pxor    $dst,$dst\t! replicate4S zero" %}
2105   ins_encode %{
2106     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
2107   %}
2108   ins_pipe( fpu_reg_reg );
2109 %}
2110 
2111 instruct Repl8S_zero(vecX dst, immI0 zero) %{
2112   predicate(n->as_Vector()->length() == 8);
2113   match(Set dst (ReplicateS zero));
2114   format %{ "pxor    $dst,$dst\t! replicate8S zero" %}
2115   ins_encode %{
2116     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
2117   %}
2118   ins_pipe( fpu_reg_reg );
2119 %}
2120 
2121 instruct Repl16S_zero(vecY dst, immI0 zero) %{
2122   predicate(n->as_Vector()->length() == 16);
2123   match(Set dst (ReplicateS zero));
2124   format %{ "vpxor   $dst,$dst,$dst\t! replicate16S zero" %}
2125   ins_encode %{
2126     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
2127     bool vector256 = true;
2128     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector256);
2129   %}
2130   ins_pipe( fpu_reg_reg );
2131 %}
2132 
2133 // Replicate integer (4 byte) scalar to be vector
2134 instruct Repl2I(vecD dst, rRegI src) %{
2135   predicate(n->as_Vector()->length() == 2);
2136   match(Set dst (ReplicateI src));
2137   format %{ "movd    $dst,$src\n\t"
2138             "pshufd  $dst,$dst,0x00\t! replicate2I" %}
2139   ins_encode %{
2140     __ movdl($dst$$XMMRegister, $src$$Register);
2141     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
2142   %}
2143   ins_pipe( fpu_reg_reg );
2144 %}
2145 
2146 instruct Repl4I(vecX dst, rRegI src) %{
2147   predicate(n->as_Vector()->length() == 4);
2148   match(Set dst (ReplicateI src));
2149   format %{ "movd    $dst,$src\n\t"
2150             "pshufd  $dst,$dst,0x00\t! replicate4I" %}
2151   ins_encode %{
2152     __ movdl($dst$$XMMRegister, $src$$Register);
2153     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
2154   %}
2155   ins_pipe( pipe_slow );
2156 %}
2157 
2158 instruct Repl8I(vecY dst, rRegI src) %{
2159   predicate(n->as_Vector()->length() == 8);
2160   match(Set dst (ReplicateI src));
2161   format %{ "movd    $dst,$src\n\t"
2162             "pshufd  $dst,$dst,0x00\n\t"
2163             "vinserti128h $dst,$dst,$dst\t! replicate8I" %}
2164   ins_encode %{
2165     __ movdl($dst$$XMMRegister, $src$$Register);
2166     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
2167     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
2168   %}
2169   ins_pipe( pipe_slow );
2170 %}
2171 
2172 // Replicate integer (4 byte) scalar immediate to be vector by loading from const table.
2173 instruct Repl2I_imm(vecD dst, immI con) %{
2174   predicate(n->as_Vector()->length() == 2);
2175   match(Set dst (ReplicateI con));
2176   format %{ "movq    $dst,[$constantaddress]\t! replicate2I($con)" %}
2177   ins_encode %{
2178     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
2179   %}
2180   ins_pipe( fpu_reg_reg );
2181 %}
2182 
2183 instruct Repl4I_imm(vecX dst, immI con) %{
2184   predicate(n->as_Vector()->length() == 4);
2185   match(Set dst (ReplicateI con));
2186   format %{ "movq    $dst,[$constantaddress]\t! replicate4I($con)\n\t"
2187             "punpcklqdq $dst,$dst" %}
2188   ins_encode %{
2189     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
2190     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2191   %}
2192   ins_pipe( pipe_slow );
2193 %}
2194 
2195 instruct Repl8I_imm(vecY dst, immI con) %{
2196   predicate(n->as_Vector()->length() == 8);
2197   match(Set dst (ReplicateI con));
2198   format %{ "movq    $dst,[$constantaddress]\t! replicate8I($con)\n\t"
2199             "punpcklqdq $dst,$dst\n\t"
2200             "vinserti128h $dst,$dst,$dst" %}
2201   ins_encode %{
2202     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
2203     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2204     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
2205   %}
2206   ins_pipe( pipe_slow );
2207 %}
2208 
2209 // Integer could be loaded into xmm register directly from memory.
2210 instruct Repl2I_mem(vecD dst, memory mem) %{
2211   predicate(n->as_Vector()->length() == 2);
2212   match(Set dst (ReplicateI (LoadI mem)));
2213   format %{ "movd    $dst,$mem\n\t"
2214             "pshufd  $dst,$dst,0x00\t! replicate2I" %}
2215   ins_encode %{
2216     __ movdl($dst$$XMMRegister, $mem$$Address);
2217     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
2218   %}
2219   ins_pipe( fpu_reg_reg );
2220 %}
2221 
2222 instruct Repl4I_mem(vecX dst, memory mem) %{
2223   predicate(n->as_Vector()->length() == 4);
2224   match(Set dst (ReplicateI (LoadI mem)));
2225   format %{ "movd    $dst,$mem\n\t"
2226             "pshufd  $dst,$dst,0x00\t! replicate4I" %}
2227   ins_encode %{
2228     __ movdl($dst$$XMMRegister, $mem$$Address);
2229     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
2230   %}
2231   ins_pipe( pipe_slow );
2232 %}
2233 
2234 instruct Repl8I_mem(vecY dst, memory mem) %{
2235   predicate(n->as_Vector()->length() == 8);
2236   match(Set dst (ReplicateI (LoadI mem)));
2237   format %{ "movd    $dst,$mem\n\t"
2238             "pshufd  $dst,$dst,0x00\n\t"
2239             "vinserti128h $dst,$dst,$dst\t! replicate8I" %}
2240   ins_encode %{
2241     __ movdl($dst$$XMMRegister, $mem$$Address);
2242     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
2243     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
2244   %}
2245   ins_pipe( pipe_slow );
2246 %}
2247 
2248 // Replicate integer (4 byte) scalar zero to be vector
2249 instruct Repl2I_zero(vecD dst, immI0 zero) %{
2250   predicate(n->as_Vector()->length() == 2);
2251   match(Set dst (ReplicateI zero));
2252   format %{ "pxor    $dst,$dst\t! replicate2I" %}
2253   ins_encode %{
2254     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
2255   %}
2256   ins_pipe( fpu_reg_reg );
2257 %}
2258 
2259 instruct Repl4I_zero(vecX dst, immI0 zero) %{
2260   predicate(n->as_Vector()->length() == 4);
2261   match(Set dst (ReplicateI zero));
2262   format %{ "pxor    $dst,$dst\t! replicate4I zero)" %}
2263   ins_encode %{
2264     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
2265   %}
2266   ins_pipe( fpu_reg_reg );
2267 %}
2268 
2269 instruct Repl8I_zero(vecY dst, immI0 zero) %{
2270   predicate(n->as_Vector()->length() == 8);
2271   match(Set dst (ReplicateI zero));
2272   format %{ "vpxor   $dst,$dst,$dst\t! replicate8I zero" %}
2273   ins_encode %{
2274     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
2275     bool vector256 = true;
2276     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector256);
2277   %}
2278   ins_pipe( fpu_reg_reg );
2279 %}
2280 
2281 // Replicate long (8 byte) scalar to be vector
2282 #ifdef _LP64
2283 instruct Repl2L(vecX dst, rRegL src) %{
2284   predicate(n->as_Vector()->length() == 2);
2285   match(Set dst (ReplicateL src));
2286   format %{ "movdq   $dst,$src\n\t"
2287             "punpcklqdq $dst,$dst\t! replicate2L" %}
2288   ins_encode %{
2289     __ movdq($dst$$XMMRegister, $src$$Register);
2290     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2291   %}
2292   ins_pipe( pipe_slow );
2293 %}
2294 
2295 instruct Repl4L(vecY dst, rRegL src) %{
2296   predicate(n->as_Vector()->length() == 4);
2297   match(Set dst (ReplicateL src));
2298   format %{ "movdq   $dst,$src\n\t"
2299             "punpcklqdq $dst,$dst\n\t"
2300             "vinserti128h $dst,$dst,$dst\t! replicate4L" %}
2301   ins_encode %{
2302     __ movdq($dst$$XMMRegister, $src$$Register);
2303     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2304     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
2305   %}
2306   ins_pipe( pipe_slow );
2307 %}
2308 #else // _LP64
2309 instruct Repl2L(vecX dst, eRegL src, regD tmp) %{
2310   predicate(n->as_Vector()->length() == 2);
2311   match(Set dst (ReplicateL src));
2312   effect(TEMP dst, USE src, TEMP tmp);
2313   format %{ "movdl   $dst,$src.lo\n\t"
2314             "movdl   $tmp,$src.hi\n\t"
2315             "punpckldq $dst,$tmp\n\t"
2316             "punpcklqdq $dst,$dst\t! replicate2L"%}
2317   ins_encode %{
2318     __ movdl($dst$$XMMRegister, $src$$Register);
2319     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
2320     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
2321     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2322   %}
2323   ins_pipe( pipe_slow );
2324 %}
2325 
2326 instruct Repl4L(vecY dst, eRegL src, regD tmp) %{
2327   predicate(n->as_Vector()->length() == 4);
2328   match(Set dst (ReplicateL src));
2329   effect(TEMP dst, USE src, TEMP tmp);
2330   format %{ "movdl   $dst,$src.lo\n\t"
2331             "movdl   $tmp,$src.hi\n\t"
2332             "punpckldq $dst,$tmp\n\t"
2333             "punpcklqdq $dst,$dst\n\t"
2334             "vinserti128h $dst,$dst,$dst\t! replicate4L" %}
2335   ins_encode %{
2336     __ movdl($dst$$XMMRegister, $src$$Register);
2337     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
2338     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
2339     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2340     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
2341   %}
2342   ins_pipe( pipe_slow );
2343 %}
2344 #endif // _LP64
2345 
2346 // Replicate long (8 byte) scalar immediate to be vector by loading from const table.
2347 instruct Repl2L_imm(vecX dst, immL con) %{
2348   predicate(n->as_Vector()->length() == 2);
2349   match(Set dst (ReplicateL con));
2350   format %{ "movq    $dst,[$constantaddress]\n\t"
2351             "punpcklqdq $dst,$dst\t! replicate2L($con)" %}
2352   ins_encode %{
2353     __ movq($dst$$XMMRegister, $constantaddress($con));
2354     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2355   %}
2356   ins_pipe( pipe_slow );
2357 %}
2358 
2359 instruct Repl4L_imm(vecY dst, immL con) %{
2360   predicate(n->as_Vector()->length() == 4);
2361   match(Set dst (ReplicateL con));
2362   format %{ "movq    $dst,[$constantaddress]\n\t"
2363             "punpcklqdq $dst,$dst\n\t"
2364             "vinserti128h $dst,$dst,$dst\t! replicate4L($con)" %}
2365   ins_encode %{
2366     __ movq($dst$$XMMRegister, $constantaddress($con));
2367     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2368     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
2369   %}
2370   ins_pipe( pipe_slow );
2371 %}
2372 
2373 // Long could be loaded into xmm register directly from memory.
2374 instruct Repl2L_mem(vecX dst, memory mem) %{
2375   predicate(n->as_Vector()->length() == 2);
2376   match(Set dst (ReplicateL (LoadL mem)));
2377   format %{ "movq    $dst,$mem\n\t"
2378             "punpcklqdq $dst,$dst\t! replicate2L" %}
2379   ins_encode %{
2380     __ movq($dst$$XMMRegister, $mem$$Address);
2381     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2382   %}
2383   ins_pipe( pipe_slow );
2384 %}
2385 
2386 instruct Repl4L_mem(vecY dst, memory mem) %{
2387   predicate(n->as_Vector()->length() == 4);
2388   match(Set dst (ReplicateL (LoadL mem)));
2389   format %{ "movq    $dst,$mem\n\t"
2390             "punpcklqdq $dst,$dst\n\t"
2391             "vinserti128h $dst,$dst,$dst\t! replicate4L" %}
2392   ins_encode %{
2393     __ movq($dst$$XMMRegister, $mem$$Address);
2394     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2395     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
2396   %}
2397   ins_pipe( pipe_slow );
2398 %}
2399 
2400 // Replicate long (8 byte) scalar zero to be vector
2401 instruct Repl2L_zero(vecX dst, immL0 zero) %{
2402   predicate(n->as_Vector()->length() == 2);
2403   match(Set dst (ReplicateL zero));
2404   format %{ "pxor    $dst,$dst\t! replicate2L zero" %}
2405   ins_encode %{
2406     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
2407   %}
2408   ins_pipe( fpu_reg_reg );
2409 %}
2410 
2411 instruct Repl4L_zero(vecY dst, immL0 zero) %{
2412   predicate(n->as_Vector()->length() == 4);
2413   match(Set dst (ReplicateL zero));
2414   format %{ "vpxor   $dst,$dst,$dst\t! replicate4L zero" %}
2415   ins_encode %{
2416     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
2417     bool vector256 = true;
2418     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector256);
2419   %}
2420   ins_pipe( fpu_reg_reg );
2421 %}
2422 
2423 // Replicate float (4 byte) scalar to be vector
2424 instruct Repl2F(vecD dst, regF src) %{
2425   predicate(n->as_Vector()->length() == 2);
2426   match(Set dst (ReplicateF src));
2427   format %{ "pshufd  $dst,$dst,0x00\t! replicate2F" %}
2428   ins_encode %{
2429     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
2430   %}
2431   ins_pipe( fpu_reg_reg );
2432 %}
2433 
2434 instruct Repl4F(vecX dst, regF src) %{
2435   predicate(n->as_Vector()->length() == 4);
2436   match(Set dst (ReplicateF src));
2437   format %{ "pshufd  $dst,$dst,0x00\t! replicate4F" %}
2438   ins_encode %{
2439     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
2440   %}
2441   ins_pipe( pipe_slow );
2442 %}
2443 
2444 instruct Repl8F(vecY dst, regF src) %{
2445   predicate(n->as_Vector()->length() == 8);
2446   match(Set dst (ReplicateF src));
2447   format %{ "pshufd  $dst,$src,0x00\n\t"
2448             "vinsertf128h $dst,$dst,$dst\t! replicate8F" %}
2449   ins_encode %{
2450     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
2451     __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
2452   %}
2453   ins_pipe( pipe_slow );
2454 %}
2455 
2456 // Replicate float (4 byte) scalar zero to be vector
2457 instruct Repl2F_zero(vecD dst, immF0 zero) %{
2458   predicate(n->as_Vector()->length() == 2);
2459   match(Set dst (ReplicateF zero));
2460   format %{ "xorps   $dst,$dst\t! replicate2F zero" %}
2461   ins_encode %{
2462     __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
2463   %}
2464   ins_pipe( fpu_reg_reg );
2465 %}
2466 
2467 instruct Repl4F_zero(vecX dst, immF0 zero) %{
2468   predicate(n->as_Vector()->length() == 4);
2469   match(Set dst (ReplicateF zero));
2470   format %{ "xorps   $dst,$dst\t! replicate4F zero" %}
2471   ins_encode %{
2472     __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
2473   %}
2474   ins_pipe( fpu_reg_reg );
2475 %}
2476 
2477 instruct Repl8F_zero(vecY dst, immF0 zero) %{
2478   predicate(n->as_Vector()->length() == 8);
2479   match(Set dst (ReplicateF zero));
2480   format %{ "vxorps  $dst,$dst,$dst\t! replicate8F zero" %}
2481   ins_encode %{
2482     bool vector256 = true;
2483     __ vxorps($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector256);
2484   %}
2485   ins_pipe( fpu_reg_reg );
2486 %}
2487 
2488 // Replicate double (8 bytes) scalar to be vector
2489 instruct Repl2D(vecX dst, regD src) %{
2490   predicate(n->as_Vector()->length() == 2);
2491   match(Set dst (ReplicateD src));
2492   format %{ "pshufd  $dst,$src,0x44\t! replicate2D" %}
2493   ins_encode %{
2494     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
2495   %}
2496   ins_pipe( pipe_slow );
2497 %}
2498 
2499 instruct Repl4D(vecY dst, regD src) %{
2500   predicate(n->as_Vector()->length() == 4);
2501   match(Set dst (ReplicateD src));
2502   format %{ "pshufd  $dst,$src,0x44\n\t"
2503             "vinsertf128h $dst,$dst,$dst\t! replicate4D" %}
2504   ins_encode %{
2505     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
2506     __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
2507   %}
2508   ins_pipe( pipe_slow );
2509 %}
2510 
2511 // Replicate double (8 byte) scalar zero to be vector
2512 instruct Repl2D_zero(vecX dst, immD0 zero) %{
2513   predicate(n->as_Vector()->length() == 2);
2514   match(Set dst (ReplicateD zero));
2515   format %{ "xorpd   $dst,$dst\t! replicate2D zero" %}
2516   ins_encode %{
2517     __ xorpd($dst$$XMMRegister, $dst$$XMMRegister);
2518   %}
2519   ins_pipe( fpu_reg_reg );
2520 %}
2521 
2522 instruct Repl4D_zero(vecY dst, immD0 zero) %{
2523   predicate(n->as_Vector()->length() == 4);
2524   match(Set dst (ReplicateD zero));
2525   format %{ "vxorpd  $dst,$dst,$dst,vect256\t! replicate4D zero" %}
2526   ins_encode %{
2527     bool vector256 = true;
2528     __ vxorpd($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector256);
2529   %}
2530   ins_pipe( fpu_reg_reg );
2531 %}
2532 
2533 // ====================VECTOR ARITHMETIC=======================================
2534 
2535 // --------------------------------- ADD --------------------------------------
2536 
2537 // Bytes vector add
2538 instruct vadd4B(vecS dst, vecS src) %{
2539   predicate(n->as_Vector()->length() == 4);
2540   match(Set dst (AddVB dst src));
2541   format %{ "paddb   $dst,$src\t! add packed4B" %}
2542   ins_encode %{
2543     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
2544   %}
2545   ins_pipe( pipe_slow );
2546 %}
2547 
2548 instruct vadd4B_reg(vecS dst, vecS src1, vecS src2) %{
2549   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
2550   match(Set dst (AddVB src1 src2));
2551   format %{ "vpaddb  $dst,$src1,$src2\t! add packed4B" %}
2552   ins_encode %{
2553     bool vector256 = false;
2554     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
2555   %}
2556   ins_pipe( pipe_slow );
2557 %}
2558 
2559 instruct vadd8B(vecD dst, vecD src) %{
2560   predicate(n->as_Vector()->length() == 8);
2561   match(Set dst (AddVB dst src));
2562   format %{ "paddb   $dst,$src\t! add packed8B" %}
2563   ins_encode %{
2564     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
2565   %}
2566   ins_pipe( pipe_slow );
2567 %}
2568 
2569 instruct vadd8B_reg(vecD dst, vecD src1, vecD src2) %{
2570   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
2571   match(Set dst (AddVB src1 src2));
2572   format %{ "vpaddb  $dst,$src1,$src2\t! add packed8B" %}
2573   ins_encode %{
2574     bool vector256 = false;
2575     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
2576   %}
2577   ins_pipe( pipe_slow );
2578 %}
2579 
2580 instruct vadd16B(vecX dst, vecX src) %{
2581   predicate(n->as_Vector()->length() == 16);
2582   match(Set dst (AddVB dst src));
2583   format %{ "paddb   $dst,$src\t! add packed16B" %}
2584   ins_encode %{
2585     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
2586   %}
2587   ins_pipe( pipe_slow );
2588 %}
2589 
2590 instruct vadd16B_reg(vecX dst, vecX src1, vecX src2) %{
2591   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
2592   match(Set dst (AddVB src1 src2));
2593   format %{ "vpaddb  $dst,$src1,$src2\t! add packed16B" %}
2594   ins_encode %{
2595     bool vector256 = false;
2596     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
2597   %}
2598   ins_pipe( pipe_slow );
2599 %}
2600 
2601 instruct vadd16B_mem(vecX dst, vecX src, memory mem) %{
2602   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
2603   match(Set dst (AddVB src (LoadVector mem)));
2604   format %{ "vpaddb  $dst,$src,$mem\t! add packed16B" %}
2605   ins_encode %{
2606     bool vector256 = false;
2607     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
2608   %}
2609   ins_pipe( pipe_slow );
2610 %}
2611 
2612 instruct vadd32B_reg(vecY dst, vecY src1, vecY src2) %{
2613   predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
2614   match(Set dst (AddVB src1 src2));
2615   format %{ "vpaddb  $dst,$src1,$src2\t! add packed32B" %}
2616   ins_encode %{
2617     bool vector256 = true;
2618     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
2619   %}
2620   ins_pipe( pipe_slow );
2621 %}
2622 
2623 instruct vadd32B_mem(vecY dst, vecY src, memory mem) %{
2624   predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
2625   match(Set dst (AddVB src (LoadVector mem)));
2626   format %{ "vpaddb  $dst,$src,$mem\t! add packed32B" %}
2627   ins_encode %{
2628     bool vector256 = true;
2629     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
2630   %}
2631   ins_pipe( pipe_slow );
2632 %}
2633 
2634 // Shorts/Chars vector add
2635 instruct vadd2S(vecS dst, vecS src) %{
2636   predicate(n->as_Vector()->length() == 2);
2637   match(Set dst (AddVS dst src));
2638   format %{ "paddw   $dst,$src\t! add packed2S" %}
2639   ins_encode %{
2640     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
2641   %}
2642   ins_pipe( pipe_slow );
2643 %}
2644 
2645 instruct vadd2S_reg(vecS dst, vecS src1, vecS src2) %{
2646   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
2647   match(Set dst (AddVS src1 src2));
2648   format %{ "vpaddw  $dst,$src1,$src2\t! add packed2S" %}
2649   ins_encode %{
2650     bool vector256 = false;
2651     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
2652   %}
2653   ins_pipe( pipe_slow );
2654 %}
2655 
2656 instruct vadd4S(vecD dst, vecD src) %{
2657   predicate(n->as_Vector()->length() == 4);
2658   match(Set dst (AddVS dst src));
2659   format %{ "paddw   $dst,$src\t! add packed4S" %}
2660   ins_encode %{
2661     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
2662   %}
2663   ins_pipe( pipe_slow );
2664 %}
2665 
2666 instruct vadd4S_reg(vecD dst, vecD src1, vecD src2) %{
2667   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
2668   match(Set dst (AddVS src1 src2));
2669   format %{ "vpaddw  $dst,$src1,$src2\t! add packed4S" %}
2670   ins_encode %{
2671     bool vector256 = false;
2672     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
2673   %}
2674   ins_pipe( pipe_slow );
2675 %}
2676 
2677 instruct vadd8S(vecX dst, vecX src) %{
2678   predicate(n->as_Vector()->length() == 8);
2679   match(Set dst (AddVS dst src));
2680   format %{ "paddw   $dst,$src\t! add packed8S" %}
2681   ins_encode %{
2682     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
2683   %}
2684   ins_pipe( pipe_slow );
2685 %}
2686 
2687 instruct vadd8S_reg(vecX dst, vecX src1, vecX src2) %{
2688   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
2689   match(Set dst (AddVS src1 src2));
2690   format %{ "vpaddw  $dst,$src1,$src2\t! add packed8S" %}
2691   ins_encode %{
2692     bool vector256 = false;
2693     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
2694   %}
2695   ins_pipe( pipe_slow );
2696 %}
2697 
2698 instruct vadd8S_mem(vecX dst, vecX src, memory mem) %{
2699   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
2700   match(Set dst (AddVS src (LoadVector mem)));
2701   format %{ "vpaddw  $dst,$src,$mem\t! add packed8S" %}
2702   ins_encode %{
2703     bool vector256 = false;
2704     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
2705   %}
2706   ins_pipe( pipe_slow );
2707 %}
2708 
2709 instruct vadd16S_reg(vecY dst, vecY src1, vecY src2) %{
2710   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
2711   match(Set dst (AddVS src1 src2));
2712   format %{ "vpaddw  $dst,$src1,$src2\t! add packed16S" %}
2713   ins_encode %{
2714     bool vector256 = true;
2715     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
2716   %}
2717   ins_pipe( pipe_slow );
2718 %}
2719 
2720 instruct vadd16S_mem(vecY dst, vecY src, memory mem) %{
2721   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
2722   match(Set dst (AddVS src (LoadVector mem)));
2723   format %{ "vpaddw  $dst,$src,$mem\t! add packed16S" %}
2724   ins_encode %{
2725     bool vector256 = true;
2726     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
2727   %}
2728   ins_pipe( pipe_slow );
2729 %}
2730 
2731 // Integers vector add
2732 instruct vadd2I(vecD dst, vecD src) %{
2733   predicate(n->as_Vector()->length() == 2);
2734   match(Set dst (AddVI dst src));
2735   format %{ "paddd   $dst,$src\t! add packed2I" %}
2736   ins_encode %{
2737     __ paddd($dst$$XMMRegister, $src$$XMMRegister);
2738   %}
2739   ins_pipe( pipe_slow );
2740 %}
2741 
2742 instruct vadd2I_reg(vecD dst, vecD src1, vecD src2) %{
2743   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
2744   match(Set dst (AddVI src1 src2));
2745   format %{ "vpaddd  $dst,$src1,$src2\t! add packed2I" %}
2746   ins_encode %{
2747     bool vector256 = false;
2748     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
2749   %}
2750   ins_pipe( pipe_slow );
2751 %}
2752 
2753 instruct vadd4I(vecX dst, vecX src) %{
2754   predicate(n->as_Vector()->length() == 4);
2755   match(Set dst (AddVI dst src));
2756   format %{ "paddd   $dst,$src\t! add packed4I" %}
2757   ins_encode %{
2758     __ paddd($dst$$XMMRegister, $src$$XMMRegister);
2759   %}
2760   ins_pipe( pipe_slow );
2761 %}
2762 
2763 instruct vadd4I_reg(vecX dst, vecX src1, vecX src2) %{
2764   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
2765   match(Set dst (AddVI src1 src2));
2766   format %{ "vpaddd  $dst,$src1,$src2\t! add packed4I" %}
2767   ins_encode %{
2768     bool vector256 = false;
2769     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
2770   %}
2771   ins_pipe( pipe_slow );
2772 %}
2773 
2774 instruct vadd4I_mem(vecX dst, vecX src, memory mem) %{
2775   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
2776   match(Set dst (AddVI src (LoadVector mem)));
2777   format %{ "vpaddd  $dst,$src,$mem\t! add packed4I" %}
2778   ins_encode %{
2779     bool vector256 = false;
2780     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
2781   %}
2782   ins_pipe( pipe_slow );
2783 %}
2784 
2785 instruct vadd8I_reg(vecY dst, vecY src1, vecY src2) %{
2786   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
2787   match(Set dst (AddVI src1 src2));
2788   format %{ "vpaddd  $dst,$src1,$src2\t! add packed8I" %}
2789   ins_encode %{
2790     bool vector256 = true;
2791     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
2792   %}
2793   ins_pipe( pipe_slow );
2794 %}
2795 
2796 instruct vadd8I_mem(vecY dst, vecY src, memory mem) %{
2797   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
2798   match(Set dst (AddVI src (LoadVector mem)));
2799   format %{ "vpaddd  $dst,$src,$mem\t! add packed8I" %}
2800   ins_encode %{
2801     bool vector256 = true;
2802     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
2803   %}
2804   ins_pipe( pipe_slow );
2805 %}
2806 
2807 // Longs vector add
2808 instruct vadd2L(vecX dst, vecX src) %{
2809   predicate(n->as_Vector()->length() == 2);
2810   match(Set dst (AddVL dst src));
2811   format %{ "paddq   $dst,$src\t! add packed2L" %}
2812   ins_encode %{
2813     __ paddq($dst$$XMMRegister, $src$$XMMRegister);
2814   %}
2815   ins_pipe( pipe_slow );
2816 %}
2817 
2818 instruct vadd2L_reg(vecX dst, vecX src1, vecX src2) %{
2819   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
2820   match(Set dst (AddVL src1 src2));
2821   format %{ "vpaddq  $dst,$src1,$src2\t! add packed2L" %}
2822   ins_encode %{
2823     bool vector256 = false;
2824     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
2825   %}
2826   ins_pipe( pipe_slow );
2827 %}
2828 
2829 instruct vadd2L_mem(vecX dst, vecX src, memory mem) %{
2830   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
2831   match(Set dst (AddVL src (LoadVector mem)));
2832   format %{ "vpaddq  $dst,$src,$mem\t! add packed2L" %}
2833   ins_encode %{
2834     bool vector256 = false;
2835     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
2836   %}
2837   ins_pipe( pipe_slow );
2838 %}
2839 
2840 instruct vadd4L_reg(vecY dst, vecY src1, vecY src2) %{
2841   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
2842   match(Set dst (AddVL src1 src2));
2843   format %{ "vpaddq  $dst,$src1,$src2\t! add packed4L" %}
2844   ins_encode %{
2845     bool vector256 = true;
2846     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
2847   %}
2848   ins_pipe( pipe_slow );
2849 %}
2850 
2851 instruct vadd4L_mem(vecY dst, vecY src, memory mem) %{
2852   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
2853   match(Set dst (AddVL src (LoadVector mem)));
2854   format %{ "vpaddq  $dst,$src,$mem\t! add packed4L" %}
2855   ins_encode %{
2856     bool vector256 = true;
2857     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
2858   %}
2859   ins_pipe( pipe_slow );
2860 %}
2861 
2862 // Floats vector add
2863 instruct vadd2F(vecD dst, vecD src) %{
2864   predicate(n->as_Vector()->length() == 2);
2865   match(Set dst (AddVF dst src));
2866   format %{ "addps   $dst,$src\t! add packed2F" %}
2867   ins_encode %{
2868     __ addps($dst$$XMMRegister, $src$$XMMRegister);
2869   %}
2870   ins_pipe( pipe_slow );
2871 %}
2872 
2873 instruct vadd2F_reg(vecD dst, vecD src1, vecD src2) %{
2874   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
2875   match(Set dst (AddVF src1 src2));
2876   format %{ "vaddps  $dst,$src1,$src2\t! add packed2F" %}
2877   ins_encode %{
2878     bool vector256 = false;
2879     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
2880   %}
2881   ins_pipe( pipe_slow );
2882 %}
2883 
2884 instruct vadd4F(vecX dst, vecX src) %{
2885   predicate(n->as_Vector()->length() == 4);
2886   match(Set dst (AddVF dst src));
2887   format %{ "addps   $dst,$src\t! add packed4F" %}
2888   ins_encode %{
2889     __ addps($dst$$XMMRegister, $src$$XMMRegister);
2890   %}
2891   ins_pipe( pipe_slow );
2892 %}
2893 
2894 instruct vadd4F_reg(vecX dst, vecX src1, vecX src2) %{
2895   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
2896   match(Set dst (AddVF src1 src2));
2897   format %{ "vaddps  $dst,$src1,$src2\t! add packed4F" %}
2898   ins_encode %{
2899     bool vector256 = false;
2900     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
2901   %}
2902   ins_pipe( pipe_slow );
2903 %}
2904 
2905 instruct vadd4F_mem(vecX dst, vecX src, memory mem) %{
2906   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
2907   match(Set dst (AddVF src (LoadVector mem)));
2908   format %{ "vaddps  $dst,$src,$mem\t! add packed4F" %}
2909   ins_encode %{
2910     bool vector256 = false;
2911     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
2912   %}
2913   ins_pipe( pipe_slow );
2914 %}
2915 
2916 instruct vadd8F_reg(vecY dst, vecY src1, vecY src2) %{
2917   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
2918   match(Set dst (AddVF src1 src2));
2919   format %{ "vaddps  $dst,$src1,$src2\t! add packed8F" %}
2920   ins_encode %{
2921     bool vector256 = true;
2922     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
2923   %}
2924   ins_pipe( pipe_slow );
2925 %}
2926 
2927 instruct vadd8F_mem(vecY dst, vecY src, memory mem) %{
2928   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
2929   match(Set dst (AddVF src (LoadVector mem)));
2930   format %{ "vaddps  $dst,$src,$mem\t! add packed8F" %}
2931   ins_encode %{
2932     bool vector256 = true;
2933     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
2934   %}
2935   ins_pipe( pipe_slow );
2936 %}
2937 
2938 // Doubles vector add
2939 instruct vadd2D(vecX dst, vecX src) %{
2940   predicate(n->as_Vector()->length() == 2);
2941   match(Set dst (AddVD dst src));
2942   format %{ "addpd   $dst,$src\t! add packed2D" %}
2943   ins_encode %{
2944     __ addpd($dst$$XMMRegister, $src$$XMMRegister);
2945   %}
2946   ins_pipe( pipe_slow );
2947 %}
2948 
2949 instruct vadd2D_reg(vecX dst, vecX src1, vecX src2) %{
2950   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
2951   match(Set dst (AddVD src1 src2));
2952   format %{ "vaddpd  $dst,$src1,$src2\t! add packed2D" %}
2953   ins_encode %{
2954     bool vector256 = false;
2955     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
2956   %}
2957   ins_pipe( pipe_slow );
2958 %}
2959 
2960 instruct vadd2D_mem(vecX dst, vecX src, memory mem) %{
2961   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
2962   match(Set dst (AddVD src (LoadVector mem)));
2963   format %{ "vaddpd  $dst,$src,$mem\t! add packed2D" %}
2964   ins_encode %{
2965     bool vector256 = false;
2966     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
2967   %}
2968   ins_pipe( pipe_slow );
2969 %}
2970 
2971 instruct vadd4D_reg(vecY dst, vecY src1, vecY src2) %{
2972   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
2973   match(Set dst (AddVD src1 src2));
2974   format %{ "vaddpd  $dst,$src1,$src2\t! add packed4D" %}
2975   ins_encode %{
2976     bool vector256 = true;
2977     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
2978   %}
2979   ins_pipe( pipe_slow );
2980 %}
2981 
2982 instruct vadd4D_mem(vecY dst, vecY src, memory mem) %{
2983   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
2984   match(Set dst (AddVD src (LoadVector mem)));
2985   format %{ "vaddpd  $dst,$src,$mem\t! add packed4D" %}
2986   ins_encode %{
2987     bool vector256 = true;
2988     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
2989   %}
2990   ins_pipe( pipe_slow );
2991 %}
2992 
2993 // --------------------------------- SUB --------------------------------------
2994 
2995 // Bytes vector sub
2996 instruct vsub4B(vecS dst, vecS src) %{
2997   predicate(n->as_Vector()->length() == 4);
2998   match(Set dst (SubVB dst src));
2999   format %{ "psubb   $dst,$src\t! sub packed4B" %}
3000   ins_encode %{
3001     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
3002   %}
3003   ins_pipe( pipe_slow );
3004 %}
3005 
3006 instruct vsub4B_reg(vecS dst, vecS src1, vecS src2) %{
3007   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
3008   match(Set dst (SubVB src1 src2));
3009   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed4B" %}
3010   ins_encode %{
3011     bool vector256 = false;
3012     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3013   %}
3014   ins_pipe( pipe_slow );
3015 %}
3016 
3017 instruct vsub8B(vecD dst, vecD src) %{
3018   predicate(n->as_Vector()->length() == 8);
3019   match(Set dst (SubVB dst src));
3020   format %{ "psubb   $dst,$src\t! sub packed8B" %}
3021   ins_encode %{
3022     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
3023   %}
3024   ins_pipe( pipe_slow );
3025 %}
3026 
3027 instruct vsub8B_reg(vecD dst, vecD src1, vecD src2) %{
3028   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
3029   match(Set dst (SubVB src1 src2));
3030   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed8B" %}
3031   ins_encode %{
3032     bool vector256 = false;
3033     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3034   %}
3035   ins_pipe( pipe_slow );
3036 %}
3037 
3038 instruct vsub16B(vecX dst, vecX src) %{
3039   predicate(n->as_Vector()->length() == 16);
3040   match(Set dst (SubVB dst src));
3041   format %{ "psubb   $dst,$src\t! sub packed16B" %}
3042   ins_encode %{
3043     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
3044   %}
3045   ins_pipe( pipe_slow );
3046 %}
3047 
3048 instruct vsub16B_reg(vecX dst, vecX src1, vecX src2) %{
3049   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
3050   match(Set dst (SubVB src1 src2));
3051   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed16B" %}
3052   ins_encode %{
3053     bool vector256 = false;
3054     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3055   %}
3056   ins_pipe( pipe_slow );
3057 %}
3058 
3059 instruct vsub16B_mem(vecX dst, vecX src, memory mem) %{
3060   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
3061   match(Set dst (SubVB src (LoadVector mem)));
3062   format %{ "vpsubb  $dst,$src,$mem\t! sub packed16B" %}
3063   ins_encode %{
3064     bool vector256 = false;
3065     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3066   %}
3067   ins_pipe( pipe_slow );
3068 %}
3069 
3070 instruct vsub32B_reg(vecY dst, vecY src1, vecY src2) %{
3071   predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
3072   match(Set dst (SubVB src1 src2));
3073   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed32B" %}
3074   ins_encode %{
3075     bool vector256 = true;
3076     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3077   %}
3078   ins_pipe( pipe_slow );
3079 %}
3080 
3081 instruct vsub32B_mem(vecY dst, vecY src, memory mem) %{
3082   predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
3083   match(Set dst (SubVB src (LoadVector mem)));
3084   format %{ "vpsubb  $dst,$src,$mem\t! sub packed32B" %}
3085   ins_encode %{
3086     bool vector256 = true;
3087     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3088   %}
3089   ins_pipe( pipe_slow );
3090 %}
3091 
3092 // Shorts/Chars vector sub
3093 instruct vsub2S(vecS dst, vecS src) %{
3094   predicate(n->as_Vector()->length() == 2);
3095   match(Set dst (SubVS dst src));
3096   format %{ "psubw   $dst,$src\t! sub packed2S" %}
3097   ins_encode %{
3098     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
3099   %}
3100   ins_pipe( pipe_slow );
3101 %}
3102 
3103 instruct vsub2S_reg(vecS dst, vecS src1, vecS src2) %{
3104   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
3105   match(Set dst (SubVS src1 src2));
3106   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed2S" %}
3107   ins_encode %{
3108     bool vector256 = false;
3109     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3110   %}
3111   ins_pipe( pipe_slow );
3112 %}
3113 
3114 instruct vsub4S(vecD dst, vecD src) %{
3115   predicate(n->as_Vector()->length() == 4);
3116   match(Set dst (SubVS dst src));
3117   format %{ "psubw   $dst,$src\t! sub packed4S" %}
3118   ins_encode %{
3119     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
3120   %}
3121   ins_pipe( pipe_slow );
3122 %}
3123 
3124 instruct vsub4S_reg(vecD dst, vecD src1, vecD src2) %{
3125   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
3126   match(Set dst (SubVS src1 src2));
3127   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed4S" %}
3128   ins_encode %{
3129     bool vector256 = false;
3130     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3131   %}
3132   ins_pipe( pipe_slow );
3133 %}
3134 
3135 instruct vsub8S(vecX dst, vecX src) %{
3136   predicate(n->as_Vector()->length() == 8);
3137   match(Set dst (SubVS dst src));
3138   format %{ "psubw   $dst,$src\t! sub packed8S" %}
3139   ins_encode %{
3140     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
3141   %}
3142   ins_pipe( pipe_slow );
3143 %}
3144 
3145 instruct vsub8S_reg(vecX dst, vecX src1, vecX src2) %{
3146   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
3147   match(Set dst (SubVS src1 src2));
3148   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed8S" %}
3149   ins_encode %{
3150     bool vector256 = false;
3151     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3152   %}
3153   ins_pipe( pipe_slow );
3154 %}
3155 
3156 instruct vsub8S_mem(vecX dst, vecX src, memory mem) %{
3157   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
3158   match(Set dst (SubVS src (LoadVector mem)));
3159   format %{ "vpsubw  $dst,$src,$mem\t! sub packed8S" %}
3160   ins_encode %{
3161     bool vector256 = false;
3162     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3163   %}
3164   ins_pipe( pipe_slow );
3165 %}
3166 
3167 instruct vsub16S_reg(vecY dst, vecY src1, vecY src2) %{
3168   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
3169   match(Set dst (SubVS src1 src2));
3170   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed16S" %}
3171   ins_encode %{
3172     bool vector256 = true;
3173     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3174   %}
3175   ins_pipe( pipe_slow );
3176 %}
3177 
3178 instruct vsub16S_mem(vecY dst, vecY src, memory mem) %{
3179   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
3180   match(Set dst (SubVS src (LoadVector mem)));
3181   format %{ "vpsubw  $dst,$src,$mem\t! sub packed16S" %}
3182   ins_encode %{
3183     bool vector256 = true;
3184     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3185   %}
3186   ins_pipe( pipe_slow );
3187 %}
3188 
3189 // Integers vector sub
3190 instruct vsub2I(vecD dst, vecD src) %{
3191   predicate(n->as_Vector()->length() == 2);
3192   match(Set dst (SubVI dst src));
3193   format %{ "psubd   $dst,$src\t! sub packed2I" %}
3194   ins_encode %{
3195     __ psubd($dst$$XMMRegister, $src$$XMMRegister);
3196   %}
3197   ins_pipe( pipe_slow );
3198 %}
3199 
3200 instruct vsub2I_reg(vecD dst, vecD src1, vecD src2) %{
3201   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
3202   match(Set dst (SubVI src1 src2));
3203   format %{ "vpsubd  $dst,$src1,$src2\t! sub packed2I" %}
3204   ins_encode %{
3205     bool vector256 = false;
3206     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3207   %}
3208   ins_pipe( pipe_slow );
3209 %}
3210 
3211 instruct vsub4I(vecX dst, vecX src) %{
3212   predicate(n->as_Vector()->length() == 4);
3213   match(Set dst (SubVI dst src));
3214   format %{ "psubd   $dst,$src\t! sub packed4I" %}
3215   ins_encode %{
3216     __ psubd($dst$$XMMRegister, $src$$XMMRegister);
3217   %}
3218   ins_pipe( pipe_slow );
3219 %}
3220 
3221 instruct vsub4I_reg(vecX dst, vecX src1, vecX src2) %{
3222   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
3223   match(Set dst (SubVI src1 src2));
3224   format %{ "vpsubd  $dst,$src1,$src2\t! sub packed4I" %}
3225   ins_encode %{
3226     bool vector256 = false;
3227     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3228   %}
3229   ins_pipe( pipe_slow );
3230 %}
3231 
3232 instruct vsub4I_mem(vecX dst, vecX src, memory mem) %{
3233   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
3234   match(Set dst (SubVI src (LoadVector mem)));
3235   format %{ "vpsubd  $dst,$src,$mem\t! sub packed4I" %}
3236   ins_encode %{
3237     bool vector256 = false;
3238     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3239   %}
3240   ins_pipe( pipe_slow );
3241 %}
3242 
3243 instruct vsub8I_reg(vecY dst, vecY src1, vecY src2) %{
3244   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
3245   match(Set dst (SubVI src1 src2));
3246   format %{ "vpsubd  $dst,$src1,$src2\t! sub packed8I" %}
3247   ins_encode %{
3248     bool vector256 = true;
3249     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3250   %}
3251   ins_pipe( pipe_slow );
3252 %}
3253 
3254 instruct vsub8I_mem(vecY dst, vecY src, memory mem) %{
3255   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
3256   match(Set dst (SubVI src (LoadVector mem)));
3257   format %{ "vpsubd  $dst,$src,$mem\t! sub packed8I" %}
3258   ins_encode %{
3259     bool vector256 = true;
3260     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3261   %}
3262   ins_pipe( pipe_slow );
3263 %}
3264 
3265 // Longs vector sub
3266 instruct vsub2L(vecX dst, vecX src) %{
3267   predicate(n->as_Vector()->length() == 2);
3268   match(Set dst (SubVL dst src));
3269   format %{ "psubq   $dst,$src\t! sub packed2L" %}
3270   ins_encode %{
3271     __ psubq($dst$$XMMRegister, $src$$XMMRegister);
3272   %}
3273   ins_pipe( pipe_slow );
3274 %}
3275 
3276 instruct vsub2L_reg(vecX dst, vecX src1, vecX src2) %{
3277   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
3278   match(Set dst (SubVL src1 src2));
3279   format %{ "vpsubq  $dst,$src1,$src2\t! sub packed2L" %}
3280   ins_encode %{
3281     bool vector256 = false;
3282     __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3283   %}
3284   ins_pipe( pipe_slow );
3285 %}
3286 
3287 instruct vsub2L_mem(vecX dst, vecX src, memory mem) %{
3288   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
3289   match(Set dst (SubVL src (LoadVector mem)));
3290   format %{ "vpsubq  $dst,$src,$mem\t! sub packed2L" %}
3291   ins_encode %{
3292     bool vector256 = false;
3293     __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3294   %}
3295   ins_pipe( pipe_slow );
3296 %}
3297 
3298 instruct vsub4L_reg(vecY dst, vecY src1, vecY src2) %{
3299   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
3300   match(Set dst (SubVL src1 src2));
3301   format %{ "vpsubq  $dst,$src1,$src2\t! sub packed4L" %}
3302   ins_encode %{
3303     bool vector256 = true;
3304     __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3305   %}
3306   ins_pipe( pipe_slow );
3307 %}
3308 
3309 instruct vsub4L_mem(vecY dst, vecY src, memory mem) %{
3310   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
3311   match(Set dst (SubVL src (LoadVector mem)));
3312   format %{ "vpsubq  $dst,$src,$mem\t! sub packed4L" %}
3313   ins_encode %{
3314     bool vector256 = true;
3315     __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3316   %}
3317   ins_pipe( pipe_slow );
3318 %}
3319 
3320 // Floats vector sub
3321 instruct vsub2F(vecD dst, vecD src) %{
3322   predicate(n->as_Vector()->length() == 2);
3323   match(Set dst (SubVF dst src));
3324   format %{ "subps   $dst,$src\t! sub packed2F" %}
3325   ins_encode %{
3326     __ subps($dst$$XMMRegister, $src$$XMMRegister);
3327   %}
3328   ins_pipe( pipe_slow );
3329 %}
3330 
3331 instruct vsub2F_reg(vecD dst, vecD src1, vecD src2) %{
3332   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
3333   match(Set dst (SubVF src1 src2));
3334   format %{ "vsubps  $dst,$src1,$src2\t! sub packed2F" %}
3335   ins_encode %{
3336     bool vector256 = false;
3337     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3338   %}
3339   ins_pipe( pipe_slow );
3340 %}
3341 
3342 instruct vsub4F(vecX dst, vecX src) %{
3343   predicate(n->as_Vector()->length() == 4);
3344   match(Set dst (SubVF dst src));
3345   format %{ "subps   $dst,$src\t! sub packed4F" %}
3346   ins_encode %{
3347     __ subps($dst$$XMMRegister, $src$$XMMRegister);
3348   %}
3349   ins_pipe( pipe_slow );
3350 %}
3351 
3352 instruct vsub4F_reg(vecX dst, vecX src1, vecX src2) %{
3353   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
3354   match(Set dst (SubVF src1 src2));
3355   format %{ "vsubps  $dst,$src1,$src2\t! sub packed4F" %}
3356   ins_encode %{
3357     bool vector256 = false;
3358     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3359   %}
3360   ins_pipe( pipe_slow );
3361 %}
3362 
3363 instruct vsub4F_mem(vecX dst, vecX src, memory mem) %{
3364   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
3365   match(Set dst (SubVF src (LoadVector mem)));
3366   format %{ "vsubps  $dst,$src,$mem\t! sub packed4F" %}
3367   ins_encode %{
3368     bool vector256 = false;
3369     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3370   %}
3371   ins_pipe( pipe_slow );
3372 %}
3373 
3374 instruct vsub8F_reg(vecY dst, vecY src1, vecY src2) %{
3375   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
3376   match(Set dst (SubVF src1 src2));
3377   format %{ "vsubps  $dst,$src1,$src2\t! sub packed8F" %}
3378   ins_encode %{
3379     bool vector256 = true;
3380     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3381   %}
3382   ins_pipe( pipe_slow );
3383 %}
3384 
3385 instruct vsub8F_mem(vecY dst, vecY src, memory mem) %{
3386   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
3387   match(Set dst (SubVF src (LoadVector mem)));
3388   format %{ "vsubps  $dst,$src,$mem\t! sub packed8F" %}
3389   ins_encode %{
3390     bool vector256 = true;
3391     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3392   %}
3393   ins_pipe( pipe_slow );
3394 %}
3395 
3396 // Doubles vector sub
3397 instruct vsub2D(vecX dst, vecX src) %{
3398   predicate(n->as_Vector()->length() == 2);
3399   match(Set dst (SubVD dst src));
3400   format %{ "subpd   $dst,$src\t! sub packed2D" %}
3401   ins_encode %{
3402     __ subpd($dst$$XMMRegister, $src$$XMMRegister);
3403   %}
3404   ins_pipe( pipe_slow );
3405 %}
3406 
3407 instruct vsub2D_reg(vecX dst, vecX src1, vecX src2) %{
3408   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
3409   match(Set dst (SubVD src1 src2));
3410   format %{ "vsubpd  $dst,$src1,$src2\t! sub packed2D" %}
3411   ins_encode %{
3412     bool vector256 = false;
3413     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3414   %}
3415   ins_pipe( pipe_slow );
3416 %}
3417 
3418 instruct vsub2D_mem(vecX dst, vecX src, memory mem) %{
3419   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
3420   match(Set dst (SubVD src (LoadVector mem)));
3421   format %{ "vsubpd  $dst,$src,$mem\t! sub packed2D" %}
3422   ins_encode %{
3423     bool vector256 = false;
3424     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3425   %}
3426   ins_pipe( pipe_slow );
3427 %}
3428 
3429 instruct vsub4D_reg(vecY dst, vecY src1, vecY src2) %{
3430   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
3431   match(Set dst (SubVD src1 src2));
3432   format %{ "vsubpd  $dst,$src1,$src2\t! sub packed4D" %}
3433   ins_encode %{
3434     bool vector256 = true;
3435     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3436   %}
3437   ins_pipe( pipe_slow );
3438 %}
3439 
3440 instruct vsub4D_mem(vecY dst, vecY src, memory mem) %{
3441   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
3442   match(Set dst (SubVD src (LoadVector mem)));
3443   format %{ "vsubpd  $dst,$src,$mem\t! sub packed4D" %}
3444   ins_encode %{
3445     bool vector256 = true;
3446     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3447   %}
3448   ins_pipe( pipe_slow );
3449 %}
3450 
3451 // --------------------------------- MUL --------------------------------------
3452 
3453 // Shorts/Chars vector mul
3454 instruct vmul2S(vecS dst, vecS src) %{
3455   predicate(n->as_Vector()->length() == 2);
3456   match(Set dst (MulVS dst src));
3457   format %{ "pmullw $dst,$src\t! mul packed2S" %}
3458   ins_encode %{
3459     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
3460   %}
3461   ins_pipe( pipe_slow );
3462 %}
3463 
3464 instruct vmul2S_reg(vecS dst, vecS src1, vecS src2) %{
3465   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
3466   match(Set dst (MulVS src1 src2));
3467   format %{ "vpmullw $dst,$src1,$src2\t! mul packed2S" %}
3468   ins_encode %{
3469     bool vector256 = false;
3470     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3471   %}
3472   ins_pipe( pipe_slow );
3473 %}
3474 
3475 instruct vmul4S(vecD dst, vecD src) %{
3476   predicate(n->as_Vector()->length() == 4);
3477   match(Set dst (MulVS dst src));
3478   format %{ "pmullw  $dst,$src\t! mul packed4S" %}
3479   ins_encode %{
3480     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
3481   %}
3482   ins_pipe( pipe_slow );
3483 %}
3484 
3485 instruct vmul4S_reg(vecD dst, vecD src1, vecD src2) %{
3486   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
3487   match(Set dst (MulVS src1 src2));
3488   format %{ "vpmullw $dst,$src1,$src2\t! mul packed4S" %}
3489   ins_encode %{
3490     bool vector256 = false;
3491     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3492   %}
3493   ins_pipe( pipe_slow );
3494 %}
3495 
3496 instruct vmul8S(vecX dst, vecX src) %{
3497   predicate(n->as_Vector()->length() == 8);
3498   match(Set dst (MulVS dst src));
3499   format %{ "pmullw  $dst,$src\t! mul packed8S" %}
3500   ins_encode %{
3501     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
3502   %}
3503   ins_pipe( pipe_slow );
3504 %}
3505 
3506 instruct vmul8S_reg(vecX dst, vecX src1, vecX src2) %{
3507   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
3508   match(Set dst (MulVS src1 src2));
3509   format %{ "vpmullw $dst,$src1,$src2\t! mul packed8S" %}
3510   ins_encode %{
3511     bool vector256 = false;
3512     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3513   %}
3514   ins_pipe( pipe_slow );
3515 %}
3516 
3517 instruct vmul8S_mem(vecX dst, vecX src, memory mem) %{
3518   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
3519   match(Set dst (MulVS src (LoadVector mem)));
3520   format %{ "vpmullw $dst,$src,$mem\t! mul packed8S" %}
3521   ins_encode %{
3522     bool vector256 = false;
3523     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3524   %}
3525   ins_pipe( pipe_slow );
3526 %}
3527 
3528 instruct vmul16S_reg(vecY dst, vecY src1, vecY src2) %{
3529   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
3530   match(Set dst (MulVS src1 src2));
3531   format %{ "vpmullw $dst,$src1,$src2\t! mul packed16S" %}
3532   ins_encode %{
3533     bool vector256 = true;
3534     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3535   %}
3536   ins_pipe( pipe_slow );
3537 %}
3538 
3539 instruct vmul16S_mem(vecY dst, vecY src, memory mem) %{
3540   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
3541   match(Set dst (MulVS src (LoadVector mem)));
3542   format %{ "vpmullw $dst,$src,$mem\t! mul packed16S" %}
3543   ins_encode %{
3544     bool vector256 = true;
3545     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3546   %}
3547   ins_pipe( pipe_slow );
3548 %}
3549 
3550 // Integers vector mul (sse4_1)
3551 instruct vmul2I(vecD dst, vecD src) %{
3552   predicate(UseSSE > 3 && n->as_Vector()->length() == 2);
3553   match(Set dst (MulVI dst src));
3554   format %{ "pmulld  $dst,$src\t! mul packed2I" %}
3555   ins_encode %{
3556     __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
3557   %}
3558   ins_pipe( pipe_slow );
3559 %}
3560 
3561 instruct vmul2I_reg(vecD dst, vecD src1, vecD src2) %{
3562   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
3563   match(Set dst (MulVI src1 src2));
3564   format %{ "vpmulld $dst,$src1,$src2\t! mul packed2I" %}
3565   ins_encode %{
3566     bool vector256 = false;
3567     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3568   %}
3569   ins_pipe( pipe_slow );
3570 %}
3571 
3572 instruct vmul4I(vecX dst, vecX src) %{
3573   predicate(UseSSE > 3 && n->as_Vector()->length() == 4);
3574   match(Set dst (MulVI dst src));
3575   format %{ "pmulld  $dst,$src\t! mul packed4I" %}
3576   ins_encode %{
3577     __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
3578   %}
3579   ins_pipe( pipe_slow );
3580 %}
3581 
3582 instruct vmul4I_reg(vecX dst, vecX src1, vecX src2) %{
3583   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
3584   match(Set dst (MulVI src1 src2));
3585   format %{ "vpmulld $dst,$src1,$src2\t! mul packed4I" %}
3586   ins_encode %{
3587     bool vector256 = false;
3588     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3589   %}
3590   ins_pipe( pipe_slow );
3591 %}
3592 
3593 instruct vmul4I_mem(vecX dst, vecX src, memory mem) %{
3594   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
3595   match(Set dst (MulVI src (LoadVector mem)));
3596   format %{ "vpmulld $dst,$src,$mem\t! mul packed4I" %}
3597   ins_encode %{
3598     bool vector256 = false;
3599     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3600   %}
3601   ins_pipe( pipe_slow );
3602 %}
3603 
3604 instruct vmul8I_reg(vecY dst, vecY src1, vecY src2) %{
3605   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
3606   match(Set dst (MulVI src1 src2));
3607   format %{ "vpmulld $dst,$src1,$src2\t! mul packed8I" %}
3608   ins_encode %{
3609     bool vector256 = true;
3610     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3611   %}
3612   ins_pipe( pipe_slow );
3613 %}
3614 
3615 instruct vmul8I_mem(vecY dst, vecY src, memory mem) %{
3616   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
3617   match(Set dst (MulVI src (LoadVector mem)));
3618   format %{ "vpmulld $dst,$src,$mem\t! mul packed8I" %}
3619   ins_encode %{
3620     bool vector256 = true;
3621     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3622   %}
3623   ins_pipe( pipe_slow );
3624 %}
3625 
3626 // Floats vector mul
3627 instruct vmul2F(vecD dst, vecD src) %{
3628   predicate(n->as_Vector()->length() == 2);
3629   match(Set dst (MulVF dst src));
3630   format %{ "mulps   $dst,$src\t! mul packed2F" %}
3631   ins_encode %{
3632     __ mulps($dst$$XMMRegister, $src$$XMMRegister);
3633   %}
3634   ins_pipe( pipe_slow );
3635 %}
3636 
3637 instruct vmul2F_reg(vecD dst, vecD src1, vecD src2) %{
3638   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
3639   match(Set dst (MulVF src1 src2));
3640   format %{ "vmulps  $dst,$src1,$src2\t! mul packed2F" %}
3641   ins_encode %{
3642     bool vector256 = false;
3643     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3644   %}
3645   ins_pipe( pipe_slow );
3646 %}
3647 
3648 instruct vmul4F(vecX dst, vecX src) %{
3649   predicate(n->as_Vector()->length() == 4);
3650   match(Set dst (MulVF dst src));
3651   format %{ "mulps   $dst,$src\t! mul packed4F" %}
3652   ins_encode %{
3653     __ mulps($dst$$XMMRegister, $src$$XMMRegister);
3654   %}
3655   ins_pipe( pipe_slow );
3656 %}
3657 
3658 instruct vmul4F_reg(vecX dst, vecX src1, vecX src2) %{
3659   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
3660   match(Set dst (MulVF src1 src2));
3661   format %{ "vmulps  $dst,$src1,$src2\t! mul packed4F" %}
3662   ins_encode %{
3663     bool vector256 = false;
3664     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3665   %}
3666   ins_pipe( pipe_slow );
3667 %}
3668 
3669 instruct vmul4F_mem(vecX dst, vecX src, memory mem) %{
3670   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
3671   match(Set dst (MulVF src (LoadVector mem)));
3672   format %{ "vmulps  $dst,$src,$mem\t! mul packed4F" %}
3673   ins_encode %{
3674     bool vector256 = false;
3675     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3676   %}
3677   ins_pipe( pipe_slow );
3678 %}
3679 
3680 instruct vmul8F_reg(vecY dst, vecY src1, vecY src2) %{
3681   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
3682   match(Set dst (MulVF src1 src2));
3683   format %{ "vmulps  $dst,$src1,$src2\t! mul packed8F" %}
3684   ins_encode %{
3685     bool vector256 = true;
3686     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3687   %}
3688   ins_pipe( pipe_slow );
3689 %}
3690 
3691 instruct vmul8F_mem(vecY dst, vecY src, memory mem) %{
3692   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
3693   match(Set dst (MulVF src (LoadVector mem)));
3694   format %{ "vmulps  $dst,$src,$mem\t! mul packed8F" %}
3695   ins_encode %{
3696     bool vector256 = true;
3697     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3698   %}
3699   ins_pipe( pipe_slow );
3700 %}
3701 
3702 // Doubles vector mul
3703 instruct vmul2D(vecX dst, vecX src) %{
3704   predicate(n->as_Vector()->length() == 2);
3705   match(Set dst (MulVD dst src));
3706   format %{ "mulpd   $dst,$src\t! mul packed2D" %}
3707   ins_encode %{
3708     __ mulpd($dst$$XMMRegister, $src$$XMMRegister);
3709   %}
3710   ins_pipe( pipe_slow );
3711 %}
3712 
3713 instruct vmul2D_reg(vecX dst, vecX src1, vecX src2) %{
3714   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
3715   match(Set dst (MulVD src1 src2));
3716   format %{ "vmulpd  $dst,$src1,$src2\t! mul packed2D" %}
3717   ins_encode %{
3718     bool vector256 = false;
3719     __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3720   %}
3721   ins_pipe( pipe_slow );
3722 %}
3723 
3724 instruct vmul2D_mem(vecX dst, vecX src, memory mem) %{
3725   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
3726   match(Set dst (MulVD src (LoadVector mem)));
3727   format %{ "vmulpd  $dst,$src,$mem\t! mul packed2D" %}
3728   ins_encode %{
3729     bool vector256 = false;
3730     __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3731   %}
3732   ins_pipe( pipe_slow );
3733 %}
3734 
3735 instruct vmul4D_reg(vecY dst, vecY src1, vecY src2) %{
3736   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
3737   match(Set dst (MulVD src1 src2));
3738   format %{ "vmulpd  $dst,$src1,$src2\t! mul packed4D" %}
3739   ins_encode %{
3740     bool vector256 = true;
3741     __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3742   %}
3743   ins_pipe( pipe_slow );
3744 %}
3745 
3746 instruct vmul4D_mem(vecY dst, vecY src, memory mem) %{
3747   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
3748   match(Set dst (MulVD src (LoadVector mem)));
3749   format %{ "vmulpd  $dst,$src,$mem\t! mul packed4D" %}
3750   ins_encode %{
3751     bool vector256 = true;
3752     __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3753   %}
3754   ins_pipe( pipe_slow );
3755 %}
3756 
3757 // --------------------------------- DIV --------------------------------------
3758 
3759 // Floats vector div
3760 instruct vdiv2F(vecD dst, vecD src) %{
3761   predicate(n->as_Vector()->length() == 2);
3762   match(Set dst (DivVF dst src));
3763   format %{ "divps   $dst,$src\t! div packed2F" %}
3764   ins_encode %{
3765     __ divps($dst$$XMMRegister, $src$$XMMRegister);
3766   %}
3767   ins_pipe( pipe_slow );
3768 %}
3769 
3770 instruct vdiv2F_reg(vecD dst, vecD src1, vecD src2) %{
3771   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
3772   match(Set dst (DivVF src1 src2));
3773   format %{ "vdivps  $dst,$src1,$src2\t! div packed2F" %}
3774   ins_encode %{
3775     bool vector256 = false;
3776     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3777   %}
3778   ins_pipe( pipe_slow );
3779 %}
3780 
3781 instruct vdiv4F(vecX dst, vecX src) %{
3782   predicate(n->as_Vector()->length() == 4);
3783   match(Set dst (DivVF dst src));
3784   format %{ "divps   $dst,$src\t! div packed4F" %}
3785   ins_encode %{
3786     __ divps($dst$$XMMRegister, $src$$XMMRegister);
3787   %}
3788   ins_pipe( pipe_slow );
3789 %}
3790 
3791 instruct vdiv4F_reg(vecX dst, vecX src1, vecX src2) %{
3792   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
3793   match(Set dst (DivVF src1 src2));
3794   format %{ "vdivps  $dst,$src1,$src2\t! div packed4F" %}
3795   ins_encode %{
3796     bool vector256 = false;
3797     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3798   %}
3799   ins_pipe( pipe_slow );
3800 %}
3801 
3802 instruct vdiv4F_mem(vecX dst, vecX src, memory mem) %{
3803   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
3804   match(Set dst (DivVF src (LoadVector mem)));
3805   format %{ "vdivps  $dst,$src,$mem\t! div packed4F" %}
3806   ins_encode %{
3807     bool vector256 = false;
3808     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3809   %}
3810   ins_pipe( pipe_slow );
3811 %}
3812 
3813 instruct vdiv8F_reg(vecY dst, vecY src1, vecY src2) %{
3814   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
3815   match(Set dst (DivVF src1 src2));
3816   format %{ "vdivps  $dst,$src1,$src2\t! div packed8F" %}
3817   ins_encode %{
3818     bool vector256 = true;
3819     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3820   %}
3821   ins_pipe( pipe_slow );
3822 %}
3823 
3824 instruct vdiv8F_mem(vecY dst, vecY src, memory mem) %{
3825   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
3826   match(Set dst (DivVF src (LoadVector mem)));
3827   format %{ "vdivps  $dst,$src,$mem\t! div packed8F" %}
3828   ins_encode %{
3829     bool vector256 = true;
3830     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3831   %}
3832   ins_pipe( pipe_slow );
3833 %}
3834 
3835 // Doubles vector div
3836 instruct vdiv2D(vecX dst, vecX src) %{
3837   predicate(n->as_Vector()->length() == 2);
3838   match(Set dst (DivVD dst src));
3839   format %{ "divpd   $dst,$src\t! div packed2D" %}
3840   ins_encode %{
3841     __ divpd($dst$$XMMRegister, $src$$XMMRegister);
3842   %}
3843   ins_pipe( pipe_slow );
3844 %}
3845 
3846 instruct vdiv2D_reg(vecX dst, vecX src1, vecX src2) %{
3847   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
3848   match(Set dst (DivVD src1 src2));
3849   format %{ "vdivpd  $dst,$src1,$src2\t! div packed2D" %}
3850   ins_encode %{
3851     bool vector256 = false;
3852     __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3853   %}
3854   ins_pipe( pipe_slow );
3855 %}
3856 
3857 instruct vdiv2D_mem(vecX dst, vecX src, memory mem) %{
3858   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
3859   match(Set dst (DivVD src (LoadVector mem)));
3860   format %{ "vdivpd  $dst,$src,$mem\t! div packed2D" %}
3861   ins_encode %{
3862     bool vector256 = false;
3863     __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3864   %}
3865   ins_pipe( pipe_slow );
3866 %}
3867 
3868 instruct vdiv4D_reg(vecY dst, vecY src1, vecY src2) %{
3869   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
3870   match(Set dst (DivVD src1 src2));
3871   format %{ "vdivpd  $dst,$src1,$src2\t! div packed4D" %}
3872   ins_encode %{
3873     bool vector256 = true;
3874     __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3875   %}
3876   ins_pipe( pipe_slow );
3877 %}
3878 
3879 instruct vdiv4D_mem(vecY dst, vecY src, memory mem) %{
3880   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
3881   match(Set dst (DivVD src (LoadVector mem)));
3882   format %{ "vdivpd  $dst,$src,$mem\t! div packed4D" %}
3883   ins_encode %{
3884     bool vector256 = true;
3885     __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3886   %}
3887   ins_pipe( pipe_slow );
3888 %}
3889 
3890 // ------------------------------ Shift ---------------------------------------
3891 
3892 // Left and right shift count vectors are the same on x86
3893 // (only lowest bits of xmm reg are used for count).
3894 instruct vshiftcnt(vecS dst, rRegI cnt) %{
3895   match(Set dst (LShiftCntV cnt));
3896   match(Set dst (RShiftCntV cnt));
3897   format %{ "movd    $dst,$cnt\t! load shift count" %}
3898   ins_encode %{
3899     __ movdl($dst$$XMMRegister, $cnt$$Register);
3900   %}
3901   ins_pipe( pipe_slow );
3902 %}
3903 
3904 // ------------------------------ LeftShift -----------------------------------
3905 
3906 // Shorts/Chars vector left shift
3907 instruct vsll2S(vecS dst, vecS shift) %{
3908   predicate(n->as_Vector()->length() == 2);
3909   match(Set dst (LShiftVS dst shift));
3910   format %{ "psllw   $dst,$shift\t! left shift packed2S" %}
3911   ins_encode %{
3912     __ psllw($dst$$XMMRegister, $shift$$XMMRegister);
3913   %}
3914   ins_pipe( pipe_slow );
3915 %}
3916 
3917 instruct vsll2S_imm(vecS dst, immI8 shift) %{
3918   predicate(n->as_Vector()->length() == 2);
3919   match(Set dst (LShiftVS dst shift));
3920   format %{ "psllw   $dst,$shift\t! left shift packed2S" %}
3921   ins_encode %{
3922     __ psllw($dst$$XMMRegister, (int)$shift$$constant);
3923   %}
3924   ins_pipe( pipe_slow );
3925 %}
3926 
3927 instruct vsll2S_reg(vecS dst, vecS src, vecS shift) %{
3928   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
3929   match(Set dst (LShiftVS src shift));
3930   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
3931   ins_encode %{
3932     bool vector256 = false;
3933     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
3934   %}
3935   ins_pipe( pipe_slow );
3936 %}
3937 
3938 instruct vsll2S_reg_imm(vecS dst, vecS src, immI8 shift) %{
3939   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
3940   match(Set dst (LShiftVS src shift));
3941   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
3942   ins_encode %{
3943     bool vector256 = false;
3944     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
3945   %}
3946   ins_pipe( pipe_slow );
3947 %}
3948 
3949 instruct vsll4S(vecD dst, vecS shift) %{
3950   predicate(n->as_Vector()->length() == 4);
3951   match(Set dst (LShiftVS dst shift));
3952   format %{ "psllw   $dst,$shift\t! left shift packed4S" %}
3953   ins_encode %{
3954     __ psllw($dst$$XMMRegister, $shift$$XMMRegister);
3955   %}
3956   ins_pipe( pipe_slow );
3957 %}
3958 
3959 instruct vsll4S_imm(vecD dst, immI8 shift) %{
3960   predicate(n->as_Vector()->length() == 4);
3961   match(Set dst (LShiftVS dst shift));
3962   format %{ "psllw   $dst,$shift\t! left shift packed4S" %}
3963   ins_encode %{
3964     __ psllw($dst$$XMMRegister, (int)$shift$$constant);
3965   %}
3966   ins_pipe( pipe_slow );
3967 %}
3968 
3969 instruct vsll4S_reg(vecD dst, vecD src, vecS shift) %{
3970   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
3971   match(Set dst (LShiftVS src shift));
3972   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
3973   ins_encode %{
3974     bool vector256 = false;
3975     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
3976   %}
3977   ins_pipe( pipe_slow );
3978 %}
3979 
3980 instruct vsll4S_reg_imm(vecD dst, vecD src, immI8 shift) %{
3981   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
3982   match(Set dst (LShiftVS src shift));
3983   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
3984   ins_encode %{
3985     bool vector256 = false;
3986     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
3987   %}
3988   ins_pipe( pipe_slow );
3989 %}
3990 
3991 instruct vsll8S(vecX dst, vecS shift) %{
3992   predicate(n->as_Vector()->length() == 8);
3993   match(Set dst (LShiftVS dst shift));
3994   format %{ "psllw   $dst,$shift\t! left shift packed8S" %}
3995   ins_encode %{
3996     __ psllw($dst$$XMMRegister, $shift$$XMMRegister);
3997   %}
3998   ins_pipe( pipe_slow );
3999 %}
4000 
4001 instruct vsll8S_imm(vecX dst, immI8 shift) %{
4002   predicate(n->as_Vector()->length() == 8);
4003   match(Set dst (LShiftVS dst shift));
4004   format %{ "psllw   $dst,$shift\t! left shift packed8S" %}
4005   ins_encode %{
4006     __ psllw($dst$$XMMRegister, (int)$shift$$constant);
4007   %}
4008   ins_pipe( pipe_slow );
4009 %}
4010 
4011 instruct vsll8S_reg(vecX dst, vecX src, vecS shift) %{
4012   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
4013   match(Set dst (LShiftVS src shift));
4014   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
4015   ins_encode %{
4016     bool vector256 = false;
4017     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
4018   %}
4019   ins_pipe( pipe_slow );
4020 %}
4021 
4022 instruct vsll8S_reg_imm(vecX dst, vecX src, immI8 shift) %{
4023   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
4024   match(Set dst (LShiftVS src shift));
4025   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
4026   ins_encode %{
4027     bool vector256 = false;
4028     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
4029   %}
4030   ins_pipe( pipe_slow );
4031 %}
4032 
4033 instruct vsll16S_reg(vecY dst, vecY src, vecS shift) %{
4034   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
4035   match(Set dst (LShiftVS src shift));
4036   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
4037   ins_encode %{
4038     bool vector256 = true;
4039     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
4040   %}
4041   ins_pipe( pipe_slow );
4042 %}
4043 
4044 instruct vsll16S_reg_imm(vecY dst, vecY src, immI8 shift) %{
4045   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
4046   match(Set dst (LShiftVS src shift));
4047   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
4048   ins_encode %{
4049     bool vector256 = true;
4050     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
4051   %}
4052   ins_pipe( pipe_slow );
4053 %}
4054 
4055 // Integers vector left shift
4056 instruct vsll2I(vecD dst, vecS shift) %{
4057   predicate(n->as_Vector()->length() == 2);
4058   match(Set dst (LShiftVI dst shift));
4059   format %{ "pslld   $dst,$shift\t! left shift packed2I" %}
4060   ins_encode %{
4061     __ pslld($dst$$XMMRegister, $shift$$XMMRegister);
4062   %}
4063   ins_pipe( pipe_slow );
4064 %}
4065 
4066 instruct vsll2I_imm(vecD dst, immI8 shift) %{
4067   predicate(n->as_Vector()->length() == 2);
4068   match(Set dst (LShiftVI dst shift));
4069   format %{ "pslld   $dst,$shift\t! left shift packed2I" %}
4070   ins_encode %{
4071     __ pslld($dst$$XMMRegister, (int)$shift$$constant);
4072   %}
4073   ins_pipe( pipe_slow );
4074 %}
4075 
4076 instruct vsll2I_reg(vecD dst, vecD src, vecS shift) %{
4077   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
4078   match(Set dst (LShiftVI src shift));
4079   format %{ "vpslld  $dst,$src,$shift\t! left shift packed2I" %}
4080   ins_encode %{
4081     bool vector256 = false;
4082     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
4083   %}
4084   ins_pipe( pipe_slow );
4085 %}
4086 
4087 instruct vsll2I_reg_imm(vecD dst, vecD src, immI8 shift) %{
4088   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
4089   match(Set dst (LShiftVI src shift));
4090   format %{ "vpslld  $dst,$src,$shift\t! left shift packed2I" %}
4091   ins_encode %{
4092     bool vector256 = false;
4093     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
4094   %}
4095   ins_pipe( pipe_slow );
4096 %}
4097 
4098 instruct vsll4I(vecX dst, vecS shift) %{
4099   predicate(n->as_Vector()->length() == 4);
4100   match(Set dst (LShiftVI dst shift));
4101   format %{ "pslld   $dst,$shift\t! left shift packed4I" %}
4102   ins_encode %{
4103     __ pslld($dst$$XMMRegister, $shift$$XMMRegister);
4104   %}
4105   ins_pipe( pipe_slow );
4106 %}
4107 
4108 instruct vsll4I_imm(vecX dst, immI8 shift) %{
4109   predicate(n->as_Vector()->length() == 4);
4110   match(Set dst (LShiftVI dst shift));
4111   format %{ "pslld   $dst,$shift\t! left shift packed4I" %}
4112   ins_encode %{
4113     __ pslld($dst$$XMMRegister, (int)$shift$$constant);
4114   %}
4115   ins_pipe( pipe_slow );
4116 %}
4117 
4118 instruct vsll4I_reg(vecX dst, vecX src, vecS shift) %{
4119   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
4120   match(Set dst (LShiftVI src shift));
4121   format %{ "vpslld  $dst,$src,$shift\t! left shift packed4I" %}
4122   ins_encode %{
4123     bool vector256 = false;
4124     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
4125   %}
4126   ins_pipe( pipe_slow );
4127 %}
4128 
4129 instruct vsll4I_reg_imm(vecX dst, vecX src, immI8 shift) %{
4130   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
4131   match(Set dst (LShiftVI src shift));
4132   format %{ "vpslld  $dst,$src,$shift\t! left shift packed4I" %}
4133   ins_encode %{
4134     bool vector256 = false;
4135     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
4136   %}
4137   ins_pipe( pipe_slow );
4138 %}
4139 
4140 instruct vsll8I_reg(vecY dst, vecY src, vecS shift) %{
4141   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
4142   match(Set dst (LShiftVI src shift));
4143   format %{ "vpslld  $dst,$src,$shift\t! left shift packed8I" %}
4144   ins_encode %{
4145     bool vector256 = true;
4146     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
4147   %}
4148   ins_pipe( pipe_slow );
4149 %}
4150 
4151 instruct vsll8I_reg_imm(vecY dst, vecY src, immI8 shift) %{
4152   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
4153   match(Set dst (LShiftVI src shift));
4154   format %{ "vpslld  $dst,$src,$shift\t! left shift packed8I" %}
4155   ins_encode %{
4156     bool vector256 = true;
4157     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
4158   %}
4159   ins_pipe( pipe_slow );
4160 %}
4161 
4162 // Longs vector left shift
4163 instruct vsll2L(vecX dst, vecS shift) %{
4164   predicate(n->as_Vector()->length() == 2);
4165   match(Set dst (LShiftVL dst shift));
4166   format %{ "psllq   $dst,$shift\t! left shift packed2L" %}
4167   ins_encode %{
4168     __ psllq($dst$$XMMRegister, $shift$$XMMRegister);
4169   %}
4170   ins_pipe( pipe_slow );
4171 %}
4172 
4173 instruct vsll2L_imm(vecX dst, immI8 shift) %{
4174   predicate(n->as_Vector()->length() == 2);
4175   match(Set dst (LShiftVL dst shift));
4176   format %{ "psllq   $dst,$shift\t! left shift packed2L" %}
4177   ins_encode %{
4178     __ psllq($dst$$XMMRegister, (int)$shift$$constant);
4179   %}
4180   ins_pipe( pipe_slow );
4181 %}
4182 
4183 instruct vsll2L_reg(vecX dst, vecX src, vecS shift) %{
4184   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
4185   match(Set dst (LShiftVL src shift));
4186   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed2L" %}
4187   ins_encode %{
4188     bool vector256 = false;
4189     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
4190   %}
4191   ins_pipe( pipe_slow );
4192 %}
4193 
4194 instruct vsll2L_reg_imm(vecX dst, vecX src, immI8 shift) %{
4195   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
4196   match(Set dst (LShiftVL src shift));
4197   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed2L" %}
4198   ins_encode %{
4199     bool vector256 = false;
4200     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
4201   %}
4202   ins_pipe( pipe_slow );
4203 %}
4204 
4205 instruct vsll4L_reg(vecY dst, vecY src, vecS shift) %{
4206   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
4207   match(Set dst (LShiftVL src shift));
4208   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed4L" %}
4209   ins_encode %{
4210     bool vector256 = true;
4211     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
4212   %}
4213   ins_pipe( pipe_slow );
4214 %}
4215 
4216 instruct vsll4L_reg_imm(vecY dst, vecY src, immI8 shift) %{
4217   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
4218   match(Set dst (LShiftVL src shift));
4219   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed4L" %}
4220   ins_encode %{
4221     bool vector256 = true;
4222     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
4223   %}
4224   ins_pipe( pipe_slow );
4225 %}
4226 
4227 // ----------------------- LogicalRightShift -----------------------------------
4228 
4229 // Shorts vector logical right shift produces incorrect Java result
4230 // for negative data because java code convert short value into int with
4231 // sign extension before a shift. But char vectors are fine since chars are
4232 // unsigned values.
4233 
4234 instruct vsrl2S(vecS dst, vecS shift) %{
4235   predicate(n->as_Vector()->length() == 2);
4236   match(Set dst (URShiftVS dst shift));
4237   format %{ "psrlw   $dst,$shift\t! logical right shift packed2S" %}
4238   ins_encode %{
4239     __ psrlw($dst$$XMMRegister, $shift$$XMMRegister);
4240   %}
4241   ins_pipe( pipe_slow );
4242 %}
4243 
4244 instruct vsrl2S_imm(vecS dst, immI8 shift) %{
4245   predicate(n->as_Vector()->length() == 2);
4246   match(Set dst (URShiftVS dst shift));
4247   format %{ "psrlw   $dst,$shift\t! logical right shift packed2S" %}
4248   ins_encode %{
4249     __ psrlw($dst$$XMMRegister, (int)$shift$$constant);
4250   %}
4251   ins_pipe( pipe_slow );
4252 %}
4253 
4254 instruct vsrl2S_reg(vecS dst, vecS src, vecS shift) %{
4255   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
4256   match(Set dst (URShiftVS src shift));
4257   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed2S" %}
4258   ins_encode %{
4259     bool vector256 = false;
4260     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
4261   %}
4262   ins_pipe( pipe_slow );
4263 %}
4264 
4265 instruct vsrl2S_reg_imm(vecS dst, vecS src, immI8 shift) %{
4266   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
4267   match(Set dst (URShiftVS src shift));
4268   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed2S" %}
4269   ins_encode %{
4270     bool vector256 = false;
4271     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
4272   %}
4273   ins_pipe( pipe_slow );
4274 %}
4275 
4276 instruct vsrl4S(vecD dst, vecS shift) %{
4277   predicate(n->as_Vector()->length() == 4);
4278   match(Set dst (URShiftVS dst shift));
4279   format %{ "psrlw   $dst,$shift\t! logical right shift packed4S" %}
4280   ins_encode %{
4281     __ psrlw($dst$$XMMRegister, $shift$$XMMRegister);
4282   %}
4283   ins_pipe( pipe_slow );
4284 %}
4285 
4286 instruct vsrl4S_imm(vecD dst, immI8 shift) %{
4287   predicate(n->as_Vector()->length() == 4);
4288   match(Set dst (URShiftVS dst shift));
4289   format %{ "psrlw   $dst,$shift\t! logical right shift packed4S" %}
4290   ins_encode %{
4291     __ psrlw($dst$$XMMRegister, (int)$shift$$constant);
4292   %}
4293   ins_pipe( pipe_slow );
4294 %}
4295 
4296 instruct vsrl4S_reg(vecD dst, vecD src, vecS shift) %{
4297   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
4298   match(Set dst (URShiftVS src shift));
4299   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed4S" %}
4300   ins_encode %{
4301     bool vector256 = false;
4302     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
4303   %}
4304   ins_pipe( pipe_slow );
4305 %}
4306 
4307 instruct vsrl4S_reg_imm(vecD dst, vecD src, immI8 shift) %{
4308   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
4309   match(Set dst (URShiftVS src shift));
4310   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed4S" %}
4311   ins_encode %{
4312     bool vector256 = false;
4313     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
4314   %}
4315   ins_pipe( pipe_slow );
4316 %}
4317 
4318 instruct vsrl8S(vecX dst, vecS shift) %{
4319   predicate(n->as_Vector()->length() == 8);
4320   match(Set dst (URShiftVS dst shift));
4321   format %{ "psrlw   $dst,$shift\t! logical right shift packed8S" %}
4322   ins_encode %{
4323     __ psrlw($dst$$XMMRegister, $shift$$XMMRegister);
4324   %}
4325   ins_pipe( pipe_slow );
4326 %}
4327 
4328 instruct vsrl8S_imm(vecX dst, immI8 shift) %{
4329   predicate(n->as_Vector()->length() == 8);
4330   match(Set dst (URShiftVS dst shift));
4331   format %{ "psrlw   $dst,$shift\t! logical right shift packed8S" %}
4332   ins_encode %{
4333     __ psrlw($dst$$XMMRegister, (int)$shift$$constant);
4334   %}
4335   ins_pipe( pipe_slow );
4336 %}
4337 
4338 instruct vsrl8S_reg(vecX dst, vecX src, vecS shift) %{
4339   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
4340   match(Set dst (URShiftVS src shift));
4341   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed8S" %}
4342   ins_encode %{
4343     bool vector256 = false;
4344     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
4345   %}
4346   ins_pipe( pipe_slow );
4347 %}
4348 
4349 instruct vsrl8S_reg_imm(vecX dst, vecX src, immI8 shift) %{
4350   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
4351   match(Set dst (URShiftVS src shift));
4352   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed8S" %}
4353   ins_encode %{
4354     bool vector256 = false;
4355     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
4356   %}
4357   ins_pipe( pipe_slow );
4358 %}
4359 
4360 instruct vsrl16S_reg(vecY dst, vecY src, vecS shift) %{
4361   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
4362   match(Set dst (URShiftVS src shift));
4363   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed16S" %}
4364   ins_encode %{
4365     bool vector256 = true;
4366     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
4367   %}
4368   ins_pipe( pipe_slow );
4369 %}
4370 
4371 instruct vsrl16S_reg_imm(vecY dst, vecY src, immI8 shift) %{
4372   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
4373   match(Set dst (URShiftVS src shift));
4374   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed16S" %}
4375   ins_encode %{
4376     bool vector256 = true;
4377     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
4378   %}
4379   ins_pipe( pipe_slow );
4380 %}
4381 
4382 // Integers vector logical right shift
4383 instruct vsrl2I(vecD dst, vecS shift) %{
4384   predicate(n->as_Vector()->length() == 2);
4385   match(Set dst (URShiftVI dst shift));
4386   format %{ "psrld   $dst,$shift\t! logical right shift packed2I" %}
4387   ins_encode %{
4388     __ psrld($dst$$XMMRegister, $shift$$XMMRegister);
4389   %}
4390   ins_pipe( pipe_slow );
4391 %}
4392 
4393 instruct vsrl2I_imm(vecD dst, immI8 shift) %{
4394   predicate(n->as_Vector()->length() == 2);
4395   match(Set dst (URShiftVI dst shift));
4396   format %{ "psrld   $dst,$shift\t! logical right shift packed2I" %}
4397   ins_encode %{
4398     __ psrld($dst$$XMMRegister, (int)$shift$$constant);
4399   %}
4400   ins_pipe( pipe_slow );
4401 %}
4402 
4403 instruct vsrl2I_reg(vecD dst, vecD src, vecS shift) %{
4404   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
4405   match(Set dst (URShiftVI src shift));
4406   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed2I" %}
4407   ins_encode %{
4408     bool vector256 = false;
4409     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
4410   %}
4411   ins_pipe( pipe_slow );
4412 %}
4413 
4414 instruct vsrl2I_reg_imm(vecD dst, vecD src, immI8 shift) %{
4415   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
4416   match(Set dst (URShiftVI src shift));
4417   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed2I" %}
4418   ins_encode %{
4419     bool vector256 = false;
4420     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
4421   %}
4422   ins_pipe( pipe_slow );
4423 %}
4424 
4425 instruct vsrl4I(vecX dst, vecS shift) %{
4426   predicate(n->as_Vector()->length() == 4);
4427   match(Set dst (URShiftVI dst shift));
4428   format %{ "psrld   $dst,$shift\t! logical right shift packed4I" %}
4429   ins_encode %{
4430     __ psrld($dst$$XMMRegister, $shift$$XMMRegister);
4431   %}
4432   ins_pipe( pipe_slow );
4433 %}
4434 
4435 instruct vsrl4I_imm(vecX dst, immI8 shift) %{
4436   predicate(n->as_Vector()->length() == 4);
4437   match(Set dst (URShiftVI dst shift));
4438   format %{ "psrld   $dst,$shift\t! logical right shift packed4I" %}
4439   ins_encode %{
4440     __ psrld($dst$$XMMRegister, (int)$shift$$constant);
4441   %}
4442   ins_pipe( pipe_slow );
4443 %}
4444 
4445 instruct vsrl4I_reg(vecX dst, vecX src, vecS shift) %{
4446   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
4447   match(Set dst (URShiftVI src shift));
4448   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed4I" %}
4449   ins_encode %{
4450     bool vector256 = false;
4451     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
4452   %}
4453   ins_pipe( pipe_slow );
4454 %}
4455 
4456 instruct vsrl4I_reg_imm(vecX dst, vecX src, immI8 shift) %{
4457   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
4458   match(Set dst (URShiftVI src shift));
4459   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed4I" %}
4460   ins_encode %{
4461     bool vector256 = false;
4462     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
4463   %}
4464   ins_pipe( pipe_slow );
4465 %}
4466 
4467 instruct vsrl8I_reg(vecY dst, vecY src, vecS shift) %{
4468   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
4469   match(Set dst (URShiftVI src shift));
4470   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed8I" %}
4471   ins_encode %{
4472     bool vector256 = true;
4473     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
4474   %}
4475   ins_pipe( pipe_slow );
4476 %}
4477 
4478 instruct vsrl8I_reg_imm(vecY dst, vecY src, immI8 shift) %{
4479   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
4480   match(Set dst (URShiftVI src shift));
4481   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed8I" %}
4482   ins_encode %{
4483     bool vector256 = true;
4484     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
4485   %}
4486   ins_pipe( pipe_slow );
4487 %}
4488 
4489 // Longs vector logical right shift
4490 instruct vsrl2L(vecX dst, vecS shift) %{
4491   predicate(n->as_Vector()->length() == 2);
4492   match(Set dst (URShiftVL dst shift));
4493   format %{ "psrlq   $dst,$shift\t! logical right shift packed2L" %}
4494   ins_encode %{
4495     __ psrlq($dst$$XMMRegister, $shift$$XMMRegister);
4496   %}
4497   ins_pipe( pipe_slow );
4498 %}
4499 
4500 instruct vsrl2L_imm(vecX dst, immI8 shift) %{
4501   predicate(n->as_Vector()->length() == 2);
4502   match(Set dst (URShiftVL dst shift));
4503   format %{ "psrlq   $dst,$shift\t! logical right shift packed2L" %}
4504   ins_encode %{
4505     __ psrlq($dst$$XMMRegister, (int)$shift$$constant);
4506   %}
4507   ins_pipe( pipe_slow );
4508 %}
4509 
4510 instruct vsrl2L_reg(vecX dst, vecX src, vecS shift) %{
4511   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
4512   match(Set dst (URShiftVL src shift));
4513   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed2L" %}
4514   ins_encode %{
4515     bool vector256 = false;
4516     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
4517   %}
4518   ins_pipe( pipe_slow );
4519 %}
4520 
4521 instruct vsrl2L_reg_imm(vecX dst, vecX src, immI8 shift) %{
4522   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
4523   match(Set dst (URShiftVL src shift));
4524   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed2L" %}
4525   ins_encode %{
4526     bool vector256 = false;
4527     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
4528   %}
4529   ins_pipe( pipe_slow );
4530 %}
4531 
4532 instruct vsrl4L_reg(vecY dst, vecY src, vecS shift) %{
4533   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
4534   match(Set dst (URShiftVL src shift));
4535   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed4L" %}
4536   ins_encode %{
4537     bool vector256 = true;
4538     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
4539   %}
4540   ins_pipe( pipe_slow );
4541 %}
4542 
4543 instruct vsrl4L_reg_imm(vecY dst, vecY src, immI8 shift) %{
4544   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
4545   match(Set dst (URShiftVL src shift));
4546   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed4L" %}
4547   ins_encode %{
4548     bool vector256 = true;
4549     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
4550   %}
4551   ins_pipe( pipe_slow );
4552 %}
4553 
4554 // ------------------- ArithmeticRightShift -----------------------------------
4555 
4556 // Shorts/Chars vector arithmetic right shift
4557 instruct vsra2S(vecS dst, vecS shift) %{
4558   predicate(n->as_Vector()->length() == 2);
4559   match(Set dst (RShiftVS dst shift));
4560   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed2S" %}
4561   ins_encode %{
4562     __ psraw($dst$$XMMRegister, $shift$$XMMRegister);
4563   %}
4564   ins_pipe( pipe_slow );
4565 %}
4566 
4567 instruct vsra2S_imm(vecS dst, immI8 shift) %{
4568   predicate(n->as_Vector()->length() == 2);
4569   match(Set dst (RShiftVS dst shift));
4570   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed2S" %}
4571   ins_encode %{
4572     __ psraw($dst$$XMMRegister, (int)$shift$$constant);
4573   %}
4574   ins_pipe( pipe_slow );
4575 %}
4576 
4577 instruct vsra2S_reg(vecS dst, vecS src, vecS shift) %{
4578   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
4579   match(Set dst (RShiftVS src shift));
4580   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
4581   ins_encode %{
4582     bool vector256 = false;
4583     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
4584   %}
4585   ins_pipe( pipe_slow );
4586 %}
4587 
4588 instruct vsra2S_reg_imm(vecS dst, vecS src, immI8 shift) %{
4589   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
4590   match(Set dst (RShiftVS src shift));
4591   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
4592   ins_encode %{
4593     bool vector256 = false;
4594     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
4595   %}
4596   ins_pipe( pipe_slow );
4597 %}
4598 
4599 instruct vsra4S(vecD dst, vecS shift) %{
4600   predicate(n->as_Vector()->length() == 4);
4601   match(Set dst (RShiftVS dst shift));
4602   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed4S" %}
4603   ins_encode %{
4604     __ psraw($dst$$XMMRegister, $shift$$XMMRegister);
4605   %}
4606   ins_pipe( pipe_slow );
4607 %}
4608 
4609 instruct vsra4S_imm(vecD dst, immI8 shift) %{
4610   predicate(n->as_Vector()->length() == 4);
4611   match(Set dst (RShiftVS dst shift));
4612   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed4S" %}
4613   ins_encode %{
4614     __ psraw($dst$$XMMRegister, (int)$shift$$constant);
4615   %}
4616   ins_pipe( pipe_slow );
4617 %}
4618 
4619 instruct vsra4S_reg(vecD dst, vecD src, vecS shift) %{
4620   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
4621   match(Set dst (RShiftVS src shift));
4622   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
4623   ins_encode %{
4624     bool vector256 = false;
4625     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
4626   %}
4627   ins_pipe( pipe_slow );
4628 %}
4629 
4630 instruct vsra4S_reg_imm(vecD dst, vecD src, immI8 shift) %{
4631   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
4632   match(Set dst (RShiftVS src shift));
4633   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
4634   ins_encode %{
4635     bool vector256 = false;
4636     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
4637   %}
4638   ins_pipe( pipe_slow );
4639 %}
4640 
4641 instruct vsra8S(vecX dst, vecS shift) %{
4642   predicate(n->as_Vector()->length() == 8);
4643   match(Set dst (RShiftVS dst shift));
4644   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed8S" %}
4645   ins_encode %{
4646     __ psraw($dst$$XMMRegister, $shift$$XMMRegister);
4647   %}
4648   ins_pipe( pipe_slow );
4649 %}
4650 
4651 instruct vsra8S_imm(vecX dst, immI8 shift) %{
4652   predicate(n->as_Vector()->length() == 8);
4653   match(Set dst (RShiftVS dst shift));
4654   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed8S" %}
4655   ins_encode %{
4656     __ psraw($dst$$XMMRegister, (int)$shift$$constant);
4657   %}
4658   ins_pipe( pipe_slow );
4659 %}
4660 
4661 instruct vsra8S_reg(vecX dst, vecX src, vecS shift) %{
4662   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
4663   match(Set dst (RShiftVS src shift));
4664   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
4665   ins_encode %{
4666     bool vector256 = false;
4667     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
4668   %}
4669   ins_pipe( pipe_slow );
4670 %}
4671 
4672 instruct vsra8S_reg_imm(vecX dst, vecX src, immI8 shift) %{
4673   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
4674   match(Set dst (RShiftVS src shift));
4675   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
4676   ins_encode %{
4677     bool vector256 = false;
4678     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
4679   %}
4680   ins_pipe( pipe_slow );
4681 %}
4682 
4683 instruct vsra16S_reg(vecY dst, vecY src, vecS shift) %{
4684   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
4685   match(Set dst (RShiftVS src shift));
4686   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
4687   ins_encode %{
4688     bool vector256 = true;
4689     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
4690   %}
4691   ins_pipe( pipe_slow );
4692 %}
4693 
4694 instruct vsra16S_reg_imm(vecY dst, vecY src, immI8 shift) %{
4695   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
4696   match(Set dst (RShiftVS src shift));
4697   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
4698   ins_encode %{
4699     bool vector256 = true;
4700     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
4701   %}
4702   ins_pipe( pipe_slow );
4703 %}
4704 
4705 // Integers vector arithmetic right shift
4706 instruct vsra2I(vecD dst, vecS shift) %{
4707   predicate(n->as_Vector()->length() == 2);
4708   match(Set dst (RShiftVI dst shift));
4709   format %{ "psrad   $dst,$shift\t! arithmetic right shift packed2I" %}
4710   ins_encode %{
4711     __ psrad($dst$$XMMRegister, $shift$$XMMRegister);
4712   %}
4713   ins_pipe( pipe_slow );
4714 %}
4715 
4716 instruct vsra2I_imm(vecD dst, immI8 shift) %{
4717   predicate(n->as_Vector()->length() == 2);
4718   match(Set dst (RShiftVI dst shift));
4719   format %{ "psrad   $dst,$shift\t! arithmetic right shift packed2I" %}
4720   ins_encode %{
4721     __ psrad($dst$$XMMRegister, (int)$shift$$constant);
4722   %}
4723   ins_pipe( pipe_slow );
4724 %}
4725 
4726 instruct vsra2I_reg(vecD dst, vecD src, vecS shift) %{
4727   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
4728   match(Set dst (RShiftVI src shift));
4729   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed2I" %}
4730   ins_encode %{
4731     bool vector256 = false;
4732     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
4733   %}
4734   ins_pipe( pipe_slow );
4735 %}
4736 
4737 instruct vsra2I_reg_imm(vecD dst, vecD src, immI8 shift) %{
4738   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
4739   match(Set dst (RShiftVI src shift));
4740   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed2I" %}
4741   ins_encode %{
4742     bool vector256 = false;
4743     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
4744   %}
4745   ins_pipe( pipe_slow );
4746 %}
4747 
4748 instruct vsra4I(vecX dst, vecS shift) %{
4749   predicate(n->as_Vector()->length() == 4);
4750   match(Set dst (RShiftVI dst shift));
4751   format %{ "psrad   $dst,$shift\t! arithmetic right shift packed4I" %}
4752   ins_encode %{
4753     __ psrad($dst$$XMMRegister, $shift$$XMMRegister);
4754   %}
4755   ins_pipe( pipe_slow );
4756 %}
4757 
4758 instruct vsra4I_imm(vecX dst, immI8 shift) %{
4759   predicate(n->as_Vector()->length() == 4);
4760   match(Set dst (RShiftVI dst shift));
4761   format %{ "psrad   $dst,$shift\t! arithmetic right shift packed4I" %}
4762   ins_encode %{
4763     __ psrad($dst$$XMMRegister, (int)$shift$$constant);
4764   %}
4765   ins_pipe( pipe_slow );
4766 %}
4767 
4768 instruct vsra4I_reg(vecX dst, vecX src, vecS shift) %{
4769   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
4770   match(Set dst (RShiftVI src shift));
4771   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed4I" %}
4772   ins_encode %{
4773     bool vector256 = false;
4774     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
4775   %}
4776   ins_pipe( pipe_slow );
4777 %}
4778 
4779 instruct vsra4I_reg_imm(vecX dst, vecX src, immI8 shift) %{
4780   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
4781   match(Set dst (RShiftVI src shift));
4782   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed4I" %}
4783   ins_encode %{
4784     bool vector256 = false;
4785     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
4786   %}
4787   ins_pipe( pipe_slow );
4788 %}
4789 
4790 instruct vsra8I_reg(vecY dst, vecY src, vecS shift) %{
4791   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
4792   match(Set dst (RShiftVI src shift));
4793   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed8I" %}
4794   ins_encode %{
4795     bool vector256 = true;
4796     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
4797   %}
4798   ins_pipe( pipe_slow );
4799 %}
4800 
4801 instruct vsra8I_reg_imm(vecY dst, vecY src, immI8 shift) %{
4802   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
4803   match(Set dst (RShiftVI src shift));
4804   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed8I" %}
4805   ins_encode %{
4806     bool vector256 = true;
4807     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
4808   %}
4809   ins_pipe( pipe_slow );
4810 %}
4811 
4812 // There are no longs vector arithmetic right shift instructions.
4813 
4814 
4815 // --------------------------------- AND --------------------------------------
4816 
4817 instruct vand4B(vecS dst, vecS src) %{
4818   predicate(n->as_Vector()->length_in_bytes() == 4);
4819   match(Set dst (AndV dst src));
4820   format %{ "pand    $dst,$src\t! and vectors (4 bytes)" %}
4821   ins_encode %{
4822     __ pand($dst$$XMMRegister, $src$$XMMRegister);
4823   %}
4824   ins_pipe( pipe_slow );
4825 %}
4826 
4827 instruct vand4B_reg(vecS dst, vecS src1, vecS src2) %{
4828   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
4829   match(Set dst (AndV src1 src2));
4830   format %{ "vpand   $dst,$src1,$src2\t! and vectors (4 bytes)" %}
4831   ins_encode %{
4832     bool vector256 = false;
4833     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
4834   %}
4835   ins_pipe( pipe_slow );
4836 %}
4837 
4838 instruct vand8B(vecD dst, vecD src) %{
4839   predicate(n->as_Vector()->length_in_bytes() == 8);
4840   match(Set dst (AndV dst src));
4841   format %{ "pand    $dst,$src\t! and vectors (8 bytes)" %}
4842   ins_encode %{
4843     __ pand($dst$$XMMRegister, $src$$XMMRegister);
4844   %}
4845   ins_pipe( pipe_slow );
4846 %}
4847 
4848 instruct vand8B_reg(vecD dst, vecD src1, vecD src2) %{
4849   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
4850   match(Set dst (AndV src1 src2));
4851   format %{ "vpand   $dst,$src1,$src2\t! and vectors (8 bytes)" %}
4852   ins_encode %{
4853     bool vector256 = false;
4854     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
4855   %}
4856   ins_pipe( pipe_slow );
4857 %}
4858 
4859 instruct vand16B(vecX dst, vecX src) %{
4860   predicate(n->as_Vector()->length_in_bytes() == 16);
4861   match(Set dst (AndV dst src));
4862   format %{ "pand    $dst,$src\t! and vectors (16 bytes)" %}
4863   ins_encode %{
4864     __ pand($dst$$XMMRegister, $src$$XMMRegister);
4865   %}
4866   ins_pipe( pipe_slow );
4867 %}
4868 
4869 instruct vand16B_reg(vecX dst, vecX src1, vecX src2) %{
4870   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
4871   match(Set dst (AndV src1 src2));
4872   format %{ "vpand   $dst,$src1,$src2\t! and vectors (16 bytes)" %}
4873   ins_encode %{
4874     bool vector256 = false;
4875     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
4876   %}
4877   ins_pipe( pipe_slow );
4878 %}
4879 
4880 instruct vand16B_mem(vecX dst, vecX src, memory mem) %{
4881   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
4882   match(Set dst (AndV src (LoadVector mem)));
4883   format %{ "vpand   $dst,$src,$mem\t! and vectors (16 bytes)" %}
4884   ins_encode %{
4885     bool vector256 = false;
4886     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
4887   %}
4888   ins_pipe( pipe_slow );
4889 %}
4890 
4891 instruct vand32B_reg(vecY dst, vecY src1, vecY src2) %{
4892   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
4893   match(Set dst (AndV src1 src2));
4894   format %{ "vpand   $dst,$src1,$src2\t! and vectors (32 bytes)" %}
4895   ins_encode %{
4896     bool vector256 = true;
4897     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
4898   %}
4899   ins_pipe( pipe_slow );
4900 %}
4901 
4902 instruct vand32B_mem(vecY dst, vecY src, memory mem) %{
4903   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
4904   match(Set dst (AndV src (LoadVector mem)));
4905   format %{ "vpand   $dst,$src,$mem\t! and vectors (32 bytes)" %}
4906   ins_encode %{
4907     bool vector256 = true;
4908     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
4909   %}
4910   ins_pipe( pipe_slow );
4911 %}
4912 
4913 // --------------------------------- OR ---------------------------------------
4914 
4915 instruct vor4B(vecS dst, vecS src) %{
4916   predicate(n->as_Vector()->length_in_bytes() == 4);
4917   match(Set dst (OrV dst src));
4918   format %{ "por     $dst,$src\t! or vectors (4 bytes)" %}
4919   ins_encode %{
4920     __ por($dst$$XMMRegister, $src$$XMMRegister);
4921   %}
4922   ins_pipe( pipe_slow );
4923 %}
4924 
4925 instruct vor4B_reg(vecS dst, vecS src1, vecS src2) %{
4926   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
4927   match(Set dst (OrV src1 src2));
4928   format %{ "vpor    $dst,$src1,$src2\t! or vectors (4 bytes)" %}
4929   ins_encode %{
4930     bool vector256 = false;
4931     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
4932   %}
4933   ins_pipe( pipe_slow );
4934 %}
4935 
4936 instruct vor8B(vecD dst, vecD src) %{
4937   predicate(n->as_Vector()->length_in_bytes() == 8);
4938   match(Set dst (OrV dst src));
4939   format %{ "por     $dst,$src\t! or vectors (8 bytes)" %}
4940   ins_encode %{
4941     __ por($dst$$XMMRegister, $src$$XMMRegister);
4942   %}
4943   ins_pipe( pipe_slow );
4944 %}
4945 
4946 instruct vor8B_reg(vecD dst, vecD src1, vecD src2) %{
4947   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
4948   match(Set dst (OrV src1 src2));
4949   format %{ "vpor    $dst,$src1,$src2\t! or vectors (8 bytes)" %}
4950   ins_encode %{
4951     bool vector256 = false;
4952     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
4953   %}
4954   ins_pipe( pipe_slow );
4955 %}
4956 
4957 instruct vor16B(vecX dst, vecX src) %{
4958   predicate(n->as_Vector()->length_in_bytes() == 16);
4959   match(Set dst (OrV dst src));
4960   format %{ "por     $dst,$src\t! or vectors (16 bytes)" %}
4961   ins_encode %{
4962     __ por($dst$$XMMRegister, $src$$XMMRegister);
4963   %}
4964   ins_pipe( pipe_slow );
4965 %}
4966 
4967 instruct vor16B_reg(vecX dst, vecX src1, vecX src2) %{
4968   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
4969   match(Set dst (OrV src1 src2));
4970   format %{ "vpor    $dst,$src1,$src2\t! or vectors (16 bytes)" %}
4971   ins_encode %{
4972     bool vector256 = false;
4973     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
4974   %}
4975   ins_pipe( pipe_slow );
4976 %}
4977 
4978 instruct vor16B_mem(vecX dst, vecX src, memory mem) %{
4979   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
4980   match(Set dst (OrV src (LoadVector mem)));
4981   format %{ "vpor    $dst,$src,$mem\t! or vectors (16 bytes)" %}
4982   ins_encode %{
4983     bool vector256 = false;
4984     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
4985   %}
4986   ins_pipe( pipe_slow );
4987 %}
4988 
4989 instruct vor32B_reg(vecY dst, vecY src1, vecY src2) %{
4990   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
4991   match(Set dst (OrV src1 src2));
4992   format %{ "vpor    $dst,$src1,$src2\t! or vectors (32 bytes)" %}
4993   ins_encode %{
4994     bool vector256 = true;
4995     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
4996   %}
4997   ins_pipe( pipe_slow );
4998 %}
4999 
5000 instruct vor32B_mem(vecY dst, vecY src, memory mem) %{
5001   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
5002   match(Set dst (OrV src (LoadVector mem)));
5003   format %{ "vpor    $dst,$src,$mem\t! or vectors (32 bytes)" %}
5004   ins_encode %{
5005     bool vector256 = true;
5006     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
5007   %}
5008   ins_pipe( pipe_slow );
5009 %}
5010 
5011 // --------------------------------- XOR --------------------------------------
5012 
5013 instruct vxor4B(vecS dst, vecS src) %{
5014   predicate(n->as_Vector()->length_in_bytes() == 4);
5015   match(Set dst (XorV dst src));
5016   format %{ "pxor    $dst,$src\t! xor vectors (4 bytes)" %}
5017   ins_encode %{
5018     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
5019   %}
5020   ins_pipe( pipe_slow );
5021 %}
5022 
5023 instruct vxor4B_reg(vecS dst, vecS src1, vecS src2) %{
5024   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
5025   match(Set dst (XorV src1 src2));
5026   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (4 bytes)" %}
5027   ins_encode %{
5028     bool vector256 = false;
5029     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
5030   %}
5031   ins_pipe( pipe_slow );
5032 %}
5033 
5034 instruct vxor8B(vecD dst, vecD src) %{
5035   predicate(n->as_Vector()->length_in_bytes() == 8);
5036   match(Set dst (XorV dst src));
5037   format %{ "pxor    $dst,$src\t! xor vectors (8 bytes)" %}
5038   ins_encode %{
5039     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
5040   %}
5041   ins_pipe( pipe_slow );
5042 %}
5043 
5044 instruct vxor8B_reg(vecD dst, vecD src1, vecD src2) %{
5045   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
5046   match(Set dst (XorV src1 src2));
5047   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (8 bytes)" %}
5048   ins_encode %{
5049     bool vector256 = false;
5050     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
5051   %}
5052   ins_pipe( pipe_slow );
5053 %}
5054 
5055 instruct vxor16B(vecX dst, vecX src) %{
5056   predicate(n->as_Vector()->length_in_bytes() == 16);
5057   match(Set dst (XorV dst src));
5058   format %{ "pxor    $dst,$src\t! xor vectors (16 bytes)" %}
5059   ins_encode %{
5060     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
5061   %}
5062   ins_pipe( pipe_slow );
5063 %}
5064 
5065 instruct vxor16B_reg(vecX dst, vecX src1, vecX src2) %{
5066   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
5067   match(Set dst (XorV src1 src2));
5068   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (16 bytes)" %}
5069   ins_encode %{
5070     bool vector256 = false;
5071     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
5072   %}
5073   ins_pipe( pipe_slow );
5074 %}
5075 
5076 instruct vxor16B_mem(vecX dst, vecX src, memory mem) %{
5077   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
5078   match(Set dst (XorV src (LoadVector mem)));
5079   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (16 bytes)" %}
5080   ins_encode %{
5081     bool vector256 = false;
5082     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
5083   %}
5084   ins_pipe( pipe_slow );
5085 %}
5086 
5087 instruct vxor32B_reg(vecY dst, vecY src1, vecY src2) %{
5088   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
5089   match(Set dst (XorV src1 src2));
5090   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (32 bytes)" %}
5091   ins_encode %{
5092     bool vector256 = true;
5093     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
5094   %}
5095   ins_pipe( pipe_slow );
5096 %}
5097 
5098 instruct vxor32B_mem(vecY dst, vecY src, memory mem) %{
5099   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
5100   match(Set dst (XorV src (LoadVector mem)));
5101   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (32 bytes)" %}
5102   ins_encode %{
5103     bool vector256 = true;
5104     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
5105   %}
5106   ins_pipe( pipe_slow );
5107 %}
5108