1 //
   2 // Copyright (c) 2011, 2012, Oracle and/or its affiliates. All rights reserved.
   3 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4 //
   5 // This code is free software; you can redistribute it and/or modify it
   6 // under the terms of the GNU General Public License version 2 only, as
   7 // published by the Free Software Foundation.
   8 //
   9 // This code is distributed in the hope that it will be useful, but WITHOUT
  10 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11 // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12 // version 2 for more details (a copy is included in the LICENSE file that
  13 // accompanied this code).
  14 //
  15 // You should have received a copy of the GNU General Public License version
  16 // 2 along with this work; if not, write to the Free Software Foundation,
  17 // Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18 //
  19 // Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20 // or visit www.oracle.com if you need additional information or have any
  21 // questions.
  22 //
  23 //
  24 
  25 // X86 Common Architecture Description File
  26 
  27 //----------REGISTER DEFINITION BLOCK------------------------------------------
  28 // This information is used by the matcher and the register allocator to
  29 // describe individual registers and classes of registers within the target
  30 // archtecture.
  31 
  32 register %{
  33 //----------Architecture Description Register Definitions----------------------
  34 // General Registers
  35 // "reg_def"  name ( register save type, C convention save type,
  36 //                   ideal register type, encoding );
  37 // Register Save Types:
  38 //
  39 // NS  = No-Save:       The register allocator assumes that these registers
  40 //                      can be used without saving upon entry to the method, &
  41 //                      that they do not need to be saved at call sites.
  42 //
  43 // SOC = Save-On-Call:  The register allocator assumes that these registers
  44 //                      can be used without saving upon entry to the method,
  45 //                      but that they must be saved at call sites.
  46 //
  47 // SOE = Save-On-Entry: The register allocator assumes that these registers
  48 //                      must be saved before using them upon entry to the
  49 //                      method, but they do not need to be saved at call
  50 //                      sites.
  51 //
  52 // AS  = Always-Save:   The register allocator assumes that these registers
  53 //                      must be saved before using them upon entry to the
  54 //                      method, & that they must be saved at call sites.
  55 //
  56 // Ideal Register Type is used to determine how to save & restore a
  57 // register.  Op_RegI will get spilled with LoadI/StoreI, Op_RegP will get
  58 // spilled with LoadP/StoreP.  If the register supports both, use Op_RegI.
  59 //
  60 // The encoding number is the actual bit-pattern placed into the opcodes.
  61 
  62 // XMM registers.  256-bit registers or 8 words each, labeled (a)-h.
  63 // Word a in each register holds a Float, words ab hold a Double.
  64 // The whole registers are used in SSE4.2 version intrinsics,
  65 // array copy stubs and superword operations (see UseSSE42Intrinsics,
  66 // UseXMMForArrayCopy and UseSuperword flags).
  67 // XMM8-XMM15 must be encoded with REX (VEX for UseAVX).
  68 // Linux ABI:   No register preserved across function calls
  69 //              XMM0-XMM7 might hold parameters
  70 // Windows ABI: XMM6-XMM15 preserved across function calls
  71 //              XMM0-XMM3 might hold parameters
  72 
  73 reg_def XMM0 ( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg());
  74 reg_def XMM0b( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(1));
  75 reg_def XMM0c( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(2));
  76 reg_def XMM0d( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(3));
  77 reg_def XMM0e( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(4));
  78 reg_def XMM0f( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(5));
  79 reg_def XMM0g( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(6));
  80 reg_def XMM0h( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(7));
  81 
  82 reg_def XMM1 ( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg());
  83 reg_def XMM1b( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(1));
  84 reg_def XMM1c( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(2));
  85 reg_def XMM1d( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(3));
  86 reg_def XMM1e( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(4));
  87 reg_def XMM1f( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(5));
  88 reg_def XMM1g( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(6));
  89 reg_def XMM1h( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(7));
  90 
  91 reg_def XMM2 ( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg());
  92 reg_def XMM2b( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(1));
  93 reg_def XMM2c( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(2));
  94 reg_def XMM2d( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(3));
  95 reg_def XMM2e( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(4));
  96 reg_def XMM2f( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(5));
  97 reg_def XMM2g( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(6));
  98 reg_def XMM2h( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(7));
  99 
 100 reg_def XMM3 ( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg());
 101 reg_def XMM3b( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(1));
 102 reg_def XMM3c( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(2));
 103 reg_def XMM3d( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(3));
 104 reg_def XMM3e( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(4));
 105 reg_def XMM3f( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(5));
 106 reg_def XMM3g( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(6));
 107 reg_def XMM3h( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(7));
 108 
 109 reg_def XMM4 ( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg());
 110 reg_def XMM4b( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(1));
 111 reg_def XMM4c( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(2));
 112 reg_def XMM4d( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(3));
 113 reg_def XMM4e( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(4));
 114 reg_def XMM4f( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(5));
 115 reg_def XMM4g( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(6));
 116 reg_def XMM4h( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(7));
 117 
 118 reg_def XMM5 ( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg());
 119 reg_def XMM5b( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(1));
 120 reg_def XMM5c( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(2));
 121 reg_def XMM5d( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(3));
 122 reg_def XMM5e( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(4));
 123 reg_def XMM5f( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(5));
 124 reg_def XMM5g( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(6));
 125 reg_def XMM5h( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(7));
 126 
 127 #ifdef _WIN64
 128 
 129 reg_def XMM6 ( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg());
 130 reg_def XMM6b( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg()->next(1));
 131 reg_def XMM6c( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg()->next(2));
 132 reg_def XMM6d( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg()->next(3));
 133 reg_def XMM6e( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg()->next(4));
 134 reg_def XMM6f( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg()->next(5));
 135 reg_def XMM6g( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg()->next(6));
 136 reg_def XMM6h( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg()->next(7));
 137 
 138 reg_def XMM7 ( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg());
 139 reg_def XMM7b( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg()->next(1));
 140 reg_def XMM7c( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg()->next(2));
 141 reg_def XMM7d( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg()->next(3));
 142 reg_def XMM7e( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg()->next(4));
 143 reg_def XMM7f( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg()->next(5));
 144 reg_def XMM7g( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg()->next(6));
 145 reg_def XMM7h( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg()->next(7));
 146 
 147 reg_def XMM8 ( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg());
 148 reg_def XMM8b( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg()->next(1));
 149 reg_def XMM8c( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg()->next(2));
 150 reg_def XMM8d( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg()->next(3));
 151 reg_def XMM8e( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg()->next(4));
 152 reg_def XMM8f( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg()->next(5));
 153 reg_def XMM8g( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg()->next(6));
 154 reg_def XMM8h( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg()->next(7));
 155 
 156 reg_def XMM9 ( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg());
 157 reg_def XMM9b( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg()->next(1));
 158 reg_def XMM9c( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg()->next(2));
 159 reg_def XMM9d( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg()->next(3));
 160 reg_def XMM9e( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg()->next(4));
 161 reg_def XMM9f( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg()->next(5));
 162 reg_def XMM9g( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg()->next(6));
 163 reg_def XMM9h( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg()->next(7));
 164 
 165 reg_def XMM10 ( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg());
 166 reg_def XMM10b( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg()->next(1));
 167 reg_def XMM10c( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg()->next(2));
 168 reg_def XMM10d( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg()->next(3));
 169 reg_def XMM10e( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg()->next(4));
 170 reg_def XMM10f( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg()->next(5));
 171 reg_def XMM10g( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg()->next(6));
 172 reg_def XMM10h( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg()->next(7));
 173 
 174 reg_def XMM11 ( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg());
 175 reg_def XMM11b( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg()->next(1));
 176 reg_def XMM11c( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg()->next(2));
 177 reg_def XMM11d( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg()->next(3));
 178 reg_def XMM11e( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg()->next(4));
 179 reg_def XMM11f( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg()->next(5));
 180 reg_def XMM11g( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg()->next(6));
 181 reg_def XMM11h( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg()->next(7));
 182 
 183 reg_def XMM12 ( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg());
 184 reg_def XMM12b( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg()->next(1));
 185 reg_def XMM12c( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg()->next(2));
 186 reg_def XMM12d( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg()->next(3));
 187 reg_def XMM12e( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg()->next(4));
 188 reg_def XMM12f( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg()->next(5));
 189 reg_def XMM12g( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg()->next(6));
 190 reg_def XMM12h( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg()->next(7));
 191 
 192 reg_def XMM13 ( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg());
 193 reg_def XMM13b( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg()->next(1));
 194 reg_def XMM13c( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg()->next(2));
 195 reg_def XMM13d( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg()->next(3));
 196 reg_def XMM13e( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg()->next(4));
 197 reg_def XMM13f( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg()->next(5));
 198 reg_def XMM13g( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg()->next(6));
 199 reg_def XMM13h( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg()->next(7));
 200 
 201 reg_def XMM14 ( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg());
 202 reg_def XMM14b( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg()->next(1));
 203 reg_def XMM14c( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg()->next(2));
 204 reg_def XMM14d( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg()->next(3));
 205 reg_def XMM14e( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg()->next(4));
 206 reg_def XMM14f( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg()->next(5));
 207 reg_def XMM14g( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg()->next(6));
 208 reg_def XMM14h( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg()->next(7));
 209 
 210 reg_def XMM15 ( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg());
 211 reg_def XMM15b( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg()->next(1));
 212 reg_def XMM15c( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg()->next(2));
 213 reg_def XMM15d( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg()->next(3));
 214 reg_def XMM15e( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg()->next(4));
 215 reg_def XMM15f( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg()->next(5));
 216 reg_def XMM15g( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg()->next(6));
 217 reg_def XMM15h( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg()->next(7));
 218 
 219 #else // _WIN64
 220 
 221 reg_def XMM6 ( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg());
 222 reg_def XMM6b( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(1));
 223 reg_def XMM6c( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(2));
 224 reg_def XMM6d( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(3));
 225 reg_def XMM6e( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(4));
 226 reg_def XMM6f( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(5));
 227 reg_def XMM6g( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(6));
 228 reg_def XMM6h( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(7));
 229 
 230 reg_def XMM7 ( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg());
 231 reg_def XMM7b( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(1));
 232 reg_def XMM7c( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(2));
 233 reg_def XMM7d( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(3));
 234 reg_def XMM7e( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(4));
 235 reg_def XMM7f( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(5));
 236 reg_def XMM7g( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(6));
 237 reg_def XMM7h( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(7));
 238 
 239 #ifdef _LP64
 240 
 241 reg_def XMM8 ( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg());
 242 reg_def XMM8b( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(1));
 243 reg_def XMM8c( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(2));
 244 reg_def XMM8d( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(3));
 245 reg_def XMM8e( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(4));
 246 reg_def XMM8f( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(5));
 247 reg_def XMM8g( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(6));
 248 reg_def XMM8h( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(7));
 249 
 250 reg_def XMM9 ( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg());
 251 reg_def XMM9b( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(1));
 252 reg_def XMM9c( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(2));
 253 reg_def XMM9d( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(3));
 254 reg_def XMM9e( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(4));
 255 reg_def XMM9f( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(5));
 256 reg_def XMM9g( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(6));
 257 reg_def XMM9h( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(7));
 258 
 259 reg_def XMM10 ( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg());
 260 reg_def XMM10b( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(1));
 261 reg_def XMM10c( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(2));
 262 reg_def XMM10d( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(3));
 263 reg_def XMM10e( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(4));
 264 reg_def XMM10f( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(5));
 265 reg_def XMM10g( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(6));
 266 reg_def XMM10h( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(7));
 267 
 268 reg_def XMM11 ( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg());
 269 reg_def XMM11b( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(1));
 270 reg_def XMM11c( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(2));
 271 reg_def XMM11d( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(3));
 272 reg_def XMM11e( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(4));
 273 reg_def XMM11f( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(5));
 274 reg_def XMM11g( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(6));
 275 reg_def XMM11h( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(7));
 276 
 277 reg_def XMM12 ( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg());
 278 reg_def XMM12b( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(1));
 279 reg_def XMM12c( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(2));
 280 reg_def XMM12d( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(3));
 281 reg_def XMM12e( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(4));
 282 reg_def XMM12f( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(5));
 283 reg_def XMM12g( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(6));
 284 reg_def XMM12h( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(7));
 285 
 286 reg_def XMM13 ( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg());
 287 reg_def XMM13b( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(1));
 288 reg_def XMM13c( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(2));
 289 reg_def XMM13d( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(3));
 290 reg_def XMM13e( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(4));
 291 reg_def XMM13f( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(5));
 292 reg_def XMM13g( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(6));
 293 reg_def XMM13h( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(7));
 294 
 295 reg_def XMM14 ( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg());
 296 reg_def XMM14b( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(1));
 297 reg_def XMM14c( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(2));
 298 reg_def XMM14d( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(3));
 299 reg_def XMM14e( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(4));
 300 reg_def XMM14f( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(5));
 301 reg_def XMM14g( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(6));
 302 reg_def XMM14h( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(7));
 303 
 304 reg_def XMM15 ( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg());
 305 reg_def XMM15b( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(1));
 306 reg_def XMM15c( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(2));
 307 reg_def XMM15d( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(3));
 308 reg_def XMM15e( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(4));
 309 reg_def XMM15f( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(5));
 310 reg_def XMM15g( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(6));
 311 reg_def XMM15h( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(7));
 312 
 313 #endif // _LP64
 314 
 315 #endif // _WIN64
 316 
 317 #ifdef _LP64
 318 reg_def RFLAGS(SOC, SOC, 0, 16, VMRegImpl::Bad());
 319 #else
 320 reg_def RFLAGS(SOC, SOC, 0, 8, VMRegImpl::Bad());
 321 #endif // _LP64
 322 
 323 alloc_class chunk1(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
 324                    XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
 325                    XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
 326                    XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
 327                    XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
 328                    XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
 329                    XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
 330                    XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
 331 #ifdef _LP64
 332                   ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
 333                    XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
 334                    XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
 335                    XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
 336                    XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
 337                    XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
 338                    XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
 339                    XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h
 340 #endif
 341                    );
 342 
 343 // flags allocation class should be last.
 344 alloc_class chunk2(RFLAGS);
 345 
 346 // Singleton class for condition codes
 347 reg_class int_flags(RFLAGS);
 348 
 349 // Class for all float registers
 350 reg_class float_reg(XMM0,
 351                     XMM1,
 352                     XMM2,
 353                     XMM3,
 354                     XMM4,
 355                     XMM5,
 356                     XMM6,
 357                     XMM7
 358 #ifdef _LP64
 359                    ,XMM8,
 360                     XMM9,
 361                     XMM10,
 362                     XMM11,
 363                     XMM12,
 364                     XMM13,
 365                     XMM14,
 366                     XMM15
 367 #endif
 368                     );
 369 
 370 // Class for all double registers
 371 reg_class double_reg(XMM0,  XMM0b,
 372                      XMM1,  XMM1b,
 373                      XMM2,  XMM2b,
 374                      XMM3,  XMM3b,
 375                      XMM4,  XMM4b,
 376                      XMM5,  XMM5b,
 377                      XMM6,  XMM6b,
 378                      XMM7,  XMM7b
 379 #ifdef _LP64
 380                     ,XMM8,  XMM8b,
 381                      XMM9,  XMM9b,
 382                      XMM10, XMM10b,
 383                      XMM11, XMM11b,
 384                      XMM12, XMM12b,
 385                      XMM13, XMM13b,
 386                      XMM14, XMM14b,
 387                      XMM15, XMM15b
 388 #endif
 389                      );
 390 
 391 // Class for all 32bit vector registers
 392 reg_class vectors_reg(XMM0,
 393                       XMM1,
 394                       XMM2,
 395                       XMM3,
 396                       XMM4,
 397                       XMM5,
 398                       XMM6,
 399                       XMM7
 400 #ifdef _LP64
 401                      ,XMM8,
 402                       XMM9,
 403                       XMM10,
 404                       XMM11,
 405                       XMM12,
 406                       XMM13,
 407                       XMM14,
 408                       XMM15
 409 #endif
 410                       );
 411 
 412 // Class for all 64bit vector registers
 413 reg_class vectord_reg(XMM0,  XMM0b,
 414                       XMM1,  XMM1b,
 415                       XMM2,  XMM2b,
 416                       XMM3,  XMM3b,
 417                       XMM4,  XMM4b,
 418                       XMM5,  XMM5b,
 419                       XMM6,  XMM6b,
 420                       XMM7,  XMM7b
 421 #ifdef _LP64
 422                      ,XMM8,  XMM8b,
 423                       XMM9,  XMM9b,
 424                       XMM10, XMM10b,
 425                       XMM11, XMM11b,
 426                       XMM12, XMM12b,
 427                       XMM13, XMM13b,
 428                       XMM14, XMM14b,
 429                       XMM15, XMM15b
 430 #endif
 431                       );
 432 
 433 // Class for all 128bit vector registers
 434 reg_class vectorx_reg(XMM0,  XMM0b,  XMM0c,  XMM0d,
 435                       XMM1,  XMM1b,  XMM1c,  XMM1d,
 436                       XMM2,  XMM2b,  XMM2c,  XMM2d,
 437                       XMM3,  XMM3b,  XMM3c,  XMM3d,
 438                       XMM4,  XMM4b,  XMM4c,  XMM4d,
 439                       XMM5,  XMM5b,  XMM5c,  XMM5d,
 440                       XMM6,  XMM6b,  XMM6c,  XMM6d,
 441                       XMM7,  XMM7b,  XMM7c,  XMM7d
 442 #ifdef _LP64
 443                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,
 444                       XMM9,  XMM9b,  XMM9c,  XMM9d,
 445                       XMM10, XMM10b, XMM10c, XMM10d,
 446                       XMM11, XMM11b, XMM11c, XMM11d,
 447                       XMM12, XMM12b, XMM12c, XMM12d,
 448                       XMM13, XMM13b, XMM13c, XMM13d,
 449                       XMM14, XMM14b, XMM14c, XMM14d,
 450                       XMM15, XMM15b, XMM15c, XMM15d
 451 #endif
 452                       );
 453 
 454 // Class for all 256bit vector registers
 455 reg_class vectory_reg(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
 456                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
 457                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
 458                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
 459                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
 460                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
 461                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
 462                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
 463 #ifdef _LP64
 464                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
 465                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
 466                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
 467                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
 468                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
 469                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
 470                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
 471                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h
 472 #endif
 473                       );
 474 
 475 %}
 476 
 477 source %{
 478   // Float masks come from different places depending on platform.
 479 #ifdef _LP64
 480   static address float_signmask()  { return StubRoutines::x86::float_sign_mask(); }
 481   static address float_signflip()  { return StubRoutines::x86::float_sign_flip(); }
 482   static address double_signmask() { return StubRoutines::x86::double_sign_mask(); }
 483   static address double_signflip() { return StubRoutines::x86::double_sign_flip(); }
 484 #else
 485   static address float_signmask()  { return (address)float_signmask_pool; }
 486   static address float_signflip()  { return (address)float_signflip_pool; }
 487   static address double_signmask() { return (address)double_signmask_pool; }
 488   static address double_signflip() { return (address)double_signflip_pool; }
 489 #endif
 490 
 491 
 492 const bool Matcher::match_rule_supported(int opcode) {
 493   if (!has_match_rule(opcode))
 494     return false;
 495 
 496   switch (opcode) {
 497     case Op_PopCountI:
 498     case Op_PopCountL:
 499       if (!UsePopCountInstruction)
 500         return false;
 501     break;
 502     case Op_MulVI:
 503       if ((UseSSE < 4) && (UseAVX < 1)) // only with SSE4_1 or AVX
 504         return false;
 505     break;
 506     case Op_CompareAndSwapL:
 507 #ifdef _LP64
 508     case Op_CompareAndSwapP:
 509 #endif
 510       if (!VM_Version::supports_cx8())
 511         return false;
 512     break;
 513   }
 514 
 515   return true;  // Per default match rules are supported.
 516 }
 517 
 518 // Max vector size in bytes. 0 if not supported.
 519 const int Matcher::vector_width_in_bytes(BasicType bt) {
 520   assert(is_java_primitive(bt), "only primitive type vectors");
 521   if (UseSSE < 2) return 0;
 522   // SSE2 supports 128bit vectors for all types.
 523   // AVX2 supports 256bit vectors for all types.
 524   int size = (UseAVX > 1) ? 32 : 16;
 525   // AVX1 supports 256bit vectors only for FLOAT and DOUBLE.
 526   if (UseAVX > 0 && (bt == T_FLOAT || bt == T_DOUBLE))
 527     size = 32;
 528   // Use flag to limit vector size.
 529   size = MIN2(size,(int)MaxVectorSize);
 530   // Minimum 2 values in vector (or 4 for bytes).
 531   switch (bt) {
 532   case T_DOUBLE:
 533   case T_LONG:
 534     if (size < 16) return 0;
 535   case T_FLOAT:
 536   case T_INT:
 537     if (size < 8) return 0;
 538   case T_BOOLEAN:
 539   case T_BYTE:
 540   case T_CHAR:
 541   case T_SHORT:
 542     if (size < 4) return 0;
 543     break;
 544   default:
 545     ShouldNotReachHere();
 546   }
 547   return size;
 548 }
 549 
 550 // Limits on vector size (number of elements) loaded into vector.
 551 const int Matcher::max_vector_size(const BasicType bt) {
 552   return vector_width_in_bytes(bt)/type2aelembytes(bt);
 553 }
 554 const int Matcher::min_vector_size(const BasicType bt) {
 555   int max_size = max_vector_size(bt);
 556   // Min size which can be loaded into vector is 4 bytes.
 557   int size = (type2aelembytes(bt) == 1) ? 4 : 2;
 558   return MIN2(size,max_size);
 559 }
 560 
 561 // Vector ideal reg corresponding to specidied size in bytes
 562 const int Matcher::vector_ideal_reg(int size) {
 563   assert(MaxVectorSize >= size, "");
 564   switch(size) {
 565     case  4: return Op_VecS;
 566     case  8: return Op_VecD;
 567     case 16: return Op_VecX;
 568     case 32: return Op_VecY;
 569   }
 570   ShouldNotReachHere();
 571   return 0;
 572 }
 573 
 574 // x86 supports misaligned vectors store/load.
 575 const bool Matcher::misaligned_vectors_ok() {
 576   return !AlignVector; // can be changed by flag
 577 }
 578 
 579 // Helper methods for MachSpillCopyNode::implementation().
 580 static int vec_mov_helper(CodeBuffer *cbuf, bool do_size, int src_lo, int dst_lo,
 581                           int src_hi, int dst_hi, uint ireg, outputStream* st) {
 582   // In 64-bit VM size calculation is very complex. Emitting instructions
 583   // into scratch buffer is used to get size in 64-bit VM.
 584   LP64_ONLY( assert(!do_size, "this method calculates size only for 32-bit VM"); )
 585   assert(ireg == Op_VecS || // 32bit vector
 586          (src_lo & 1) == 0 && (src_lo + 1) == src_hi &&
 587          (dst_lo & 1) == 0 && (dst_lo + 1) == dst_hi,
 588          "no non-adjacent vector moves" );
 589   if (cbuf) {
 590     MacroAssembler _masm(cbuf);
 591     int offset = __ offset();
 592     switch (ireg) {
 593     case Op_VecS: // copy whole register
 594     case Op_VecD:
 595     case Op_VecX:
 596       __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
 597       break;
 598     case Op_VecY:
 599       __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
 600       break;
 601     default:
 602       ShouldNotReachHere();
 603     }
 604     int size = __ offset() - offset;
 605 #ifdef ASSERT
 606     // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
 607     assert(!do_size || size == 4, "incorrect size calculattion");
 608 #endif
 609     return size;
 610 #ifndef PRODUCT
 611   } else if (!do_size) {
 612     switch (ireg) {
 613     case Op_VecS:
 614     case Op_VecD:
 615     case Op_VecX:
 616       st->print("movdqu  %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
 617       break;
 618     case Op_VecY:
 619       st->print("vmovdqu %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
 620       break;
 621     default:
 622       ShouldNotReachHere();
 623     }
 624 #endif
 625   }
 626   // VEX_2bytes prefix is used if UseAVX > 0, and it takes the same 2 bytes as SIMD prefix.
 627   return 4;
 628 }
 629 
 630 static int vec_spill_helper(CodeBuffer *cbuf, bool do_size, bool is_load,
 631                             int stack_offset, int reg, uint ireg, outputStream* st) {
 632   // In 64-bit VM size calculation is very complex. Emitting instructions
 633   // into scratch buffer is used to get size in 64-bit VM.
 634   LP64_ONLY( assert(!do_size, "this method calculates size only for 32-bit VM"); )
 635   if (cbuf) {
 636     MacroAssembler _masm(cbuf);
 637     int offset = __ offset();
 638     if (is_load) {
 639       switch (ireg) {
 640       case Op_VecS:
 641         __ movdl(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 642         break;
 643       case Op_VecD:
 644         __ movq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 645         break;
 646       case Op_VecX:
 647         __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 648         break;
 649       case Op_VecY:
 650         __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 651         break;
 652       default:
 653         ShouldNotReachHere();
 654       }
 655     } else { // store
 656       switch (ireg) {
 657       case Op_VecS:
 658         __ movdl(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 659         break;
 660       case Op_VecD:
 661         __ movq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 662         break;
 663       case Op_VecX:
 664         __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 665         break;
 666       case Op_VecY:
 667         __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 668         break;
 669       default:
 670         ShouldNotReachHere();
 671       }
 672     }
 673     int size = __ offset() - offset;
 674 #ifdef ASSERT
 675     int offset_size = (stack_offset == 0) ? 0 : ((stack_offset < 0x80) ? 1 : 4);
 676     // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
 677     assert(!do_size || size == (5+offset_size), "incorrect size calculattion");
 678 #endif
 679     return size;
 680 #ifndef PRODUCT
 681   } else if (!do_size) {
 682     if (is_load) {
 683       switch (ireg) {
 684       case Op_VecS:
 685         st->print("movd    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 686         break;
 687       case Op_VecD:
 688         st->print("movq    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 689         break;
 690        case Op_VecX:
 691         st->print("movdqu  %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 692         break;
 693       case Op_VecY:
 694         st->print("vmovdqu %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 695         break;
 696       default:
 697         ShouldNotReachHere();
 698       }
 699     } else { // store
 700       switch (ireg) {
 701       case Op_VecS:
 702         st->print("movd    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 703         break;
 704       case Op_VecD:
 705         st->print("movq    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 706         break;
 707        case Op_VecX:
 708         st->print("movdqu  [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 709         break;
 710       case Op_VecY:
 711         st->print("vmovdqu [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 712         break;
 713       default:
 714         ShouldNotReachHere();
 715       }
 716     }
 717 #endif
 718   }
 719   int offset_size = (stack_offset == 0) ? 0 : ((stack_offset < 0x80) ? 1 : 4);
 720   // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
 721   return 5+offset_size;
 722 }
 723 
 724 static inline jfloat replicate4_imm(int con, int width) {
 725   // Load a constant of "width" (in bytes) and replicate it to fill 32bit.
 726   assert(width == 1 || width == 2, "only byte or short types here");
 727   int bit_width = width * 8;
 728   jint val = con;
 729   val &= (1 << bit_width) - 1;  // mask off sign bits
 730   while(bit_width < 32) {
 731     val |= (val << bit_width);
 732     bit_width <<= 1;
 733   }
 734   jfloat fval = *((jfloat*) &val);  // coerce to float type
 735   return fval;
 736 }
 737 
 738 static inline jdouble replicate8_imm(int con, int width) {
 739   // Load a constant of "width" (in bytes) and replicate it to fill 64bit.
 740   assert(width == 1 || width == 2 || width == 4, "only byte, short or int types here");
 741   int bit_width = width * 8;
 742   jlong val = con;
 743   val &= (((jlong) 1) << bit_width) - 1;  // mask off sign bits
 744   while(bit_width < 64) {
 745     val |= (val << bit_width);
 746     bit_width <<= 1;
 747   }
 748   jdouble dval = *((jdouble*) &val);  // coerce to double type
 749   return dval;
 750 }
 751 
 752 #ifndef PRODUCT
 753   void MachNopNode::format(PhaseRegAlloc*, outputStream* st) const {
 754     st->print("nop \t# %d bytes pad for loops and calls", _count);
 755   }
 756 #endif
 757 
 758   void MachNopNode::emit(CodeBuffer &cbuf, PhaseRegAlloc*) const {
 759     MacroAssembler _masm(&cbuf);
 760     __ nop(_count);
 761   }
 762 
 763   uint MachNopNode::size(PhaseRegAlloc*) const {
 764     return _count;
 765   }
 766 
 767 #ifndef PRODUCT
 768   void MachBreakpointNode::format(PhaseRegAlloc*, outputStream* st) const {
 769     st->print("# breakpoint");
 770   }
 771 #endif
 772 
 773   void MachBreakpointNode::emit(CodeBuffer &cbuf, PhaseRegAlloc* ra_) const {
 774     MacroAssembler _masm(&cbuf);
 775     __ int3();
 776   }
 777 
 778   uint MachBreakpointNode::size(PhaseRegAlloc* ra_) const {
 779     return MachNode::size(ra_);
 780   }
 781 
 782 %}
 783 
 784 encode %{
 785 
 786   enc_class preserve_SP %{
 787     debug_only(int off0 = cbuf.insts_size());
 788     MacroAssembler _masm(&cbuf);
 789     // RBP is preserved across all calls, even compiled calls.
 790     // Use it to preserve RSP in places where the callee might change the SP.
 791     __ movptr(rbp_mh_SP_save, rsp);
 792     debug_only(int off1 = cbuf.insts_size());
 793     assert(off1 - off0 == preserve_SP_size(), "correct size prediction");
 794   %}
 795 
 796   enc_class restore_SP %{
 797     MacroAssembler _masm(&cbuf);
 798     __ movptr(rsp, rbp_mh_SP_save);
 799   %}
 800 
 801   enc_class call_epilog %{
 802     if (VerifyStackAtCalls) {
 803       // Check that stack depth is unchanged: find majik cookie on stack
 804       int framesize = ra_->reg2offset_unchecked(OptoReg::add(ra_->_matcher._old_SP, -3*VMRegImpl::slots_per_word));
 805       MacroAssembler _masm(&cbuf);
 806       Label L;
 807       __ cmpptr(Address(rsp, framesize), (int32_t)0xbadb100d);
 808       __ jccb(Assembler::equal, L);
 809       // Die if stack mismatch
 810       __ int3();
 811       __ bind(L);
 812     }
 813   %}
 814 
 815 %}
 816 
 817 
 818 //----------OPERANDS-----------------------------------------------------------
 819 // Operand definitions must precede instruction definitions for correct parsing
 820 // in the ADLC because operands constitute user defined types which are used in
 821 // instruction definitions.
 822 
 823 // Vectors
 824 operand vecS() %{
 825   constraint(ALLOC_IN_RC(vectors_reg));
 826   match(VecS);
 827 
 828   format %{ %}
 829   interface(REG_INTER);
 830 %}
 831 
 832 operand vecD() %{
 833   constraint(ALLOC_IN_RC(vectord_reg));
 834   match(VecD);
 835 
 836   format %{ %}
 837   interface(REG_INTER);
 838 %}
 839 
 840 operand vecX() %{
 841   constraint(ALLOC_IN_RC(vectorx_reg));
 842   match(VecX);
 843 
 844   format %{ %}
 845   interface(REG_INTER);
 846 %}
 847 
 848 operand vecY() %{
 849   constraint(ALLOC_IN_RC(vectory_reg));
 850   match(VecY);
 851 
 852   format %{ %}
 853   interface(REG_INTER);
 854 %}
 855 
 856 
 857 // INSTRUCTIONS -- Platform independent definitions (same for 32- and 64-bit)
 858 
 859 // ============================================================================
 860 
 861 instruct ShouldNotReachHere() %{
 862   match(Halt);
 863   format %{ "int3\t# ShouldNotReachHere" %}
 864   ins_encode %{
 865     __ int3();
 866   %}
 867   ins_pipe(pipe_slow);
 868 %}
 869 
 870 // ============================================================================
 871 
 872 instruct addF_reg(regF dst, regF src) %{
 873   predicate((UseSSE>=1) && (UseAVX == 0));
 874   match(Set dst (AddF dst src));
 875 
 876   format %{ "addss   $dst, $src" %}
 877   ins_cost(150);
 878   ins_encode %{
 879     __ addss($dst$$XMMRegister, $src$$XMMRegister);
 880   %}
 881   ins_pipe(pipe_slow);
 882 %}
 883 
 884 instruct addF_mem(regF dst, memory src) %{
 885   predicate((UseSSE>=1) && (UseAVX == 0));
 886   match(Set dst (AddF dst (LoadF src)));
 887 
 888   format %{ "addss   $dst, $src" %}
 889   ins_cost(150);
 890   ins_encode %{
 891     __ addss($dst$$XMMRegister, $src$$Address);
 892   %}
 893   ins_pipe(pipe_slow);
 894 %}
 895 
 896 instruct addF_imm(regF dst, immF con) %{
 897   predicate((UseSSE>=1) && (UseAVX == 0));
 898   match(Set dst (AddF dst con));
 899   format %{ "addss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
 900   ins_cost(150);
 901   ins_encode %{
 902     __ addss($dst$$XMMRegister, $constantaddress($con));
 903   %}
 904   ins_pipe(pipe_slow);
 905 %}
 906 
 907 instruct addF_reg_reg(regF dst, regF src1, regF src2) %{
 908   predicate(UseAVX > 0);
 909   match(Set dst (AddF src1 src2));
 910 
 911   format %{ "vaddss  $dst, $src1, $src2" %}
 912   ins_cost(150);
 913   ins_encode %{
 914     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 915   %}
 916   ins_pipe(pipe_slow);
 917 %}
 918 
 919 instruct addF_reg_mem(regF dst, regF src1, memory src2) %{
 920   predicate(UseAVX > 0);
 921   match(Set dst (AddF src1 (LoadF src2)));
 922 
 923   format %{ "vaddss  $dst, $src1, $src2" %}
 924   ins_cost(150);
 925   ins_encode %{
 926     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 927   %}
 928   ins_pipe(pipe_slow);
 929 %}
 930 
 931 instruct addF_reg_imm(regF dst, regF src, immF con) %{
 932   predicate(UseAVX > 0);
 933   match(Set dst (AddF src con));
 934 
 935   format %{ "vaddss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
 936   ins_cost(150);
 937   ins_encode %{
 938     __ vaddss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 939   %}
 940   ins_pipe(pipe_slow);
 941 %}
 942 
 943 instruct addD_reg(regD dst, regD src) %{
 944   predicate((UseSSE>=2) && (UseAVX == 0));
 945   match(Set dst (AddD dst src));
 946 
 947   format %{ "addsd   $dst, $src" %}
 948   ins_cost(150);
 949   ins_encode %{
 950     __ addsd($dst$$XMMRegister, $src$$XMMRegister);
 951   %}
 952   ins_pipe(pipe_slow);
 953 %}
 954 
 955 instruct addD_mem(regD dst, memory src) %{
 956   predicate((UseSSE>=2) && (UseAVX == 0));
 957   match(Set dst (AddD dst (LoadD src)));
 958 
 959   format %{ "addsd   $dst, $src" %}
 960   ins_cost(150);
 961   ins_encode %{
 962     __ addsd($dst$$XMMRegister, $src$$Address);
 963   %}
 964   ins_pipe(pipe_slow);
 965 %}
 966 
 967 instruct addD_imm(regD dst, immD con) %{
 968   predicate((UseSSE>=2) && (UseAVX == 0));
 969   match(Set dst (AddD dst con));
 970   format %{ "addsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
 971   ins_cost(150);
 972   ins_encode %{
 973     __ addsd($dst$$XMMRegister, $constantaddress($con));
 974   %}
 975   ins_pipe(pipe_slow);
 976 %}
 977 
 978 instruct addD_reg_reg(regD dst, regD src1, regD src2) %{
 979   predicate(UseAVX > 0);
 980   match(Set dst (AddD src1 src2));
 981 
 982   format %{ "vaddsd  $dst, $src1, $src2" %}
 983   ins_cost(150);
 984   ins_encode %{
 985     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 986   %}
 987   ins_pipe(pipe_slow);
 988 %}
 989 
 990 instruct addD_reg_mem(regD dst, regD src1, memory src2) %{
 991   predicate(UseAVX > 0);
 992   match(Set dst (AddD src1 (LoadD src2)));
 993 
 994   format %{ "vaddsd  $dst, $src1, $src2" %}
 995   ins_cost(150);
 996   ins_encode %{
 997     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 998   %}
 999   ins_pipe(pipe_slow);
1000 %}
1001 
1002 instruct addD_reg_imm(regD dst, regD src, immD con) %{
1003   predicate(UseAVX > 0);
1004   match(Set dst (AddD src con));
1005 
1006   format %{ "vaddsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
1007   ins_cost(150);
1008   ins_encode %{
1009     __ vaddsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
1010   %}
1011   ins_pipe(pipe_slow);
1012 %}
1013 
1014 instruct subF_reg(regF dst, regF src) %{
1015   predicate((UseSSE>=1) && (UseAVX == 0));
1016   match(Set dst (SubF dst src));
1017 
1018   format %{ "subss   $dst, $src" %}
1019   ins_cost(150);
1020   ins_encode %{
1021     __ subss($dst$$XMMRegister, $src$$XMMRegister);
1022   %}
1023   ins_pipe(pipe_slow);
1024 %}
1025 
1026 instruct subF_mem(regF dst, memory src) %{
1027   predicate((UseSSE>=1) && (UseAVX == 0));
1028   match(Set dst (SubF dst (LoadF src)));
1029 
1030   format %{ "subss   $dst, $src" %}
1031   ins_cost(150);
1032   ins_encode %{
1033     __ subss($dst$$XMMRegister, $src$$Address);
1034   %}
1035   ins_pipe(pipe_slow);
1036 %}
1037 
1038 instruct subF_imm(regF dst, immF con) %{
1039   predicate((UseSSE>=1) && (UseAVX == 0));
1040   match(Set dst (SubF dst con));
1041   format %{ "subss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
1042   ins_cost(150);
1043   ins_encode %{
1044     __ subss($dst$$XMMRegister, $constantaddress($con));
1045   %}
1046   ins_pipe(pipe_slow);
1047 %}
1048 
1049 instruct subF_reg_reg(regF dst, regF src1, regF src2) %{
1050   predicate(UseAVX > 0);
1051   match(Set dst (SubF src1 src2));
1052 
1053   format %{ "vsubss  $dst, $src1, $src2" %}
1054   ins_cost(150);
1055   ins_encode %{
1056     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
1057   %}
1058   ins_pipe(pipe_slow);
1059 %}
1060 
1061 instruct subF_reg_mem(regF dst, regF src1, memory src2) %{
1062   predicate(UseAVX > 0);
1063   match(Set dst (SubF src1 (LoadF src2)));
1064 
1065   format %{ "vsubss  $dst, $src1, $src2" %}
1066   ins_cost(150);
1067   ins_encode %{
1068     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
1069   %}
1070   ins_pipe(pipe_slow);
1071 %}
1072 
1073 instruct subF_reg_imm(regF dst, regF src, immF con) %{
1074   predicate(UseAVX > 0);
1075   match(Set dst (SubF src con));
1076 
1077   format %{ "vsubss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
1078   ins_cost(150);
1079   ins_encode %{
1080     __ vsubss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
1081   %}
1082   ins_pipe(pipe_slow);
1083 %}
1084 
1085 instruct subD_reg(regD dst, regD src) %{
1086   predicate((UseSSE>=2) && (UseAVX == 0));
1087   match(Set dst (SubD dst src));
1088 
1089   format %{ "subsd   $dst, $src" %}
1090   ins_cost(150);
1091   ins_encode %{
1092     __ subsd($dst$$XMMRegister, $src$$XMMRegister);
1093   %}
1094   ins_pipe(pipe_slow);
1095 %}
1096 
1097 instruct subD_mem(regD dst, memory src) %{
1098   predicate((UseSSE>=2) && (UseAVX == 0));
1099   match(Set dst (SubD dst (LoadD src)));
1100 
1101   format %{ "subsd   $dst, $src" %}
1102   ins_cost(150);
1103   ins_encode %{
1104     __ subsd($dst$$XMMRegister, $src$$Address);
1105   %}
1106   ins_pipe(pipe_slow);
1107 %}
1108 
1109 instruct subD_imm(regD dst, immD con) %{
1110   predicate((UseSSE>=2) && (UseAVX == 0));
1111   match(Set dst (SubD dst con));
1112   format %{ "subsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
1113   ins_cost(150);
1114   ins_encode %{
1115     __ subsd($dst$$XMMRegister, $constantaddress($con));
1116   %}
1117   ins_pipe(pipe_slow);
1118 %}
1119 
1120 instruct subD_reg_reg(regD dst, regD src1, regD src2) %{
1121   predicate(UseAVX > 0);
1122   match(Set dst (SubD src1 src2));
1123 
1124   format %{ "vsubsd  $dst, $src1, $src2" %}
1125   ins_cost(150);
1126   ins_encode %{
1127     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
1128   %}
1129   ins_pipe(pipe_slow);
1130 %}
1131 
1132 instruct subD_reg_mem(regD dst, regD src1, memory src2) %{
1133   predicate(UseAVX > 0);
1134   match(Set dst (SubD src1 (LoadD src2)));
1135 
1136   format %{ "vsubsd  $dst, $src1, $src2" %}
1137   ins_cost(150);
1138   ins_encode %{
1139     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
1140   %}
1141   ins_pipe(pipe_slow);
1142 %}
1143 
1144 instruct subD_reg_imm(regD dst, regD src, immD con) %{
1145   predicate(UseAVX > 0);
1146   match(Set dst (SubD src con));
1147 
1148   format %{ "vsubsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
1149   ins_cost(150);
1150   ins_encode %{
1151     __ vsubsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
1152   %}
1153   ins_pipe(pipe_slow);
1154 %}
1155 
1156 instruct mulF_reg(regF dst, regF src) %{
1157   predicate((UseSSE>=1) && (UseAVX == 0));
1158   match(Set dst (MulF dst src));
1159 
1160   format %{ "mulss   $dst, $src" %}
1161   ins_cost(150);
1162   ins_encode %{
1163     __ mulss($dst$$XMMRegister, $src$$XMMRegister);
1164   %}
1165   ins_pipe(pipe_slow);
1166 %}
1167 
1168 instruct mulF_mem(regF dst, memory src) %{
1169   predicate((UseSSE>=1) && (UseAVX == 0));
1170   match(Set dst (MulF dst (LoadF src)));
1171 
1172   format %{ "mulss   $dst, $src" %}
1173   ins_cost(150);
1174   ins_encode %{
1175     __ mulss($dst$$XMMRegister, $src$$Address);
1176   %}
1177   ins_pipe(pipe_slow);
1178 %}
1179 
1180 instruct mulF_imm(regF dst, immF con) %{
1181   predicate((UseSSE>=1) && (UseAVX == 0));
1182   match(Set dst (MulF dst con));
1183   format %{ "mulss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
1184   ins_cost(150);
1185   ins_encode %{
1186     __ mulss($dst$$XMMRegister, $constantaddress($con));
1187   %}
1188   ins_pipe(pipe_slow);
1189 %}
1190 
1191 instruct mulF_reg_reg(regF dst, regF src1, regF src2) %{
1192   predicate(UseAVX > 0);
1193   match(Set dst (MulF src1 src2));
1194 
1195   format %{ "vmulss  $dst, $src1, $src2" %}
1196   ins_cost(150);
1197   ins_encode %{
1198     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
1199   %}
1200   ins_pipe(pipe_slow);
1201 %}
1202 
1203 instruct mulF_reg_mem(regF dst, regF src1, memory src2) %{
1204   predicate(UseAVX > 0);
1205   match(Set dst (MulF src1 (LoadF src2)));
1206 
1207   format %{ "vmulss  $dst, $src1, $src2" %}
1208   ins_cost(150);
1209   ins_encode %{
1210     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
1211   %}
1212   ins_pipe(pipe_slow);
1213 %}
1214 
1215 instruct mulF_reg_imm(regF dst, regF src, immF con) %{
1216   predicate(UseAVX > 0);
1217   match(Set dst (MulF src con));
1218 
1219   format %{ "vmulss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
1220   ins_cost(150);
1221   ins_encode %{
1222     __ vmulss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
1223   %}
1224   ins_pipe(pipe_slow);
1225 %}
1226 
1227 instruct mulD_reg(regD dst, regD src) %{
1228   predicate((UseSSE>=2) && (UseAVX == 0));
1229   match(Set dst (MulD dst src));
1230 
1231   format %{ "mulsd   $dst, $src" %}
1232   ins_cost(150);
1233   ins_encode %{
1234     __ mulsd($dst$$XMMRegister, $src$$XMMRegister);
1235   %}
1236   ins_pipe(pipe_slow);
1237 %}
1238 
1239 instruct mulD_mem(regD dst, memory src) %{
1240   predicate((UseSSE>=2) && (UseAVX == 0));
1241   match(Set dst (MulD dst (LoadD src)));
1242 
1243   format %{ "mulsd   $dst, $src" %}
1244   ins_cost(150);
1245   ins_encode %{
1246     __ mulsd($dst$$XMMRegister, $src$$Address);
1247   %}
1248   ins_pipe(pipe_slow);
1249 %}
1250 
1251 instruct mulD_imm(regD dst, immD con) %{
1252   predicate((UseSSE>=2) && (UseAVX == 0));
1253   match(Set dst (MulD dst con));
1254   format %{ "mulsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
1255   ins_cost(150);
1256   ins_encode %{
1257     __ mulsd($dst$$XMMRegister, $constantaddress($con));
1258   %}
1259   ins_pipe(pipe_slow);
1260 %}
1261 
1262 instruct mulD_reg_reg(regD dst, regD src1, regD src2) %{
1263   predicate(UseAVX > 0);
1264   match(Set dst (MulD src1 src2));
1265 
1266   format %{ "vmulsd  $dst, $src1, $src2" %}
1267   ins_cost(150);
1268   ins_encode %{
1269     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
1270   %}
1271   ins_pipe(pipe_slow);
1272 %}
1273 
1274 instruct mulD_reg_mem(regD dst, regD src1, memory src2) %{
1275   predicate(UseAVX > 0);
1276   match(Set dst (MulD src1 (LoadD src2)));
1277 
1278   format %{ "vmulsd  $dst, $src1, $src2" %}
1279   ins_cost(150);
1280   ins_encode %{
1281     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
1282   %}
1283   ins_pipe(pipe_slow);
1284 %}
1285 
1286 instruct mulD_reg_imm(regD dst, regD src, immD con) %{
1287   predicate(UseAVX > 0);
1288   match(Set dst (MulD src con));
1289 
1290   format %{ "vmulsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
1291   ins_cost(150);
1292   ins_encode %{
1293     __ vmulsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
1294   %}
1295   ins_pipe(pipe_slow);
1296 %}
1297 
1298 instruct divF_reg(regF dst, regF src) %{
1299   predicate((UseSSE>=1) && (UseAVX == 0));
1300   match(Set dst (DivF dst src));
1301 
1302   format %{ "divss   $dst, $src" %}
1303   ins_cost(150);
1304   ins_encode %{
1305     __ divss($dst$$XMMRegister, $src$$XMMRegister);
1306   %}
1307   ins_pipe(pipe_slow);
1308 %}
1309 
1310 instruct divF_mem(regF dst, memory src) %{
1311   predicate((UseSSE>=1) && (UseAVX == 0));
1312   match(Set dst (DivF dst (LoadF src)));
1313 
1314   format %{ "divss   $dst, $src" %}
1315   ins_cost(150);
1316   ins_encode %{
1317     __ divss($dst$$XMMRegister, $src$$Address);
1318   %}
1319   ins_pipe(pipe_slow);
1320 %}
1321 
1322 instruct divF_imm(regF dst, immF con) %{
1323   predicate((UseSSE>=1) && (UseAVX == 0));
1324   match(Set dst (DivF dst con));
1325   format %{ "divss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
1326   ins_cost(150);
1327   ins_encode %{
1328     __ divss($dst$$XMMRegister, $constantaddress($con));
1329   %}
1330   ins_pipe(pipe_slow);
1331 %}
1332 
1333 instruct divF_reg_reg(regF dst, regF src1, regF src2) %{
1334   predicate(UseAVX > 0);
1335   match(Set dst (DivF src1 src2));
1336 
1337   format %{ "vdivss  $dst, $src1, $src2" %}
1338   ins_cost(150);
1339   ins_encode %{
1340     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
1341   %}
1342   ins_pipe(pipe_slow);
1343 %}
1344 
1345 instruct divF_reg_mem(regF dst, regF src1, memory src2) %{
1346   predicate(UseAVX > 0);
1347   match(Set dst (DivF src1 (LoadF src2)));
1348 
1349   format %{ "vdivss  $dst, $src1, $src2" %}
1350   ins_cost(150);
1351   ins_encode %{
1352     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
1353   %}
1354   ins_pipe(pipe_slow);
1355 %}
1356 
1357 instruct divF_reg_imm(regF dst, regF src, immF con) %{
1358   predicate(UseAVX > 0);
1359   match(Set dst (DivF src con));
1360 
1361   format %{ "vdivss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
1362   ins_cost(150);
1363   ins_encode %{
1364     __ vdivss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
1365   %}
1366   ins_pipe(pipe_slow);
1367 %}
1368 
1369 instruct divD_reg(regD dst, regD src) %{
1370   predicate((UseSSE>=2) && (UseAVX == 0));
1371   match(Set dst (DivD dst src));
1372 
1373   format %{ "divsd   $dst, $src" %}
1374   ins_cost(150);
1375   ins_encode %{
1376     __ divsd($dst$$XMMRegister, $src$$XMMRegister);
1377   %}
1378   ins_pipe(pipe_slow);
1379 %}
1380 
1381 instruct divD_mem(regD dst, memory src) %{
1382   predicate((UseSSE>=2) && (UseAVX == 0));
1383   match(Set dst (DivD dst (LoadD src)));
1384 
1385   format %{ "divsd   $dst, $src" %}
1386   ins_cost(150);
1387   ins_encode %{
1388     __ divsd($dst$$XMMRegister, $src$$Address);
1389   %}
1390   ins_pipe(pipe_slow);
1391 %}
1392 
1393 instruct divD_imm(regD dst, immD con) %{
1394   predicate((UseSSE>=2) && (UseAVX == 0));
1395   match(Set dst (DivD dst con));
1396   format %{ "divsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
1397   ins_cost(150);
1398   ins_encode %{
1399     __ divsd($dst$$XMMRegister, $constantaddress($con));
1400   %}
1401   ins_pipe(pipe_slow);
1402 %}
1403 
1404 instruct divD_reg_reg(regD dst, regD src1, regD src2) %{
1405   predicate(UseAVX > 0);
1406   match(Set dst (DivD src1 src2));
1407 
1408   format %{ "vdivsd  $dst, $src1, $src2" %}
1409   ins_cost(150);
1410   ins_encode %{
1411     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
1412   %}
1413   ins_pipe(pipe_slow);
1414 %}
1415 
1416 instruct divD_reg_mem(regD dst, regD src1, memory src2) %{
1417   predicate(UseAVX > 0);
1418   match(Set dst (DivD src1 (LoadD src2)));
1419 
1420   format %{ "vdivsd  $dst, $src1, $src2" %}
1421   ins_cost(150);
1422   ins_encode %{
1423     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
1424   %}
1425   ins_pipe(pipe_slow);
1426 %}
1427 
1428 instruct divD_reg_imm(regD dst, regD src, immD con) %{
1429   predicate(UseAVX > 0);
1430   match(Set dst (DivD src con));
1431 
1432   format %{ "vdivsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
1433   ins_cost(150);
1434   ins_encode %{
1435     __ vdivsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
1436   %}
1437   ins_pipe(pipe_slow);
1438 %}
1439 
1440 instruct absF_reg(regF dst) %{
1441   predicate((UseSSE>=1) && (UseAVX == 0));
1442   match(Set dst (AbsF dst));
1443   ins_cost(150);
1444   format %{ "andps   $dst, [0x7fffffff]\t# abs float by sign masking" %}
1445   ins_encode %{
1446     __ andps($dst$$XMMRegister, ExternalAddress(float_signmask()));
1447   %}
1448   ins_pipe(pipe_slow);
1449 %}
1450 
1451 instruct absF_reg_reg(regF dst, regF src) %{
1452   predicate(UseAVX > 0);
1453   match(Set dst (AbsF src));
1454   ins_cost(150);
1455   format %{ "vandps  $dst, $src, [0x7fffffff]\t# abs float by sign masking" %}
1456   ins_encode %{
1457     bool vector256 = false;
1458     __ vandps($dst$$XMMRegister, $src$$XMMRegister,
1459               ExternalAddress(float_signmask()), vector256);
1460   %}
1461   ins_pipe(pipe_slow);
1462 %}
1463 
1464 instruct absD_reg(regD dst) %{
1465   predicate((UseSSE>=2) && (UseAVX == 0));
1466   match(Set dst (AbsD dst));
1467   ins_cost(150);
1468   format %{ "andpd   $dst, [0x7fffffffffffffff]\t"
1469             "# abs double by sign masking" %}
1470   ins_encode %{
1471     __ andpd($dst$$XMMRegister, ExternalAddress(double_signmask()));
1472   %}
1473   ins_pipe(pipe_slow);
1474 %}
1475 
1476 instruct absD_reg_reg(regD dst, regD src) %{
1477   predicate(UseAVX > 0);
1478   match(Set dst (AbsD src));
1479   ins_cost(150);
1480   format %{ "vandpd  $dst, $src, [0x7fffffffffffffff]\t"
1481             "# abs double by sign masking" %}
1482   ins_encode %{
1483     bool vector256 = false;
1484     __ vandpd($dst$$XMMRegister, $src$$XMMRegister,
1485               ExternalAddress(double_signmask()), vector256);
1486   %}
1487   ins_pipe(pipe_slow);
1488 %}
1489 
1490 instruct negF_reg(regF dst) %{
1491   predicate((UseSSE>=1) && (UseAVX == 0));
1492   match(Set dst (NegF dst));
1493   ins_cost(150);
1494   format %{ "xorps   $dst, [0x80000000]\t# neg float by sign flipping" %}
1495   ins_encode %{
1496     __ xorps($dst$$XMMRegister, ExternalAddress(float_signflip()));
1497   %}
1498   ins_pipe(pipe_slow);
1499 %}
1500 
1501 instruct negF_reg_reg(regF dst, regF src) %{
1502   predicate(UseAVX > 0);
1503   match(Set dst (NegF src));
1504   ins_cost(150);
1505   format %{ "vxorps  $dst, $src, [0x80000000]\t# neg float by sign flipping" %}
1506   ins_encode %{
1507     bool vector256 = false;
1508     __ vxorps($dst$$XMMRegister, $src$$XMMRegister,
1509               ExternalAddress(float_signflip()), vector256);
1510   %}
1511   ins_pipe(pipe_slow);
1512 %}
1513 
1514 instruct negD_reg(regD dst) %{
1515   predicate((UseSSE>=2) && (UseAVX == 0));
1516   match(Set dst (NegD dst));
1517   ins_cost(150);
1518   format %{ "xorpd   $dst, [0x8000000000000000]\t"
1519             "# neg double by sign flipping" %}
1520   ins_encode %{
1521     __ xorpd($dst$$XMMRegister, ExternalAddress(double_signflip()));
1522   %}
1523   ins_pipe(pipe_slow);
1524 %}
1525 
1526 instruct negD_reg_reg(regD dst, regD src) %{
1527   predicate(UseAVX > 0);
1528   match(Set dst (NegD src));
1529   ins_cost(150);
1530   format %{ "vxorpd  $dst, $src, [0x8000000000000000]\t"
1531             "# neg double by sign flipping" %}
1532   ins_encode %{
1533     bool vector256 = false;
1534     __ vxorpd($dst$$XMMRegister, $src$$XMMRegister,
1535               ExternalAddress(double_signflip()), vector256);
1536   %}
1537   ins_pipe(pipe_slow);
1538 %}
1539 
1540 instruct sqrtF_reg(regF dst, regF src) %{
1541   predicate(UseSSE>=1);
1542   match(Set dst (ConvD2F (SqrtD (ConvF2D src))));
1543 
1544   format %{ "sqrtss  $dst, $src" %}
1545   ins_cost(150);
1546   ins_encode %{
1547     __ sqrtss($dst$$XMMRegister, $src$$XMMRegister);
1548   %}
1549   ins_pipe(pipe_slow);
1550 %}
1551 
1552 instruct sqrtF_mem(regF dst, memory src) %{
1553   predicate(UseSSE>=1);
1554   match(Set dst (ConvD2F (SqrtD (ConvF2D (LoadF src)))));
1555 
1556   format %{ "sqrtss  $dst, $src" %}
1557   ins_cost(150);
1558   ins_encode %{
1559     __ sqrtss($dst$$XMMRegister, $src$$Address);
1560   %}
1561   ins_pipe(pipe_slow);
1562 %}
1563 
1564 instruct sqrtF_imm(regF dst, immF con) %{
1565   predicate(UseSSE>=1);
1566   match(Set dst (ConvD2F (SqrtD (ConvF2D con))));
1567   format %{ "sqrtss  $dst, [$constantaddress]\t# load from constant table: float=$con" %}
1568   ins_cost(150);
1569   ins_encode %{
1570     __ sqrtss($dst$$XMMRegister, $constantaddress($con));
1571   %}
1572   ins_pipe(pipe_slow);
1573 %}
1574 
1575 instruct sqrtD_reg(regD dst, regD src) %{
1576   predicate(UseSSE>=2);
1577   match(Set dst (SqrtD src));
1578 
1579   format %{ "sqrtsd  $dst, $src" %}
1580   ins_cost(150);
1581   ins_encode %{
1582     __ sqrtsd($dst$$XMMRegister, $src$$XMMRegister);
1583   %}
1584   ins_pipe(pipe_slow);
1585 %}
1586 
1587 instruct sqrtD_mem(regD dst, memory src) %{
1588   predicate(UseSSE>=2);
1589   match(Set dst (SqrtD (LoadD src)));
1590 
1591   format %{ "sqrtsd  $dst, $src" %}
1592   ins_cost(150);
1593   ins_encode %{
1594     __ sqrtsd($dst$$XMMRegister, $src$$Address);
1595   %}
1596   ins_pipe(pipe_slow);
1597 %}
1598 
1599 instruct sqrtD_imm(regD dst, immD con) %{
1600   predicate(UseSSE>=2);
1601   match(Set dst (SqrtD con));
1602   format %{ "sqrtsd  $dst, [$constantaddress]\t# load from constant table: double=$con" %}
1603   ins_cost(150);
1604   ins_encode %{
1605     __ sqrtsd($dst$$XMMRegister, $constantaddress($con));
1606   %}
1607   ins_pipe(pipe_slow);
1608 %}
1609 
1610 
1611 // ====================VECTOR INSTRUCTIONS=====================================
1612 
1613 // Load vectors (4 bytes long)
1614 instruct loadV4(vecS dst, memory mem) %{
1615   predicate(n->as_LoadVector()->memory_size() == 4);
1616   match(Set dst (LoadVector mem));
1617   ins_cost(125);
1618   format %{ "movd    $dst,$mem\t! load vector (4 bytes)" %}
1619   ins_encode %{
1620     __ movdl($dst$$XMMRegister, $mem$$Address);
1621   %}
1622   ins_pipe( pipe_slow );
1623 %}
1624 
1625 // Load vectors (8 bytes long)
1626 instruct loadV8(vecD dst, memory mem) %{
1627   predicate(n->as_LoadVector()->memory_size() == 8);
1628   match(Set dst (LoadVector mem));
1629   ins_cost(125);
1630   format %{ "movq    $dst,$mem\t! load vector (8 bytes)" %}
1631   ins_encode %{
1632     __ movq($dst$$XMMRegister, $mem$$Address);
1633   %}
1634   ins_pipe( pipe_slow );
1635 %}
1636 
1637 // Load vectors (16 bytes long)
1638 instruct loadV16(vecX dst, memory mem) %{
1639   predicate(n->as_LoadVector()->memory_size() == 16);
1640   match(Set dst (LoadVector mem));
1641   ins_cost(125);
1642   format %{ "movdqu  $dst,$mem\t! load vector (16 bytes)" %}
1643   ins_encode %{
1644     __ movdqu($dst$$XMMRegister, $mem$$Address);
1645   %}
1646   ins_pipe( pipe_slow );
1647 %}
1648 
1649 // Load vectors (32 bytes long)
1650 instruct loadV32(vecY dst, memory mem) %{
1651   predicate(n->as_LoadVector()->memory_size() == 32);
1652   match(Set dst (LoadVector mem));
1653   ins_cost(125);
1654   format %{ "vmovdqu $dst,$mem\t! load vector (32 bytes)" %}
1655   ins_encode %{
1656     __ vmovdqu($dst$$XMMRegister, $mem$$Address);
1657   %}
1658   ins_pipe( pipe_slow );
1659 %}
1660 
1661 // Store vectors
1662 instruct storeV4(memory mem, vecS src) %{
1663   predicate(n->as_StoreVector()->memory_size() == 4);
1664   match(Set mem (StoreVector mem src));
1665   ins_cost(145);
1666   format %{ "movd    $mem,$src\t! store vector (4 bytes)" %}
1667   ins_encode %{
1668     __ movdl($mem$$Address, $src$$XMMRegister);
1669   %}
1670   ins_pipe( pipe_slow );
1671 %}
1672 
1673 instruct storeV8(memory mem, vecD src) %{
1674   predicate(n->as_StoreVector()->memory_size() == 8);
1675   match(Set mem (StoreVector mem src));
1676   ins_cost(145);
1677   format %{ "movq    $mem,$src\t! store vector (8 bytes)" %}
1678   ins_encode %{
1679     __ movq($mem$$Address, $src$$XMMRegister);
1680   %}
1681   ins_pipe( pipe_slow );
1682 %}
1683 
1684 instruct storeV16(memory mem, vecX src) %{
1685   predicate(n->as_StoreVector()->memory_size() == 16);
1686   match(Set mem (StoreVector mem src));
1687   ins_cost(145);
1688   format %{ "movdqu  $mem,$src\t! store vector (16 bytes)" %}
1689   ins_encode %{
1690     __ movdqu($mem$$Address, $src$$XMMRegister);
1691   %}
1692   ins_pipe( pipe_slow );
1693 %}
1694 
1695 instruct storeV32(memory mem, vecY src) %{
1696   predicate(n->as_StoreVector()->memory_size() == 32);
1697   match(Set mem (StoreVector mem src));
1698   ins_cost(145);
1699   format %{ "vmovdqu $mem,$src\t! store vector (32 bytes)" %}
1700   ins_encode %{
1701     __ vmovdqu($mem$$Address, $src$$XMMRegister);
1702   %}
1703   ins_pipe( pipe_slow );
1704 %}
1705 
1706 // Replicate byte scalar to be vector
1707 instruct Repl4B(vecS dst, rRegI src) %{
1708   predicate(n->as_Vector()->length() == 4);
1709   match(Set dst (ReplicateB src));
1710   format %{ "movd    $dst,$src\n\t"
1711             "punpcklbw $dst,$dst\n\t"
1712             "pshuflw $dst,$dst,0x00\t! replicate4B" %}
1713   ins_encode %{
1714     __ movdl($dst$$XMMRegister, $src$$Register);
1715     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
1716     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
1717   %}
1718   ins_pipe( pipe_slow );
1719 %}
1720 
1721 instruct Repl8B(vecD dst, rRegI src) %{
1722   predicate(n->as_Vector()->length() == 8);
1723   match(Set dst (ReplicateB src));
1724   format %{ "movd    $dst,$src\n\t"
1725             "punpcklbw $dst,$dst\n\t"
1726             "pshuflw $dst,$dst,0x00\t! replicate8B" %}
1727   ins_encode %{
1728     __ movdl($dst$$XMMRegister, $src$$Register);
1729     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
1730     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
1731   %}
1732   ins_pipe( pipe_slow );
1733 %}
1734 
1735 instruct Repl16B(vecX dst, rRegI src) %{
1736   predicate(n->as_Vector()->length() == 16);
1737   match(Set dst (ReplicateB src));
1738   format %{ "movd    $dst,$src\n\t"
1739             "punpcklbw $dst,$dst\n\t"
1740             "pshuflw $dst,$dst,0x00\n\t"
1741             "punpcklqdq $dst,$dst\t! replicate16B" %}
1742   ins_encode %{
1743     __ movdl($dst$$XMMRegister, $src$$Register);
1744     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
1745     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
1746     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
1747   %}
1748   ins_pipe( pipe_slow );
1749 %}
1750 
1751 instruct Repl32B(vecY dst, rRegI src) %{
1752   predicate(n->as_Vector()->length() == 32);
1753   match(Set dst (ReplicateB src));
1754   format %{ "movd    $dst,$src\n\t"
1755             "punpcklbw $dst,$dst\n\t"
1756             "pshuflw $dst,$dst,0x00\n\t"
1757             "punpcklqdq $dst,$dst\n\t"
1758             "vinserti128h $dst,$dst,$dst\t! replicate32B" %}
1759   ins_encode %{
1760     __ movdl($dst$$XMMRegister, $src$$Register);
1761     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
1762     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
1763     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
1764     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
1765   %}
1766   ins_pipe( pipe_slow );
1767 %}
1768 
1769 // Replicate byte scalar immediate to be vector by loading from const table.
1770 instruct Repl4B_imm(vecS dst, immI con) %{
1771   predicate(n->as_Vector()->length() == 4);
1772   match(Set dst (ReplicateB con));
1773   format %{ "movdl   $dst,[$constantaddress]\t! replicate4B($con)" %}
1774   ins_encode %{
1775     __ movdl($dst$$XMMRegister, $constantaddress(replicate4_imm($con$$constant, 1)));
1776   %}
1777   ins_pipe( pipe_slow );
1778 %}
1779 
1780 instruct Repl8B_imm(vecD dst, immI con) %{
1781   predicate(n->as_Vector()->length() == 8);
1782   match(Set dst (ReplicateB con));
1783   format %{ "movq    $dst,[$constantaddress]\t! replicate8B($con)" %}
1784   ins_encode %{
1785     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
1786   %}
1787   ins_pipe( pipe_slow );
1788 %}
1789 
1790 instruct Repl16B_imm(vecX dst, immI con) %{
1791   predicate(n->as_Vector()->length() == 16);
1792   match(Set dst (ReplicateB con));
1793   format %{ "movq    $dst,[$constantaddress]\n\t"
1794             "punpcklqdq $dst,$dst\t! replicate16B($con)" %}
1795   ins_encode %{
1796     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
1797     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
1798   %}
1799   ins_pipe( pipe_slow );
1800 %}
1801 
1802 instruct Repl32B_imm(vecY dst, immI con) %{
1803   predicate(n->as_Vector()->length() == 32);
1804   match(Set dst (ReplicateB con));
1805   format %{ "movq    $dst,[$constantaddress]\n\t"
1806             "punpcklqdq $dst,$dst\n\t"
1807             "vinserti128h $dst,$dst,$dst\t! lreplicate32B($con)" %}
1808   ins_encode %{
1809     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
1810     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
1811     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
1812   %}
1813   ins_pipe( pipe_slow );
1814 %}
1815 
1816 // Replicate byte scalar zero to be vector
1817 instruct Repl4B_zero(vecS dst, immI0 zero) %{
1818   predicate(n->as_Vector()->length() == 4);
1819   match(Set dst (ReplicateB zero));
1820   format %{ "pxor    $dst,$dst\t! replicate4B zero" %}
1821   ins_encode %{
1822     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
1823   %}
1824   ins_pipe( fpu_reg_reg );
1825 %}
1826 
1827 instruct Repl8B_zero(vecD dst, immI0 zero) %{
1828   predicate(n->as_Vector()->length() == 8);
1829   match(Set dst (ReplicateB zero));
1830   format %{ "pxor    $dst,$dst\t! replicate8B zero" %}
1831   ins_encode %{
1832     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
1833   %}
1834   ins_pipe( fpu_reg_reg );
1835 %}
1836 
1837 instruct Repl16B_zero(vecX dst, immI0 zero) %{
1838   predicate(n->as_Vector()->length() == 16);
1839   match(Set dst (ReplicateB zero));
1840   format %{ "pxor    $dst,$dst\t! replicate16B zero" %}
1841   ins_encode %{
1842     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
1843   %}
1844   ins_pipe( fpu_reg_reg );
1845 %}
1846 
1847 instruct Repl32B_zero(vecY dst, immI0 zero) %{
1848   predicate(n->as_Vector()->length() == 32);
1849   match(Set dst (ReplicateB zero));
1850   format %{ "vpxor   $dst,$dst,$dst\t! replicate32B zero" %}
1851   ins_encode %{
1852     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
1853     bool vector256 = true;
1854     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector256);
1855   %}
1856   ins_pipe( fpu_reg_reg );
1857 %}
1858 
1859 // Replicate char/short (2 byte) scalar to be vector
1860 instruct Repl2S(vecS dst, rRegI src) %{
1861   predicate(n->as_Vector()->length() == 2);
1862   match(Set dst (ReplicateS src));
1863   format %{ "movd    $dst,$src\n\t"
1864             "pshuflw $dst,$dst,0x00\t! replicate2S" %}
1865   ins_encode %{
1866     __ movdl($dst$$XMMRegister, $src$$Register);
1867     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
1868   %}
1869   ins_pipe( fpu_reg_reg );
1870 %}
1871 
1872 instruct Repl4S(vecD dst, rRegI src) %{
1873   predicate(n->as_Vector()->length() == 4);
1874   match(Set dst (ReplicateS src));
1875   format %{ "movd    $dst,$src\n\t"
1876             "pshuflw $dst,$dst,0x00\t! replicate4S" %}
1877   ins_encode %{
1878     __ movdl($dst$$XMMRegister, $src$$Register);
1879     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
1880   %}
1881   ins_pipe( fpu_reg_reg );
1882 %}
1883 
1884 instruct Repl8S(vecX dst, rRegI src) %{
1885   predicate(n->as_Vector()->length() == 8);
1886   match(Set dst (ReplicateS src));
1887   format %{ "movd    $dst,$src\n\t"
1888             "pshuflw $dst,$dst,0x00\n\t"
1889             "punpcklqdq $dst,$dst\t! replicate8S" %}
1890   ins_encode %{
1891     __ movdl($dst$$XMMRegister, $src$$Register);
1892     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
1893     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
1894   %}
1895   ins_pipe( pipe_slow );
1896 %}
1897 
1898 instruct Repl16S(vecY dst, rRegI src) %{
1899   predicate(n->as_Vector()->length() == 16);
1900   match(Set dst (ReplicateS src));
1901   format %{ "movd    $dst,$src\n\t"
1902             "pshuflw $dst,$dst,0x00\n\t"
1903             "punpcklqdq $dst,$dst\n\t"
1904             "vinserti128h $dst,$dst,$dst\t! replicate16S" %}
1905   ins_encode %{
1906     __ movdl($dst$$XMMRegister, $src$$Register);
1907     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
1908     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
1909     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
1910   %}
1911   ins_pipe( pipe_slow );
1912 %}
1913 
1914 // Replicate char/short (2 byte) scalar immediate to be vector by loading from const table.
1915 instruct Repl2S_imm(vecS dst, immI con) %{
1916   predicate(n->as_Vector()->length() == 2);
1917   match(Set dst (ReplicateS con));
1918   format %{ "movdl   $dst,[$constantaddress]\t! replicate2S($con)" %}
1919   ins_encode %{
1920     __ movdl($dst$$XMMRegister, $constantaddress(replicate4_imm($con$$constant, 2)));
1921   %}
1922   ins_pipe( fpu_reg_reg );
1923 %}
1924 
1925 instruct Repl4S_imm(vecD dst, immI con) %{
1926   predicate(n->as_Vector()->length() == 4);
1927   match(Set dst (ReplicateS con));
1928   format %{ "movq    $dst,[$constantaddress]\t! replicate4S($con)" %}
1929   ins_encode %{
1930     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
1931   %}
1932   ins_pipe( fpu_reg_reg );
1933 %}
1934 
1935 instruct Repl8S_imm(vecX dst, immI con) %{
1936   predicate(n->as_Vector()->length() == 8);
1937   match(Set dst (ReplicateS con));
1938   format %{ "movq    $dst,[$constantaddress]\n\t"
1939             "punpcklqdq $dst,$dst\t! replicate8S($con)" %}
1940   ins_encode %{
1941     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
1942     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
1943   %}
1944   ins_pipe( pipe_slow );
1945 %}
1946 
1947 instruct Repl16S_imm(vecY dst, immI con) %{
1948   predicate(n->as_Vector()->length() == 16);
1949   match(Set dst (ReplicateS con));
1950   format %{ "movq    $dst,[$constantaddress]\n\t"
1951             "punpcklqdq $dst,$dst\n\t"
1952             "vinserti128h $dst,$dst,$dst\t! replicate16S($con)" %}
1953   ins_encode %{
1954     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
1955     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
1956     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
1957   %}
1958   ins_pipe( pipe_slow );
1959 %}
1960 
1961 // Replicate char/short (2 byte) scalar zero to be vector
1962 instruct Repl2S_zero(vecS dst, immI0 zero) %{
1963   predicate(n->as_Vector()->length() == 2);
1964   match(Set dst (ReplicateS zero));
1965   format %{ "pxor    $dst,$dst\t! replicate2S zero" %}
1966   ins_encode %{
1967     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
1968   %}
1969   ins_pipe( fpu_reg_reg );
1970 %}
1971 
1972 instruct Repl4S_zero(vecD dst, immI0 zero) %{
1973   predicate(n->as_Vector()->length() == 4);
1974   match(Set dst (ReplicateS zero));
1975   format %{ "pxor    $dst,$dst\t! replicate4S zero" %}
1976   ins_encode %{
1977     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
1978   %}
1979   ins_pipe( fpu_reg_reg );
1980 %}
1981 
1982 instruct Repl8S_zero(vecX dst, immI0 zero) %{
1983   predicate(n->as_Vector()->length() == 8);
1984   match(Set dst (ReplicateS zero));
1985   format %{ "pxor    $dst,$dst\t! replicate8S zero" %}
1986   ins_encode %{
1987     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
1988   %}
1989   ins_pipe( fpu_reg_reg );
1990 %}
1991 
1992 instruct Repl16S_zero(vecY dst, immI0 zero) %{
1993   predicate(n->as_Vector()->length() == 16);
1994   match(Set dst (ReplicateS zero));
1995   format %{ "vpxor   $dst,$dst,$dst\t! replicate16S zero" %}
1996   ins_encode %{
1997     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
1998     bool vector256 = true;
1999     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector256);
2000   %}
2001   ins_pipe( fpu_reg_reg );
2002 %}
2003 
2004 // Replicate integer (4 byte) scalar to be vector
2005 instruct Repl2I(vecD dst, rRegI src) %{
2006   predicate(n->as_Vector()->length() == 2);
2007   match(Set dst (ReplicateI src));
2008   format %{ "movd    $dst,$src\n\t"
2009             "pshufd  $dst,$dst,0x00\t! replicate2I" %}
2010   ins_encode %{
2011     __ movdl($dst$$XMMRegister, $src$$Register);
2012     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
2013   %}
2014   ins_pipe( fpu_reg_reg );
2015 %}
2016 
2017 instruct Repl4I(vecX dst, rRegI src) %{
2018   predicate(n->as_Vector()->length() == 4);
2019   match(Set dst (ReplicateI src));
2020   format %{ "movd    $dst,$src\n\t"
2021             "pshufd  $dst,$dst,0x00\t! replicate4I" %}
2022   ins_encode %{
2023     __ movdl($dst$$XMMRegister, $src$$Register);
2024     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
2025   %}
2026   ins_pipe( pipe_slow );
2027 %}
2028 
2029 instruct Repl8I(vecY dst, rRegI src) %{
2030   predicate(n->as_Vector()->length() == 8);
2031   match(Set dst (ReplicateI src));
2032   format %{ "movd    $dst,$src\n\t"
2033             "pshufd  $dst,$dst,0x00\n\t"
2034             "vinserti128h $dst,$dst,$dst\t! replicate8I" %}
2035   ins_encode %{
2036     __ movdl($dst$$XMMRegister, $src$$Register);
2037     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
2038     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
2039   %}
2040   ins_pipe( pipe_slow );
2041 %}
2042 
2043 // Replicate integer (4 byte) scalar immediate to be vector by loading from const table.
2044 instruct Repl2I_imm(vecD dst, immI con) %{
2045   predicate(n->as_Vector()->length() == 2);
2046   match(Set dst (ReplicateI con));
2047   format %{ "movq    $dst,[$constantaddress]\t! replicate2I($con)" %}
2048   ins_encode %{
2049     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
2050   %}
2051   ins_pipe( fpu_reg_reg );
2052 %}
2053 
2054 instruct Repl4I_imm(vecX dst, immI con) %{
2055   predicate(n->as_Vector()->length() == 4);
2056   match(Set dst (ReplicateI con));
2057   format %{ "movq    $dst,[$constantaddress]\t! replicate4I($con)\n\t"
2058             "punpcklqdq $dst,$dst" %}
2059   ins_encode %{
2060     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
2061     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2062   %}
2063   ins_pipe( pipe_slow );
2064 %}
2065 
2066 instruct Repl8I_imm(vecY dst, immI con) %{
2067   predicate(n->as_Vector()->length() == 8);
2068   match(Set dst (ReplicateI con));
2069   format %{ "movq    $dst,[$constantaddress]\t! replicate8I($con)\n\t"
2070             "punpcklqdq $dst,$dst\n\t"
2071             "vinserti128h $dst,$dst,$dst" %}
2072   ins_encode %{
2073     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
2074     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2075     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
2076   %}
2077   ins_pipe( pipe_slow );
2078 %}
2079 
2080 // Integer could be loaded into xmm register directly from memory.
2081 instruct Repl2I_mem(vecD dst, memory mem) %{
2082   predicate(n->as_Vector()->length() == 2);
2083   match(Set dst (ReplicateI (LoadI mem)));
2084   format %{ "movd    $dst,$mem\n\t"
2085             "pshufd  $dst,$dst,0x00\t! replicate2I" %}
2086   ins_encode %{
2087     __ movdl($dst$$XMMRegister, $mem$$Address);
2088     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
2089   %}
2090   ins_pipe( fpu_reg_reg );
2091 %}
2092 
2093 instruct Repl4I_mem(vecX dst, memory mem) %{
2094   predicate(n->as_Vector()->length() == 4);
2095   match(Set dst (ReplicateI (LoadI mem)));
2096   format %{ "movd    $dst,$mem\n\t"
2097             "pshufd  $dst,$dst,0x00\t! replicate4I" %}
2098   ins_encode %{
2099     __ movdl($dst$$XMMRegister, $mem$$Address);
2100     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
2101   %}
2102   ins_pipe( pipe_slow );
2103 %}
2104 
2105 instruct Repl8I_mem(vecY dst, memory mem) %{
2106   predicate(n->as_Vector()->length() == 8);
2107   match(Set dst (ReplicateI (LoadI mem)));
2108   format %{ "movd    $dst,$mem\n\t"
2109             "pshufd  $dst,$dst,0x00\n\t"
2110             "vinserti128h $dst,$dst,$dst\t! replicate8I" %}
2111   ins_encode %{
2112     __ movdl($dst$$XMMRegister, $mem$$Address);
2113     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
2114     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
2115   %}
2116   ins_pipe( pipe_slow );
2117 %}
2118 
2119 // Replicate integer (4 byte) scalar zero to be vector
2120 instruct Repl2I_zero(vecD dst, immI0 zero) %{
2121   predicate(n->as_Vector()->length() == 2);
2122   match(Set dst (ReplicateI zero));
2123   format %{ "pxor    $dst,$dst\t! replicate2I" %}
2124   ins_encode %{
2125     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
2126   %}
2127   ins_pipe( fpu_reg_reg );
2128 %}
2129 
2130 instruct Repl4I_zero(vecX dst, immI0 zero) %{
2131   predicate(n->as_Vector()->length() == 4);
2132   match(Set dst (ReplicateI zero));
2133   format %{ "pxor    $dst,$dst\t! replicate4I zero)" %}
2134   ins_encode %{
2135     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
2136   %}
2137   ins_pipe( fpu_reg_reg );
2138 %}
2139 
2140 instruct Repl8I_zero(vecY dst, immI0 zero) %{
2141   predicate(n->as_Vector()->length() == 8);
2142   match(Set dst (ReplicateI zero));
2143   format %{ "vpxor   $dst,$dst,$dst\t! replicate8I zero" %}
2144   ins_encode %{
2145     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
2146     bool vector256 = true;
2147     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector256);
2148   %}
2149   ins_pipe( fpu_reg_reg );
2150 %}
2151 
2152 // Replicate long (8 byte) scalar to be vector
2153 #ifdef _LP64
2154 instruct Repl2L(vecX dst, rRegL src) %{
2155   predicate(n->as_Vector()->length() == 2);
2156   match(Set dst (ReplicateL src));
2157   format %{ "movdq   $dst,$src\n\t"
2158             "punpcklqdq $dst,$dst\t! replicate2L" %}
2159   ins_encode %{
2160     __ movdq($dst$$XMMRegister, $src$$Register);
2161     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2162   %}
2163   ins_pipe( pipe_slow );
2164 %}
2165 
2166 instruct Repl4L(vecY dst, rRegL src) %{
2167   predicate(n->as_Vector()->length() == 4);
2168   match(Set dst (ReplicateL src));
2169   format %{ "movdq   $dst,$src\n\t"
2170             "punpcklqdq $dst,$dst\n\t"
2171             "vinserti128h $dst,$dst,$dst\t! replicate4L" %}
2172   ins_encode %{
2173     __ movdq($dst$$XMMRegister, $src$$Register);
2174     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2175     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
2176   %}
2177   ins_pipe( pipe_slow );
2178 %}
2179 #else // _LP64
2180 instruct Repl2L(vecX dst, eRegL src, regD tmp) %{
2181   predicate(n->as_Vector()->length() == 2);
2182   match(Set dst (ReplicateL src));
2183   effect(TEMP dst, USE src, TEMP tmp);
2184   format %{ "movdl   $dst,$src.lo\n\t"
2185             "movdl   $tmp,$src.hi\n\t"
2186             "punpckldq $dst,$tmp\n\t"
2187             "punpcklqdq $dst,$dst\t! replicate2L"%}
2188   ins_encode %{
2189     __ movdl($dst$$XMMRegister, $src$$Register);
2190     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
2191     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
2192     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2193   %}
2194   ins_pipe( pipe_slow );
2195 %}
2196 
2197 instruct Repl4L(vecY dst, eRegL src, regD tmp) %{
2198   predicate(n->as_Vector()->length() == 4);
2199   match(Set dst (ReplicateL src));
2200   effect(TEMP dst, USE src, TEMP tmp);
2201   format %{ "movdl   $dst,$src.lo\n\t"
2202             "movdl   $tmp,$src.hi\n\t"
2203             "punpckldq $dst,$tmp\n\t"
2204             "punpcklqdq $dst,$dst\n\t"
2205             "vinserti128h $dst,$dst,$dst\t! replicate4L" %}
2206   ins_encode %{
2207     __ movdl($dst$$XMMRegister, $src$$Register);
2208     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
2209     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
2210     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2211     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
2212   %}
2213   ins_pipe( pipe_slow );
2214 %}
2215 #endif // _LP64
2216 
2217 // Replicate long (8 byte) scalar immediate to be vector by loading from const table.
2218 instruct Repl2L_imm(vecX dst, immL con) %{
2219   predicate(n->as_Vector()->length() == 2);
2220   match(Set dst (ReplicateL con));
2221   format %{ "movq    $dst,[$constantaddress]\n\t"
2222             "punpcklqdq $dst,$dst\t! replicate2L($con)" %}
2223   ins_encode %{
2224     __ movq($dst$$XMMRegister, $constantaddress($con));
2225     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2226   %}
2227   ins_pipe( pipe_slow );
2228 %}
2229 
2230 instruct Repl4L_imm(vecY dst, immL con) %{
2231   predicate(n->as_Vector()->length() == 4);
2232   match(Set dst (ReplicateL con));
2233   format %{ "movq    $dst,[$constantaddress]\n\t"
2234             "punpcklqdq $dst,$dst\n\t"
2235             "vinserti128h $dst,$dst,$dst\t! replicate4L($con)" %}
2236   ins_encode %{
2237     __ movq($dst$$XMMRegister, $constantaddress($con));
2238     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2239     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
2240   %}
2241   ins_pipe( pipe_slow );
2242 %}
2243 
2244 // Long could be loaded into xmm register directly from memory.
2245 instruct Repl2L_mem(vecX dst, memory mem) %{
2246   predicate(n->as_Vector()->length() == 2);
2247   match(Set dst (ReplicateL (LoadL mem)));
2248   format %{ "movq    $dst,$mem\n\t"
2249             "punpcklqdq $dst,$dst\t! replicate2L" %}
2250   ins_encode %{
2251     __ movq($dst$$XMMRegister, $mem$$Address);
2252     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2253   %}
2254   ins_pipe( pipe_slow );
2255 %}
2256 
2257 instruct Repl4L_mem(vecY dst, memory mem) %{
2258   predicate(n->as_Vector()->length() == 4);
2259   match(Set dst (ReplicateL (LoadL mem)));
2260   format %{ "movq    $dst,$mem\n\t"
2261             "punpcklqdq $dst,$dst\n\t"
2262             "vinserti128h $dst,$dst,$dst\t! replicate4L" %}
2263   ins_encode %{
2264     __ movq($dst$$XMMRegister, $mem$$Address);
2265     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2266     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
2267   %}
2268   ins_pipe( pipe_slow );
2269 %}
2270 
2271 // Replicate long (8 byte) scalar zero to be vector
2272 instruct Repl2L_zero(vecX dst, immL0 zero) %{
2273   predicate(n->as_Vector()->length() == 2);
2274   match(Set dst (ReplicateL zero));
2275   format %{ "pxor    $dst,$dst\t! replicate2L zero" %}
2276   ins_encode %{
2277     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
2278   %}
2279   ins_pipe( fpu_reg_reg );
2280 %}
2281 
2282 instruct Repl4L_zero(vecY dst, immL0 zero) %{
2283   predicate(n->as_Vector()->length() == 4);
2284   match(Set dst (ReplicateL zero));
2285   format %{ "vpxor   $dst,$dst,$dst\t! replicate4L zero" %}
2286   ins_encode %{
2287     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
2288     bool vector256 = true;
2289     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector256);
2290   %}
2291   ins_pipe( fpu_reg_reg );
2292 %}
2293 
2294 // Replicate float (4 byte) scalar to be vector
2295 instruct Repl2F(vecD dst, regF src) %{
2296   predicate(n->as_Vector()->length() == 2);
2297   match(Set dst (ReplicateF src));
2298   format %{ "pshufd  $dst,$dst,0x00\t! replicate2F" %}
2299   ins_encode %{
2300     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
2301   %}
2302   ins_pipe( fpu_reg_reg );
2303 %}
2304 
2305 instruct Repl4F(vecX dst, regF src) %{
2306   predicate(n->as_Vector()->length() == 4);
2307   match(Set dst (ReplicateF src));
2308   format %{ "pshufd  $dst,$dst,0x00\t! replicate4F" %}
2309   ins_encode %{
2310     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
2311   %}
2312   ins_pipe( pipe_slow );
2313 %}
2314 
2315 instruct Repl8F(vecY dst, regF src) %{
2316   predicate(n->as_Vector()->length() == 8);
2317   match(Set dst (ReplicateF src));
2318   format %{ "pshufd  $dst,$src,0x00\n\t"
2319             "vinsertf128h $dst,$dst,$dst\t! replicate8F" %}
2320   ins_encode %{
2321     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
2322     __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
2323   %}
2324   ins_pipe( pipe_slow );
2325 %}
2326 
2327 // Replicate float (4 byte) scalar zero to be vector
2328 instruct Repl2F_zero(vecD dst, immF0 zero) %{
2329   predicate(n->as_Vector()->length() == 2);
2330   match(Set dst (ReplicateF zero));
2331   format %{ "xorps   $dst,$dst\t! replicate2F zero" %}
2332   ins_encode %{
2333     __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
2334   %}
2335   ins_pipe( fpu_reg_reg );
2336 %}
2337 
2338 instruct Repl4F_zero(vecX dst, immF0 zero) %{
2339   predicate(n->as_Vector()->length() == 4);
2340   match(Set dst (ReplicateF zero));
2341   format %{ "xorps   $dst,$dst\t! replicate4F zero" %}
2342   ins_encode %{
2343     __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
2344   %}
2345   ins_pipe( fpu_reg_reg );
2346 %}
2347 
2348 instruct Repl8F_zero(vecY dst, immF0 zero) %{
2349   predicate(n->as_Vector()->length() == 8);
2350   match(Set dst (ReplicateF zero));
2351   format %{ "vxorps  $dst,$dst,$dst\t! replicate8F zero" %}
2352   ins_encode %{
2353     bool vector256 = true;
2354     __ vxorps($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector256);
2355   %}
2356   ins_pipe( fpu_reg_reg );
2357 %}
2358 
2359 // Replicate double (8 bytes) scalar to be vector
2360 instruct Repl2D(vecX dst, regD src) %{
2361   predicate(n->as_Vector()->length() == 2);
2362   match(Set dst (ReplicateD src));
2363   format %{ "pshufd  $dst,$src,0x44\t! replicate2D" %}
2364   ins_encode %{
2365     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
2366   %}
2367   ins_pipe( pipe_slow );
2368 %}
2369 
2370 instruct Repl4D(vecY dst, regD src) %{
2371   predicate(n->as_Vector()->length() == 4);
2372   match(Set dst (ReplicateD src));
2373   format %{ "pshufd  $dst,$src,0x44\n\t"
2374             "vinsertf128h $dst,$dst,$dst\t! replicate4D" %}
2375   ins_encode %{
2376     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
2377     __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
2378   %}
2379   ins_pipe( pipe_slow );
2380 %}
2381 
2382 // Replicate double (8 byte) scalar zero to be vector
2383 instruct Repl2D_zero(vecX dst, immD0 zero) %{
2384   predicate(n->as_Vector()->length() == 2);
2385   match(Set dst (ReplicateD zero));
2386   format %{ "xorpd   $dst,$dst\t! replicate2D zero" %}
2387   ins_encode %{
2388     __ xorpd($dst$$XMMRegister, $dst$$XMMRegister);
2389   %}
2390   ins_pipe( fpu_reg_reg );
2391 %}
2392 
2393 instruct Repl4D_zero(vecY dst, immD0 zero) %{
2394   predicate(n->as_Vector()->length() == 4);
2395   match(Set dst (ReplicateD zero));
2396   format %{ "vxorpd  $dst,$dst,$dst,vect256\t! replicate4D zero" %}
2397   ins_encode %{
2398     bool vector256 = true;
2399     __ vxorpd($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector256);
2400   %}
2401   ins_pipe( fpu_reg_reg );
2402 %}
2403 
2404 // ====================VECTOR ARITHMETIC=======================================
2405 
2406 // --------------------------------- ADD --------------------------------------
2407 
2408 // Bytes vector add
2409 instruct vadd4B(vecS dst, vecS src) %{
2410   predicate(n->as_Vector()->length() == 4);
2411   match(Set dst (AddVB dst src));
2412   format %{ "paddb   $dst,$src\t! add packed4B" %}
2413   ins_encode %{
2414     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
2415   %}
2416   ins_pipe( pipe_slow );
2417 %}
2418 
2419 instruct vadd4B_reg(vecS dst, vecS src1, vecS src2) %{
2420   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
2421   match(Set dst (AddVB src1 src2));
2422   format %{ "vpaddb  $dst,$src1,$src2\t! add packed4B" %}
2423   ins_encode %{
2424     bool vector256 = false;
2425     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
2426   %}
2427   ins_pipe( pipe_slow );
2428 %}
2429 
2430 instruct vadd8B(vecD dst, vecD src) %{
2431   predicate(n->as_Vector()->length() == 8);
2432   match(Set dst (AddVB dst src));
2433   format %{ "paddb   $dst,$src\t! add packed8B" %}
2434   ins_encode %{
2435     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
2436   %}
2437   ins_pipe( pipe_slow );
2438 %}
2439 
2440 instruct vadd8B_reg(vecD dst, vecD src1, vecD src2) %{
2441   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
2442   match(Set dst (AddVB src1 src2));
2443   format %{ "vpaddb  $dst,$src1,$src2\t! add packed8B" %}
2444   ins_encode %{
2445     bool vector256 = false;
2446     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
2447   %}
2448   ins_pipe( pipe_slow );
2449 %}
2450 
2451 instruct vadd16B(vecX dst, vecX src) %{
2452   predicate(n->as_Vector()->length() == 16);
2453   match(Set dst (AddVB dst src));
2454   format %{ "paddb   $dst,$src\t! add packed16B" %}
2455   ins_encode %{
2456     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
2457   %}
2458   ins_pipe( pipe_slow );
2459 %}
2460 
2461 instruct vadd16B_reg(vecX dst, vecX src1, vecX src2) %{
2462   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
2463   match(Set dst (AddVB src1 src2));
2464   format %{ "vpaddb  $dst,$src1,$src2\t! add packed16B" %}
2465   ins_encode %{
2466     bool vector256 = false;
2467     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
2468   %}
2469   ins_pipe( pipe_slow );
2470 %}
2471 
2472 instruct vadd16B_mem(vecX dst, vecX src, memory mem) %{
2473   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
2474   match(Set dst (AddVB src (LoadVector mem)));
2475   format %{ "vpaddb  $dst,$src,$mem\t! add packed16B" %}
2476   ins_encode %{
2477     bool vector256 = false;
2478     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
2479   %}
2480   ins_pipe( pipe_slow );
2481 %}
2482 
2483 instruct vadd32B_reg(vecY dst, vecY src1, vecY src2) %{
2484   predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
2485   match(Set dst (AddVB src1 src2));
2486   format %{ "vpaddb  $dst,$src1,$src2\t! add packed32B" %}
2487   ins_encode %{
2488     bool vector256 = true;
2489     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
2490   %}
2491   ins_pipe( pipe_slow );
2492 %}
2493 
2494 instruct vadd32B_mem(vecY dst, vecY src, memory mem) %{
2495   predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
2496   match(Set dst (AddVB src (LoadVector mem)));
2497   format %{ "vpaddb  $dst,$src,$mem\t! add packed32B" %}
2498   ins_encode %{
2499     bool vector256 = true;
2500     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
2501   %}
2502   ins_pipe( pipe_slow );
2503 %}
2504 
2505 // Shorts/Chars vector add
2506 instruct vadd2S(vecS dst, vecS src) %{
2507   predicate(n->as_Vector()->length() == 2);
2508   match(Set dst (AddVS dst src));
2509   format %{ "paddw   $dst,$src\t! add packed2S" %}
2510   ins_encode %{
2511     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
2512   %}
2513   ins_pipe( pipe_slow );
2514 %}
2515 
2516 instruct vadd2S_reg(vecS dst, vecS src1, vecS src2) %{
2517   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
2518   match(Set dst (AddVS src1 src2));
2519   format %{ "vpaddw  $dst,$src1,$src2\t! add packed2S" %}
2520   ins_encode %{
2521     bool vector256 = false;
2522     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
2523   %}
2524   ins_pipe( pipe_slow );
2525 %}
2526 
2527 instruct vadd4S(vecD dst, vecD src) %{
2528   predicate(n->as_Vector()->length() == 4);
2529   match(Set dst (AddVS dst src));
2530   format %{ "paddw   $dst,$src\t! add packed4S" %}
2531   ins_encode %{
2532     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
2533   %}
2534   ins_pipe( pipe_slow );
2535 %}
2536 
2537 instruct vadd4S_reg(vecD dst, vecD src1, vecD src2) %{
2538   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
2539   match(Set dst (AddVS src1 src2));
2540   format %{ "vpaddw  $dst,$src1,$src2\t! add packed4S" %}
2541   ins_encode %{
2542     bool vector256 = false;
2543     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
2544   %}
2545   ins_pipe( pipe_slow );
2546 %}
2547 
2548 instruct vadd8S(vecX dst, vecX src) %{
2549   predicate(n->as_Vector()->length() == 8);
2550   match(Set dst (AddVS dst src));
2551   format %{ "paddw   $dst,$src\t! add packed8S" %}
2552   ins_encode %{
2553     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
2554   %}
2555   ins_pipe( pipe_slow );
2556 %}
2557 
2558 instruct vadd8S_reg(vecX dst, vecX src1, vecX src2) %{
2559   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
2560   match(Set dst (AddVS src1 src2));
2561   format %{ "vpaddw  $dst,$src1,$src2\t! add packed8S" %}
2562   ins_encode %{
2563     bool vector256 = false;
2564     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
2565   %}
2566   ins_pipe( pipe_slow );
2567 %}
2568 
2569 instruct vadd8S_mem(vecX dst, vecX src, memory mem) %{
2570   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
2571   match(Set dst (AddVS src (LoadVector mem)));
2572   format %{ "vpaddw  $dst,$src,$mem\t! add packed8S" %}
2573   ins_encode %{
2574     bool vector256 = false;
2575     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
2576   %}
2577   ins_pipe( pipe_slow );
2578 %}
2579 
2580 instruct vadd16S_reg(vecY dst, vecY src1, vecY src2) %{
2581   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
2582   match(Set dst (AddVS src1 src2));
2583   format %{ "vpaddw  $dst,$src1,$src2\t! add packed16S" %}
2584   ins_encode %{
2585     bool vector256 = true;
2586     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
2587   %}
2588   ins_pipe( pipe_slow );
2589 %}
2590 
2591 instruct vadd16S_mem(vecY dst, vecY src, memory mem) %{
2592   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
2593   match(Set dst (AddVS src (LoadVector mem)));
2594   format %{ "vpaddw  $dst,$src,$mem\t! add packed16S" %}
2595   ins_encode %{
2596     bool vector256 = true;
2597     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
2598   %}
2599   ins_pipe( pipe_slow );
2600 %}
2601 
2602 // Integers vector add
2603 instruct vadd2I(vecD dst, vecD src) %{
2604   predicate(n->as_Vector()->length() == 2);
2605   match(Set dst (AddVI dst src));
2606   format %{ "paddd   $dst,$src\t! add packed2I" %}
2607   ins_encode %{
2608     __ paddd($dst$$XMMRegister, $src$$XMMRegister);
2609   %}
2610   ins_pipe( pipe_slow );
2611 %}
2612 
2613 instruct vadd2I_reg(vecD dst, vecD src1, vecD src2) %{
2614   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
2615   match(Set dst (AddVI src1 src2));
2616   format %{ "vpaddd  $dst,$src1,$src2\t! add packed2I" %}
2617   ins_encode %{
2618     bool vector256 = false;
2619     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
2620   %}
2621   ins_pipe( pipe_slow );
2622 %}
2623 
2624 instruct vadd4I(vecX dst, vecX src) %{
2625   predicate(n->as_Vector()->length() == 4);
2626   match(Set dst (AddVI dst src));
2627   format %{ "paddd   $dst,$src\t! add packed4I" %}
2628   ins_encode %{
2629     __ paddd($dst$$XMMRegister, $src$$XMMRegister);
2630   %}
2631   ins_pipe( pipe_slow );
2632 %}
2633 
2634 instruct vadd4I_reg(vecX dst, vecX src1, vecX src2) %{
2635   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
2636   match(Set dst (AddVI src1 src2));
2637   format %{ "vpaddd  $dst,$src1,$src2\t! add packed4I" %}
2638   ins_encode %{
2639     bool vector256 = false;
2640     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
2641   %}
2642   ins_pipe( pipe_slow );
2643 %}
2644 
2645 instruct vadd4I_mem(vecX dst, vecX src, memory mem) %{
2646   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
2647   match(Set dst (AddVI src (LoadVector mem)));
2648   format %{ "vpaddd  $dst,$src,$mem\t! add packed4I" %}
2649   ins_encode %{
2650     bool vector256 = false;
2651     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
2652   %}
2653   ins_pipe( pipe_slow );
2654 %}
2655 
2656 instruct vadd8I_reg(vecY dst, vecY src1, vecY src2) %{
2657   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
2658   match(Set dst (AddVI src1 src2));
2659   format %{ "vpaddd  $dst,$src1,$src2\t! add packed8I" %}
2660   ins_encode %{
2661     bool vector256 = true;
2662     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
2663   %}
2664   ins_pipe( pipe_slow );
2665 %}
2666 
2667 instruct vadd8I_mem(vecY dst, vecY src, memory mem) %{
2668   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
2669   match(Set dst (AddVI src (LoadVector mem)));
2670   format %{ "vpaddd  $dst,$src,$mem\t! add packed8I" %}
2671   ins_encode %{
2672     bool vector256 = true;
2673     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
2674   %}
2675   ins_pipe( pipe_slow );
2676 %}
2677 
2678 // Longs vector add
2679 instruct vadd2L(vecX dst, vecX src) %{
2680   predicate(n->as_Vector()->length() == 2);
2681   match(Set dst (AddVL dst src));
2682   format %{ "paddq   $dst,$src\t! add packed2L" %}
2683   ins_encode %{
2684     __ paddq($dst$$XMMRegister, $src$$XMMRegister);
2685   %}
2686   ins_pipe( pipe_slow );
2687 %}
2688 
2689 instruct vadd2L_reg(vecX dst, vecX src1, vecX src2) %{
2690   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
2691   match(Set dst (AddVL src1 src2));
2692   format %{ "vpaddq  $dst,$src1,$src2\t! add packed2L" %}
2693   ins_encode %{
2694     bool vector256 = false;
2695     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
2696   %}
2697   ins_pipe( pipe_slow );
2698 %}
2699 
2700 instruct vadd2L_mem(vecX dst, vecX src, memory mem) %{
2701   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
2702   match(Set dst (AddVL src (LoadVector mem)));
2703   format %{ "vpaddq  $dst,$src,$mem\t! add packed2L" %}
2704   ins_encode %{
2705     bool vector256 = false;
2706     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
2707   %}
2708   ins_pipe( pipe_slow );
2709 %}
2710 
2711 instruct vadd4L_reg(vecY dst, vecY src1, vecY src2) %{
2712   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
2713   match(Set dst (AddVL src1 src2));
2714   format %{ "vpaddq  $dst,$src1,$src2\t! add packed4L" %}
2715   ins_encode %{
2716     bool vector256 = true;
2717     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
2718   %}
2719   ins_pipe( pipe_slow );
2720 %}
2721 
2722 instruct vadd4L_mem(vecY dst, vecY src, memory mem) %{
2723   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
2724   match(Set dst (AddVL src (LoadVector mem)));
2725   format %{ "vpaddq  $dst,$src,$mem\t! add packed4L" %}
2726   ins_encode %{
2727     bool vector256 = true;
2728     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
2729   %}
2730   ins_pipe( pipe_slow );
2731 %}
2732 
2733 // Floats vector add
2734 instruct vadd2F(vecD dst, vecD src) %{
2735   predicate(n->as_Vector()->length() == 2);
2736   match(Set dst (AddVF dst src));
2737   format %{ "addps   $dst,$src\t! add packed2F" %}
2738   ins_encode %{
2739     __ addps($dst$$XMMRegister, $src$$XMMRegister);
2740   %}
2741   ins_pipe( pipe_slow );
2742 %}
2743 
2744 instruct vadd2F_reg(vecD dst, vecD src1, vecD src2) %{
2745   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
2746   match(Set dst (AddVF src1 src2));
2747   format %{ "vaddps  $dst,$src1,$src2\t! add packed2F" %}
2748   ins_encode %{
2749     bool vector256 = false;
2750     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
2751   %}
2752   ins_pipe( pipe_slow );
2753 %}
2754 
2755 instruct vadd4F(vecX dst, vecX src) %{
2756   predicate(n->as_Vector()->length() == 4);
2757   match(Set dst (AddVF dst src));
2758   format %{ "addps   $dst,$src\t! add packed4F" %}
2759   ins_encode %{
2760     __ addps($dst$$XMMRegister, $src$$XMMRegister);
2761   %}
2762   ins_pipe( pipe_slow );
2763 %}
2764 
2765 instruct vadd4F_reg(vecX dst, vecX src1, vecX src2) %{
2766   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
2767   match(Set dst (AddVF src1 src2));
2768   format %{ "vaddps  $dst,$src1,$src2\t! add packed4F" %}
2769   ins_encode %{
2770     bool vector256 = false;
2771     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
2772   %}
2773   ins_pipe( pipe_slow );
2774 %}
2775 
2776 instruct vadd4F_mem(vecX dst, vecX src, memory mem) %{
2777   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
2778   match(Set dst (AddVF src (LoadVector mem)));
2779   format %{ "vaddps  $dst,$src,$mem\t! add packed4F" %}
2780   ins_encode %{
2781     bool vector256 = false;
2782     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
2783   %}
2784   ins_pipe( pipe_slow );
2785 %}
2786 
2787 instruct vadd8F_reg(vecY dst, vecY src1, vecY src2) %{
2788   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
2789   match(Set dst (AddVF src1 src2));
2790   format %{ "vaddps  $dst,$src1,$src2\t! add packed8F" %}
2791   ins_encode %{
2792     bool vector256 = true;
2793     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
2794   %}
2795   ins_pipe( pipe_slow );
2796 %}
2797 
2798 instruct vadd8F_mem(vecY dst, vecY src, memory mem) %{
2799   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
2800   match(Set dst (AddVF src (LoadVector mem)));
2801   format %{ "vaddps  $dst,$src,$mem\t! add packed8F" %}
2802   ins_encode %{
2803     bool vector256 = true;
2804     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
2805   %}
2806   ins_pipe( pipe_slow );
2807 %}
2808 
2809 // Doubles vector add
2810 instruct vadd2D(vecX dst, vecX src) %{
2811   predicate(n->as_Vector()->length() == 2);
2812   match(Set dst (AddVD dst src));
2813   format %{ "addpd   $dst,$src\t! add packed2D" %}
2814   ins_encode %{
2815     __ addpd($dst$$XMMRegister, $src$$XMMRegister);
2816   %}
2817   ins_pipe( pipe_slow );
2818 %}
2819 
2820 instruct vadd2D_reg(vecX dst, vecX src1, vecX src2) %{
2821   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
2822   match(Set dst (AddVD src1 src2));
2823   format %{ "vaddpd  $dst,$src1,$src2\t! add packed2D" %}
2824   ins_encode %{
2825     bool vector256 = false;
2826     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
2827   %}
2828   ins_pipe( pipe_slow );
2829 %}
2830 
2831 instruct vadd2D_mem(vecX dst, vecX src, memory mem) %{
2832   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
2833   match(Set dst (AddVD src (LoadVector mem)));
2834   format %{ "vaddpd  $dst,$src,$mem\t! add packed2D" %}
2835   ins_encode %{
2836     bool vector256 = false;
2837     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
2838   %}
2839   ins_pipe( pipe_slow );
2840 %}
2841 
2842 instruct vadd4D_reg(vecY dst, vecY src1, vecY src2) %{
2843   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
2844   match(Set dst (AddVD src1 src2));
2845   format %{ "vaddpd  $dst,$src1,$src2\t! add packed4D" %}
2846   ins_encode %{
2847     bool vector256 = true;
2848     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
2849   %}
2850   ins_pipe( pipe_slow );
2851 %}
2852 
2853 instruct vadd4D_mem(vecY dst, vecY src, memory mem) %{
2854   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
2855   match(Set dst (AddVD src (LoadVector mem)));
2856   format %{ "vaddpd  $dst,$src,$mem\t! add packed4D" %}
2857   ins_encode %{
2858     bool vector256 = true;
2859     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
2860   %}
2861   ins_pipe( pipe_slow );
2862 %}
2863 
2864 // --------------------------------- SUB --------------------------------------
2865 
2866 // Bytes vector sub
2867 instruct vsub4B(vecS dst, vecS src) %{
2868   predicate(n->as_Vector()->length() == 4);
2869   match(Set dst (SubVB dst src));
2870   format %{ "psubb   $dst,$src\t! sub packed4B" %}
2871   ins_encode %{
2872     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
2873   %}
2874   ins_pipe( pipe_slow );
2875 %}
2876 
2877 instruct vsub4B_reg(vecS dst, vecS src1, vecS src2) %{
2878   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
2879   match(Set dst (SubVB src1 src2));
2880   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed4B" %}
2881   ins_encode %{
2882     bool vector256 = false;
2883     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
2884   %}
2885   ins_pipe( pipe_slow );
2886 %}
2887 
2888 instruct vsub8B(vecD dst, vecD src) %{
2889   predicate(n->as_Vector()->length() == 8);
2890   match(Set dst (SubVB dst src));
2891   format %{ "psubb   $dst,$src\t! sub packed8B" %}
2892   ins_encode %{
2893     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
2894   %}
2895   ins_pipe( pipe_slow );
2896 %}
2897 
2898 instruct vsub8B_reg(vecD dst, vecD src1, vecD src2) %{
2899   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
2900   match(Set dst (SubVB src1 src2));
2901   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed8B" %}
2902   ins_encode %{
2903     bool vector256 = false;
2904     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
2905   %}
2906   ins_pipe( pipe_slow );
2907 %}
2908 
2909 instruct vsub16B(vecX dst, vecX src) %{
2910   predicate(n->as_Vector()->length() == 16);
2911   match(Set dst (SubVB dst src));
2912   format %{ "psubb   $dst,$src\t! sub packed16B" %}
2913   ins_encode %{
2914     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
2915   %}
2916   ins_pipe( pipe_slow );
2917 %}
2918 
2919 instruct vsub16B_reg(vecX dst, vecX src1, vecX src2) %{
2920   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
2921   match(Set dst (SubVB src1 src2));
2922   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed16B" %}
2923   ins_encode %{
2924     bool vector256 = false;
2925     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
2926   %}
2927   ins_pipe( pipe_slow );
2928 %}
2929 
2930 instruct vsub16B_mem(vecX dst, vecX src, memory mem) %{
2931   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
2932   match(Set dst (SubVB src (LoadVector mem)));
2933   format %{ "vpsubb  $dst,$src,$mem\t! sub packed16B" %}
2934   ins_encode %{
2935     bool vector256 = false;
2936     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
2937   %}
2938   ins_pipe( pipe_slow );
2939 %}
2940 
2941 instruct vsub32B_reg(vecY dst, vecY src1, vecY src2) %{
2942   predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
2943   match(Set dst (SubVB src1 src2));
2944   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed32B" %}
2945   ins_encode %{
2946     bool vector256 = true;
2947     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
2948   %}
2949   ins_pipe( pipe_slow );
2950 %}
2951 
2952 instruct vsub32B_mem(vecY dst, vecY src, memory mem) %{
2953   predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
2954   match(Set dst (SubVB src (LoadVector mem)));
2955   format %{ "vpsubb  $dst,$src,$mem\t! sub packed32B" %}
2956   ins_encode %{
2957     bool vector256 = true;
2958     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
2959   %}
2960   ins_pipe( pipe_slow );
2961 %}
2962 
2963 // Shorts/Chars vector sub
2964 instruct vsub2S(vecS dst, vecS src) %{
2965   predicate(n->as_Vector()->length() == 2);
2966   match(Set dst (SubVS dst src));
2967   format %{ "psubw   $dst,$src\t! sub packed2S" %}
2968   ins_encode %{
2969     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
2970   %}
2971   ins_pipe( pipe_slow );
2972 %}
2973 
2974 instruct vsub2S_reg(vecS dst, vecS src1, vecS src2) %{
2975   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
2976   match(Set dst (SubVS src1 src2));
2977   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed2S" %}
2978   ins_encode %{
2979     bool vector256 = false;
2980     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
2981   %}
2982   ins_pipe( pipe_slow );
2983 %}
2984 
2985 instruct vsub4S(vecD dst, vecD src) %{
2986   predicate(n->as_Vector()->length() == 4);
2987   match(Set dst (SubVS dst src));
2988   format %{ "psubw   $dst,$src\t! sub packed4S" %}
2989   ins_encode %{
2990     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
2991   %}
2992   ins_pipe( pipe_slow );
2993 %}
2994 
2995 instruct vsub4S_reg(vecD dst, vecD src1, vecD src2) %{
2996   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
2997   match(Set dst (SubVS src1 src2));
2998   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed4S" %}
2999   ins_encode %{
3000     bool vector256 = false;
3001     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3002   %}
3003   ins_pipe( pipe_slow );
3004 %}
3005 
3006 instruct vsub8S(vecX dst, vecX src) %{
3007   predicate(n->as_Vector()->length() == 8);
3008   match(Set dst (SubVS dst src));
3009   format %{ "psubw   $dst,$src\t! sub packed8S" %}
3010   ins_encode %{
3011     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
3012   %}
3013   ins_pipe( pipe_slow );
3014 %}
3015 
3016 instruct vsub8S_reg(vecX dst, vecX src1, vecX src2) %{
3017   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
3018   match(Set dst (SubVS src1 src2));
3019   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed8S" %}
3020   ins_encode %{
3021     bool vector256 = false;
3022     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3023   %}
3024   ins_pipe( pipe_slow );
3025 %}
3026 
3027 instruct vsub8S_mem(vecX dst, vecX src, memory mem) %{
3028   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
3029   match(Set dst (SubVS src (LoadVector mem)));
3030   format %{ "vpsubw  $dst,$src,$mem\t! sub packed8S" %}
3031   ins_encode %{
3032     bool vector256 = false;
3033     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3034   %}
3035   ins_pipe( pipe_slow );
3036 %}
3037 
3038 instruct vsub16S_reg(vecY dst, vecY src1, vecY src2) %{
3039   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
3040   match(Set dst (SubVS src1 src2));
3041   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed16S" %}
3042   ins_encode %{
3043     bool vector256 = true;
3044     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3045   %}
3046   ins_pipe( pipe_slow );
3047 %}
3048 
3049 instruct vsub16S_mem(vecY dst, vecY src, memory mem) %{
3050   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
3051   match(Set dst (SubVS src (LoadVector mem)));
3052   format %{ "vpsubw  $dst,$src,$mem\t! sub packed16S" %}
3053   ins_encode %{
3054     bool vector256 = true;
3055     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3056   %}
3057   ins_pipe( pipe_slow );
3058 %}
3059 
3060 // Integers vector sub
3061 instruct vsub2I(vecD dst, vecD src) %{
3062   predicate(n->as_Vector()->length() == 2);
3063   match(Set dst (SubVI dst src));
3064   format %{ "psubd   $dst,$src\t! sub packed2I" %}
3065   ins_encode %{
3066     __ psubd($dst$$XMMRegister, $src$$XMMRegister);
3067   %}
3068   ins_pipe( pipe_slow );
3069 %}
3070 
3071 instruct vsub2I_reg(vecD dst, vecD src1, vecD src2) %{
3072   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
3073   match(Set dst (SubVI src1 src2));
3074   format %{ "vpsubd  $dst,$src1,$src2\t! sub packed2I" %}
3075   ins_encode %{
3076     bool vector256 = false;
3077     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3078   %}
3079   ins_pipe( pipe_slow );
3080 %}
3081 
3082 instruct vsub4I(vecX dst, vecX src) %{
3083   predicate(n->as_Vector()->length() == 4);
3084   match(Set dst (SubVI dst src));
3085   format %{ "psubd   $dst,$src\t! sub packed4I" %}
3086   ins_encode %{
3087     __ psubd($dst$$XMMRegister, $src$$XMMRegister);
3088   %}
3089   ins_pipe( pipe_slow );
3090 %}
3091 
3092 instruct vsub4I_reg(vecX dst, vecX src1, vecX src2) %{
3093   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
3094   match(Set dst (SubVI src1 src2));
3095   format %{ "vpsubd  $dst,$src1,$src2\t! sub packed4I" %}
3096   ins_encode %{
3097     bool vector256 = false;
3098     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3099   %}
3100   ins_pipe( pipe_slow );
3101 %}
3102 
3103 instruct vsub4I_mem(vecX dst, vecX src, memory mem) %{
3104   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
3105   match(Set dst (SubVI src (LoadVector mem)));
3106   format %{ "vpsubd  $dst,$src,$mem\t! sub packed4I" %}
3107   ins_encode %{
3108     bool vector256 = false;
3109     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3110   %}
3111   ins_pipe( pipe_slow );
3112 %}
3113 
3114 instruct vsub8I_reg(vecY dst, vecY src1, vecY src2) %{
3115   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
3116   match(Set dst (SubVI src1 src2));
3117   format %{ "vpsubd  $dst,$src1,$src2\t! sub packed8I" %}
3118   ins_encode %{
3119     bool vector256 = true;
3120     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3121   %}
3122   ins_pipe( pipe_slow );
3123 %}
3124 
3125 instruct vsub8I_mem(vecY dst, vecY src, memory mem) %{
3126   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
3127   match(Set dst (SubVI src (LoadVector mem)));
3128   format %{ "vpsubd  $dst,$src,$mem\t! sub packed8I" %}
3129   ins_encode %{
3130     bool vector256 = true;
3131     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3132   %}
3133   ins_pipe( pipe_slow );
3134 %}
3135 
3136 // Longs vector sub
3137 instruct vsub2L(vecX dst, vecX src) %{
3138   predicate(n->as_Vector()->length() == 2);
3139   match(Set dst (SubVL dst src));
3140   format %{ "psubq   $dst,$src\t! sub packed2L" %}
3141   ins_encode %{
3142     __ psubq($dst$$XMMRegister, $src$$XMMRegister);
3143   %}
3144   ins_pipe( pipe_slow );
3145 %}
3146 
3147 instruct vsub2L_reg(vecX dst, vecX src1, vecX src2) %{
3148   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
3149   match(Set dst (SubVL src1 src2));
3150   format %{ "vpsubq  $dst,$src1,$src2\t! sub packed2L" %}
3151   ins_encode %{
3152     bool vector256 = false;
3153     __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3154   %}
3155   ins_pipe( pipe_slow );
3156 %}
3157 
3158 instruct vsub2L_mem(vecX dst, vecX src, memory mem) %{
3159   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
3160   match(Set dst (SubVL src (LoadVector mem)));
3161   format %{ "vpsubq  $dst,$src,$mem\t! sub packed2L" %}
3162   ins_encode %{
3163     bool vector256 = false;
3164     __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3165   %}
3166   ins_pipe( pipe_slow );
3167 %}
3168 
3169 instruct vsub4L_reg(vecY dst, vecY src1, vecY src2) %{
3170   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
3171   match(Set dst (SubVL src1 src2));
3172   format %{ "vpsubq  $dst,$src1,$src2\t! sub packed4L" %}
3173   ins_encode %{
3174     bool vector256 = true;
3175     __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3176   %}
3177   ins_pipe( pipe_slow );
3178 %}
3179 
3180 instruct vsub4L_mem(vecY dst, vecY src, memory mem) %{
3181   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
3182   match(Set dst (SubVL src (LoadVector mem)));
3183   format %{ "vpsubq  $dst,$src,$mem\t! sub packed4L" %}
3184   ins_encode %{
3185     bool vector256 = true;
3186     __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3187   %}
3188   ins_pipe( pipe_slow );
3189 %}
3190 
3191 // Floats vector sub
3192 instruct vsub2F(vecD dst, vecD src) %{
3193   predicate(n->as_Vector()->length() == 2);
3194   match(Set dst (SubVF dst src));
3195   format %{ "subps   $dst,$src\t! sub packed2F" %}
3196   ins_encode %{
3197     __ subps($dst$$XMMRegister, $src$$XMMRegister);
3198   %}
3199   ins_pipe( pipe_slow );
3200 %}
3201 
3202 instruct vsub2F_reg(vecD dst, vecD src1, vecD src2) %{
3203   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
3204   match(Set dst (SubVF src1 src2));
3205   format %{ "vsubps  $dst,$src1,$src2\t! sub packed2F" %}
3206   ins_encode %{
3207     bool vector256 = false;
3208     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3209   %}
3210   ins_pipe( pipe_slow );
3211 %}
3212 
3213 instruct vsub4F(vecX dst, vecX src) %{
3214   predicate(n->as_Vector()->length() == 4);
3215   match(Set dst (SubVF dst src));
3216   format %{ "subps   $dst,$src\t! sub packed4F" %}
3217   ins_encode %{
3218     __ subps($dst$$XMMRegister, $src$$XMMRegister);
3219   %}
3220   ins_pipe( pipe_slow );
3221 %}
3222 
3223 instruct vsub4F_reg(vecX dst, vecX src1, vecX src2) %{
3224   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
3225   match(Set dst (SubVF src1 src2));
3226   format %{ "vsubps  $dst,$src1,$src2\t! sub packed4F" %}
3227   ins_encode %{
3228     bool vector256 = false;
3229     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3230   %}
3231   ins_pipe( pipe_slow );
3232 %}
3233 
3234 instruct vsub4F_mem(vecX dst, vecX src, memory mem) %{
3235   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
3236   match(Set dst (SubVF src (LoadVector mem)));
3237   format %{ "vsubps  $dst,$src,$mem\t! sub packed4F" %}
3238   ins_encode %{
3239     bool vector256 = false;
3240     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3241   %}
3242   ins_pipe( pipe_slow );
3243 %}
3244 
3245 instruct vsub8F_reg(vecY dst, vecY src1, vecY src2) %{
3246   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
3247   match(Set dst (SubVF src1 src2));
3248   format %{ "vsubps  $dst,$src1,$src2\t! sub packed8F" %}
3249   ins_encode %{
3250     bool vector256 = true;
3251     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3252   %}
3253   ins_pipe( pipe_slow );
3254 %}
3255 
3256 instruct vsub8F_mem(vecY dst, vecY src, memory mem) %{
3257   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
3258   match(Set dst (SubVF src (LoadVector mem)));
3259   format %{ "vsubps  $dst,$src,$mem\t! sub packed8F" %}
3260   ins_encode %{
3261     bool vector256 = true;
3262     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3263   %}
3264   ins_pipe( pipe_slow );
3265 %}
3266 
3267 // Doubles vector sub
3268 instruct vsub2D(vecX dst, vecX src) %{
3269   predicate(n->as_Vector()->length() == 2);
3270   match(Set dst (SubVD dst src));
3271   format %{ "subpd   $dst,$src\t! sub packed2D" %}
3272   ins_encode %{
3273     __ subpd($dst$$XMMRegister, $src$$XMMRegister);
3274   %}
3275   ins_pipe( pipe_slow );
3276 %}
3277 
3278 instruct vsub2D_reg(vecX dst, vecX src1, vecX src2) %{
3279   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
3280   match(Set dst (SubVD src1 src2));
3281   format %{ "vsubpd  $dst,$src1,$src2\t! sub packed2D" %}
3282   ins_encode %{
3283     bool vector256 = false;
3284     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3285   %}
3286   ins_pipe( pipe_slow );
3287 %}
3288 
3289 instruct vsub2D_mem(vecX dst, vecX src, memory mem) %{
3290   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
3291   match(Set dst (SubVD src (LoadVector mem)));
3292   format %{ "vsubpd  $dst,$src,$mem\t! sub packed2D" %}
3293   ins_encode %{
3294     bool vector256 = false;
3295     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3296   %}
3297   ins_pipe( pipe_slow );
3298 %}
3299 
3300 instruct vsub4D_reg(vecY dst, vecY src1, vecY src2) %{
3301   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
3302   match(Set dst (SubVD src1 src2));
3303   format %{ "vsubpd  $dst,$src1,$src2\t! sub packed4D" %}
3304   ins_encode %{
3305     bool vector256 = true;
3306     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3307   %}
3308   ins_pipe( pipe_slow );
3309 %}
3310 
3311 instruct vsub4D_mem(vecY dst, vecY src, memory mem) %{
3312   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
3313   match(Set dst (SubVD src (LoadVector mem)));
3314   format %{ "vsubpd  $dst,$src,$mem\t! sub packed4D" %}
3315   ins_encode %{
3316     bool vector256 = true;
3317     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3318   %}
3319   ins_pipe( pipe_slow );
3320 %}
3321 
3322 // --------------------------------- MUL --------------------------------------
3323 
3324 // Shorts/Chars vector mul
3325 instruct vmul2S(vecS dst, vecS src) %{
3326   predicate(n->as_Vector()->length() == 2);
3327   match(Set dst (MulVS dst src));
3328   format %{ "pmullw $dst,$src\t! mul packed2S" %}
3329   ins_encode %{
3330     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
3331   %}
3332   ins_pipe( pipe_slow );
3333 %}
3334 
3335 instruct vmul2S_reg(vecS dst, vecS src1, vecS src2) %{
3336   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
3337   match(Set dst (MulVS src1 src2));
3338   format %{ "vpmullw $dst,$src1,$src2\t! mul packed2S" %}
3339   ins_encode %{
3340     bool vector256 = false;
3341     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3342   %}
3343   ins_pipe( pipe_slow );
3344 %}
3345 
3346 instruct vmul4S(vecD dst, vecD src) %{
3347   predicate(n->as_Vector()->length() == 4);
3348   match(Set dst (MulVS dst src));
3349   format %{ "pmullw  $dst,$src\t! mul packed4S" %}
3350   ins_encode %{
3351     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
3352   %}
3353   ins_pipe( pipe_slow );
3354 %}
3355 
3356 instruct vmul4S_reg(vecD dst, vecD src1, vecD src2) %{
3357   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
3358   match(Set dst (MulVS src1 src2));
3359   format %{ "vpmullw $dst,$src1,$src2\t! mul packed4S" %}
3360   ins_encode %{
3361     bool vector256 = false;
3362     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3363   %}
3364   ins_pipe( pipe_slow );
3365 %}
3366 
3367 instruct vmul8S(vecX dst, vecX src) %{
3368   predicate(n->as_Vector()->length() == 8);
3369   match(Set dst (MulVS dst src));
3370   format %{ "pmullw  $dst,$src\t! mul packed8S" %}
3371   ins_encode %{
3372     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
3373   %}
3374   ins_pipe( pipe_slow );
3375 %}
3376 
3377 instruct vmul8S_reg(vecX dst, vecX src1, vecX src2) %{
3378   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
3379   match(Set dst (MulVS src1 src2));
3380   format %{ "vpmullw $dst,$src1,$src2\t! mul packed8S" %}
3381   ins_encode %{
3382     bool vector256 = false;
3383     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3384   %}
3385   ins_pipe( pipe_slow );
3386 %}
3387 
3388 instruct vmul8S_mem(vecX dst, vecX src, memory mem) %{
3389   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
3390   match(Set dst (MulVS src (LoadVector mem)));
3391   format %{ "vpmullw $dst,$src,$mem\t! mul packed8S" %}
3392   ins_encode %{
3393     bool vector256 = false;
3394     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3395   %}
3396   ins_pipe( pipe_slow );
3397 %}
3398 
3399 instruct vmul16S_reg(vecY dst, vecY src1, vecY src2) %{
3400   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
3401   match(Set dst (MulVS src1 src2));
3402   format %{ "vpmullw $dst,$src1,$src2\t! mul packed16S" %}
3403   ins_encode %{
3404     bool vector256 = true;
3405     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3406   %}
3407   ins_pipe( pipe_slow );
3408 %}
3409 
3410 instruct vmul16S_mem(vecY dst, vecY src, memory mem) %{
3411   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
3412   match(Set dst (MulVS src (LoadVector mem)));
3413   format %{ "vpmullw $dst,$src,$mem\t! mul packed16S" %}
3414   ins_encode %{
3415     bool vector256 = true;
3416     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3417   %}
3418   ins_pipe( pipe_slow );
3419 %}
3420 
3421 // Integers vector mul (sse4_1)
3422 instruct vmul2I(vecD dst, vecD src) %{
3423   predicate(UseSSE > 3 && n->as_Vector()->length() == 2);
3424   match(Set dst (MulVI dst src));
3425   format %{ "pmulld  $dst,$src\t! mul packed2I" %}
3426   ins_encode %{
3427     __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
3428   %}
3429   ins_pipe( pipe_slow );
3430 %}
3431 
3432 instruct vmul2I_reg(vecD dst, vecD src1, vecD src2) %{
3433   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
3434   match(Set dst (MulVI src1 src2));
3435   format %{ "vpmulld $dst,$src1,$src2\t! mul packed2I" %}
3436   ins_encode %{
3437     bool vector256 = false;
3438     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3439   %}
3440   ins_pipe( pipe_slow );
3441 %}
3442 
3443 instruct vmul4I(vecX dst, vecX src) %{
3444   predicate(UseSSE > 3 && n->as_Vector()->length() == 4);
3445   match(Set dst (MulVI dst src));
3446   format %{ "pmulld  $dst,$src\t! mul packed4I" %}
3447   ins_encode %{
3448     __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
3449   %}
3450   ins_pipe( pipe_slow );
3451 %}
3452 
3453 instruct vmul4I_reg(vecX dst, vecX src1, vecX src2) %{
3454   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
3455   match(Set dst (MulVI src1 src2));
3456   format %{ "vpmulld $dst,$src1,$src2\t! mul packed4I" %}
3457   ins_encode %{
3458     bool vector256 = false;
3459     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3460   %}
3461   ins_pipe( pipe_slow );
3462 %}
3463 
3464 instruct vmul4I_mem(vecX dst, vecX src, memory mem) %{
3465   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
3466   match(Set dst (MulVI src (LoadVector mem)));
3467   format %{ "vpmulld $dst,$src,$mem\t! mul packed4I" %}
3468   ins_encode %{
3469     bool vector256 = false;
3470     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3471   %}
3472   ins_pipe( pipe_slow );
3473 %}
3474 
3475 instruct vmul8I_reg(vecY dst, vecY src1, vecY src2) %{
3476   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
3477   match(Set dst (MulVI src1 src2));
3478   format %{ "vpmulld $dst,$src1,$src2\t! mul packed8I" %}
3479   ins_encode %{
3480     bool vector256 = true;
3481     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3482   %}
3483   ins_pipe( pipe_slow );
3484 %}
3485 
3486 instruct vmul8I_mem(vecY dst, vecY src, memory mem) %{
3487   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
3488   match(Set dst (MulVI src (LoadVector mem)));
3489   format %{ "vpmulld $dst,$src,$mem\t! mul packed8I" %}
3490   ins_encode %{
3491     bool vector256 = true;
3492     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3493   %}
3494   ins_pipe( pipe_slow );
3495 %}
3496 
3497 // Floats vector mul
3498 instruct vmul2F(vecD dst, vecD src) %{
3499   predicate(n->as_Vector()->length() == 2);
3500   match(Set dst (MulVF dst src));
3501   format %{ "mulps   $dst,$src\t! mul packed2F" %}
3502   ins_encode %{
3503     __ mulps($dst$$XMMRegister, $src$$XMMRegister);
3504   %}
3505   ins_pipe( pipe_slow );
3506 %}
3507 
3508 instruct vmul2F_reg(vecD dst, vecD src1, vecD src2) %{
3509   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
3510   match(Set dst (MulVF src1 src2));
3511   format %{ "vmulps  $dst,$src1,$src2\t! mul packed2F" %}
3512   ins_encode %{
3513     bool vector256 = false;
3514     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3515   %}
3516   ins_pipe( pipe_slow );
3517 %}
3518 
3519 instruct vmul4F(vecX dst, vecX src) %{
3520   predicate(n->as_Vector()->length() == 4);
3521   match(Set dst (MulVF dst src));
3522   format %{ "mulps   $dst,$src\t! mul packed4F" %}
3523   ins_encode %{
3524     __ mulps($dst$$XMMRegister, $src$$XMMRegister);
3525   %}
3526   ins_pipe( pipe_slow );
3527 %}
3528 
3529 instruct vmul4F_reg(vecX dst, vecX src1, vecX src2) %{
3530   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
3531   match(Set dst (MulVF src1 src2));
3532   format %{ "vmulps  $dst,$src1,$src2\t! mul packed4F" %}
3533   ins_encode %{
3534     bool vector256 = false;
3535     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3536   %}
3537   ins_pipe( pipe_slow );
3538 %}
3539 
3540 instruct vmul4F_mem(vecX dst, vecX src, memory mem) %{
3541   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
3542   match(Set dst (MulVF src (LoadVector mem)));
3543   format %{ "vmulps  $dst,$src,$mem\t! mul packed4F" %}
3544   ins_encode %{
3545     bool vector256 = false;
3546     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3547   %}
3548   ins_pipe( pipe_slow );
3549 %}
3550 
3551 instruct vmul8F_reg(vecY dst, vecY src1, vecY src2) %{
3552   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
3553   match(Set dst (MulVF src1 src2));
3554   format %{ "vmulps  $dst,$src1,$src2\t! mul packed8F" %}
3555   ins_encode %{
3556     bool vector256 = true;
3557     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3558   %}
3559   ins_pipe( pipe_slow );
3560 %}
3561 
3562 instruct vmul8F_mem(vecY dst, vecY src, memory mem) %{
3563   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
3564   match(Set dst (MulVF src (LoadVector mem)));
3565   format %{ "vmulps  $dst,$src,$mem\t! mul packed8F" %}
3566   ins_encode %{
3567     bool vector256 = true;
3568     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3569   %}
3570   ins_pipe( pipe_slow );
3571 %}
3572 
3573 // Doubles vector mul
3574 instruct vmul2D(vecX dst, vecX src) %{
3575   predicate(n->as_Vector()->length() == 2);
3576   match(Set dst (MulVD dst src));
3577   format %{ "mulpd   $dst,$src\t! mul packed2D" %}
3578   ins_encode %{
3579     __ mulpd($dst$$XMMRegister, $src$$XMMRegister);
3580   %}
3581   ins_pipe( pipe_slow );
3582 %}
3583 
3584 instruct vmul2D_reg(vecX dst, vecX src1, vecX src2) %{
3585   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
3586   match(Set dst (MulVD src1 src2));
3587   format %{ "vmulpd  $dst,$src1,$src2\t! mul packed2D" %}
3588   ins_encode %{
3589     bool vector256 = false;
3590     __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3591   %}
3592   ins_pipe( pipe_slow );
3593 %}
3594 
3595 instruct vmul2D_mem(vecX dst, vecX src, memory mem) %{
3596   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
3597   match(Set dst (MulVD src (LoadVector mem)));
3598   format %{ "vmulpd  $dst,$src,$mem\t! mul packed2D" %}
3599   ins_encode %{
3600     bool vector256 = false;
3601     __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3602   %}
3603   ins_pipe( pipe_slow );
3604 %}
3605 
3606 instruct vmul4D_reg(vecY dst, vecY src1, vecY src2) %{
3607   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
3608   match(Set dst (MulVD src1 src2));
3609   format %{ "vmulpd  $dst,$src1,$src2\t! mul packed4D" %}
3610   ins_encode %{
3611     bool vector256 = true;
3612     __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3613   %}
3614   ins_pipe( pipe_slow );
3615 %}
3616 
3617 instruct vmul4D_mem(vecY dst, vecY src, memory mem) %{
3618   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
3619   match(Set dst (MulVD src (LoadVector mem)));
3620   format %{ "vmulpd  $dst,$src,$mem\t! mul packed4D" %}
3621   ins_encode %{
3622     bool vector256 = true;
3623     __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3624   %}
3625   ins_pipe( pipe_slow );
3626 %}
3627 
3628 // --------------------------------- DIV --------------------------------------
3629 
3630 // Floats vector div
3631 instruct vdiv2F(vecD dst, vecD src) %{
3632   predicate(n->as_Vector()->length() == 2);
3633   match(Set dst (DivVF dst src));
3634   format %{ "divps   $dst,$src\t! div packed2F" %}
3635   ins_encode %{
3636     __ divps($dst$$XMMRegister, $src$$XMMRegister);
3637   %}
3638   ins_pipe( pipe_slow );
3639 %}
3640 
3641 instruct vdiv2F_reg(vecD dst, vecD src1, vecD src2) %{
3642   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
3643   match(Set dst (DivVF src1 src2));
3644   format %{ "vdivps  $dst,$src1,$src2\t! div packed2F" %}
3645   ins_encode %{
3646     bool vector256 = false;
3647     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3648   %}
3649   ins_pipe( pipe_slow );
3650 %}
3651 
3652 instruct vdiv4F(vecX dst, vecX src) %{
3653   predicate(n->as_Vector()->length() == 4);
3654   match(Set dst (DivVF dst src));
3655   format %{ "divps   $dst,$src\t! div packed4F" %}
3656   ins_encode %{
3657     __ divps($dst$$XMMRegister, $src$$XMMRegister);
3658   %}
3659   ins_pipe( pipe_slow );
3660 %}
3661 
3662 instruct vdiv4F_reg(vecX dst, vecX src1, vecX src2) %{
3663   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
3664   match(Set dst (DivVF src1 src2));
3665   format %{ "vdivps  $dst,$src1,$src2\t! div packed4F" %}
3666   ins_encode %{
3667     bool vector256 = false;
3668     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3669   %}
3670   ins_pipe( pipe_slow );
3671 %}
3672 
3673 instruct vdiv4F_mem(vecX dst, vecX src, memory mem) %{
3674   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
3675   match(Set dst (DivVF src (LoadVector mem)));
3676   format %{ "vdivps  $dst,$src,$mem\t! div packed4F" %}
3677   ins_encode %{
3678     bool vector256 = false;
3679     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3680   %}
3681   ins_pipe( pipe_slow );
3682 %}
3683 
3684 instruct vdiv8F_reg(vecY dst, vecY src1, vecY src2) %{
3685   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
3686   match(Set dst (DivVF src1 src2));
3687   format %{ "vdivps  $dst,$src1,$src2\t! div packed8F" %}
3688   ins_encode %{
3689     bool vector256 = true;
3690     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3691   %}
3692   ins_pipe( pipe_slow );
3693 %}
3694 
3695 instruct vdiv8F_mem(vecY dst, vecY src, memory mem) %{
3696   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
3697   match(Set dst (DivVF src (LoadVector mem)));
3698   format %{ "vdivps  $dst,$src,$mem\t! div packed8F" %}
3699   ins_encode %{
3700     bool vector256 = true;
3701     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3702   %}
3703   ins_pipe( pipe_slow );
3704 %}
3705 
3706 // Doubles vector div
3707 instruct vdiv2D(vecX dst, vecX src) %{
3708   predicate(n->as_Vector()->length() == 2);
3709   match(Set dst (DivVD dst src));
3710   format %{ "divpd   $dst,$src\t! div packed2D" %}
3711   ins_encode %{
3712     __ divpd($dst$$XMMRegister, $src$$XMMRegister);
3713   %}
3714   ins_pipe( pipe_slow );
3715 %}
3716 
3717 instruct vdiv2D_reg(vecX dst, vecX src1, vecX src2) %{
3718   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
3719   match(Set dst (DivVD src1 src2));
3720   format %{ "vdivpd  $dst,$src1,$src2\t! div packed2D" %}
3721   ins_encode %{
3722     bool vector256 = false;
3723     __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3724   %}
3725   ins_pipe( pipe_slow );
3726 %}
3727 
3728 instruct vdiv2D_mem(vecX dst, vecX src, memory mem) %{
3729   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
3730   match(Set dst (DivVD src (LoadVector mem)));
3731   format %{ "vdivpd  $dst,$src,$mem\t! div packed2D" %}
3732   ins_encode %{
3733     bool vector256 = false;
3734     __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3735   %}
3736   ins_pipe( pipe_slow );
3737 %}
3738 
3739 instruct vdiv4D_reg(vecY dst, vecY src1, vecY src2) %{
3740   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
3741   match(Set dst (DivVD src1 src2));
3742   format %{ "vdivpd  $dst,$src1,$src2\t! div packed4D" %}
3743   ins_encode %{
3744     bool vector256 = true;
3745     __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3746   %}
3747   ins_pipe( pipe_slow );
3748 %}
3749 
3750 instruct vdiv4D_mem(vecY dst, vecY src, memory mem) %{
3751   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
3752   match(Set dst (DivVD src (LoadVector mem)));
3753   format %{ "vdivpd  $dst,$src,$mem\t! div packed4D" %}
3754   ins_encode %{
3755     bool vector256 = true;
3756     __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3757   %}
3758   ins_pipe( pipe_slow );
3759 %}
3760 
3761 // ------------------------------ LeftShift -----------------------------------
3762 
3763 // Shorts/Chars vector left shift
3764 instruct vsll2S(vecS dst, regF shift) %{
3765   predicate(n->as_Vector()->length() == 2);
3766   match(Set dst (LShiftVS dst shift));
3767   format %{ "psllw   $dst,$shift\t! left shift packed2S" %}
3768   ins_encode %{
3769     __ psllw($dst$$XMMRegister, $shift$$XMMRegister);
3770   %}
3771   ins_pipe( pipe_slow );
3772 %}
3773 
3774 instruct vsll2S_imm(vecS dst, immI8 shift) %{
3775   predicate(n->as_Vector()->length() == 2);
3776   match(Set dst (LShiftVS dst shift));
3777   format %{ "psllw   $dst,$shift\t! left shift packed2S" %}
3778   ins_encode %{
3779     __ psllw($dst$$XMMRegister, (int)$shift$$constant);
3780   %}
3781   ins_pipe( pipe_slow );
3782 %}
3783 
3784 instruct vsll2S_reg(vecS dst, vecS src, regF shift) %{
3785   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
3786   match(Set dst (LShiftVS src shift));
3787   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
3788   ins_encode %{
3789     bool vector256 = false;
3790     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
3791   %}
3792   ins_pipe( pipe_slow );
3793 %}
3794 
3795 instruct vsll2S_reg_imm(vecS dst, vecS src, immI8 shift) %{
3796   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
3797   match(Set dst (LShiftVS src shift));
3798   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
3799   ins_encode %{
3800     bool vector256 = false;
3801     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
3802   %}
3803   ins_pipe( pipe_slow );
3804 %}
3805 
3806 instruct vsll4S(vecD dst, regF shift) %{
3807   predicate(n->as_Vector()->length() == 4);
3808   match(Set dst (LShiftVS dst shift));
3809   format %{ "psllw   $dst,$shift\t! left shift packed4S" %}
3810   ins_encode %{
3811     __ psllw($dst$$XMMRegister, $shift$$XMMRegister);
3812   %}
3813   ins_pipe( pipe_slow );
3814 %}
3815 
3816 instruct vsll4S_imm(vecD dst, immI8 shift) %{
3817   predicate(n->as_Vector()->length() == 4);
3818   match(Set dst (LShiftVS dst shift));
3819   format %{ "psllw   $dst,$shift\t! left shift packed4S" %}
3820   ins_encode %{
3821     __ psllw($dst$$XMMRegister, (int)$shift$$constant);
3822   %}
3823   ins_pipe( pipe_slow );
3824 %}
3825 
3826 instruct vsll4S_reg(vecD dst, vecD src, regF shift) %{
3827   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
3828   match(Set dst (LShiftVS src shift));
3829   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
3830   ins_encode %{
3831     bool vector256 = false;
3832     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
3833   %}
3834   ins_pipe( pipe_slow );
3835 %}
3836 
3837 instruct vsll4S_reg_imm(vecD dst, vecD src, immI8 shift) %{
3838   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
3839   match(Set dst (LShiftVS src shift));
3840   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
3841   ins_encode %{
3842     bool vector256 = false;
3843     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
3844   %}
3845   ins_pipe( pipe_slow );
3846 %}
3847 
3848 instruct vsll8S(vecX dst, regF shift) %{
3849   predicate(n->as_Vector()->length() == 8);
3850   match(Set dst (LShiftVS dst shift));
3851   format %{ "psllw   $dst,$shift\t! left shift packed8S" %}
3852   ins_encode %{
3853     __ psllw($dst$$XMMRegister, $shift$$XMMRegister);
3854   %}
3855   ins_pipe( pipe_slow );
3856 %}
3857 
3858 instruct vsll8S_imm(vecX dst, immI8 shift) %{
3859   predicate(n->as_Vector()->length() == 8);
3860   match(Set dst (LShiftVS dst shift));
3861   format %{ "psllw   $dst,$shift\t! left shift packed8S" %}
3862   ins_encode %{
3863     __ psllw($dst$$XMMRegister, (int)$shift$$constant);
3864   %}
3865   ins_pipe( pipe_slow );
3866 %}
3867 
3868 instruct vsll8S_reg(vecX dst, vecX src, regF shift) %{
3869   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
3870   match(Set dst (LShiftVS src shift));
3871   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
3872   ins_encode %{
3873     bool vector256 = false;
3874     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
3875   %}
3876   ins_pipe( pipe_slow );
3877 %}
3878 
3879 instruct vsll8S_reg_imm(vecX dst, vecX src, immI8 shift) %{
3880   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
3881   match(Set dst (LShiftVS src shift));
3882   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
3883   ins_encode %{
3884     bool vector256 = false;
3885     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
3886   %}
3887   ins_pipe( pipe_slow );
3888 %}
3889 
3890 instruct vsll16S_reg(vecY dst, vecY src, regF shift) %{
3891   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
3892   match(Set dst (LShiftVS src shift));
3893   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
3894   ins_encode %{
3895     bool vector256 = true;
3896     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
3897   %}
3898   ins_pipe( pipe_slow );
3899 %}
3900 
3901 instruct vsll16S_reg_imm(vecY dst, vecY src, immI8 shift) %{
3902   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
3903   match(Set dst (LShiftVS src shift));
3904   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
3905   ins_encode %{
3906     bool vector256 = true;
3907     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
3908   %}
3909   ins_pipe( pipe_slow );
3910 %}
3911 
3912 // Integers vector left shift
3913 instruct vsll2I(vecD dst, regF shift) %{
3914   predicate(n->as_Vector()->length() == 2);
3915   match(Set dst (LShiftVI dst shift));
3916   format %{ "pslld   $dst,$shift\t! left shift packed2I" %}
3917   ins_encode %{
3918     __ pslld($dst$$XMMRegister, $shift$$XMMRegister);
3919   %}
3920   ins_pipe( pipe_slow );
3921 %}
3922 
3923 instruct vsll2I_imm(vecD dst, immI8 shift) %{
3924   predicate(n->as_Vector()->length() == 2);
3925   match(Set dst (LShiftVI dst shift));
3926   format %{ "pslld   $dst,$shift\t! left shift packed2I" %}
3927   ins_encode %{
3928     __ pslld($dst$$XMMRegister, (int)$shift$$constant);
3929   %}
3930   ins_pipe( pipe_slow );
3931 %}
3932 
3933 instruct vsll2I_reg(vecD dst, vecD src, regF shift) %{
3934   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
3935   match(Set dst (LShiftVI src shift));
3936   format %{ "vpslld  $dst,$src,$shift\t! left shift packed2I" %}
3937   ins_encode %{
3938     bool vector256 = false;
3939     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
3940   %}
3941   ins_pipe( pipe_slow );
3942 %}
3943 
3944 instruct vsll2I_reg_imm(vecD dst, vecD src, immI8 shift) %{
3945   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
3946   match(Set dst (LShiftVI src shift));
3947   format %{ "vpslld  $dst,$src,$shift\t! left shift packed2I" %}
3948   ins_encode %{
3949     bool vector256 = false;
3950     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
3951   %}
3952   ins_pipe( pipe_slow );
3953 %}
3954 
3955 instruct vsll4I(vecX dst, regF shift) %{
3956   predicate(n->as_Vector()->length() == 4);
3957   match(Set dst (LShiftVI dst shift));
3958   format %{ "pslld   $dst,$shift\t! left shift packed4I" %}
3959   ins_encode %{
3960     __ pslld($dst$$XMMRegister, $shift$$XMMRegister);
3961   %}
3962   ins_pipe( pipe_slow );
3963 %}
3964 
3965 instruct vsll4I_imm(vecX dst, immI8 shift) %{
3966   predicate(n->as_Vector()->length() == 4);
3967   match(Set dst (LShiftVI dst shift));
3968   format %{ "pslld   $dst,$shift\t! left shift packed4I" %}
3969   ins_encode %{
3970     __ pslld($dst$$XMMRegister, (int)$shift$$constant);
3971   %}
3972   ins_pipe( pipe_slow );
3973 %}
3974 
3975 instruct vsll4I_reg(vecX dst, vecX src, regF shift) %{
3976   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
3977   match(Set dst (LShiftVI src shift));
3978   format %{ "vpslld  $dst,$src,$shift\t! left shift packed4I" %}
3979   ins_encode %{
3980     bool vector256 = false;
3981     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
3982   %}
3983   ins_pipe( pipe_slow );
3984 %}
3985 
3986 instruct vsll4I_reg_imm(vecX dst, vecX src, immI8 shift) %{
3987   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
3988   match(Set dst (LShiftVI src shift));
3989   format %{ "vpslld  $dst,$src,$shift\t! left shift packed4I" %}
3990   ins_encode %{
3991     bool vector256 = false;
3992     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
3993   %}
3994   ins_pipe( pipe_slow );
3995 %}
3996 
3997 instruct vsll8I_reg(vecY dst, vecY src, regF shift) %{
3998   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
3999   match(Set dst (LShiftVI src shift));
4000   format %{ "vpslld  $dst,$src,$shift\t! left shift packed8I" %}
4001   ins_encode %{
4002     bool vector256 = true;
4003     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
4004   %}
4005   ins_pipe( pipe_slow );
4006 %}
4007 
4008 instruct vsll8I_reg_imm(vecY dst, vecY src, immI8 shift) %{
4009   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
4010   match(Set dst (LShiftVI src shift));
4011   format %{ "vpslld  $dst,$src,$shift\t! left shift packed8I" %}
4012   ins_encode %{
4013     bool vector256 = true;
4014     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
4015   %}
4016   ins_pipe( pipe_slow );
4017 %}
4018 
4019 // Longs vector left shift
4020 instruct vsll2L(vecX dst, regF shift) %{
4021   predicate(n->as_Vector()->length() == 2);
4022   match(Set dst (LShiftVL dst shift));
4023   format %{ "psllq   $dst,$shift\t! left shift packed2L" %}
4024   ins_encode %{
4025     __ psllq($dst$$XMMRegister, $shift$$XMMRegister);
4026   %}
4027   ins_pipe( pipe_slow );
4028 %}
4029 
4030 instruct vsll2L_imm(vecX dst, immI8 shift) %{
4031   predicate(n->as_Vector()->length() == 2);
4032   match(Set dst (LShiftVL dst shift));
4033   format %{ "psllq   $dst,$shift\t! left shift packed2L" %}
4034   ins_encode %{
4035     __ psllq($dst$$XMMRegister, (int)$shift$$constant);
4036   %}
4037   ins_pipe( pipe_slow );
4038 %}
4039 
4040 instruct vsll2L_reg(vecX dst, vecX src, regF shift) %{
4041   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
4042   match(Set dst (LShiftVL src shift));
4043   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed2L" %}
4044   ins_encode %{
4045     bool vector256 = false;
4046     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
4047   %}
4048   ins_pipe( pipe_slow );
4049 %}
4050 
4051 instruct vsll2L_reg_imm(vecX dst, vecX src, immI8 shift) %{
4052   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
4053   match(Set dst (LShiftVL src shift));
4054   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed2L" %}
4055   ins_encode %{
4056     bool vector256 = false;
4057     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
4058   %}
4059   ins_pipe( pipe_slow );
4060 %}
4061 
4062 instruct vsll4L_reg(vecY dst, vecY src, regF shift) %{
4063   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
4064   match(Set dst (LShiftVL src shift));
4065   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed4L" %}
4066   ins_encode %{
4067     bool vector256 = true;
4068     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
4069   %}
4070   ins_pipe( pipe_slow );
4071 %}
4072 
4073 instruct vsll4L_reg_imm(vecY dst, vecY src, immI8 shift) %{
4074   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
4075   match(Set dst (LShiftVL src shift));
4076   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed4L" %}
4077   ins_encode %{
4078     bool vector256 = true;
4079     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
4080   %}
4081   ins_pipe( pipe_slow );
4082 %}
4083 
4084 // ----------------------- LogicalRightShift -----------------------------------
4085 
4086 // Shorts/Chars vector logical right shift produces incorrect Java result
4087 // for negative data because java code convert short value into int with
4088 // sign extension before a shift.
4089 
4090 // Integers vector logical right shift
4091 instruct vsrl2I(vecD dst, regF shift) %{
4092   predicate(n->as_Vector()->length() == 2);
4093   match(Set dst (URShiftVI dst shift));
4094   format %{ "psrld   $dst,$shift\t! logical right shift packed2I" %}
4095   ins_encode %{
4096     __ psrld($dst$$XMMRegister, $shift$$XMMRegister);
4097   %}
4098   ins_pipe( pipe_slow );
4099 %}
4100 
4101 instruct vsrl2I_imm(vecD dst, immI8 shift) %{
4102   predicate(n->as_Vector()->length() == 2);
4103   match(Set dst (URShiftVI dst shift));
4104   format %{ "psrld   $dst,$shift\t! logical right shift packed2I" %}
4105   ins_encode %{
4106     __ psrld($dst$$XMMRegister, (int)$shift$$constant);
4107   %}
4108   ins_pipe( pipe_slow );
4109 %}
4110 
4111 instruct vsrl2I_reg(vecD dst, vecD src, regF shift) %{
4112   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
4113   match(Set dst (URShiftVI src shift));
4114   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed2I" %}
4115   ins_encode %{
4116     bool vector256 = false;
4117     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
4118   %}
4119   ins_pipe( pipe_slow );
4120 %}
4121 
4122 instruct vsrl2I_reg_imm(vecD dst, vecD src, immI8 shift) %{
4123   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
4124   match(Set dst (URShiftVI src shift));
4125   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed2I" %}
4126   ins_encode %{
4127     bool vector256 = false;
4128     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
4129   %}
4130   ins_pipe( pipe_slow );
4131 %}
4132 
4133 instruct vsrl4I(vecX dst, regF shift) %{
4134   predicate(n->as_Vector()->length() == 4);
4135   match(Set dst (URShiftVI dst shift));
4136   format %{ "psrld   $dst,$shift\t! logical right shift packed4I" %}
4137   ins_encode %{
4138     __ psrld($dst$$XMMRegister, $shift$$XMMRegister);
4139   %}
4140   ins_pipe( pipe_slow );
4141 %}
4142 
4143 instruct vsrl4I_imm(vecX dst, immI8 shift) %{
4144   predicate(n->as_Vector()->length() == 4);
4145   match(Set dst (URShiftVI dst shift));
4146   format %{ "psrld   $dst,$shift\t! logical right shift packed4I" %}
4147   ins_encode %{
4148     __ psrld($dst$$XMMRegister, (int)$shift$$constant);
4149   %}
4150   ins_pipe( pipe_slow );
4151 %}
4152 
4153 instruct vsrl4I_reg(vecX dst, vecX src, regF shift) %{
4154   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
4155   match(Set dst (URShiftVI src shift));
4156   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed4I" %}
4157   ins_encode %{
4158     bool vector256 = false;
4159     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
4160   %}
4161   ins_pipe( pipe_slow );
4162 %}
4163 
4164 instruct vsrl4I_reg_imm(vecX dst, vecX src, immI8 shift) %{
4165   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
4166   match(Set dst (URShiftVI src shift));
4167   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed4I" %}
4168   ins_encode %{
4169     bool vector256 = false;
4170     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
4171   %}
4172   ins_pipe( pipe_slow );
4173 %}
4174 
4175 instruct vsrl8I_reg(vecY dst, vecY src, regF shift) %{
4176   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
4177   match(Set dst (URShiftVI src shift));
4178   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed8I" %}
4179   ins_encode %{
4180     bool vector256 = true;
4181     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
4182   %}
4183   ins_pipe( pipe_slow );
4184 %}
4185 
4186 instruct vsrl8I_reg_imm(vecY dst, vecY src, immI8 shift) %{
4187   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
4188   match(Set dst (URShiftVI src shift));
4189   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed8I" %}
4190   ins_encode %{
4191     bool vector256 = true;
4192     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
4193   %}
4194   ins_pipe( pipe_slow );
4195 %}
4196 
4197 // Longs vector logical right shift
4198 instruct vsrl2L(vecX dst, regF shift) %{
4199   predicate(n->as_Vector()->length() == 2);
4200   match(Set dst (URShiftVL dst shift));
4201   format %{ "psrlq   $dst,$shift\t! logical right shift packed2L" %}
4202   ins_encode %{
4203     __ psrlq($dst$$XMMRegister, $shift$$XMMRegister);
4204   %}
4205   ins_pipe( pipe_slow );
4206 %}
4207 
4208 instruct vsrl2L_imm(vecX dst, immI8 shift) %{
4209   predicate(n->as_Vector()->length() == 2);
4210   match(Set dst (URShiftVL dst shift));
4211   format %{ "psrlq   $dst,$shift\t! logical right shift packed2L" %}
4212   ins_encode %{
4213     __ psrlq($dst$$XMMRegister, (int)$shift$$constant);
4214   %}
4215   ins_pipe( pipe_slow );
4216 %}
4217 
4218 instruct vsrl2L_reg(vecX dst, vecX src, regF shift) %{
4219   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
4220   match(Set dst (URShiftVL src shift));
4221   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed2L" %}
4222   ins_encode %{
4223     bool vector256 = false;
4224     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
4225   %}
4226   ins_pipe( pipe_slow );
4227 %}
4228 
4229 instruct vsrl2L_reg_imm(vecX dst, vecX src, immI8 shift) %{
4230   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
4231   match(Set dst (URShiftVL src shift));
4232   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed2L" %}
4233   ins_encode %{
4234     bool vector256 = false;
4235     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
4236   %}
4237   ins_pipe( pipe_slow );
4238 %}
4239 
4240 instruct vsrl4L_reg(vecY dst, vecY src, regF shift) %{
4241   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
4242   match(Set dst (URShiftVL src shift));
4243   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed4L" %}
4244   ins_encode %{
4245     bool vector256 = true;
4246     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
4247   %}
4248   ins_pipe( pipe_slow );
4249 %}
4250 
4251 instruct vsrl4L_reg_imm(vecY dst, vecY src, immI8 shift) %{
4252   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
4253   match(Set dst (URShiftVL src shift));
4254   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed4L" %}
4255   ins_encode %{
4256     bool vector256 = true;
4257     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
4258   %}
4259   ins_pipe( pipe_slow );
4260 %}
4261 
4262 // ------------------- ArithmeticRightShift -----------------------------------
4263 
4264 // Shorts/Chars vector arithmetic right shift
4265 instruct vsra2S(vecS dst, regF shift) %{
4266   predicate(n->as_Vector()->length() == 2);
4267   match(Set dst (RShiftVS dst shift));
4268   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed2S" %}
4269   ins_encode %{
4270     __ psraw($dst$$XMMRegister, $shift$$XMMRegister);
4271   %}
4272   ins_pipe( pipe_slow );
4273 %}
4274 
4275 instruct vsra2S_imm(vecS dst, immI8 shift) %{
4276   predicate(n->as_Vector()->length() == 2);
4277   match(Set dst (RShiftVS dst shift));
4278   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed2S" %}
4279   ins_encode %{
4280     __ psraw($dst$$XMMRegister, (int)$shift$$constant);
4281   %}
4282   ins_pipe( pipe_slow );
4283 %}
4284 
4285 instruct vsra2S_reg(vecS dst, vecS src, regF shift) %{
4286   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
4287   match(Set dst (RShiftVS src shift));
4288   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
4289   ins_encode %{
4290     bool vector256 = false;
4291     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
4292   %}
4293   ins_pipe( pipe_slow );
4294 %}
4295 
4296 instruct vsra2S_reg_imm(vecS dst, vecS src, immI8 shift) %{
4297   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
4298   match(Set dst (RShiftVS src shift));
4299   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
4300   ins_encode %{
4301     bool vector256 = false;
4302     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
4303   %}
4304   ins_pipe( pipe_slow );
4305 %}
4306 
4307 instruct vsra4S(vecD dst, regF shift) %{
4308   predicate(n->as_Vector()->length() == 4);
4309   match(Set dst (RShiftVS dst shift));
4310   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed4S" %}
4311   ins_encode %{
4312     __ psraw($dst$$XMMRegister, $shift$$XMMRegister);
4313   %}
4314   ins_pipe( pipe_slow );
4315 %}
4316 
4317 instruct vsra4S_imm(vecD dst, immI8 shift) %{
4318   predicate(n->as_Vector()->length() == 4);
4319   match(Set dst (RShiftVS dst shift));
4320   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed4S" %}
4321   ins_encode %{
4322     __ psraw($dst$$XMMRegister, (int)$shift$$constant);
4323   %}
4324   ins_pipe( pipe_slow );
4325 %}
4326 
4327 instruct vsra4S_reg(vecD dst, vecD src, regF shift) %{
4328   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
4329   match(Set dst (RShiftVS src shift));
4330   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
4331   ins_encode %{
4332     bool vector256 = false;
4333     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
4334   %}
4335   ins_pipe( pipe_slow );
4336 %}
4337 
4338 instruct vsra4S_reg_imm(vecD dst, vecD src, immI8 shift) %{
4339   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
4340   match(Set dst (RShiftVS src shift));
4341   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
4342   ins_encode %{
4343     bool vector256 = false;
4344     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
4345   %}
4346   ins_pipe( pipe_slow );
4347 %}
4348 
4349 instruct vsra8S(vecX dst, regF shift) %{
4350   predicate(n->as_Vector()->length() == 8);
4351   match(Set dst (RShiftVS dst shift));
4352   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed8S" %}
4353   ins_encode %{
4354     __ psraw($dst$$XMMRegister, $shift$$XMMRegister);
4355   %}
4356   ins_pipe( pipe_slow );
4357 %}
4358 
4359 instruct vsra8S_imm(vecX dst, immI8 shift) %{
4360   predicate(n->as_Vector()->length() == 8);
4361   match(Set dst (RShiftVS dst shift));
4362   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed8S" %}
4363   ins_encode %{
4364     __ psraw($dst$$XMMRegister, (int)$shift$$constant);
4365   %}
4366   ins_pipe( pipe_slow );
4367 %}
4368 
4369 instruct vsra8S_reg(vecX dst, vecX src, regF shift) %{
4370   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
4371   match(Set dst (RShiftVS src shift));
4372   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
4373   ins_encode %{
4374     bool vector256 = false;
4375     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
4376   %}
4377   ins_pipe( pipe_slow );
4378 %}
4379 
4380 instruct vsra8S_reg_imm(vecX dst, vecX src, immI8 shift) %{
4381   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
4382   match(Set dst (RShiftVS src shift));
4383   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
4384   ins_encode %{
4385     bool vector256 = false;
4386     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
4387   %}
4388   ins_pipe( pipe_slow );
4389 %}
4390 
4391 instruct vsra16S_reg(vecY dst, vecY src, regF shift) %{
4392   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
4393   match(Set dst (RShiftVS src shift));
4394   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
4395   ins_encode %{
4396     bool vector256 = true;
4397     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
4398   %}
4399   ins_pipe( pipe_slow );
4400 %}
4401 
4402 instruct vsra16S_reg_imm(vecY dst, vecY src, immI8 shift) %{
4403   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
4404   match(Set dst (RShiftVS src shift));
4405   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
4406   ins_encode %{
4407     bool vector256 = true;
4408     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
4409   %}
4410   ins_pipe( pipe_slow );
4411 %}
4412 
4413 // Integers vector arithmetic right shift
4414 instruct vsra2I(vecD dst, regF shift) %{
4415   predicate(n->as_Vector()->length() == 2);
4416   match(Set dst (RShiftVI dst shift));
4417   format %{ "psrad   $dst,$shift\t! arithmetic right shift packed2I" %}
4418   ins_encode %{
4419     __ psrad($dst$$XMMRegister, $shift$$XMMRegister);
4420   %}
4421   ins_pipe( pipe_slow );
4422 %}
4423 
4424 instruct vsra2I_imm(vecD dst, immI8 shift) %{
4425   predicate(n->as_Vector()->length() == 2);
4426   match(Set dst (RShiftVI dst shift));
4427   format %{ "psrad   $dst,$shift\t! arithmetic right shift packed2I" %}
4428   ins_encode %{
4429     __ psrad($dst$$XMMRegister, (int)$shift$$constant);
4430   %}
4431   ins_pipe( pipe_slow );
4432 %}
4433 
4434 instruct vsra2I_reg(vecD dst, vecD src, regF shift) %{
4435   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
4436   match(Set dst (RShiftVI src shift));
4437   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed2I" %}
4438   ins_encode %{
4439     bool vector256 = false;
4440     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
4441   %}
4442   ins_pipe( pipe_slow );
4443 %}
4444 
4445 instruct vsra2I_reg_imm(vecD dst, vecD src, immI8 shift) %{
4446   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
4447   match(Set dst (RShiftVI src shift));
4448   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed2I" %}
4449   ins_encode %{
4450     bool vector256 = false;
4451     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
4452   %}
4453   ins_pipe( pipe_slow );
4454 %}
4455 
4456 instruct vsra4I(vecX dst, regF shift) %{
4457   predicate(n->as_Vector()->length() == 4);
4458   match(Set dst (RShiftVI dst shift));
4459   format %{ "psrad   $dst,$shift\t! arithmetic right shift packed4I" %}
4460   ins_encode %{
4461     __ psrad($dst$$XMMRegister, $shift$$XMMRegister);
4462   %}
4463   ins_pipe( pipe_slow );
4464 %}
4465 
4466 instruct vsra4I_imm(vecX dst, immI8 shift) %{
4467   predicate(n->as_Vector()->length() == 4);
4468   match(Set dst (RShiftVI dst shift));
4469   format %{ "psrad   $dst,$shift\t! arithmetic right shift packed4I" %}
4470   ins_encode %{
4471     __ psrad($dst$$XMMRegister, (int)$shift$$constant);
4472   %}
4473   ins_pipe( pipe_slow );
4474 %}
4475 
4476 instruct vsra4I_reg(vecX dst, vecX src, regF shift) %{
4477   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
4478   match(Set dst (RShiftVI src shift));
4479   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed4I" %}
4480   ins_encode %{
4481     bool vector256 = false;
4482     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
4483   %}
4484   ins_pipe( pipe_slow );
4485 %}
4486 
4487 instruct vsra4I_reg_imm(vecX dst, vecX src, immI8 shift) %{
4488   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
4489   match(Set dst (RShiftVI src shift));
4490   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed4I" %}
4491   ins_encode %{
4492     bool vector256 = false;
4493     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
4494   %}
4495   ins_pipe( pipe_slow );
4496 %}
4497 
4498 instruct vsra8I_reg(vecY dst, vecY src, regF shift) %{
4499   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
4500   match(Set dst (RShiftVI src shift));
4501   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed8I" %}
4502   ins_encode %{
4503     bool vector256 = true;
4504     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
4505   %}
4506   ins_pipe( pipe_slow );
4507 %}
4508 
4509 instruct vsra8I_reg_imm(vecY dst, vecY src, immI8 shift) %{
4510   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
4511   match(Set dst (RShiftVI src shift));
4512   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed8I" %}
4513   ins_encode %{
4514     bool vector256 = true;
4515     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
4516   %}
4517   ins_pipe( pipe_slow );
4518 %}
4519 
4520 // There are no longs vector arithmetic right shift instructions.
4521 
4522 
4523 // --------------------------------- AND --------------------------------------
4524 
4525 instruct vand4B(vecS dst, vecS src) %{
4526   predicate(n->as_Vector()->length_in_bytes() == 4);
4527   match(Set dst (AndV dst src));
4528   format %{ "pand    $dst,$src\t! and vectors (4 bytes)" %}
4529   ins_encode %{
4530     __ pand($dst$$XMMRegister, $src$$XMMRegister);
4531   %}
4532   ins_pipe( pipe_slow );
4533 %}
4534 
4535 instruct vand4B_reg(vecS dst, vecS src1, vecS src2) %{
4536   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
4537   match(Set dst (AndV src1 src2));
4538   format %{ "vpand   $dst,$src1,$src2\t! and vectors (4 bytes)" %}
4539   ins_encode %{
4540     bool vector256 = false;
4541     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
4542   %}
4543   ins_pipe( pipe_slow );
4544 %}
4545 
4546 instruct vand8B(vecD dst, vecD src) %{
4547   predicate(n->as_Vector()->length_in_bytes() == 8);
4548   match(Set dst (AndV dst src));
4549   format %{ "pand    $dst,$src\t! and vectors (8 bytes)" %}
4550   ins_encode %{
4551     __ pand($dst$$XMMRegister, $src$$XMMRegister);
4552   %}
4553   ins_pipe( pipe_slow );
4554 %}
4555 
4556 instruct vand8B_reg(vecD dst, vecD src1, vecD src2) %{
4557   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
4558   match(Set dst (AndV src1 src2));
4559   format %{ "vpand   $dst,$src1,$src2\t! and vectors (8 bytes)" %}
4560   ins_encode %{
4561     bool vector256 = false;
4562     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
4563   %}
4564   ins_pipe( pipe_slow );
4565 %}
4566 
4567 instruct vand16B(vecX dst, vecX src) %{
4568   predicate(n->as_Vector()->length_in_bytes() == 16);
4569   match(Set dst (AndV dst src));
4570   format %{ "pand    $dst,$src\t! and vectors (16 bytes)" %}
4571   ins_encode %{
4572     __ pand($dst$$XMMRegister, $src$$XMMRegister);
4573   %}
4574   ins_pipe( pipe_slow );
4575 %}
4576 
4577 instruct vand16B_reg(vecX dst, vecX src1, vecX src2) %{
4578   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
4579   match(Set dst (AndV src1 src2));
4580   format %{ "vpand   $dst,$src1,$src2\t! and vectors (16 bytes)" %}
4581   ins_encode %{
4582     bool vector256 = false;
4583     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
4584   %}
4585   ins_pipe( pipe_slow );
4586 %}
4587 
4588 instruct vand16B_mem(vecX dst, vecX src, memory mem) %{
4589   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
4590   match(Set dst (AndV src (LoadVector mem)));
4591   format %{ "vpand   $dst,$src,$mem\t! and vectors (16 bytes)" %}
4592   ins_encode %{
4593     bool vector256 = false;
4594     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
4595   %}
4596   ins_pipe( pipe_slow );
4597 %}
4598 
4599 instruct vand32B_reg(vecY dst, vecY src1, vecY src2) %{
4600   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
4601   match(Set dst (AndV src1 src2));
4602   format %{ "vpand   $dst,$src1,$src2\t! and vectors (32 bytes)" %}
4603   ins_encode %{
4604     bool vector256 = true;
4605     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
4606   %}
4607   ins_pipe( pipe_slow );
4608 %}
4609 
4610 instruct vand32B_mem(vecY dst, vecY src, memory mem) %{
4611   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
4612   match(Set dst (AndV src (LoadVector mem)));
4613   format %{ "vpand   $dst,$src,$mem\t! and vectors (32 bytes)" %}
4614   ins_encode %{
4615     bool vector256 = true;
4616     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
4617   %}
4618   ins_pipe( pipe_slow );
4619 %}
4620 
4621 // --------------------------------- OR ---------------------------------------
4622 
4623 instruct vor4B(vecS dst, vecS src) %{
4624   predicate(n->as_Vector()->length_in_bytes() == 4);
4625   match(Set dst (OrV dst src));
4626   format %{ "por     $dst,$src\t! or vectors (4 bytes)" %}
4627   ins_encode %{
4628     __ por($dst$$XMMRegister, $src$$XMMRegister);
4629   %}
4630   ins_pipe( pipe_slow );
4631 %}
4632 
4633 instruct vor4B_reg(vecS dst, vecS src1, vecS src2) %{
4634   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
4635   match(Set dst (OrV src1 src2));
4636   format %{ "vpor    $dst,$src1,$src2\t! or vectors (4 bytes)" %}
4637   ins_encode %{
4638     bool vector256 = false;
4639     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
4640   %}
4641   ins_pipe( pipe_slow );
4642 %}
4643 
4644 instruct vor8B(vecD dst, vecD src) %{
4645   predicate(n->as_Vector()->length_in_bytes() == 8);
4646   match(Set dst (OrV dst src));
4647   format %{ "por     $dst,$src\t! or vectors (8 bytes)" %}
4648   ins_encode %{
4649     __ por($dst$$XMMRegister, $src$$XMMRegister);
4650   %}
4651   ins_pipe( pipe_slow );
4652 %}
4653 
4654 instruct vor8B_reg(vecD dst, vecD src1, vecD src2) %{
4655   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
4656   match(Set dst (OrV src1 src2));
4657   format %{ "vpor    $dst,$src1,$src2\t! or vectors (8 bytes)" %}
4658   ins_encode %{
4659     bool vector256 = false;
4660     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
4661   %}
4662   ins_pipe( pipe_slow );
4663 %}
4664 
4665 instruct vor16B(vecX dst, vecX src) %{
4666   predicate(n->as_Vector()->length_in_bytes() == 16);
4667   match(Set dst (OrV dst src));
4668   format %{ "por     $dst,$src\t! or vectors (16 bytes)" %}
4669   ins_encode %{
4670     __ por($dst$$XMMRegister, $src$$XMMRegister);
4671   %}
4672   ins_pipe( pipe_slow );
4673 %}
4674 
4675 instruct vor16B_reg(vecX dst, vecX src1, vecX src2) %{
4676   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
4677   match(Set dst (OrV src1 src2));
4678   format %{ "vpor    $dst,$src1,$src2\t! or vectors (16 bytes)" %}
4679   ins_encode %{
4680     bool vector256 = false;
4681     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
4682   %}
4683   ins_pipe( pipe_slow );
4684 %}
4685 
4686 instruct vor16B_mem(vecX dst, vecX src, memory mem) %{
4687   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
4688   match(Set dst (OrV src (LoadVector mem)));
4689   format %{ "vpor    $dst,$src,$mem\t! or vectors (16 bytes)" %}
4690   ins_encode %{
4691     bool vector256 = false;
4692     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
4693   %}
4694   ins_pipe( pipe_slow );
4695 %}
4696 
4697 instruct vor32B_reg(vecY dst, vecY src1, vecY src2) %{
4698   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
4699   match(Set dst (OrV src1 src2));
4700   format %{ "vpor    $dst,$src1,$src2\t! or vectors (32 bytes)" %}
4701   ins_encode %{
4702     bool vector256 = true;
4703     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
4704   %}
4705   ins_pipe( pipe_slow );
4706 %}
4707 
4708 instruct vor32B_mem(vecY dst, vecY src, memory mem) %{
4709   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
4710   match(Set dst (OrV src (LoadVector mem)));
4711   format %{ "vpor    $dst,$src,$mem\t! or vectors (32 bytes)" %}
4712   ins_encode %{
4713     bool vector256 = true;
4714     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
4715   %}
4716   ins_pipe( pipe_slow );
4717 %}
4718 
4719 // --------------------------------- XOR --------------------------------------
4720 
4721 instruct vxor4B(vecS dst, vecS src) %{
4722   predicate(n->as_Vector()->length_in_bytes() == 4);
4723   match(Set dst (XorV dst src));
4724   format %{ "pxor    $dst,$src\t! xor vectors (4 bytes)" %}
4725   ins_encode %{
4726     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
4727   %}
4728   ins_pipe( pipe_slow );
4729 %}
4730 
4731 instruct vxor4B_reg(vecS dst, vecS src1, vecS src2) %{
4732   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
4733   match(Set dst (XorV src1 src2));
4734   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (4 bytes)" %}
4735   ins_encode %{
4736     bool vector256 = false;
4737     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
4738   %}
4739   ins_pipe( pipe_slow );
4740 %}
4741 
4742 instruct vxor8B(vecD dst, vecD src) %{
4743   predicate(n->as_Vector()->length_in_bytes() == 8);
4744   match(Set dst (XorV dst src));
4745   format %{ "pxor    $dst,$src\t! xor vectors (8 bytes)" %}
4746   ins_encode %{
4747     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
4748   %}
4749   ins_pipe( pipe_slow );
4750 %}
4751 
4752 instruct vxor8B_reg(vecD dst, vecD src1, vecD src2) %{
4753   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
4754   match(Set dst (XorV src1 src2));
4755   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (8 bytes)" %}
4756   ins_encode %{
4757     bool vector256 = false;
4758     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
4759   %}
4760   ins_pipe( pipe_slow );
4761 %}
4762 
4763 instruct vxor16B(vecX dst, vecX src) %{
4764   predicate(n->as_Vector()->length_in_bytes() == 16);
4765   match(Set dst (XorV dst src));
4766   format %{ "pxor    $dst,$src\t! xor vectors (16 bytes)" %}
4767   ins_encode %{
4768     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
4769   %}
4770   ins_pipe( pipe_slow );
4771 %}
4772 
4773 instruct vxor16B_reg(vecX dst, vecX src1, vecX src2) %{
4774   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
4775   match(Set dst (XorV src1 src2));
4776   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (16 bytes)" %}
4777   ins_encode %{
4778     bool vector256 = false;
4779     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
4780   %}
4781   ins_pipe( pipe_slow );
4782 %}
4783 
4784 instruct vxor16B_mem(vecX dst, vecX src, memory mem) %{
4785   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
4786   match(Set dst (XorV src (LoadVector mem)));
4787   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (16 bytes)" %}
4788   ins_encode %{
4789     bool vector256 = false;
4790     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
4791   %}
4792   ins_pipe( pipe_slow );
4793 %}
4794 
4795 instruct vxor32B_reg(vecY dst, vecY src1, vecY src2) %{
4796   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
4797   match(Set dst (XorV src1 src2));
4798   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (32 bytes)" %}
4799   ins_encode %{
4800     bool vector256 = true;
4801     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
4802   %}
4803   ins_pipe( pipe_slow );
4804 %}
4805 
4806 instruct vxor32B_mem(vecY dst, vecY src, memory mem) %{
4807   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
4808   match(Set dst (XorV src (LoadVector mem)));
4809   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (32 bytes)" %}
4810   ins_encode %{
4811     bool vector256 = true;
4812     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
4813   %}
4814   ins_pipe( pipe_slow );
4815 %}
4816