1 //
   2 // Copyright (c) 2011, 2014, Oracle and/or its affiliates. All rights reserved.
   3 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4 //
   5 // This code is free software; you can redistribute it and/or modify it
   6 // under the terms of the GNU General Public License version 2 only, as
   7 // published by the Free Software Foundation.
   8 //
   9 // This code is distributed in the hope that it will be useful, but WITHOUT
  10 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11 // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12 // version 2 for more details (a copy is included in the LICENSE file that
  13 // accompanied this code).
  14 //
  15 // You should have received a copy of the GNU General Public License version
  16 // 2 along with this work; if not, write to the Free Software Foundation,
  17 // Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18 //
  19 // Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20 // or visit www.oracle.com if you need additional information or have any
  21 // questions.
  22 //
  23 //
  24 
  25 // X86 Common Architecture Description File
  26 
  27 //----------REGISTER DEFINITION BLOCK------------------------------------------
  28 // This information is used by the matcher and the register allocator to
  29 // describe individual registers and classes of registers within the target
  30 // archtecture.
  31 
  32 register %{
  33 //----------Architecture Description Register Definitions----------------------
  34 // General Registers
  35 // "reg_def"  name ( register save type, C convention save type,
  36 //                   ideal register type, encoding );
  37 // Register Save Types:
  38 //
  39 // NS  = No-Save:       The register allocator assumes that these registers
  40 //                      can be used without saving upon entry to the method, &
  41 //                      that they do not need to be saved at call sites.
  42 //
  43 // SOC = Save-On-Call:  The register allocator assumes that these registers
  44 //                      can be used without saving upon entry to the method,
  45 //                      but that they must be saved at call sites.
  46 //
  47 // SOE = Save-On-Entry: The register allocator assumes that these registers
  48 //                      must be saved before using them upon entry to the
  49 //                      method, but they do not need to be saved at call
  50 //                      sites.
  51 //
  52 // AS  = Always-Save:   The register allocator assumes that these registers
  53 //                      must be saved before using them upon entry to the
  54 //                      method, & that they must be saved at call sites.
  55 //
  56 // Ideal Register Type is used to determine how to save & restore a
  57 // register.  Op_RegI will get spilled with LoadI/StoreI, Op_RegP will get
  58 // spilled with LoadP/StoreP.  If the register supports both, use Op_RegI.
  59 //
  60 // The encoding number is the actual bit-pattern placed into the opcodes.
  61 
  62 // XMM registers.  256-bit registers or 8 words each, labeled (a)-h.
  63 // Word a in each register holds a Float, words ab hold a Double.
  64 // The whole registers are used in SSE4.2 version intrinsics,
  65 // array copy stubs and superword operations (see UseSSE42Intrinsics,
  66 // UseXMMForArrayCopy and UseSuperword flags).
  67 // XMM8-XMM15 must be encoded with REX (VEX for UseAVX).
  68 // Linux ABI:   No register preserved across function calls
  69 //              XMM0-XMM7 might hold parameters
  70 // Windows ABI: XMM6-XMM15 preserved across function calls
  71 //              XMM0-XMM3 might hold parameters
  72 
  73 reg_def XMM0 ( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg());
  74 reg_def XMM0b( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(1));
  75 reg_def XMM0c( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(2));
  76 reg_def XMM0d( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(3));
  77 reg_def XMM0e( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(4));
  78 reg_def XMM0f( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(5));
  79 reg_def XMM0g( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(6));
  80 reg_def XMM0h( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(7));
  81 
  82 reg_def XMM1 ( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg());
  83 reg_def XMM1b( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(1));
  84 reg_def XMM1c( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(2));
  85 reg_def XMM1d( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(3));
  86 reg_def XMM1e( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(4));
  87 reg_def XMM1f( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(5));
  88 reg_def XMM1g( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(6));
  89 reg_def XMM1h( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(7));
  90 
  91 reg_def XMM2 ( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg());
  92 reg_def XMM2b( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(1));
  93 reg_def XMM2c( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(2));
  94 reg_def XMM2d( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(3));
  95 reg_def XMM2e( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(4));
  96 reg_def XMM2f( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(5));
  97 reg_def XMM2g( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(6));
  98 reg_def XMM2h( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(7));
  99 
 100 reg_def XMM3 ( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg());
 101 reg_def XMM3b( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(1));
 102 reg_def XMM3c( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(2));
 103 reg_def XMM3d( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(3));
 104 reg_def XMM3e( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(4));
 105 reg_def XMM3f( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(5));
 106 reg_def XMM3g( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(6));
 107 reg_def XMM3h( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(7));
 108 
 109 reg_def XMM4 ( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg());
 110 reg_def XMM4b( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(1));
 111 reg_def XMM4c( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(2));
 112 reg_def XMM4d( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(3));
 113 reg_def XMM4e( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(4));
 114 reg_def XMM4f( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(5));
 115 reg_def XMM4g( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(6));
 116 reg_def XMM4h( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(7));
 117 
 118 reg_def XMM5 ( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg());
 119 reg_def XMM5b( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(1));
 120 reg_def XMM5c( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(2));
 121 reg_def XMM5d( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(3));
 122 reg_def XMM5e( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(4));
 123 reg_def XMM5f( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(5));
 124 reg_def XMM5g( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(6));
 125 reg_def XMM5h( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(7));
 126 
 127 #ifdef _WIN64
 128 
 129 reg_def XMM6 ( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg());
 130 reg_def XMM6b( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg()->next(1));
 131 reg_def XMM6c( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg()->next(2));
 132 reg_def XMM6d( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg()->next(3));
 133 reg_def XMM6e( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg()->next(4));
 134 reg_def XMM6f( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg()->next(5));
 135 reg_def XMM6g( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg()->next(6));
 136 reg_def XMM6h( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg()->next(7));
 137 
 138 reg_def XMM7 ( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg());
 139 reg_def XMM7b( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg()->next(1));
 140 reg_def XMM7c( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg()->next(2));
 141 reg_def XMM7d( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg()->next(3));
 142 reg_def XMM7e( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg()->next(4));
 143 reg_def XMM7f( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg()->next(5));
 144 reg_def XMM7g( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg()->next(6));
 145 reg_def XMM7h( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg()->next(7));
 146 
 147 reg_def XMM8 ( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg());
 148 reg_def XMM8b( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg()->next(1));
 149 reg_def XMM8c( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg()->next(2));
 150 reg_def XMM8d( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg()->next(3));
 151 reg_def XMM8e( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg()->next(4));
 152 reg_def XMM8f( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg()->next(5));
 153 reg_def XMM8g( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg()->next(6));
 154 reg_def XMM8h( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg()->next(7));
 155 
 156 reg_def XMM9 ( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg());
 157 reg_def XMM9b( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg()->next(1));
 158 reg_def XMM9c( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg()->next(2));
 159 reg_def XMM9d( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg()->next(3));
 160 reg_def XMM9e( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg()->next(4));
 161 reg_def XMM9f( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg()->next(5));
 162 reg_def XMM9g( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg()->next(6));
 163 reg_def XMM9h( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg()->next(7));
 164 
 165 reg_def XMM10 ( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg());
 166 reg_def XMM10b( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg()->next(1));
 167 reg_def XMM10c( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg()->next(2));
 168 reg_def XMM10d( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg()->next(3));
 169 reg_def XMM10e( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg()->next(4));
 170 reg_def XMM10f( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg()->next(5));
 171 reg_def XMM10g( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg()->next(6));
 172 reg_def XMM10h( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg()->next(7));
 173 
 174 reg_def XMM11 ( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg());
 175 reg_def XMM11b( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg()->next(1));
 176 reg_def XMM11c( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg()->next(2));
 177 reg_def XMM11d( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg()->next(3));
 178 reg_def XMM11e( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg()->next(4));
 179 reg_def XMM11f( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg()->next(5));
 180 reg_def XMM11g( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg()->next(6));
 181 reg_def XMM11h( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg()->next(7));
 182 
 183 reg_def XMM12 ( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg());
 184 reg_def XMM12b( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg()->next(1));
 185 reg_def XMM12c( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg()->next(2));
 186 reg_def XMM12d( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg()->next(3));
 187 reg_def XMM12e( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg()->next(4));
 188 reg_def XMM12f( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg()->next(5));
 189 reg_def XMM12g( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg()->next(6));
 190 reg_def XMM12h( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg()->next(7));
 191 
 192 reg_def XMM13 ( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg());
 193 reg_def XMM13b( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg()->next(1));
 194 reg_def XMM13c( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg()->next(2));
 195 reg_def XMM13d( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg()->next(3));
 196 reg_def XMM13e( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg()->next(4));
 197 reg_def XMM13f( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg()->next(5));
 198 reg_def XMM13g( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg()->next(6));
 199 reg_def XMM13h( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg()->next(7));
 200 
 201 reg_def XMM14 ( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg());
 202 reg_def XMM14b( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg()->next(1));
 203 reg_def XMM14c( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg()->next(2));
 204 reg_def XMM14d( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg()->next(3));
 205 reg_def XMM14e( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg()->next(4));
 206 reg_def XMM14f( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg()->next(5));
 207 reg_def XMM14g( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg()->next(6));
 208 reg_def XMM14h( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg()->next(7));
 209 
 210 reg_def XMM15 ( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg());
 211 reg_def XMM15b( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg()->next(1));
 212 reg_def XMM15c( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg()->next(2));
 213 reg_def XMM15d( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg()->next(3));
 214 reg_def XMM15e( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg()->next(4));
 215 reg_def XMM15f( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg()->next(5));
 216 reg_def XMM15g( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg()->next(6));
 217 reg_def XMM15h( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg()->next(7));
 218 
 219 #else // _WIN64
 220 
 221 reg_def XMM6 ( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg());
 222 reg_def XMM6b( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(1));
 223 reg_def XMM6c( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(2));
 224 reg_def XMM6d( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(3));
 225 reg_def XMM6e( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(4));
 226 reg_def XMM6f( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(5));
 227 reg_def XMM6g( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(6));
 228 reg_def XMM6h( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(7));
 229 
 230 reg_def XMM7 ( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg());
 231 reg_def XMM7b( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(1));
 232 reg_def XMM7c( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(2));
 233 reg_def XMM7d( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(3));
 234 reg_def XMM7e( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(4));
 235 reg_def XMM7f( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(5));
 236 reg_def XMM7g( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(6));
 237 reg_def XMM7h( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(7));
 238 
 239 #ifdef _LP64
 240 
 241 reg_def XMM8 ( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg());
 242 reg_def XMM8b( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(1));
 243 reg_def XMM8c( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(2));
 244 reg_def XMM8d( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(3));
 245 reg_def XMM8e( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(4));
 246 reg_def XMM8f( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(5));
 247 reg_def XMM8g( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(6));
 248 reg_def XMM8h( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(7));
 249 
 250 reg_def XMM9 ( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg());
 251 reg_def XMM9b( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(1));
 252 reg_def XMM9c( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(2));
 253 reg_def XMM9d( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(3));
 254 reg_def XMM9e( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(4));
 255 reg_def XMM9f( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(5));
 256 reg_def XMM9g( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(6));
 257 reg_def XMM9h( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(7));
 258 
 259 reg_def XMM10 ( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg());
 260 reg_def XMM10b( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(1));
 261 reg_def XMM10c( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(2));
 262 reg_def XMM10d( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(3));
 263 reg_def XMM10e( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(4));
 264 reg_def XMM10f( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(5));
 265 reg_def XMM10g( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(6));
 266 reg_def XMM10h( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(7));
 267 
 268 reg_def XMM11 ( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg());
 269 reg_def XMM11b( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(1));
 270 reg_def XMM11c( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(2));
 271 reg_def XMM11d( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(3));
 272 reg_def XMM11e( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(4));
 273 reg_def XMM11f( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(5));
 274 reg_def XMM11g( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(6));
 275 reg_def XMM11h( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(7));
 276 
 277 reg_def XMM12 ( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg());
 278 reg_def XMM12b( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(1));
 279 reg_def XMM12c( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(2));
 280 reg_def XMM12d( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(3));
 281 reg_def XMM12e( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(4));
 282 reg_def XMM12f( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(5));
 283 reg_def XMM12g( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(6));
 284 reg_def XMM12h( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(7));
 285 
 286 reg_def XMM13 ( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg());
 287 reg_def XMM13b( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(1));
 288 reg_def XMM13c( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(2));
 289 reg_def XMM13d( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(3));
 290 reg_def XMM13e( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(4));
 291 reg_def XMM13f( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(5));
 292 reg_def XMM13g( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(6));
 293 reg_def XMM13h( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(7));
 294 
 295 reg_def XMM14 ( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg());
 296 reg_def XMM14b( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(1));
 297 reg_def XMM14c( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(2));
 298 reg_def XMM14d( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(3));
 299 reg_def XMM14e( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(4));
 300 reg_def XMM14f( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(5));
 301 reg_def XMM14g( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(6));
 302 reg_def XMM14h( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(7));
 303 
 304 reg_def XMM15 ( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg());
 305 reg_def XMM15b( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(1));
 306 reg_def XMM15c( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(2));
 307 reg_def XMM15d( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(3));
 308 reg_def XMM15e( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(4));
 309 reg_def XMM15f( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(5));
 310 reg_def XMM15g( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(6));
 311 reg_def XMM15h( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(7));
 312 
 313 #endif // _LP64
 314 
 315 #endif // _WIN64
 316 
 317 #ifdef _LP64
 318 reg_def RFLAGS(SOC, SOC, 0, 16, VMRegImpl::Bad());
 319 #else
 320 reg_def RFLAGS(SOC, SOC, 0, 8, VMRegImpl::Bad());
 321 #endif // _LP64
 322 
 323 alloc_class chunk1(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
 324                    XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
 325                    XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
 326                    XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
 327                    XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
 328                    XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
 329                    XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
 330                    XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
 331 #ifdef _LP64
 332                   ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
 333                    XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
 334                    XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
 335                    XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
 336                    XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
 337                    XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
 338                    XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
 339                    XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h
 340 #endif
 341                    );
 342 
 343 // flags allocation class should be last.
 344 alloc_class chunk2(RFLAGS);
 345 
 346 // Singleton class for condition codes
 347 reg_class int_flags(RFLAGS);
 348 
 349 // Class for all float registers
 350 reg_class float_reg(XMM0,
 351                     XMM1,
 352                     XMM2,
 353                     XMM3,
 354                     XMM4,
 355                     XMM5,
 356                     XMM6,
 357                     XMM7
 358 #ifdef _LP64
 359                    ,XMM8,
 360                     XMM9,
 361                     XMM10,
 362                     XMM11,
 363                     XMM12,
 364                     XMM13,
 365                     XMM14,
 366                     XMM15
 367 #endif
 368                     );
 369 
 370 // Class for all double registers
 371 reg_class double_reg(XMM0,  XMM0b,
 372                      XMM1,  XMM1b,
 373                      XMM2,  XMM2b,
 374                      XMM3,  XMM3b,
 375                      XMM4,  XMM4b,
 376                      XMM5,  XMM5b,
 377                      XMM6,  XMM6b,
 378                      XMM7,  XMM7b
 379 #ifdef _LP64
 380                     ,XMM8,  XMM8b,
 381                      XMM9,  XMM9b,
 382                      XMM10, XMM10b,
 383                      XMM11, XMM11b,
 384                      XMM12, XMM12b,
 385                      XMM13, XMM13b,
 386                      XMM14, XMM14b,
 387                      XMM15, XMM15b
 388 #endif
 389                      );
 390 
 391 // Class for all 32bit vector registers
 392 reg_class vectors_reg(XMM0,
 393                       XMM1,
 394                       XMM2,
 395                       XMM3,
 396                       XMM4,
 397                       XMM5,
 398                       XMM6,
 399                       XMM7
 400 #ifdef _LP64
 401                      ,XMM8,
 402                       XMM9,
 403                       XMM10,
 404                       XMM11,
 405                       XMM12,
 406                       XMM13,
 407                       XMM14,
 408                       XMM15
 409 #endif
 410                       );
 411 
 412 // Class for all 64bit vector registers
 413 reg_class vectord_reg(XMM0,  XMM0b,
 414                       XMM1,  XMM1b,
 415                       XMM2,  XMM2b,
 416                       XMM3,  XMM3b,
 417                       XMM4,  XMM4b,
 418                       XMM5,  XMM5b,
 419                       XMM6,  XMM6b,
 420                       XMM7,  XMM7b
 421 #ifdef _LP64
 422                      ,XMM8,  XMM8b,
 423                       XMM9,  XMM9b,
 424                       XMM10, XMM10b,
 425                       XMM11, XMM11b,
 426                       XMM12, XMM12b,
 427                       XMM13, XMM13b,
 428                       XMM14, XMM14b,
 429                       XMM15, XMM15b
 430 #endif
 431                       );
 432 
 433 // Class for all 128bit vector registers
 434 reg_class vectorx_reg(XMM0,  XMM0b,  XMM0c,  XMM0d,
 435                       XMM1,  XMM1b,  XMM1c,  XMM1d,
 436                       XMM2,  XMM2b,  XMM2c,  XMM2d,
 437                       XMM3,  XMM3b,  XMM3c,  XMM3d,
 438                       XMM4,  XMM4b,  XMM4c,  XMM4d,
 439                       XMM5,  XMM5b,  XMM5c,  XMM5d,
 440                       XMM6,  XMM6b,  XMM6c,  XMM6d,
 441                       XMM7,  XMM7b,  XMM7c,  XMM7d
 442 #ifdef _LP64
 443                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,
 444                       XMM9,  XMM9b,  XMM9c,  XMM9d,
 445                       XMM10, XMM10b, XMM10c, XMM10d,
 446                       XMM11, XMM11b, XMM11c, XMM11d,
 447                       XMM12, XMM12b, XMM12c, XMM12d,
 448                       XMM13, XMM13b, XMM13c, XMM13d,
 449                       XMM14, XMM14b, XMM14c, XMM14d,
 450                       XMM15, XMM15b, XMM15c, XMM15d
 451 #endif
 452                       );
 453 
 454 // Class for all 256bit vector registers
 455 reg_class vectory_reg(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
 456                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
 457                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
 458                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
 459                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
 460                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
 461                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
 462                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
 463 #ifdef _LP64
 464                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
 465                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
 466                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
 467                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
 468                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
 469                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
 470                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
 471                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h
 472 #endif
 473                       );
 474 
 475 %}
 476 
 477 
 478 //----------SOURCE BLOCK-------------------------------------------------------
 479 // This is a block of C++ code which provides values, functions, and
 480 // definitions necessary in the rest of the architecture description
 481 
 482 source_hpp %{
 483 // Header information of the source block.
 484 // Method declarations/definitions which are used outside
 485 // the ad-scope can conveniently be defined here.
 486 //
 487 // To keep related declarations/definitions/uses close together,
 488 // we switch between source %{ }% and source_hpp %{ }% freely as needed.
 489 
 490 class NativeJump;
 491 
 492 class CallStubImpl {
 493  
 494   //--------------------------------------------------------------
 495   //---<  Used for optimization in Compile::shorten_branches  >---
 496   //--------------------------------------------------------------
 497 
 498  public:
 499   // Size of call trampoline stub.
 500   static uint size_call_trampoline() {
 501     return 0; // no call trampolines on this platform
 502   }
 503   
 504   // number of relocations needed by a call trampoline stub
 505   static uint reloc_call_trampoline() { 
 506     return 0; // no call trampolines on this platform
 507   }
 508 };
 509 
 510 class HandlerImpl {
 511 
 512  public:
 513 
 514   static int emit_exception_handler(CodeBuffer &cbuf);
 515   static int emit_deopt_handler(CodeBuffer& cbuf);
 516 
 517   static uint size_exception_handler() {
 518     // NativeCall instruction size is the same as NativeJump.
 519     // exception handler starts out as jump and can be patched to
 520     // a call be deoptimization.  (4932387)
 521     // Note that this value is also credited (in output.cpp) to
 522     // the size of the code section.
 523     return NativeJump::instruction_size;
 524   }
 525 
 526 #ifdef _LP64
 527   static uint size_deopt_handler() {
 528     // three 5 byte instructions
 529     return 15;
 530   }
 531 #else
 532   static uint size_deopt_handler() {
 533     // NativeCall instruction size is the same as NativeJump.
 534     // exception handler starts out as jump and can be patched to
 535     // a call be deoptimization.  (4932387)
 536     // Note that this value is also credited (in output.cpp) to
 537     // the size of the code section.
 538     return 5 + NativeJump::instruction_size; // pushl(); jmp;
 539   }
 540 #endif
 541 };
 542 
 543 %} // end source_hpp
 544 
 545 source %{
 546 
 547 // Emit exception handler code.
 548 // Stuff framesize into a register and call a VM stub routine.
 549 int HandlerImpl::emit_exception_handler(CodeBuffer& cbuf) {
 550 
 551   // Note that the code buffer's insts_mark is always relative to insts.
 552   // That's why we must use the macroassembler to generate a handler.
 553   MacroAssembler _masm(&cbuf);
 554   address base = __ start_a_stub(size_exception_handler());
 555   if (base == NULL)  return 0;  // CodeBuffer::expand failed
 556   int offset = __ offset();
 557   __ jump(RuntimeAddress(OptoRuntime::exception_blob()->entry_point()));
 558   assert(__ offset() - offset <= (int) size_exception_handler(), "overflow");
 559   __ end_a_stub();
 560   return offset;
 561 }
 562 
 563 // Emit deopt handler code.
 564 int HandlerImpl::emit_deopt_handler(CodeBuffer& cbuf) {
 565 
 566   // Note that the code buffer's insts_mark is always relative to insts.
 567   // That's why we must use the macroassembler to generate a handler.
 568   MacroAssembler _masm(&cbuf);
 569   address base = __ start_a_stub(size_deopt_handler());
 570   if (base == NULL)  return 0;  // CodeBuffer::expand failed
 571   int offset = __ offset();
 572 
 573 #ifdef _LP64
 574   address the_pc = (address) __ pc();
 575   Label next;
 576   // push a "the_pc" on the stack without destroying any registers
 577   // as they all may be live.
 578 
 579   // push address of "next"
 580   __ call(next, relocInfo::none); // reloc none is fine since it is a disp32
 581   __ bind(next);
 582   // adjust it so it matches "the_pc"
 583   __ subptr(Address(rsp, 0), __ offset() - offset);
 584 #else
 585   InternalAddress here(__ pc());
 586   __ pushptr(here.addr());
 587 #endif
 588 
 589   __ jump(RuntimeAddress(SharedRuntime::deopt_blob()->unpack()));
 590   assert(__ offset() - offset <= (int) size_deopt_handler(), "overflow");
 591   __ end_a_stub();
 592   return offset;
 593 }
 594 
 595 
 596 //=============================================================================
 597 
 598   // Float masks come from different places depending on platform.
 599 #ifdef _LP64
 600   static address float_signmask()  { return StubRoutines::x86::float_sign_mask(); }
 601   static address float_signflip()  { return StubRoutines::x86::float_sign_flip(); }
 602   static address double_signmask() { return StubRoutines::x86::double_sign_mask(); }
 603   static address double_signflip() { return StubRoutines::x86::double_sign_flip(); }
 604 #else
 605   static address float_signmask()  { return (address)float_signmask_pool; }
 606   static address float_signflip()  { return (address)float_signflip_pool; }
 607   static address double_signmask() { return (address)double_signmask_pool; }
 608   static address double_signflip() { return (address)double_signflip_pool; }
 609 #endif
 610 
 611 
 612 const bool Matcher::match_rule_supported(int opcode) {
 613   if (!has_match_rule(opcode))
 614     return false;
 615 
 616   switch (opcode) {
 617     case Op_PopCountI:
 618     case Op_PopCountL:
 619       if (!UsePopCountInstruction)
 620         return false;
 621     break;
 622     case Op_MulVI:
 623       if ((UseSSE < 4) && (UseAVX < 1)) // only with SSE4_1 or AVX
 624         return false;
 625     break;
 626     case Op_AddReductionVL:
 627       if (UseAVX < 3) // only EVEX : vector connectivity becomes an issue here
 628         return false;
 629     case Op_AddReductionVI:
 630       if (UseSSE < 3) // requires at least SSE3
 631         return false;
 632     case Op_MulReductionVI:
 633       if (UseSSE < 4) // requires at least SSE4
 634         return false;
 635     case Op_AddReductionVF:
 636     case Op_AddReductionVD:
 637     case Op_MulReductionVF:
 638     case Op_MulReductionVD:
 639       if (UseSSE < 1) // requires at least SSE
 640         return false;
 641     break;
 642     case Op_CompareAndSwapL:
 643 #ifdef _LP64
 644     case Op_CompareAndSwapP:
 645 #endif
 646       if (!VM_Version::supports_cx8())
 647         return false;
 648     break;
 649   }
 650 
 651   return true;  // Per default match rules are supported.
 652 }
 653 
 654 // Max vector size in bytes. 0 if not supported.
 655 const int Matcher::vector_width_in_bytes(BasicType bt) {
 656   assert(is_java_primitive(bt), "only primitive type vectors");
 657   if (UseSSE < 2) return 0;
 658   // SSE2 supports 128bit vectors for all types.
 659   // AVX2 supports 256bit vectors for all types.
 660   int size = (UseAVX > 1) ? 32 : 16;
 661   // AVX1 supports 256bit vectors only for FLOAT and DOUBLE.
 662   if (UseAVX > 0 && (bt == T_FLOAT || bt == T_DOUBLE))
 663     size = 32;
 664   // Use flag to limit vector size.
 665   size = MIN2(size,(int)MaxVectorSize);
 666   // Minimum 2 values in vector (or 4 for bytes).
 667   switch (bt) {
 668   case T_DOUBLE:
 669   case T_LONG:
 670     if (size < 16) return 0;
 671   case T_FLOAT:
 672   case T_INT:
 673     if (size < 8) return 0;
 674   case T_BOOLEAN:
 675   case T_BYTE:
 676   case T_CHAR:
 677   case T_SHORT:
 678     if (size < 4) return 0;
 679     break;
 680   default:
 681     ShouldNotReachHere();
 682   }
 683   return size;
 684 }
 685 
 686 // Limits on vector size (number of elements) loaded into vector.
 687 const int Matcher::max_vector_size(const BasicType bt) {
 688   return vector_width_in_bytes(bt)/type2aelembytes(bt);
 689 }
 690 const int Matcher::min_vector_size(const BasicType bt) {
 691   int max_size = max_vector_size(bt);
 692   // Min size which can be loaded into vector is 4 bytes.
 693   int size = (type2aelembytes(bt) == 1) ? 4 : 2;
 694   return MIN2(size,max_size);
 695 }
 696 
 697 // Vector ideal reg corresponding to specidied size in bytes
 698 const int Matcher::vector_ideal_reg(int size) {
 699   assert(MaxVectorSize >= size, "");
 700   switch(size) {
 701     case  4: return Op_VecS;
 702     case  8: return Op_VecD;
 703     case 16: return Op_VecX;
 704     case 32: return Op_VecY;
 705   }
 706   ShouldNotReachHere();
 707   return 0;
 708 }
 709 
 710 // Only lowest bits of xmm reg are used for vector shift count.
 711 const int Matcher::vector_shift_count_ideal_reg(int size) {
 712   return Op_VecS;
 713 }
 714 
 715 // x86 supports misaligned vectors store/load.
 716 const bool Matcher::misaligned_vectors_ok() {
 717   return !AlignVector; // can be changed by flag
 718 }
 719 
 720 // x86 AES instructions are compatible with SunJCE expanded
 721 // keys, hence we do not need to pass the original key to stubs
 722 const bool Matcher::pass_original_key_for_aes() {
 723   return false;
 724 }
 725 
 726 // Helper methods for MachSpillCopyNode::implementation().
 727 static int vec_mov_helper(CodeBuffer *cbuf, bool do_size, int src_lo, int dst_lo,
 728                           int src_hi, int dst_hi, uint ireg, outputStream* st) {
 729   // In 64-bit VM size calculation is very complex. Emitting instructions
 730   // into scratch buffer is used to get size in 64-bit VM.
 731   LP64_ONLY( assert(!do_size, "this method calculates size only for 32-bit VM"); )
 732   assert(ireg == Op_VecS || // 32bit vector
 733          (src_lo & 1) == 0 && (src_lo + 1) == src_hi &&
 734          (dst_lo & 1) == 0 && (dst_lo + 1) == dst_hi,
 735          "no non-adjacent vector moves" );
 736   if (cbuf) {
 737     MacroAssembler _masm(cbuf);
 738     int offset = __ offset();
 739     switch (ireg) {
 740     case Op_VecS: // copy whole register
 741     case Op_VecD:
 742     case Op_VecX:
 743       __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
 744       break;
 745     case Op_VecY:
 746       __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
 747       break;
 748     default:
 749       ShouldNotReachHere();
 750     }
 751     int size = __ offset() - offset;
 752 #ifdef ASSERT
 753     // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
 754     assert(!do_size || size == 4, "incorrect size calculattion");
 755 #endif
 756     return size;
 757 #ifndef PRODUCT
 758   } else if (!do_size) {
 759     switch (ireg) {
 760     case Op_VecS:
 761     case Op_VecD:
 762     case Op_VecX:
 763       st->print("movdqu  %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
 764       break;
 765     case Op_VecY:
 766       st->print("vmovdqu %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
 767       break;
 768     default:
 769       ShouldNotReachHere();
 770     }
 771 #endif
 772   }
 773   // VEX_2bytes prefix is used if UseAVX > 0, and it takes the same 2 bytes as SIMD prefix.
 774   return 4;
 775 }
 776 
 777 static int vec_spill_helper(CodeBuffer *cbuf, bool do_size, bool is_load,
 778                             int stack_offset, int reg, uint ireg, outputStream* st) {
 779   // In 64-bit VM size calculation is very complex. Emitting instructions
 780   // into scratch buffer is used to get size in 64-bit VM.
 781   LP64_ONLY( assert(!do_size, "this method calculates size only for 32-bit VM"); )
 782   if (cbuf) {
 783     MacroAssembler _masm(cbuf);
 784     int offset = __ offset();
 785     if (is_load) {
 786       switch (ireg) {
 787       case Op_VecS:
 788         __ movdl(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 789         break;
 790       case Op_VecD:
 791         __ movq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 792         break;
 793       case Op_VecX:
 794         __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 795         break;
 796       case Op_VecY:
 797         __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 798         break;
 799       default:
 800         ShouldNotReachHere();
 801       }
 802     } else { // store
 803       switch (ireg) {
 804       case Op_VecS:
 805         __ movdl(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 806         break;
 807       case Op_VecD:
 808         __ movq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 809         break;
 810       case Op_VecX:
 811         __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 812         break;
 813       case Op_VecY:
 814         __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 815         break;
 816       default:
 817         ShouldNotReachHere();
 818       }
 819     }
 820     int size = __ offset() - offset;
 821 #ifdef ASSERT
 822     int offset_size = (stack_offset == 0) ? 0 : ((stack_offset < 0x80) ? 1 : 4);
 823     // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
 824     assert(!do_size || size == (5+offset_size), "incorrect size calculattion");
 825 #endif
 826     return size;
 827 #ifndef PRODUCT
 828   } else if (!do_size) {
 829     if (is_load) {
 830       switch (ireg) {
 831       case Op_VecS:
 832         st->print("movd    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 833         break;
 834       case Op_VecD:
 835         st->print("movq    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 836         break;
 837        case Op_VecX:
 838         st->print("movdqu  %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 839         break;
 840       case Op_VecY:
 841         st->print("vmovdqu %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 842         break;
 843       default:
 844         ShouldNotReachHere();
 845       }
 846     } else { // store
 847       switch (ireg) {
 848       case Op_VecS:
 849         st->print("movd    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 850         break;
 851       case Op_VecD:
 852         st->print("movq    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 853         break;
 854        case Op_VecX:
 855         st->print("movdqu  [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 856         break;
 857       case Op_VecY:
 858         st->print("vmovdqu [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 859         break;
 860       default:
 861         ShouldNotReachHere();
 862       }
 863     }
 864 #endif
 865   }
 866   int offset_size = (stack_offset == 0) ? 0 : ((stack_offset < 0x80) ? 1 : 4);
 867   // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
 868   return 5+offset_size;
 869 }
 870 
 871 static inline jfloat replicate4_imm(int con, int width) {
 872   // Load a constant of "width" (in bytes) and replicate it to fill 32bit.
 873   assert(width == 1 || width == 2, "only byte or short types here");
 874   int bit_width = width * 8;
 875   jint val = con;
 876   val &= (1 << bit_width) - 1;  // mask off sign bits
 877   while(bit_width < 32) {
 878     val |= (val << bit_width);
 879     bit_width <<= 1;
 880   }
 881   jfloat fval = *((jfloat*) &val);  // coerce to float type
 882   return fval;
 883 }
 884 
 885 static inline jdouble replicate8_imm(int con, int width) {
 886   // Load a constant of "width" (in bytes) and replicate it to fill 64bit.
 887   assert(width == 1 || width == 2 || width == 4, "only byte, short or int types here");
 888   int bit_width = width * 8;
 889   jlong val = con;
 890   val &= (((jlong) 1) << bit_width) - 1;  // mask off sign bits
 891   while(bit_width < 64) {
 892     val |= (val << bit_width);
 893     bit_width <<= 1;
 894   }
 895   jdouble dval = *((jdouble*) &val);  // coerce to double type
 896   return dval;
 897 }
 898 
 899 #ifndef PRODUCT
 900   void MachNopNode::format(PhaseRegAlloc*, outputStream* st) const {
 901     st->print("nop \t# %d bytes pad for loops and calls", _count);
 902   }
 903 #endif
 904 
 905   void MachNopNode::emit(CodeBuffer &cbuf, PhaseRegAlloc*) const {
 906     MacroAssembler _masm(&cbuf);
 907     __ nop(_count);
 908   }
 909 
 910   uint MachNopNode::size(PhaseRegAlloc*) const {
 911     return _count;
 912   }
 913 
 914 #ifndef PRODUCT
 915   void MachBreakpointNode::format(PhaseRegAlloc*, outputStream* st) const {
 916     st->print("# breakpoint");
 917   }
 918 #endif
 919 
 920   void MachBreakpointNode::emit(CodeBuffer &cbuf, PhaseRegAlloc* ra_) const {
 921     MacroAssembler _masm(&cbuf);
 922     __ int3();
 923   }
 924 
 925   uint MachBreakpointNode::size(PhaseRegAlloc* ra_) const {
 926     return MachNode::size(ra_);
 927   }
 928 
 929 %}
 930 
 931 encode %{
 932 
 933   enc_class preserve_SP %{
 934     debug_only(int off0 = cbuf.insts_size());
 935     MacroAssembler _masm(&cbuf);
 936     // RBP is preserved across all calls, even compiled calls.
 937     // Use it to preserve RSP in places where the callee might change the SP.
 938     __ movptr(rbp_mh_SP_save, rsp);
 939     debug_only(int off1 = cbuf.insts_size());
 940     assert(off1 - off0 == preserve_SP_size(), "correct size prediction");
 941   %}
 942 
 943   enc_class restore_SP %{
 944     MacroAssembler _masm(&cbuf);
 945     __ movptr(rsp, rbp_mh_SP_save);
 946   %}
 947 
 948   enc_class call_epilog %{
 949     if (VerifyStackAtCalls) {
 950       // Check that stack depth is unchanged: find majik cookie on stack
 951       int framesize = ra_->reg2offset_unchecked(OptoReg::add(ra_->_matcher._old_SP, -3*VMRegImpl::slots_per_word));
 952       MacroAssembler _masm(&cbuf);
 953       Label L;
 954       __ cmpptr(Address(rsp, framesize), (int32_t)0xbadb100d);
 955       __ jccb(Assembler::equal, L);
 956       // Die if stack mismatch
 957       __ int3();
 958       __ bind(L);
 959     }
 960   %}
 961 
 962 %}
 963 
 964 
 965 //----------OPERANDS-----------------------------------------------------------
 966 // Operand definitions must precede instruction definitions for correct parsing
 967 // in the ADLC because operands constitute user defined types which are used in
 968 // instruction definitions.
 969 
 970 // Vectors
 971 operand vecS() %{
 972   constraint(ALLOC_IN_RC(vectors_reg));
 973   match(VecS);
 974 
 975   format %{ %}
 976   interface(REG_INTER);
 977 %}
 978 
 979 operand vecD() %{
 980   constraint(ALLOC_IN_RC(vectord_reg));
 981   match(VecD);
 982 
 983   format %{ %}
 984   interface(REG_INTER);
 985 %}
 986 
 987 operand vecX() %{
 988   constraint(ALLOC_IN_RC(vectorx_reg));
 989   match(VecX);
 990 
 991   format %{ %}
 992   interface(REG_INTER);
 993 %}
 994 
 995 operand vecY() %{
 996   constraint(ALLOC_IN_RC(vectory_reg));
 997   match(VecY);
 998 
 999   format %{ %}
1000   interface(REG_INTER);
1001 %}
1002 
1003 
1004 // INSTRUCTIONS -- Platform independent definitions (same for 32- and 64-bit)
1005 
1006 // ============================================================================
1007 
1008 instruct ShouldNotReachHere() %{
1009   match(Halt);
1010   format %{ "int3\t# ShouldNotReachHere" %}
1011   ins_encode %{
1012     __ int3();
1013   %}
1014   ins_pipe(pipe_slow);
1015 %}
1016 
1017 // ============================================================================
1018 
1019 instruct addF_reg(regF dst, regF src) %{
1020   predicate((UseSSE>=1) && (UseAVX == 0));
1021   match(Set dst (AddF dst src));
1022 
1023   format %{ "addss   $dst, $src" %}
1024   ins_cost(150);
1025   ins_encode %{
1026     __ addss($dst$$XMMRegister, $src$$XMMRegister);
1027   %}
1028   ins_pipe(pipe_slow);
1029 %}
1030 
1031 instruct addF_mem(regF dst, memory src) %{
1032   predicate((UseSSE>=1) && (UseAVX == 0));
1033   match(Set dst (AddF dst (LoadF src)));
1034 
1035   format %{ "addss   $dst, $src" %}
1036   ins_cost(150);
1037   ins_encode %{
1038     __ addss($dst$$XMMRegister, $src$$Address);
1039   %}
1040   ins_pipe(pipe_slow);
1041 %}
1042 
1043 instruct addF_imm(regF dst, immF con) %{
1044   predicate((UseSSE>=1) && (UseAVX == 0));
1045   match(Set dst (AddF dst con));
1046   format %{ "addss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
1047   ins_cost(150);
1048   ins_encode %{
1049     __ addss($dst$$XMMRegister, $constantaddress($con));
1050   %}
1051   ins_pipe(pipe_slow);
1052 %}
1053 
1054 instruct addF_reg_reg(regF dst, regF src1, regF src2) %{
1055   predicate(UseAVX > 0);
1056   match(Set dst (AddF src1 src2));
1057 
1058   format %{ "vaddss  $dst, $src1, $src2" %}
1059   ins_cost(150);
1060   ins_encode %{
1061     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
1062   %}
1063   ins_pipe(pipe_slow);
1064 %}
1065 
1066 instruct addF_reg_mem(regF dst, regF src1, memory src2) %{
1067   predicate(UseAVX > 0);
1068   match(Set dst (AddF src1 (LoadF src2)));
1069 
1070   format %{ "vaddss  $dst, $src1, $src2" %}
1071   ins_cost(150);
1072   ins_encode %{
1073     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
1074   %}
1075   ins_pipe(pipe_slow);
1076 %}
1077 
1078 instruct addF_reg_imm(regF dst, regF src, immF con) %{
1079   predicate(UseAVX > 0);
1080   match(Set dst (AddF src con));
1081 
1082   format %{ "vaddss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
1083   ins_cost(150);
1084   ins_encode %{
1085     __ vaddss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
1086   %}
1087   ins_pipe(pipe_slow);
1088 %}
1089 
1090 instruct addD_reg(regD dst, regD src) %{
1091   predicate((UseSSE>=2) && (UseAVX == 0));
1092   match(Set dst (AddD dst src));
1093 
1094   format %{ "addsd   $dst, $src" %}
1095   ins_cost(150);
1096   ins_encode %{
1097     __ addsd($dst$$XMMRegister, $src$$XMMRegister);
1098   %}
1099   ins_pipe(pipe_slow);
1100 %}
1101 
1102 instruct addD_mem(regD dst, memory src) %{
1103   predicate((UseSSE>=2) && (UseAVX == 0));
1104   match(Set dst (AddD dst (LoadD src)));
1105 
1106   format %{ "addsd   $dst, $src" %}
1107   ins_cost(150);
1108   ins_encode %{
1109     __ addsd($dst$$XMMRegister, $src$$Address);
1110   %}
1111   ins_pipe(pipe_slow);
1112 %}
1113 
1114 instruct addD_imm(regD dst, immD con) %{
1115   predicate((UseSSE>=2) && (UseAVX == 0));
1116   match(Set dst (AddD dst con));
1117   format %{ "addsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
1118   ins_cost(150);
1119   ins_encode %{
1120     __ addsd($dst$$XMMRegister, $constantaddress($con));
1121   %}
1122   ins_pipe(pipe_slow);
1123 %}
1124 
1125 instruct addD_reg_reg(regD dst, regD src1, regD src2) %{
1126   predicate(UseAVX > 0);
1127   match(Set dst (AddD src1 src2));
1128 
1129   format %{ "vaddsd  $dst, $src1, $src2" %}
1130   ins_cost(150);
1131   ins_encode %{
1132     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
1133   %}
1134   ins_pipe(pipe_slow);
1135 %}
1136 
1137 instruct addD_reg_mem(regD dst, regD src1, memory src2) %{
1138   predicate(UseAVX > 0);
1139   match(Set dst (AddD src1 (LoadD src2)));
1140 
1141   format %{ "vaddsd  $dst, $src1, $src2" %}
1142   ins_cost(150);
1143   ins_encode %{
1144     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
1145   %}
1146   ins_pipe(pipe_slow);
1147 %}
1148 
1149 instruct addD_reg_imm(regD dst, regD src, immD con) %{
1150   predicate(UseAVX > 0);
1151   match(Set dst (AddD src con));
1152 
1153   format %{ "vaddsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
1154   ins_cost(150);
1155   ins_encode %{
1156     __ vaddsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
1157   %}
1158   ins_pipe(pipe_slow);
1159 %}
1160 
1161 instruct subF_reg(regF dst, regF src) %{
1162   predicate((UseSSE>=1) && (UseAVX == 0));
1163   match(Set dst (SubF dst src));
1164 
1165   format %{ "subss   $dst, $src" %}
1166   ins_cost(150);
1167   ins_encode %{
1168     __ subss($dst$$XMMRegister, $src$$XMMRegister);
1169   %}
1170   ins_pipe(pipe_slow);
1171 %}
1172 
1173 instruct subF_mem(regF dst, memory src) %{
1174   predicate((UseSSE>=1) && (UseAVX == 0));
1175   match(Set dst (SubF dst (LoadF src)));
1176 
1177   format %{ "subss   $dst, $src" %}
1178   ins_cost(150);
1179   ins_encode %{
1180     __ subss($dst$$XMMRegister, $src$$Address);
1181   %}
1182   ins_pipe(pipe_slow);
1183 %}
1184 
1185 instruct subF_imm(regF dst, immF con) %{
1186   predicate((UseSSE>=1) && (UseAVX == 0));
1187   match(Set dst (SubF dst con));
1188   format %{ "subss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
1189   ins_cost(150);
1190   ins_encode %{
1191     __ subss($dst$$XMMRegister, $constantaddress($con));
1192   %}
1193   ins_pipe(pipe_slow);
1194 %}
1195 
1196 instruct subF_reg_reg(regF dst, regF src1, regF src2) %{
1197   predicate(UseAVX > 0);
1198   match(Set dst (SubF src1 src2));
1199 
1200   format %{ "vsubss  $dst, $src1, $src2" %}
1201   ins_cost(150);
1202   ins_encode %{
1203     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
1204   %}
1205   ins_pipe(pipe_slow);
1206 %}
1207 
1208 instruct subF_reg_mem(regF dst, regF src1, memory src2) %{
1209   predicate(UseAVX > 0);
1210   match(Set dst (SubF src1 (LoadF src2)));
1211 
1212   format %{ "vsubss  $dst, $src1, $src2" %}
1213   ins_cost(150);
1214   ins_encode %{
1215     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
1216   %}
1217   ins_pipe(pipe_slow);
1218 %}
1219 
1220 instruct subF_reg_imm(regF dst, regF src, immF con) %{
1221   predicate(UseAVX > 0);
1222   match(Set dst (SubF src con));
1223 
1224   format %{ "vsubss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
1225   ins_cost(150);
1226   ins_encode %{
1227     __ vsubss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
1228   %}
1229   ins_pipe(pipe_slow);
1230 %}
1231 
1232 instruct subD_reg(regD dst, regD src) %{
1233   predicate((UseSSE>=2) && (UseAVX == 0));
1234   match(Set dst (SubD dst src));
1235 
1236   format %{ "subsd   $dst, $src" %}
1237   ins_cost(150);
1238   ins_encode %{
1239     __ subsd($dst$$XMMRegister, $src$$XMMRegister);
1240   %}
1241   ins_pipe(pipe_slow);
1242 %}
1243 
1244 instruct subD_mem(regD dst, memory src) %{
1245   predicate((UseSSE>=2) && (UseAVX == 0));
1246   match(Set dst (SubD dst (LoadD src)));
1247 
1248   format %{ "subsd   $dst, $src" %}
1249   ins_cost(150);
1250   ins_encode %{
1251     __ subsd($dst$$XMMRegister, $src$$Address);
1252   %}
1253   ins_pipe(pipe_slow);
1254 %}
1255 
1256 instruct subD_imm(regD dst, immD con) %{
1257   predicate((UseSSE>=2) && (UseAVX == 0));
1258   match(Set dst (SubD dst con));
1259   format %{ "subsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
1260   ins_cost(150);
1261   ins_encode %{
1262     __ subsd($dst$$XMMRegister, $constantaddress($con));
1263   %}
1264   ins_pipe(pipe_slow);
1265 %}
1266 
1267 instruct subD_reg_reg(regD dst, regD src1, regD src2) %{
1268   predicate(UseAVX > 0);
1269   match(Set dst (SubD src1 src2));
1270 
1271   format %{ "vsubsd  $dst, $src1, $src2" %}
1272   ins_cost(150);
1273   ins_encode %{
1274     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
1275   %}
1276   ins_pipe(pipe_slow);
1277 %}
1278 
1279 instruct subD_reg_mem(regD dst, regD src1, memory src2) %{
1280   predicate(UseAVX > 0);
1281   match(Set dst (SubD src1 (LoadD src2)));
1282 
1283   format %{ "vsubsd  $dst, $src1, $src2" %}
1284   ins_cost(150);
1285   ins_encode %{
1286     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
1287   %}
1288   ins_pipe(pipe_slow);
1289 %}
1290 
1291 instruct subD_reg_imm(regD dst, regD src, immD con) %{
1292   predicate(UseAVX > 0);
1293   match(Set dst (SubD src con));
1294 
1295   format %{ "vsubsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
1296   ins_cost(150);
1297   ins_encode %{
1298     __ vsubsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
1299   %}
1300   ins_pipe(pipe_slow);
1301 %}
1302 
1303 instruct mulF_reg(regF dst, regF src) %{
1304   predicate((UseSSE>=1) && (UseAVX == 0));
1305   match(Set dst (MulF dst src));
1306 
1307   format %{ "mulss   $dst, $src" %}
1308   ins_cost(150);
1309   ins_encode %{
1310     __ mulss($dst$$XMMRegister, $src$$XMMRegister);
1311   %}
1312   ins_pipe(pipe_slow);
1313 %}
1314 
1315 instruct mulF_mem(regF dst, memory src) %{
1316   predicate((UseSSE>=1) && (UseAVX == 0));
1317   match(Set dst (MulF dst (LoadF src)));
1318 
1319   format %{ "mulss   $dst, $src" %}
1320   ins_cost(150);
1321   ins_encode %{
1322     __ mulss($dst$$XMMRegister, $src$$Address);
1323   %}
1324   ins_pipe(pipe_slow);
1325 %}
1326 
1327 instruct mulF_imm(regF dst, immF con) %{
1328   predicate((UseSSE>=1) && (UseAVX == 0));
1329   match(Set dst (MulF dst con));
1330   format %{ "mulss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
1331   ins_cost(150);
1332   ins_encode %{
1333     __ mulss($dst$$XMMRegister, $constantaddress($con));
1334   %}
1335   ins_pipe(pipe_slow);
1336 %}
1337 
1338 instruct mulF_reg_reg(regF dst, regF src1, regF src2) %{
1339   predicate(UseAVX > 0);
1340   match(Set dst (MulF src1 src2));
1341 
1342   format %{ "vmulss  $dst, $src1, $src2" %}
1343   ins_cost(150);
1344   ins_encode %{
1345     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
1346   %}
1347   ins_pipe(pipe_slow);
1348 %}
1349 
1350 instruct mulF_reg_mem(regF dst, regF src1, memory src2) %{
1351   predicate(UseAVX > 0);
1352   match(Set dst (MulF src1 (LoadF src2)));
1353 
1354   format %{ "vmulss  $dst, $src1, $src2" %}
1355   ins_cost(150);
1356   ins_encode %{
1357     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
1358   %}
1359   ins_pipe(pipe_slow);
1360 %}
1361 
1362 instruct mulF_reg_imm(regF dst, regF src, immF con) %{
1363   predicate(UseAVX > 0);
1364   match(Set dst (MulF src con));
1365 
1366   format %{ "vmulss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
1367   ins_cost(150);
1368   ins_encode %{
1369     __ vmulss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
1370   %}
1371   ins_pipe(pipe_slow);
1372 %}
1373 
1374 instruct mulD_reg(regD dst, regD src) %{
1375   predicate((UseSSE>=2) && (UseAVX == 0));
1376   match(Set dst (MulD dst src));
1377 
1378   format %{ "mulsd   $dst, $src" %}
1379   ins_cost(150);
1380   ins_encode %{
1381     __ mulsd($dst$$XMMRegister, $src$$XMMRegister);
1382   %}
1383   ins_pipe(pipe_slow);
1384 %}
1385 
1386 instruct mulD_mem(regD dst, memory src) %{
1387   predicate((UseSSE>=2) && (UseAVX == 0));
1388   match(Set dst (MulD dst (LoadD src)));
1389 
1390   format %{ "mulsd   $dst, $src" %}
1391   ins_cost(150);
1392   ins_encode %{
1393     __ mulsd($dst$$XMMRegister, $src$$Address);
1394   %}
1395   ins_pipe(pipe_slow);
1396 %}
1397 
1398 instruct mulD_imm(regD dst, immD con) %{
1399   predicate((UseSSE>=2) && (UseAVX == 0));
1400   match(Set dst (MulD dst con));
1401   format %{ "mulsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
1402   ins_cost(150);
1403   ins_encode %{
1404     __ mulsd($dst$$XMMRegister, $constantaddress($con));
1405   %}
1406   ins_pipe(pipe_slow);
1407 %}
1408 
1409 instruct mulD_reg_reg(regD dst, regD src1, regD src2) %{
1410   predicate(UseAVX > 0);
1411   match(Set dst (MulD src1 src2));
1412 
1413   format %{ "vmulsd  $dst, $src1, $src2" %}
1414   ins_cost(150);
1415   ins_encode %{
1416     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
1417   %}
1418   ins_pipe(pipe_slow);
1419 %}
1420 
1421 instruct mulD_reg_mem(regD dst, regD src1, memory src2) %{
1422   predicate(UseAVX > 0);
1423   match(Set dst (MulD src1 (LoadD src2)));
1424 
1425   format %{ "vmulsd  $dst, $src1, $src2" %}
1426   ins_cost(150);
1427   ins_encode %{
1428     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
1429   %}
1430   ins_pipe(pipe_slow);
1431 %}
1432 
1433 instruct mulD_reg_imm(regD dst, regD src, immD con) %{
1434   predicate(UseAVX > 0);
1435   match(Set dst (MulD src con));
1436 
1437   format %{ "vmulsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
1438   ins_cost(150);
1439   ins_encode %{
1440     __ vmulsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
1441   %}
1442   ins_pipe(pipe_slow);
1443 %}
1444 
1445 instruct divF_reg(regF dst, regF src) %{
1446   predicate((UseSSE>=1) && (UseAVX == 0));
1447   match(Set dst (DivF dst src));
1448 
1449   format %{ "divss   $dst, $src" %}
1450   ins_cost(150);
1451   ins_encode %{
1452     __ divss($dst$$XMMRegister, $src$$XMMRegister);
1453   %}
1454   ins_pipe(pipe_slow);
1455 %}
1456 
1457 instruct divF_mem(regF dst, memory src) %{
1458   predicate((UseSSE>=1) && (UseAVX == 0));
1459   match(Set dst (DivF dst (LoadF src)));
1460 
1461   format %{ "divss   $dst, $src" %}
1462   ins_cost(150);
1463   ins_encode %{
1464     __ divss($dst$$XMMRegister, $src$$Address);
1465   %}
1466   ins_pipe(pipe_slow);
1467 %}
1468 
1469 instruct divF_imm(regF dst, immF con) %{
1470   predicate((UseSSE>=1) && (UseAVX == 0));
1471   match(Set dst (DivF dst con));
1472   format %{ "divss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
1473   ins_cost(150);
1474   ins_encode %{
1475     __ divss($dst$$XMMRegister, $constantaddress($con));
1476   %}
1477   ins_pipe(pipe_slow);
1478 %}
1479 
1480 instruct divF_reg_reg(regF dst, regF src1, regF src2) %{
1481   predicate(UseAVX > 0);
1482   match(Set dst (DivF src1 src2));
1483 
1484   format %{ "vdivss  $dst, $src1, $src2" %}
1485   ins_cost(150);
1486   ins_encode %{
1487     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
1488   %}
1489   ins_pipe(pipe_slow);
1490 %}
1491 
1492 instruct divF_reg_mem(regF dst, regF src1, memory src2) %{
1493   predicate(UseAVX > 0);
1494   match(Set dst (DivF src1 (LoadF src2)));
1495 
1496   format %{ "vdivss  $dst, $src1, $src2" %}
1497   ins_cost(150);
1498   ins_encode %{
1499     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
1500   %}
1501   ins_pipe(pipe_slow);
1502 %}
1503 
1504 instruct divF_reg_imm(regF dst, regF src, immF con) %{
1505   predicate(UseAVX > 0);
1506   match(Set dst (DivF src con));
1507 
1508   format %{ "vdivss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
1509   ins_cost(150);
1510   ins_encode %{
1511     __ vdivss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
1512   %}
1513   ins_pipe(pipe_slow);
1514 %}
1515 
1516 instruct divD_reg(regD dst, regD src) %{
1517   predicate((UseSSE>=2) && (UseAVX == 0));
1518   match(Set dst (DivD dst src));
1519 
1520   format %{ "divsd   $dst, $src" %}
1521   ins_cost(150);
1522   ins_encode %{
1523     __ divsd($dst$$XMMRegister, $src$$XMMRegister);
1524   %}
1525   ins_pipe(pipe_slow);
1526 %}
1527 
1528 instruct divD_mem(regD dst, memory src) %{
1529   predicate((UseSSE>=2) && (UseAVX == 0));
1530   match(Set dst (DivD dst (LoadD src)));
1531 
1532   format %{ "divsd   $dst, $src" %}
1533   ins_cost(150);
1534   ins_encode %{
1535     __ divsd($dst$$XMMRegister, $src$$Address);
1536   %}
1537   ins_pipe(pipe_slow);
1538 %}
1539 
1540 instruct divD_imm(regD dst, immD con) %{
1541   predicate((UseSSE>=2) && (UseAVX == 0));
1542   match(Set dst (DivD dst con));
1543   format %{ "divsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
1544   ins_cost(150);
1545   ins_encode %{
1546     __ divsd($dst$$XMMRegister, $constantaddress($con));
1547   %}
1548   ins_pipe(pipe_slow);
1549 %}
1550 
1551 instruct divD_reg_reg(regD dst, regD src1, regD src2) %{
1552   predicate(UseAVX > 0);
1553   match(Set dst (DivD src1 src2));
1554 
1555   format %{ "vdivsd  $dst, $src1, $src2" %}
1556   ins_cost(150);
1557   ins_encode %{
1558     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
1559   %}
1560   ins_pipe(pipe_slow);
1561 %}
1562 
1563 instruct divD_reg_mem(regD dst, regD src1, memory src2) %{
1564   predicate(UseAVX > 0);
1565   match(Set dst (DivD src1 (LoadD src2)));
1566 
1567   format %{ "vdivsd  $dst, $src1, $src2" %}
1568   ins_cost(150);
1569   ins_encode %{
1570     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
1571   %}
1572   ins_pipe(pipe_slow);
1573 %}
1574 
1575 instruct divD_reg_imm(regD dst, regD src, immD con) %{
1576   predicate(UseAVX > 0);
1577   match(Set dst (DivD src con));
1578 
1579   format %{ "vdivsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
1580   ins_cost(150);
1581   ins_encode %{
1582     __ vdivsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
1583   %}
1584   ins_pipe(pipe_slow);
1585 %}
1586 
1587 instruct absF_reg(regF dst) %{
1588   predicate((UseSSE>=1) && (UseAVX == 0));
1589   match(Set dst (AbsF dst));
1590   ins_cost(150);
1591   format %{ "andps   $dst, [0x7fffffff]\t# abs float by sign masking" %}
1592   ins_encode %{
1593     __ andps($dst$$XMMRegister, ExternalAddress(float_signmask()));
1594   %}
1595   ins_pipe(pipe_slow);
1596 %}
1597 
1598 instruct absF_reg_reg(regF dst, regF src) %{
1599   predicate(UseAVX > 0);
1600   match(Set dst (AbsF src));
1601   ins_cost(150);
1602   format %{ "vandps  $dst, $src, [0x7fffffff]\t# abs float by sign masking" %}
1603   ins_encode %{
1604     bool vector256 = false;
1605     __ vandps($dst$$XMMRegister, $src$$XMMRegister,
1606               ExternalAddress(float_signmask()), vector256);
1607   %}
1608   ins_pipe(pipe_slow);
1609 %}
1610 
1611 instruct absD_reg(regD dst) %{
1612   predicate((UseSSE>=2) && (UseAVX == 0));
1613   match(Set dst (AbsD dst));
1614   ins_cost(150);
1615   format %{ "andpd   $dst, [0x7fffffffffffffff]\t"
1616             "# abs double by sign masking" %}
1617   ins_encode %{
1618     __ andpd($dst$$XMMRegister, ExternalAddress(double_signmask()));
1619   %}
1620   ins_pipe(pipe_slow);
1621 %}
1622 
1623 instruct absD_reg_reg(regD dst, regD src) %{
1624   predicate(UseAVX > 0);
1625   match(Set dst (AbsD src));
1626   ins_cost(150);
1627   format %{ "vandpd  $dst, $src, [0x7fffffffffffffff]\t"
1628             "# abs double by sign masking" %}
1629   ins_encode %{
1630     bool vector256 = false;
1631     __ vandpd($dst$$XMMRegister, $src$$XMMRegister,
1632               ExternalAddress(double_signmask()), vector256);
1633   %}
1634   ins_pipe(pipe_slow);
1635 %}
1636 
1637 instruct negF_reg(regF dst) %{
1638   predicate((UseSSE>=1) && (UseAVX == 0));
1639   match(Set dst (NegF dst));
1640   ins_cost(150);
1641   format %{ "xorps   $dst, [0x80000000]\t# neg float by sign flipping" %}
1642   ins_encode %{
1643     __ xorps($dst$$XMMRegister, ExternalAddress(float_signflip()));
1644   %}
1645   ins_pipe(pipe_slow);
1646 %}
1647 
1648 instruct negF_reg_reg(regF dst, regF src) %{
1649   predicate(UseAVX > 0);
1650   match(Set dst (NegF src));
1651   ins_cost(150);
1652   format %{ "vxorps  $dst, $src, [0x80000000]\t# neg float by sign flipping" %}
1653   ins_encode %{
1654     bool vector256 = false;
1655     __ vxorps($dst$$XMMRegister, $src$$XMMRegister,
1656               ExternalAddress(float_signflip()), vector256);
1657   %}
1658   ins_pipe(pipe_slow);
1659 %}
1660 
1661 instruct negD_reg(regD dst) %{
1662   predicate((UseSSE>=2) && (UseAVX == 0));
1663   match(Set dst (NegD dst));
1664   ins_cost(150);
1665   format %{ "xorpd   $dst, [0x8000000000000000]\t"
1666             "# neg double by sign flipping" %}
1667   ins_encode %{
1668     __ xorpd($dst$$XMMRegister, ExternalAddress(double_signflip()));
1669   %}
1670   ins_pipe(pipe_slow);
1671 %}
1672 
1673 instruct negD_reg_reg(regD dst, regD src) %{
1674   predicate(UseAVX > 0);
1675   match(Set dst (NegD src));
1676   ins_cost(150);
1677   format %{ "vxorpd  $dst, $src, [0x8000000000000000]\t"
1678             "# neg double by sign flipping" %}
1679   ins_encode %{
1680     bool vector256 = false;
1681     __ vxorpd($dst$$XMMRegister, $src$$XMMRegister,
1682               ExternalAddress(double_signflip()), vector256);
1683   %}
1684   ins_pipe(pipe_slow);
1685 %}
1686 
1687 instruct sqrtF_reg(regF dst, regF src) %{
1688   predicate(UseSSE>=1);
1689   match(Set dst (ConvD2F (SqrtD (ConvF2D src))));
1690 
1691   format %{ "sqrtss  $dst, $src" %}
1692   ins_cost(150);
1693   ins_encode %{
1694     __ sqrtss($dst$$XMMRegister, $src$$XMMRegister);
1695   %}
1696   ins_pipe(pipe_slow);
1697 %}
1698 
1699 instruct sqrtF_mem(regF dst, memory src) %{
1700   predicate(UseSSE>=1);
1701   match(Set dst (ConvD2F (SqrtD (ConvF2D (LoadF src)))));
1702 
1703   format %{ "sqrtss  $dst, $src" %}
1704   ins_cost(150);
1705   ins_encode %{
1706     __ sqrtss($dst$$XMMRegister, $src$$Address);
1707   %}
1708   ins_pipe(pipe_slow);
1709 %}
1710 
1711 instruct sqrtF_imm(regF dst, immF con) %{
1712   predicate(UseSSE>=1);
1713   match(Set dst (ConvD2F (SqrtD (ConvF2D con))));
1714   format %{ "sqrtss  $dst, [$constantaddress]\t# load from constant table: float=$con" %}
1715   ins_cost(150);
1716   ins_encode %{
1717     __ sqrtss($dst$$XMMRegister, $constantaddress($con));
1718   %}
1719   ins_pipe(pipe_slow);
1720 %}
1721 
1722 instruct sqrtD_reg(regD dst, regD src) %{
1723   predicate(UseSSE>=2);
1724   match(Set dst (SqrtD src));
1725 
1726   format %{ "sqrtsd  $dst, $src" %}
1727   ins_cost(150);
1728   ins_encode %{
1729     __ sqrtsd($dst$$XMMRegister, $src$$XMMRegister);
1730   %}
1731   ins_pipe(pipe_slow);
1732 %}
1733 
1734 instruct sqrtD_mem(regD dst, memory src) %{
1735   predicate(UseSSE>=2);
1736   match(Set dst (SqrtD (LoadD src)));
1737 
1738   format %{ "sqrtsd  $dst, $src" %}
1739   ins_cost(150);
1740   ins_encode %{
1741     __ sqrtsd($dst$$XMMRegister, $src$$Address);
1742   %}
1743   ins_pipe(pipe_slow);
1744 %}
1745 
1746 instruct sqrtD_imm(regD dst, immD con) %{
1747   predicate(UseSSE>=2);
1748   match(Set dst (SqrtD con));
1749   format %{ "sqrtsd  $dst, [$constantaddress]\t# load from constant table: double=$con" %}
1750   ins_cost(150);
1751   ins_encode %{
1752     __ sqrtsd($dst$$XMMRegister, $constantaddress($con));
1753   %}
1754   ins_pipe(pipe_slow);
1755 %}
1756 
1757 
1758 // ====================VECTOR INSTRUCTIONS=====================================
1759 
1760 // Load vectors (4 bytes long)
1761 instruct loadV4(vecS dst, memory mem) %{
1762   predicate(n->as_LoadVector()->memory_size() == 4);
1763   match(Set dst (LoadVector mem));
1764   ins_cost(125);
1765   format %{ "movd    $dst,$mem\t! load vector (4 bytes)" %}
1766   ins_encode %{
1767     __ movdl($dst$$XMMRegister, $mem$$Address);
1768   %}
1769   ins_pipe( pipe_slow );
1770 %}
1771 
1772 // Load vectors (8 bytes long)
1773 instruct loadV8(vecD dst, memory mem) %{
1774   predicate(n->as_LoadVector()->memory_size() == 8);
1775   match(Set dst (LoadVector mem));
1776   ins_cost(125);
1777   format %{ "movq    $dst,$mem\t! load vector (8 bytes)" %}
1778   ins_encode %{
1779     __ movq($dst$$XMMRegister, $mem$$Address);
1780   %}
1781   ins_pipe( pipe_slow );
1782 %}
1783 
1784 // Load vectors (16 bytes long)
1785 instruct loadV16(vecX dst, memory mem) %{
1786   predicate(n->as_LoadVector()->memory_size() == 16);
1787   match(Set dst (LoadVector mem));
1788   ins_cost(125);
1789   format %{ "movdqu  $dst,$mem\t! load vector (16 bytes)" %}
1790   ins_encode %{
1791     __ movdqu($dst$$XMMRegister, $mem$$Address);
1792   %}
1793   ins_pipe( pipe_slow );
1794 %}
1795 
1796 // Load vectors (32 bytes long)
1797 instruct loadV32(vecY dst, memory mem) %{
1798   predicate(n->as_LoadVector()->memory_size() == 32);
1799   match(Set dst (LoadVector mem));
1800   ins_cost(125);
1801   format %{ "vmovdqu $dst,$mem\t! load vector (32 bytes)" %}
1802   ins_encode %{
1803     __ vmovdqu($dst$$XMMRegister, $mem$$Address);
1804   %}
1805   ins_pipe( pipe_slow );
1806 %}
1807 
1808 // Store vectors
1809 instruct storeV4(memory mem, vecS src) %{
1810   predicate(n->as_StoreVector()->memory_size() == 4);
1811   match(Set mem (StoreVector mem src));
1812   ins_cost(145);
1813   format %{ "movd    $mem,$src\t! store vector (4 bytes)" %}
1814   ins_encode %{
1815     __ movdl($mem$$Address, $src$$XMMRegister);
1816   %}
1817   ins_pipe( pipe_slow );
1818 %}
1819 
1820 instruct storeV8(memory mem, vecD src) %{
1821   predicate(n->as_StoreVector()->memory_size() == 8);
1822   match(Set mem (StoreVector mem src));
1823   ins_cost(145);
1824   format %{ "movq    $mem,$src\t! store vector (8 bytes)" %}
1825   ins_encode %{
1826     __ movq($mem$$Address, $src$$XMMRegister);
1827   %}
1828   ins_pipe( pipe_slow );
1829 %}
1830 
1831 instruct storeV16(memory mem, vecX src) %{
1832   predicate(n->as_StoreVector()->memory_size() == 16);
1833   match(Set mem (StoreVector mem src));
1834   ins_cost(145);
1835   format %{ "movdqu  $mem,$src\t! store vector (16 bytes)" %}
1836   ins_encode %{
1837     __ movdqu($mem$$Address, $src$$XMMRegister);
1838   %}
1839   ins_pipe( pipe_slow );
1840 %}
1841 
1842 instruct storeV32(memory mem, vecY src) %{
1843   predicate(n->as_StoreVector()->memory_size() == 32);
1844   match(Set mem (StoreVector mem src));
1845   ins_cost(145);
1846   format %{ "vmovdqu $mem,$src\t! store vector (32 bytes)" %}
1847   ins_encode %{
1848     __ vmovdqu($mem$$Address, $src$$XMMRegister);
1849   %}
1850   ins_pipe( pipe_slow );
1851 %}
1852 
1853 // Replicate byte scalar to be vector
1854 instruct Repl4B(vecS dst, rRegI src) %{
1855   predicate(n->as_Vector()->length() == 4);
1856   match(Set dst (ReplicateB src));
1857   format %{ "movd    $dst,$src\n\t"
1858             "punpcklbw $dst,$dst\n\t"
1859             "pshuflw $dst,$dst,0x00\t! replicate4B" %}
1860   ins_encode %{
1861     __ movdl($dst$$XMMRegister, $src$$Register);
1862     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
1863     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
1864   %}
1865   ins_pipe( pipe_slow );
1866 %}
1867 
1868 instruct Repl8B(vecD dst, rRegI src) %{
1869   predicate(n->as_Vector()->length() == 8);
1870   match(Set dst (ReplicateB src));
1871   format %{ "movd    $dst,$src\n\t"
1872             "punpcklbw $dst,$dst\n\t"
1873             "pshuflw $dst,$dst,0x00\t! replicate8B" %}
1874   ins_encode %{
1875     __ movdl($dst$$XMMRegister, $src$$Register);
1876     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
1877     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
1878   %}
1879   ins_pipe( pipe_slow );
1880 %}
1881 
1882 instruct Repl16B(vecX dst, rRegI src) %{
1883   predicate(n->as_Vector()->length() == 16);
1884   match(Set dst (ReplicateB src));
1885   format %{ "movd    $dst,$src\n\t"
1886             "punpcklbw $dst,$dst\n\t"
1887             "pshuflw $dst,$dst,0x00\n\t"
1888             "punpcklqdq $dst,$dst\t! replicate16B" %}
1889   ins_encode %{
1890     __ movdl($dst$$XMMRegister, $src$$Register);
1891     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
1892     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
1893     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
1894   %}
1895   ins_pipe( pipe_slow );
1896 %}
1897 
1898 instruct Repl32B(vecY dst, rRegI src) %{
1899   predicate(n->as_Vector()->length() == 32);
1900   match(Set dst (ReplicateB src));
1901   format %{ "movd    $dst,$src\n\t"
1902             "punpcklbw $dst,$dst\n\t"
1903             "pshuflw $dst,$dst,0x00\n\t"
1904             "punpcklqdq $dst,$dst\n\t"
1905             "vinserti128h $dst,$dst,$dst\t! replicate32B" %}
1906   ins_encode %{
1907     __ movdl($dst$$XMMRegister, $src$$Register);
1908     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
1909     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
1910     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
1911     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
1912   %}
1913   ins_pipe( pipe_slow );
1914 %}
1915 
1916 // Replicate byte scalar immediate to be vector by loading from const table.
1917 instruct Repl4B_imm(vecS dst, immI con) %{
1918   predicate(n->as_Vector()->length() == 4);
1919   match(Set dst (ReplicateB con));
1920   format %{ "movdl   $dst,[$constantaddress]\t! replicate4B($con)" %}
1921   ins_encode %{
1922     __ movdl($dst$$XMMRegister, $constantaddress(replicate4_imm($con$$constant, 1)));
1923   %}
1924   ins_pipe( pipe_slow );
1925 %}
1926 
1927 instruct Repl8B_imm(vecD dst, immI con) %{
1928   predicate(n->as_Vector()->length() == 8);
1929   match(Set dst (ReplicateB con));
1930   format %{ "movq    $dst,[$constantaddress]\t! replicate8B($con)" %}
1931   ins_encode %{
1932     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
1933   %}
1934   ins_pipe( pipe_slow );
1935 %}
1936 
1937 instruct Repl16B_imm(vecX dst, immI con) %{
1938   predicate(n->as_Vector()->length() == 16);
1939   match(Set dst (ReplicateB con));
1940   format %{ "movq    $dst,[$constantaddress]\n\t"
1941             "punpcklqdq $dst,$dst\t! replicate16B($con)" %}
1942   ins_encode %{
1943     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
1944     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
1945   %}
1946   ins_pipe( pipe_slow );
1947 %}
1948 
1949 instruct Repl32B_imm(vecY dst, immI con) %{
1950   predicate(n->as_Vector()->length() == 32);
1951   match(Set dst (ReplicateB con));
1952   format %{ "movq    $dst,[$constantaddress]\n\t"
1953             "punpcklqdq $dst,$dst\n\t"
1954             "vinserti128h $dst,$dst,$dst\t! lreplicate32B($con)" %}
1955   ins_encode %{
1956     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
1957     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
1958     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
1959   %}
1960   ins_pipe( pipe_slow );
1961 %}
1962 
1963 // Replicate byte scalar zero to be vector
1964 instruct Repl4B_zero(vecS dst, immI0 zero) %{
1965   predicate(n->as_Vector()->length() == 4);
1966   match(Set dst (ReplicateB zero));
1967   format %{ "pxor    $dst,$dst\t! replicate4B zero" %}
1968   ins_encode %{
1969     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
1970   %}
1971   ins_pipe( fpu_reg_reg );
1972 %}
1973 
1974 instruct Repl8B_zero(vecD dst, immI0 zero) %{
1975   predicate(n->as_Vector()->length() == 8);
1976   match(Set dst (ReplicateB zero));
1977   format %{ "pxor    $dst,$dst\t! replicate8B zero" %}
1978   ins_encode %{
1979     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
1980   %}
1981   ins_pipe( fpu_reg_reg );
1982 %}
1983 
1984 instruct Repl16B_zero(vecX dst, immI0 zero) %{
1985   predicate(n->as_Vector()->length() == 16);
1986   match(Set dst (ReplicateB zero));
1987   format %{ "pxor    $dst,$dst\t! replicate16B zero" %}
1988   ins_encode %{
1989     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
1990   %}
1991   ins_pipe( fpu_reg_reg );
1992 %}
1993 
1994 instruct Repl32B_zero(vecY dst, immI0 zero) %{
1995   predicate(n->as_Vector()->length() == 32);
1996   match(Set dst (ReplicateB zero));
1997   format %{ "vpxor   $dst,$dst,$dst\t! replicate32B zero" %}
1998   ins_encode %{
1999     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
2000     bool vector256 = true;
2001     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector256);
2002   %}
2003   ins_pipe( fpu_reg_reg );
2004 %}
2005 
2006 // Replicate char/short (2 byte) scalar to be vector
2007 instruct Repl2S(vecS dst, rRegI src) %{
2008   predicate(n->as_Vector()->length() == 2);
2009   match(Set dst (ReplicateS src));
2010   format %{ "movd    $dst,$src\n\t"
2011             "pshuflw $dst,$dst,0x00\t! replicate2S" %}
2012   ins_encode %{
2013     __ movdl($dst$$XMMRegister, $src$$Register);
2014     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
2015   %}
2016   ins_pipe( fpu_reg_reg );
2017 %}
2018 
2019 instruct Repl4S(vecD dst, rRegI src) %{
2020   predicate(n->as_Vector()->length() == 4);
2021   match(Set dst (ReplicateS src));
2022   format %{ "movd    $dst,$src\n\t"
2023             "pshuflw $dst,$dst,0x00\t! replicate4S" %}
2024   ins_encode %{
2025     __ movdl($dst$$XMMRegister, $src$$Register);
2026     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
2027   %}
2028   ins_pipe( fpu_reg_reg );
2029 %}
2030 
2031 instruct Repl8S(vecX dst, rRegI src) %{
2032   predicate(n->as_Vector()->length() == 8);
2033   match(Set dst (ReplicateS src));
2034   format %{ "movd    $dst,$src\n\t"
2035             "pshuflw $dst,$dst,0x00\n\t"
2036             "punpcklqdq $dst,$dst\t! replicate8S" %}
2037   ins_encode %{
2038     __ movdl($dst$$XMMRegister, $src$$Register);
2039     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
2040     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2041   %}
2042   ins_pipe( pipe_slow );
2043 %}
2044 
2045 instruct Repl16S(vecY dst, rRegI src) %{
2046   predicate(n->as_Vector()->length() == 16);
2047   match(Set dst (ReplicateS src));
2048   format %{ "movd    $dst,$src\n\t"
2049             "pshuflw $dst,$dst,0x00\n\t"
2050             "punpcklqdq $dst,$dst\n\t"
2051             "vinserti128h $dst,$dst,$dst\t! replicate16S" %}
2052   ins_encode %{
2053     __ movdl($dst$$XMMRegister, $src$$Register);
2054     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
2055     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2056     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
2057   %}
2058   ins_pipe( pipe_slow );
2059 %}
2060 
2061 // Replicate char/short (2 byte) scalar immediate to be vector by loading from const table.
2062 instruct Repl2S_imm(vecS dst, immI con) %{
2063   predicate(n->as_Vector()->length() == 2);
2064   match(Set dst (ReplicateS con));
2065   format %{ "movdl   $dst,[$constantaddress]\t! replicate2S($con)" %}
2066   ins_encode %{
2067     __ movdl($dst$$XMMRegister, $constantaddress(replicate4_imm($con$$constant, 2)));
2068   %}
2069   ins_pipe( fpu_reg_reg );
2070 %}
2071 
2072 instruct Repl4S_imm(vecD dst, immI con) %{
2073   predicate(n->as_Vector()->length() == 4);
2074   match(Set dst (ReplicateS con));
2075   format %{ "movq    $dst,[$constantaddress]\t! replicate4S($con)" %}
2076   ins_encode %{
2077     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
2078   %}
2079   ins_pipe( fpu_reg_reg );
2080 %}
2081 
2082 instruct Repl8S_imm(vecX dst, immI con) %{
2083   predicate(n->as_Vector()->length() == 8);
2084   match(Set dst (ReplicateS con));
2085   format %{ "movq    $dst,[$constantaddress]\n\t"
2086             "punpcklqdq $dst,$dst\t! replicate8S($con)" %}
2087   ins_encode %{
2088     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
2089     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2090   %}
2091   ins_pipe( pipe_slow );
2092 %}
2093 
2094 instruct Repl16S_imm(vecY dst, immI con) %{
2095   predicate(n->as_Vector()->length() == 16);
2096   match(Set dst (ReplicateS con));
2097   format %{ "movq    $dst,[$constantaddress]\n\t"
2098             "punpcklqdq $dst,$dst\n\t"
2099             "vinserti128h $dst,$dst,$dst\t! replicate16S($con)" %}
2100   ins_encode %{
2101     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
2102     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2103     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
2104   %}
2105   ins_pipe( pipe_slow );
2106 %}
2107 
2108 // Replicate char/short (2 byte) scalar zero to be vector
2109 instruct Repl2S_zero(vecS dst, immI0 zero) %{
2110   predicate(n->as_Vector()->length() == 2);
2111   match(Set dst (ReplicateS zero));
2112   format %{ "pxor    $dst,$dst\t! replicate2S zero" %}
2113   ins_encode %{
2114     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
2115   %}
2116   ins_pipe( fpu_reg_reg );
2117 %}
2118 
2119 instruct Repl4S_zero(vecD dst, immI0 zero) %{
2120   predicate(n->as_Vector()->length() == 4);
2121   match(Set dst (ReplicateS zero));
2122   format %{ "pxor    $dst,$dst\t! replicate4S zero" %}
2123   ins_encode %{
2124     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
2125   %}
2126   ins_pipe( fpu_reg_reg );
2127 %}
2128 
2129 instruct Repl8S_zero(vecX dst, immI0 zero) %{
2130   predicate(n->as_Vector()->length() == 8);
2131   match(Set dst (ReplicateS zero));
2132   format %{ "pxor    $dst,$dst\t! replicate8S zero" %}
2133   ins_encode %{
2134     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
2135   %}
2136   ins_pipe( fpu_reg_reg );
2137 %}
2138 
2139 instruct Repl16S_zero(vecY dst, immI0 zero) %{
2140   predicate(n->as_Vector()->length() == 16);
2141   match(Set dst (ReplicateS zero));
2142   format %{ "vpxor   $dst,$dst,$dst\t! replicate16S zero" %}
2143   ins_encode %{
2144     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
2145     bool vector256 = true;
2146     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector256);
2147   %}
2148   ins_pipe( fpu_reg_reg );
2149 %}
2150 
2151 // Replicate integer (4 byte) scalar to be vector
2152 instruct Repl2I(vecD dst, rRegI src) %{
2153   predicate(n->as_Vector()->length() == 2);
2154   match(Set dst (ReplicateI src));
2155   format %{ "movd    $dst,$src\n\t"
2156             "pshufd  $dst,$dst,0x00\t! replicate2I" %}
2157   ins_encode %{
2158     __ movdl($dst$$XMMRegister, $src$$Register);
2159     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
2160   %}
2161   ins_pipe( fpu_reg_reg );
2162 %}
2163 
2164 instruct Repl4I(vecX dst, rRegI src) %{
2165   predicate(n->as_Vector()->length() == 4);
2166   match(Set dst (ReplicateI src));
2167   format %{ "movd    $dst,$src\n\t"
2168             "pshufd  $dst,$dst,0x00\t! replicate4I" %}
2169   ins_encode %{
2170     __ movdl($dst$$XMMRegister, $src$$Register);
2171     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
2172   %}
2173   ins_pipe( pipe_slow );
2174 %}
2175 
2176 instruct Repl8I(vecY dst, rRegI src) %{
2177   predicate(n->as_Vector()->length() == 8);
2178   match(Set dst (ReplicateI src));
2179   format %{ "movd    $dst,$src\n\t"
2180             "pshufd  $dst,$dst,0x00\n\t"
2181             "vinserti128h $dst,$dst,$dst\t! replicate8I" %}
2182   ins_encode %{
2183     __ movdl($dst$$XMMRegister, $src$$Register);
2184     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
2185     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
2186   %}
2187   ins_pipe( pipe_slow );
2188 %}
2189 
2190 // Replicate integer (4 byte) scalar immediate to be vector by loading from const table.
2191 instruct Repl2I_imm(vecD dst, immI con) %{
2192   predicate(n->as_Vector()->length() == 2);
2193   match(Set dst (ReplicateI con));
2194   format %{ "movq    $dst,[$constantaddress]\t! replicate2I($con)" %}
2195   ins_encode %{
2196     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
2197   %}
2198   ins_pipe( fpu_reg_reg );
2199 %}
2200 
2201 instruct Repl4I_imm(vecX dst, immI con) %{
2202   predicate(n->as_Vector()->length() == 4);
2203   match(Set dst (ReplicateI con));
2204   format %{ "movq    $dst,[$constantaddress]\t! replicate4I($con)\n\t"
2205             "punpcklqdq $dst,$dst" %}
2206   ins_encode %{
2207     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
2208     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2209   %}
2210   ins_pipe( pipe_slow );
2211 %}
2212 
2213 instruct Repl8I_imm(vecY dst, immI con) %{
2214   predicate(n->as_Vector()->length() == 8);
2215   match(Set dst (ReplicateI con));
2216   format %{ "movq    $dst,[$constantaddress]\t! replicate8I($con)\n\t"
2217             "punpcklqdq $dst,$dst\n\t"
2218             "vinserti128h $dst,$dst,$dst" %}
2219   ins_encode %{
2220     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
2221     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2222     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
2223   %}
2224   ins_pipe( pipe_slow );
2225 %}
2226 
2227 // Integer could be loaded into xmm register directly from memory.
2228 instruct Repl2I_mem(vecD dst, memory mem) %{
2229   predicate(n->as_Vector()->length() == 2);
2230   match(Set dst (ReplicateI (LoadI mem)));
2231   format %{ "movd    $dst,$mem\n\t"
2232             "pshufd  $dst,$dst,0x00\t! replicate2I" %}
2233   ins_encode %{
2234     __ movdl($dst$$XMMRegister, $mem$$Address);
2235     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
2236   %}
2237   ins_pipe( fpu_reg_reg );
2238 %}
2239 
2240 instruct Repl4I_mem(vecX dst, memory mem) %{
2241   predicate(n->as_Vector()->length() == 4);
2242   match(Set dst (ReplicateI (LoadI mem)));
2243   format %{ "movd    $dst,$mem\n\t"
2244             "pshufd  $dst,$dst,0x00\t! replicate4I" %}
2245   ins_encode %{
2246     __ movdl($dst$$XMMRegister, $mem$$Address);
2247     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
2248   %}
2249   ins_pipe( pipe_slow );
2250 %}
2251 
2252 instruct Repl8I_mem(vecY dst, memory mem) %{
2253   predicate(n->as_Vector()->length() == 8);
2254   match(Set dst (ReplicateI (LoadI mem)));
2255   format %{ "movd    $dst,$mem\n\t"
2256             "pshufd  $dst,$dst,0x00\n\t"
2257             "vinserti128h $dst,$dst,$dst\t! replicate8I" %}
2258   ins_encode %{
2259     __ movdl($dst$$XMMRegister, $mem$$Address);
2260     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
2261     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
2262   %}
2263   ins_pipe( pipe_slow );
2264 %}
2265 
2266 // Replicate integer (4 byte) scalar zero to be vector
2267 instruct Repl2I_zero(vecD dst, immI0 zero) %{
2268   predicate(n->as_Vector()->length() == 2);
2269   match(Set dst (ReplicateI zero));
2270   format %{ "pxor    $dst,$dst\t! replicate2I" %}
2271   ins_encode %{
2272     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
2273   %}
2274   ins_pipe( fpu_reg_reg );
2275 %}
2276 
2277 instruct Repl4I_zero(vecX dst, immI0 zero) %{
2278   predicate(n->as_Vector()->length() == 4);
2279   match(Set dst (ReplicateI zero));
2280   format %{ "pxor    $dst,$dst\t! replicate4I zero)" %}
2281   ins_encode %{
2282     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
2283   %}
2284   ins_pipe( fpu_reg_reg );
2285 %}
2286 
2287 instruct Repl8I_zero(vecY dst, immI0 zero) %{
2288   predicate(n->as_Vector()->length() == 8);
2289   match(Set dst (ReplicateI zero));
2290   format %{ "vpxor   $dst,$dst,$dst\t! replicate8I zero" %}
2291   ins_encode %{
2292     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
2293     bool vector256 = true;
2294     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector256);
2295   %}
2296   ins_pipe( fpu_reg_reg );
2297 %}
2298 
2299 // Replicate long (8 byte) scalar to be vector
2300 #ifdef _LP64
2301 instruct Repl2L(vecX dst, rRegL src) %{
2302   predicate(n->as_Vector()->length() == 2);
2303   match(Set dst (ReplicateL src));
2304   format %{ "movdq   $dst,$src\n\t"
2305             "punpcklqdq $dst,$dst\t! replicate2L" %}
2306   ins_encode %{
2307     __ movdq($dst$$XMMRegister, $src$$Register);
2308     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2309   %}
2310   ins_pipe( pipe_slow );
2311 %}
2312 
2313 instruct Repl4L(vecY dst, rRegL src) %{
2314   predicate(n->as_Vector()->length() == 4);
2315   match(Set dst (ReplicateL src));
2316   format %{ "movdq   $dst,$src\n\t"
2317             "punpcklqdq $dst,$dst\n\t"
2318             "vinserti128h $dst,$dst,$dst\t! replicate4L" %}
2319   ins_encode %{
2320     __ movdq($dst$$XMMRegister, $src$$Register);
2321     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2322     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
2323   %}
2324   ins_pipe( pipe_slow );
2325 %}
2326 #else // _LP64
2327 instruct Repl2L(vecX dst, eRegL src, regD tmp) %{
2328   predicate(n->as_Vector()->length() == 2);
2329   match(Set dst (ReplicateL src));
2330   effect(TEMP dst, USE src, TEMP tmp);
2331   format %{ "movdl   $dst,$src.lo\n\t"
2332             "movdl   $tmp,$src.hi\n\t"
2333             "punpckldq $dst,$tmp\n\t"
2334             "punpcklqdq $dst,$dst\t! replicate2L"%}
2335   ins_encode %{
2336     __ movdl($dst$$XMMRegister, $src$$Register);
2337     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
2338     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
2339     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2340   %}
2341   ins_pipe( pipe_slow );
2342 %}
2343 
2344 instruct Repl4L(vecY dst, eRegL src, regD tmp) %{
2345   predicate(n->as_Vector()->length() == 4);
2346   match(Set dst (ReplicateL src));
2347   effect(TEMP dst, USE src, TEMP tmp);
2348   format %{ "movdl   $dst,$src.lo\n\t"
2349             "movdl   $tmp,$src.hi\n\t"
2350             "punpckldq $dst,$tmp\n\t"
2351             "punpcklqdq $dst,$dst\n\t"
2352             "vinserti128h $dst,$dst,$dst\t! replicate4L" %}
2353   ins_encode %{
2354     __ movdl($dst$$XMMRegister, $src$$Register);
2355     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
2356     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
2357     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2358     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
2359   %}
2360   ins_pipe( pipe_slow );
2361 %}
2362 #endif // _LP64
2363 
2364 // Replicate long (8 byte) scalar immediate to be vector by loading from const table.
2365 instruct Repl2L_imm(vecX dst, immL con) %{
2366   predicate(n->as_Vector()->length() == 2);
2367   match(Set dst (ReplicateL con));
2368   format %{ "movq    $dst,[$constantaddress]\n\t"
2369             "punpcklqdq $dst,$dst\t! replicate2L($con)" %}
2370   ins_encode %{
2371     __ movq($dst$$XMMRegister, $constantaddress($con));
2372     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2373   %}
2374   ins_pipe( pipe_slow );
2375 %}
2376 
2377 instruct Repl4L_imm(vecY dst, immL con) %{
2378   predicate(n->as_Vector()->length() == 4);
2379   match(Set dst (ReplicateL con));
2380   format %{ "movq    $dst,[$constantaddress]\n\t"
2381             "punpcklqdq $dst,$dst\n\t"
2382             "vinserti128h $dst,$dst,$dst\t! replicate4L($con)" %}
2383   ins_encode %{
2384     __ movq($dst$$XMMRegister, $constantaddress($con));
2385     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2386     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
2387   %}
2388   ins_pipe( pipe_slow );
2389 %}
2390 
2391 // Long could be loaded into xmm register directly from memory.
2392 instruct Repl2L_mem(vecX dst, memory mem) %{
2393   predicate(n->as_Vector()->length() == 2);
2394   match(Set dst (ReplicateL (LoadL mem)));
2395   format %{ "movq    $dst,$mem\n\t"
2396             "punpcklqdq $dst,$dst\t! replicate2L" %}
2397   ins_encode %{
2398     __ movq($dst$$XMMRegister, $mem$$Address);
2399     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2400   %}
2401   ins_pipe( pipe_slow );
2402 %}
2403 
2404 instruct Repl4L_mem(vecY dst, memory mem) %{
2405   predicate(n->as_Vector()->length() == 4);
2406   match(Set dst (ReplicateL (LoadL mem)));
2407   format %{ "movq    $dst,$mem\n\t"
2408             "punpcklqdq $dst,$dst\n\t"
2409             "vinserti128h $dst,$dst,$dst\t! replicate4L" %}
2410   ins_encode %{
2411     __ movq($dst$$XMMRegister, $mem$$Address);
2412     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2413     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
2414   %}
2415   ins_pipe( pipe_slow );
2416 %}
2417 
2418 // Replicate long (8 byte) scalar zero to be vector
2419 instruct Repl2L_zero(vecX dst, immL0 zero) %{
2420   predicate(n->as_Vector()->length() == 2);
2421   match(Set dst (ReplicateL zero));
2422   format %{ "pxor    $dst,$dst\t! replicate2L zero" %}
2423   ins_encode %{
2424     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
2425   %}
2426   ins_pipe( fpu_reg_reg );
2427 %}
2428 
2429 instruct Repl4L_zero(vecY dst, immL0 zero) %{
2430   predicate(n->as_Vector()->length() == 4);
2431   match(Set dst (ReplicateL zero));
2432   format %{ "vpxor   $dst,$dst,$dst\t! replicate4L zero" %}
2433   ins_encode %{
2434     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
2435     bool vector256 = true;
2436     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector256);
2437   %}
2438   ins_pipe( fpu_reg_reg );
2439 %}
2440 
2441 // Replicate float (4 byte) scalar to be vector
2442 instruct Repl2F(vecD dst, regF src) %{
2443   predicate(n->as_Vector()->length() == 2);
2444   match(Set dst (ReplicateF src));
2445   format %{ "pshufd  $dst,$dst,0x00\t! replicate2F" %}
2446   ins_encode %{
2447     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
2448   %}
2449   ins_pipe( fpu_reg_reg );
2450 %}
2451 
2452 instruct Repl4F(vecX dst, regF src) %{
2453   predicate(n->as_Vector()->length() == 4);
2454   match(Set dst (ReplicateF src));
2455   format %{ "pshufd  $dst,$dst,0x00\t! replicate4F" %}
2456   ins_encode %{
2457     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
2458   %}
2459   ins_pipe( pipe_slow );
2460 %}
2461 
2462 instruct Repl8F(vecY dst, regF src) %{
2463   predicate(n->as_Vector()->length() == 8);
2464   match(Set dst (ReplicateF src));
2465   format %{ "pshufd  $dst,$src,0x00\n\t"
2466             "vinsertf128h $dst,$dst,$dst\t! replicate8F" %}
2467   ins_encode %{
2468     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
2469     __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
2470   %}
2471   ins_pipe( pipe_slow );
2472 %}
2473 
2474 // Replicate float (4 byte) scalar zero to be vector
2475 instruct Repl2F_zero(vecD dst, immF0 zero) %{
2476   predicate(n->as_Vector()->length() == 2);
2477   match(Set dst (ReplicateF zero));
2478   format %{ "xorps   $dst,$dst\t! replicate2F zero" %}
2479   ins_encode %{
2480     __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
2481   %}
2482   ins_pipe( fpu_reg_reg );
2483 %}
2484 
2485 instruct Repl4F_zero(vecX dst, immF0 zero) %{
2486   predicate(n->as_Vector()->length() == 4);
2487   match(Set dst (ReplicateF zero));
2488   format %{ "xorps   $dst,$dst\t! replicate4F zero" %}
2489   ins_encode %{
2490     __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
2491   %}
2492   ins_pipe( fpu_reg_reg );
2493 %}
2494 
2495 instruct Repl8F_zero(vecY dst, immF0 zero) %{
2496   predicate(n->as_Vector()->length() == 8);
2497   match(Set dst (ReplicateF zero));
2498   format %{ "vxorps  $dst,$dst,$dst\t! replicate8F zero" %}
2499   ins_encode %{
2500     bool vector256 = true;
2501     __ vxorps($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector256);
2502   %}
2503   ins_pipe( fpu_reg_reg );
2504 %}
2505 
2506 // Replicate double (8 bytes) scalar to be vector
2507 instruct Repl2D(vecX dst, regD src) %{
2508   predicate(n->as_Vector()->length() == 2);
2509   match(Set dst (ReplicateD src));
2510   format %{ "pshufd  $dst,$src,0x44\t! replicate2D" %}
2511   ins_encode %{
2512     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
2513   %}
2514   ins_pipe( pipe_slow );
2515 %}
2516 
2517 instruct Repl4D(vecY dst, regD src) %{
2518   predicate(n->as_Vector()->length() == 4);
2519   match(Set dst (ReplicateD src));
2520   format %{ "pshufd  $dst,$src,0x44\n\t"
2521             "vinsertf128h $dst,$dst,$dst\t! replicate4D" %}
2522   ins_encode %{
2523     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
2524     __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
2525   %}
2526   ins_pipe( pipe_slow );
2527 %}
2528 
2529 // Replicate double (8 byte) scalar zero to be vector
2530 instruct Repl2D_zero(vecX dst, immD0 zero) %{
2531   predicate(n->as_Vector()->length() == 2);
2532   match(Set dst (ReplicateD zero));
2533   format %{ "xorpd   $dst,$dst\t! replicate2D zero" %}
2534   ins_encode %{
2535     __ xorpd($dst$$XMMRegister, $dst$$XMMRegister);
2536   %}
2537   ins_pipe( fpu_reg_reg );
2538 %}
2539 
2540 instruct Repl4D_zero(vecY dst, immD0 zero) %{
2541   predicate(n->as_Vector()->length() == 4);
2542   match(Set dst (ReplicateD zero));
2543   format %{ "vxorpd  $dst,$dst,$dst,vect256\t! replicate4D zero" %}
2544   ins_encode %{
2545     bool vector256 = true;
2546     __ vxorpd($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector256);
2547   %}
2548   ins_pipe( fpu_reg_reg );
2549 %}
2550 
2551 // ====================REDUCTION ARITHMETIC=======================================
2552 
2553 instruct rsadd2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
2554   predicate(UseSSE > 2 && UseAVX == 0);
2555   match(Set dst (AddReductionVI src1 src2));
2556   effect(TEMP tmp2, TEMP tmp);
2557   format %{ "movdqu  $tmp2,$src2\n\t"
2558             "phaddd  $tmp2,$tmp2\n\t"
2559             "movd  $tmp,$src1\n\t"
2560             "paddd  $tmp,$tmp2\n\t"
2561             "movd  $dst,$tmp\t! add reduction2I" %}
2562   ins_encode %{
2563     __ movdqu($tmp2$$XMMRegister, $src2$$XMMRegister);
2564     __ phaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister);
2565     __ movdl($tmp$$XMMRegister, $src1$$Register);
2566     __ paddd($tmp$$XMMRegister, $tmp2$$XMMRegister);
2567     __ movdl($dst$$Register, $tmp$$XMMRegister);
2568   %}
2569   ins_pipe( pipe_slow );
2570 %}
2571 
2572 instruct rvadd2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
2573   predicate(UseAVX > 0);
2574   match(Set dst (AddReductionVI src1 src2));
2575   effect(TEMP tmp, TEMP tmp2);
2576   format %{ "vphaddd  $tmp,$src2,$src2\n\t"
2577             "movd  $tmp2,$src1\n\t"
2578             "vpaddd  $tmp2,$tmp2,$tmp\n\t"
2579             "movd  $dst,$tmp2\t! add reduction2I" %}
2580   ins_encode %{
2581     __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, false);
2582     __ movdl($tmp2$$XMMRegister, $src1$$Register);
2583     __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, false);
2584     __ movdl($dst$$Register, $tmp2$$XMMRegister);
2585   %}
2586   ins_pipe( pipe_slow );
2587 %}
2588 
2589 instruct rsadd4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
2590   predicate(UseSSE > 2 && UseAVX == 0);
2591   match(Set dst (AddReductionVI src1 src2));
2592   effect(TEMP tmp2, TEMP tmp);
2593   format %{ "movdqu  $tmp2,$src2\n\t"
2594             "phaddd  $tmp2,$tmp2\n\t"
2595             "phaddd  $tmp2,$tmp2\n\t"
2596             "movd  $tmp,$src1\n\t"
2597             "paddd  $tmp,$tmp2\n\t"
2598             "movd  $dst,$tmp\t! add reduction4I" %}
2599   ins_encode %{
2600     __ movdqu($tmp2$$XMMRegister, $src2$$XMMRegister);
2601     __ phaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister);
2602     __ phaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister);
2603     __ movdl($tmp$$XMMRegister, $src1$$Register);
2604     __ paddd($tmp$$XMMRegister, $tmp2$$XMMRegister);
2605     __ movdl($dst$$Register, $tmp$$XMMRegister);
2606   %}
2607   ins_pipe( pipe_slow );
2608 %}
2609 
2610 instruct rvadd4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
2611   predicate(UseAVX > 0);
2612   match(Set dst (AddReductionVI src1 src2));
2613   effect(TEMP tmp, TEMP tmp2);
2614   format %{ "vphaddd  $tmp,$src2,$src2\n\t"
2615             "vphaddd  $tmp,$tmp,$tmp2\n\t"
2616             "movd  $tmp2,$src1\n\t"
2617             "vpaddd  $tmp2,$tmp2,$tmp\n\t"
2618             "movd  $dst,$tmp2\t! add reduction4I" %}
2619   ins_encode %{
2620     __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, false);
2621     __ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, false);
2622     __ movdl($tmp2$$XMMRegister, $src1$$Register);
2623     __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, false);
2624     __ movdl($dst$$Register, $tmp2$$XMMRegister);
2625   %}
2626   ins_pipe( pipe_slow );
2627 %}
2628 
2629 instruct rvadd8I_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2) %{
2630   predicate(UseAVX > 0);
2631   match(Set dst (AddReductionVI src1 src2));
2632   effect(TEMP tmp, TEMP tmp2);
2633   format %{ "vphaddd  $tmp,$src2,$src2\n\t"
2634             "vphaddd  $tmp,$tmp,$tmp2\n\t"
2635             "vextractf128  $tmp2,$tmp\n\t"
2636             "vpaddd  $tmp,$tmp,$tmp2\n\t"
2637             "movd  $tmp2,$src1\n\t"
2638             "vpaddd  $tmp2,$tmp2,$tmp\n\t"
2639             "movd  $dst,$tmp2\t! add reduction8I" %}
2640   ins_encode %{
2641     __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, true);
2642     __ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, true);
2643     __ vextractf128h($tmp2$$XMMRegister, $tmp$$XMMRegister);
2644     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, false);
2645     __ movdl($tmp2$$XMMRegister, $src1$$Register);
2646     __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, false);
2647     __ movdl($dst$$Register, $tmp2$$XMMRegister);
2648   %}
2649   ins_pipe( pipe_slow );
2650 %}
2651 
2652 instruct rsadd2F_reduction_reg(regF dst, regF src1, vecD src2, regF tmp, regF tmp2) %{
2653   predicate(UseSSE >= 1 && UseAVX == 0);
2654   match(Set dst (AddReductionVF src1 src2));
2655   effect(TEMP tmp, TEMP tmp2);
2656   format %{ "movdqu  $tmp,$src1\n\t"
2657             "addss  $tmp,$src2\n\t"
2658             "pshufd  $tmp2,$src2,0x01\n\t"
2659             "addss  $tmp,$tmp2\n\t"
2660             "movdqu  $dst,$tmp\t! add reduction2F" %}
2661   ins_encode %{
2662     __ movdqu($tmp$$XMMRegister, $src1$$XMMRegister);
2663     __ addss($tmp$$XMMRegister, $src2$$XMMRegister);
2664     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x01);
2665     __ addss($tmp$$XMMRegister, $tmp2$$XMMRegister);
2666     __ movdqu($dst$$XMMRegister, $tmp$$XMMRegister);
2667   %}
2668   ins_pipe( pipe_slow );
2669 %}
2670 
2671 instruct rvadd2F_reduction_reg(regF dst, regF src1, vecD src2, regF tmp, regF tmp2) %{
2672   predicate(UseAVX > 0);
2673   match(Set dst (AddReductionVF src1 src2));
2674   effect(TEMP tmp2, TEMP tmp);
2675   format %{ "vaddss  $tmp2,$src1,$src2\n\t"
2676             "pshufd  $tmp,$src2,0x01\n\t"
2677             "vaddss  $dst,$tmp2,$tmp\t! add reduction2F" %}
2678   ins_encode %{
2679     __ vaddss($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2680     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
2681     __ vaddss($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
2682   %}
2683   ins_pipe( pipe_slow );
2684 %}
2685 
2686 instruct rsadd4F_reduction_reg(regF dst, regF src1, vecX src2, regF tmp, regF tmp2) %{
2687   predicate(UseSSE >= 1 && UseAVX == 0);
2688   match(Set dst (AddReductionVF src1 src2));
2689   effect(TEMP tmp, TEMP tmp2);
2690   format %{ "movdqu  $tmp,$src1\n\t"
2691                         "addss  $tmp,$src2\n\t"
2692             "pshufd  $tmp2,$src2,0x01\n\t"
2693             "addss  $tmp,$tmp2\n\t"
2694             "pshufd  $tmp2,$src2,0x02\n\t"
2695             "addss  $tmp,$tmp2\n\t"
2696             "pshufd  $tmp2,$src2,0x03\n\t"
2697             "addss  $tmp,$tmp2\n\t"
2698                         "movdqu  $dst,$tmp\t! add reduction4F" %}
2699   ins_encode %{
2700     __ movdqu($tmp$$XMMRegister, $src1$$XMMRegister);
2701     __ addss($tmp$$XMMRegister, $src2$$XMMRegister);
2702     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x01);
2703     __ addss($tmp$$XMMRegister, $tmp2$$XMMRegister);
2704     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x02);
2705     __ addss($tmp$$XMMRegister, $tmp2$$XMMRegister); 
2706     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x03);
2707     __ addss($tmp$$XMMRegister, $tmp2$$XMMRegister);
2708     __ movdqu($dst$$XMMRegister, $tmp$$XMMRegister);
2709   %}
2710   ins_pipe( pipe_slow );
2711 %}
2712 
2713 instruct rvadd4F_reduction_reg(regF dst, regF src1, vecX src2, regF tmp, regF tmp2) %{
2714   predicate(UseAVX > 0);
2715   match(Set dst (AddReductionVF src1 src2));
2716   effect(TEMP tmp, TEMP tmp2);
2717   format %{ "vaddss  $tmp2,$src1,$src2\n\t"
2718             "pshufd  $tmp,$src2,0x01\n\t"
2719             "vaddss  $tmp2,$tmp2,$tmp\n\t"
2720             "pshufd  $tmp,$src2,0x02\n\t"
2721             "vaddss  $tmp2,$tmp2,$tmp\n\t"
2722             "pshufd  $tmp,$src2,0x03\n\t"
2723             "vaddss  $dst,$tmp2,$tmp\t! add reduction4F" %}
2724   ins_encode %{
2725     __ vaddss($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2726     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
2727     __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
2728     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
2729     __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister); 
2730     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
2731     __ vaddss($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
2732   %}
2733   ins_pipe( pipe_slow );
2734 %}
2735 
2736 instruct radd8F_reduction_reg(regF dst, regF src1, vecY src2, regF tmp, regF tmp2, regF tmp3) %{
2737   predicate(UseAVX > 0);
2738   match(Set dst (AddReductionVF src1 src2));
2739   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
2740   format %{ "vaddss  $tmp2,$src1,$src2\n\t"
2741             "pshufd  $tmp,$src2,0x01\n\t"
2742             "vaddss  $tmp2,$tmp2,$tmp\n\t"
2743             "pshufd  $tmp,$src2,0x02\n\t"
2744             "vaddss  $tmp2,$tmp2,$tmp\n\t"
2745             "pshufd  $tmp,$src2,0x03\n\t"
2746             "vaddss  $tmp2,$tmp2,$tmp\n\t"
2747             "vextractf128  $tmp3,$src2\n\t"
2748             "vaddss  $tmp2,$tmp2,$tmp3\n\t"
2749             "pshufd  $tmp,$tmp3,0x01\n\t"
2750             "vaddss  $tmp2,$tmp2,$tmp\n\t"
2751             "pshufd  $tmp,$tmp3,0x02\n\t"
2752             "vaddss  $tmp2,$tmp2,$tmp\n\t"
2753             "pshufd  $tmp,$tmp3,0x03\n\t"
2754             "vaddss  $dst,$tmp2,$tmp\t! add reduction8F" %}
2755   ins_encode %{
2756     __ vaddss($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2757     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
2758     __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
2759     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
2760     __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister); 
2761     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
2762     __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
2763     __ vextractf128h($tmp3$$XMMRegister, $src2$$XMMRegister);
2764     __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister);
2765     __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x01);
2766     __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
2767     __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x02);
2768     __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister); 
2769     __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x03);
2770     __ vaddss($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
2771   %}
2772   ins_pipe( pipe_slow );
2773 %}
2774 
2775 instruct rsadd2D_reduction_reg(regD dst, regD src1, vecX src2, regD tmp) %{
2776   predicate(UseSSE >= 1 && UseAVX == 0);
2777   match(Set dst (AddReductionVD src1 src2));
2778   effect(TEMP tmp, TEMP dst);
2779   format %{ "movdqu  $tmp,$src1\n\t"
2780             "addsd  $tmp,$src2\n\t"
2781             "pshufd  $dst,$src2,0xE\n\t"
2782             "addsd  $dst,$tmp\t! add reduction2D" %}
2783   ins_encode %{
2784     __ movdqu($tmp$$XMMRegister, $src1$$XMMRegister);
2785     __ addsd($tmp$$XMMRegister, $src2$$XMMRegister);
2786     __ pshufd($dst$$XMMRegister, $src2$$XMMRegister, 0xE);
2787     __ addsd($dst$$XMMRegister, $tmp$$XMMRegister);
2788   %}
2789   ins_pipe( pipe_slow );
2790 %}
2791 
2792 instruct rvadd2D_reduction_reg(regD dst, regD src1, vecX src2, regD tmp, regD tmp2) %{
2793   predicate(UseAVX > 0);
2794   match(Set dst (AddReductionVD src1 src2));
2795   effect(TEMP tmp, TEMP tmp2);
2796   format %{ "vaddsd  $tmp2,$src1,$src2\n\t"
2797             "pshufd  $tmp,$src2,0xE\n\t"
2798             "vaddsd  $dst,$tmp2,$tmp\t! add reduction2D" %}
2799   ins_encode %{
2800     __ vaddsd($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2801     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
2802     __ vaddsd($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
2803   %}
2804   ins_pipe( pipe_slow );
2805 %}
2806 
2807 instruct rvadd4D_reduction_reg(regD dst, regD src1, vecY src2, regD tmp, regD tmp2, regD tmp3) %{
2808   predicate(UseAVX > 0);
2809   match(Set dst (AddReductionVD src1 src2));
2810   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
2811   format %{ "vaddsd  $tmp2,$src1,$src2\n\t"
2812             "pshufd  $tmp,$src2,0xE\n\t"
2813             "vaddsd  $tmp2,$tmp2,$tmp\n\t"
2814             "vextractf128  $tmp3,$src2\n\t"
2815             "vaddsd  $tmp2,$tmp2,$tmp3\n\t"
2816             "pshufd  $tmp,$tmp3,0xE\n\t"
2817             "vaddsd  $dst,$tmp2,$tmp\t! add reduction4D" %}
2818   ins_encode %{
2819     __ vaddsd($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2820     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
2821     __ vaddsd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
2822     __ vextractf128h($tmp3$$XMMRegister, $src2$$XMMRegister);
2823     __ vaddsd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister); 
2824     __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0xE);
2825     __ vaddsd($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
2826   %}
2827   ins_pipe( pipe_slow );
2828 %}
2829 
2830 instruct rsmul2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
2831   predicate(UseSSE > 3 && UseAVX == 0);
2832   match(Set dst (MulReductionVI src1 src2));
2833   effect(TEMP tmp, TEMP tmp2);
2834   format %{ "pshufd  $tmp2,$src2,0x1\n\t"
2835             "pmulld  $tmp2,$src2\n\t"
2836             "movd  $tmp,$src1\n\t"
2837             "pmulld  $tmp2,$tmp\n\t"
2838             "movd  $dst,$tmp2\t! mul reduction2I" %}
2839   ins_encode %{
2840     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
2841     __ pmulld($tmp2$$XMMRegister, $src2$$XMMRegister);
2842     __ movdl($tmp$$XMMRegister, $src1$$Register);
2843     __ pmulld($tmp2$$XMMRegister, $tmp$$XMMRegister);
2844     __ movdl($dst$$Register, $tmp2$$XMMRegister);
2845   %}
2846   ins_pipe( pipe_slow );
2847 %}
2848 
2849 instruct rvmul2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
2850   predicate(UseAVX > 0);
2851   match(Set dst (MulReductionVI src1 src2));
2852   effect(TEMP tmp, TEMP tmp2);
2853   format %{ "pshufd  $tmp2,$src2,0x1\n\t"
2854             "vpmulld  $tmp,$src2,$tmp2\n\t"
2855             "movd  $tmp2,$src1\n\t"
2856             "vpmulld  $tmp2,$tmp,$tmp2\n\t"
2857             "movd  $dst,$tmp2\t! mul reduction2I" %}
2858   ins_encode %{
2859     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
2860     __ vpmulld($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, false);
2861     __ movdl($tmp2$$XMMRegister, $src1$$Register);
2862     __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, false);
2863     __ movdl($dst$$Register, $tmp2$$XMMRegister);
2864   %}
2865   ins_pipe( pipe_slow );
2866 %}
2867 
2868 instruct rsmul4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
2869   predicate(UseSSE > 3 && UseAVX == 0);
2870   match(Set dst (MulReductionVI src1 src2));
2871   effect(TEMP tmp, TEMP tmp2);
2872   format %{ "pshufd  $tmp2,$src2,0xE\n\t"
2873             "pmulld  $tmp2,$src2\n\t"
2874             "pshufd  $tmp,$tmp2,0x1\n\t"
2875             "pmulld  $tmp2,$tmp\n\t"
2876             "movd  $tmp,$src1\n\t"
2877             "pmulld  $tmp2,$tmp\n\t"
2878             "movd  $dst,$tmp2\t! mul reduction4I" %}
2879   ins_encode %{
2880     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
2881     __ pmulld($tmp2$$XMMRegister, $src2$$XMMRegister);
2882     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x1);
2883     __ pmulld($tmp2$$XMMRegister, $tmp$$XMMRegister);
2884     __ movdl($tmp$$XMMRegister, $src1$$Register);
2885     __ pmulld($tmp2$$XMMRegister, $tmp$$XMMRegister);
2886     __ movdl($dst$$Register, $tmp2$$XMMRegister);
2887   %}
2888   ins_pipe( pipe_slow );
2889 %}
2890 
2891 instruct rvmul4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
2892   predicate(UseAVX > 0);
2893   match(Set dst (MulReductionVI src1 src2));
2894   effect(TEMP tmp, TEMP tmp2);
2895   format %{ "pshufd  $tmp2,$src2,0xE\n\t"
2896             "vpmulld  $tmp,$src2,$tmp2\n\t"
2897             "pshufd  $tmp2,$tmp,0x1\n\t"
2898             "vpmulld  $tmp,$tmp,$tmp2\n\t"
2899             "movd  $tmp2,$src1\n\t"
2900             "vpmulld  $tmp2,$tmp,$tmp2\n\t"
2901             "movd  $dst,$tmp2\t! mul reduction4I" %}
2902   ins_encode %{
2903     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
2904     __ vpmulld($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, false);
2905     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
2906     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, false);
2907     __ movdl($tmp2$$XMMRegister, $src1$$Register);
2908     __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, false);
2909     __ movdl($dst$$Register, $tmp2$$XMMRegister);
2910   %}
2911   ins_pipe( pipe_slow );
2912 %}
2913 
2914 instruct rvmul8I_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2) %{
2915   predicate(UseAVX > 0);
2916   match(Set dst (MulReductionVI src1 src2));
2917   effect(TEMP tmp, TEMP tmp2);
2918   format %{ "vextractf128  $tmp,$src2\n\t"
2919             "vpmulld  $tmp,$tmp,$src2\n\t"
2920             "pshufd  $tmp2,$tmp,0xE\n\t"
2921             "vpmulld  $tmp,$tmp,$tmp2\n\t"
2922             "pshufd  $tmp2,$tmp,0x1\n\t"
2923             "vpmulld  $tmp,$tmp,$tmp2\n\t"
2924             "movd  $tmp2,$src1\n\t"
2925             "vpmulld  $tmp2,$tmp,$tmp2\n\t"
2926             "movd  $dst,$tmp2\t! mul reduction8I" %}
2927   ins_encode %{
2928     __ vextractf128h($tmp$$XMMRegister, $src2$$XMMRegister);
2929     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, false);
2930     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
2931     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, false);
2932     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
2933     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, false);
2934     __ movdl($tmp2$$XMMRegister, $src1$$Register);
2935     __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, false);
2936     __ movdl($dst$$Register, $tmp2$$XMMRegister);
2937   %}
2938   ins_pipe( pipe_slow );
2939 %}
2940 
2941 instruct rsmul2F_reduction_reg(regF dst, regF src1, vecD src2, regF tmp, regF tmp2) %{
2942   predicate(UseSSE >= 1 && UseAVX == 0);
2943   match(Set dst (MulReductionVF src1 src2));
2944   effect(TEMP tmp, TEMP tmp2);
2945   format %{ "movdqu  $tmp,$src1\n\t"
2946             "mulss  $tmp,$src2\n\t"
2947             "pshufd  $tmp2,$src2,0x01\n\t"
2948             "mulss  $tmp,$tmp2\n\t"
2949             "movdqu  $dst,$tmp\t! add reduction2F" %}
2950   ins_encode %{
2951     __ movdqu($tmp$$XMMRegister, $src1$$XMMRegister);
2952     __ mulss($tmp$$XMMRegister, $src2$$XMMRegister);
2953     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x01);
2954     __ mulss($tmp$$XMMRegister, $tmp2$$XMMRegister);
2955     __ movdqu($dst$$XMMRegister, $tmp$$XMMRegister);
2956   %}
2957   ins_pipe( pipe_slow );
2958 %}
2959 
2960 instruct rvmul2F_reduction_reg(regF dst, regF src1, vecD src2, regF tmp, regF tmp2) %{
2961   predicate(UseAVX > 0);
2962   match(Set dst (MulReductionVF src1 src2));
2963   effect(TEMP tmp, TEMP tmp2);
2964   format %{ "vmulss  $tmp2,$src1,$src2\n\t"
2965             "pshufd  $tmp,$src2,0x01\n\t"
2966             "vmulss  $dst,$tmp2,$tmp\t! add reduction2F" %}
2967   ins_encode %{
2968     __ vmulss($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2969     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
2970     __ vmulss($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
2971   %}
2972   ins_pipe( pipe_slow );
2973 %}
2974 
2975 instruct rsmul4F_reduction_reg(regF dst, regF src1, vecX src2, regF tmp, regF tmp2) %{
2976   predicate(UseSSE >= 1 && UseAVX == 0);
2977   match(Set dst (MulReductionVF src1 src2));
2978   effect(TEMP tmp, TEMP tmp2);
2979   format %{ "movdqu  $tmp,$src1\n\t"
2980             "mulss  $tmp,$src2\n\t"
2981             "pshufd  $tmp2,$src2,0x01\n\t"
2982             "mulss  $tmp,$tmp2\n\t"
2983             "pshufd  $tmp2,$src2,0x02\n\t"
2984             "mulss  $tmp,$tmp2\n\t"
2985             "pshufd  $tmp2,$src2,0x03\n\t"
2986             "mulss  $tmp,$tmp2\n\t"
2987             "movdqu  $dst,$tmp\t! add reduction4F" %}
2988   ins_encode %{
2989     __ movdqu($tmp$$XMMRegister, $src1$$XMMRegister);
2990     __ mulss($tmp$$XMMRegister, $src2$$XMMRegister);
2991     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x01);
2992     __ mulss($tmp$$XMMRegister, $tmp2$$XMMRegister);
2993     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x02);
2994     __ mulss($tmp$$XMMRegister, $tmp2$$XMMRegister); 
2995     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x03);
2996     __ mulss($tmp$$XMMRegister, $tmp2$$XMMRegister);
2997     __ movdqu($dst$$XMMRegister, $tmp$$XMMRegister);
2998   %}
2999   ins_pipe( pipe_slow );
3000 %}
3001 
3002 instruct rvmul4F_reduction_reg(regF dst, regF src1, vecX src2, regF tmp, regF tmp2) %{
3003   predicate(UseAVX > 0);
3004   match(Set dst (MulReductionVF src1 src2));
3005   effect(TEMP tmp, TEMP tmp2);
3006   format %{ "vmulss  $tmp2,$src1,$src2\n\t"
3007             "pshufd  $tmp,$src2,0x01\n\t"
3008             "vmulss  $tmp2,$tmp2,$tmp\n\t"
3009             "pshufd  $tmp,$src2,0x02\n\t"
3010             "vmulss  $tmp2,$tmp2,$tmp\n\t"
3011             "pshufd  $tmp,$src2,0x03\n\t"
3012             "vmulss  $dst,$tmp2,$tmp\t! add reduction4F" %}
3013   ins_encode %{
3014     __ vmulss($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
3015     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
3016     __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
3017     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
3018     __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister); 
3019     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
3020     __ vmulss($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
3021   %}
3022   ins_pipe( pipe_slow );
3023 %}
3024 
3025 instruct rvmul8F_reduction_reg(regF dst, regF src1, vecY src2, regF tmp, regF tmp2, regF tmp3) %{
3026   predicate(UseAVX > 0);
3027   match(Set dst (MulReductionVF src1 src2));
3028   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
3029   format %{ "vmulss  $tmp2,$src1,$src2\n\t"
3030             "pshufd  $tmp,$src2,0x01\n\t"
3031             "vmulss  $tmp2,$tmp2,$tmp\n\t"
3032             "pshufd  $tmp,$src2,0x02\n\t"
3033             "vmulss  $tmp2,$tmp2,$tmp\n\t"
3034             "pshufd  $tmp,$src2,0x03\n\t"
3035             "vmulss  $tmp2,$tmp2,$tmp\n\t"
3036             "vextractf128  $tmp3,$src2\n\t"
3037             "vmulss  $tmp2,$tmp2,$tmp3\n\t"
3038             "pshufd  $tmp,$tmp3,0x01\n\t"
3039             "vmulss  $tmp2,$tmp2,$tmp\n\t"
3040             "pshufd  $tmp,$tmp3,0x02\n\t"
3041             "vmulss  $tmp2,$tmp2,$tmp\n\t"
3042             "pshufd  $tmp,$tmp3,0x03\n\t"
3043             "vmulss  $dst,$tmp2,$tmp\t! mul reduction8F" %}
3044   ins_encode %{
3045     __ vmulss($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
3046     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
3047     __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
3048     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
3049     __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister); 
3050     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
3051     __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
3052     __ vextractf128h($tmp3$$XMMRegister, $src2$$XMMRegister);
3053     __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister);
3054     __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x01);
3055     __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
3056     __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x02);
3057     __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister); 
3058     __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x03);
3059     __ vmulss($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
3060   %}
3061   ins_pipe( pipe_slow );
3062 %}
3063 
3064 instruct rsmul2D_reduction_reg(regD dst, regD src1, vecX src2, regD tmp) %{
3065   predicate(UseSSE >= 1 && UseAVX == 0);
3066   match(Set dst (MulReductionVD src1 src2));
3067   effect(TEMP tmp, TEMP dst);
3068   format %{ "movdqu  $tmp,$src1\n\t"
3069             "mulsd  $tmp,$src2\n\t"
3070             "pshufd  $dst,$src2,0xE\n\t"
3071             "mulsd  $dst,$tmp\t! add reduction2D" %}
3072   ins_encode %{
3073     __ movdqu($tmp$$XMMRegister, $src1$$XMMRegister);
3074     __ mulsd($tmp$$XMMRegister, $src2$$XMMRegister);
3075     __ pshufd($dst$$XMMRegister, $src2$$XMMRegister, 0xE);
3076     __ mulsd($dst$$XMMRegister, $tmp$$XMMRegister);
3077   %}
3078   ins_pipe( pipe_slow );
3079 %}
3080 
3081 instruct rvmul2D_reduction_reg(regD dst, regD src1, vecX src2, regD tmp, regD tmp2) %{
3082   predicate(UseAVX > 0);
3083   match(Set dst (MulReductionVD src1 src2));
3084   effect(TEMP tmp, TEMP tmp2);
3085   format %{ "vmulsd  $tmp2,$src1,$src2\n\t"
3086             "pshufd  $tmp,$src2,0xE\n\t"
3087             "vmulsd  $dst,$tmp2,$tmp\t! mul reduction2D" %}
3088   ins_encode %{
3089     __ vmulsd($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
3090     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
3091     __ vmulsd($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
3092   %}
3093   ins_pipe( pipe_slow );
3094 %}
3095 
3096 instruct rvmul4D_reduction_reg(regD dst, regD src1, vecY src2, regD tmp, regD tmp2, regD tmp3) %{
3097   predicate(UseAVX > 0); 
3098   match(Set dst (MulReductionVD src1 src2));
3099   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
3100   format %{ "vmulsd  $tmp2,$src1,$src2\n\t"
3101             "pshufd  $tmp,$src2,0xE\n\t"
3102             "vmulsd  $tmp2,$tmp2,$tmp\n\t"
3103             "vextractf128  $tmp3,$src2\n\t"
3104             "vmulsd  $tmp2,$tmp2,$tmp3\n\t"
3105             "pshufd  $tmp,$tmp3,0xE\n\t"
3106             "vmulsd  $dst,$tmp2,$tmp\t! mul reduction4D" %}
3107   ins_encode %{
3108     __ vmulsd($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
3109     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
3110     __ vmulsd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
3111     __ vextractf128h($tmp3$$XMMRegister, $src2$$XMMRegister);
3112     __ vmulsd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister); 
3113     __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0xE);
3114     __ vmulsd($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
3115   %}
3116   ins_pipe( pipe_slow );
3117 %}
3118 
3119 // ====================VECTOR ARITHMETIC=======================================
3120 
3121 // --------------------------------- ADD --------------------------------------
3122 
3123 // Bytes vector add
3124 instruct vadd4B(vecS dst, vecS src) %{
3125   predicate(n->as_Vector()->length() == 4);
3126   match(Set dst (AddVB dst src));
3127   format %{ "paddb   $dst,$src\t! add packed4B" %}
3128   ins_encode %{
3129     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
3130   %}
3131   ins_pipe( pipe_slow );
3132 %}
3133 
3134 instruct vadd4B_reg(vecS dst, vecS src1, vecS src2) %{
3135   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
3136   match(Set dst (AddVB src1 src2));
3137   format %{ "vpaddb  $dst,$src1,$src2\t! add packed4B" %}
3138   ins_encode %{
3139     bool vector256 = false;
3140     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3141   %}
3142   ins_pipe( pipe_slow );
3143 %}
3144 
3145 instruct vadd8B(vecD dst, vecD src) %{
3146   predicate(n->as_Vector()->length() == 8);
3147   match(Set dst (AddVB dst src));
3148   format %{ "paddb   $dst,$src\t! add packed8B" %}
3149   ins_encode %{
3150     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
3151   %}
3152   ins_pipe( pipe_slow );
3153 %}
3154 
3155 instruct vadd8B_reg(vecD dst, vecD src1, vecD src2) %{
3156   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
3157   match(Set dst (AddVB src1 src2));
3158   format %{ "vpaddb  $dst,$src1,$src2\t! add packed8B" %}
3159   ins_encode %{
3160     bool vector256 = false;
3161     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3162   %}
3163   ins_pipe( pipe_slow );
3164 %}
3165 
3166 instruct vadd16B(vecX dst, vecX src) %{
3167   predicate(n->as_Vector()->length() == 16);
3168   match(Set dst (AddVB dst src));
3169   format %{ "paddb   $dst,$src\t! add packed16B" %}
3170   ins_encode %{
3171     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
3172   %}
3173   ins_pipe( pipe_slow );
3174 %}
3175 
3176 instruct vadd16B_reg(vecX dst, vecX src1, vecX src2) %{
3177   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
3178   match(Set dst (AddVB src1 src2));
3179   format %{ "vpaddb  $dst,$src1,$src2\t! add packed16B" %}
3180   ins_encode %{
3181     bool vector256 = false;
3182     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3183   %}
3184   ins_pipe( pipe_slow );
3185 %}
3186 
3187 instruct vadd16B_mem(vecX dst, vecX src, memory mem) %{
3188   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
3189   match(Set dst (AddVB src (LoadVector mem)));
3190   format %{ "vpaddb  $dst,$src,$mem\t! add packed16B" %}
3191   ins_encode %{
3192     bool vector256 = false;
3193     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3194   %}
3195   ins_pipe( pipe_slow );
3196 %}
3197 
3198 instruct vadd32B_reg(vecY dst, vecY src1, vecY src2) %{
3199   predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
3200   match(Set dst (AddVB src1 src2));
3201   format %{ "vpaddb  $dst,$src1,$src2\t! add packed32B" %}
3202   ins_encode %{
3203     bool vector256 = true;
3204     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3205   %}
3206   ins_pipe( pipe_slow );
3207 %}
3208 
3209 instruct vadd32B_mem(vecY dst, vecY src, memory mem) %{
3210   predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
3211   match(Set dst (AddVB src (LoadVector mem)));
3212   format %{ "vpaddb  $dst,$src,$mem\t! add packed32B" %}
3213   ins_encode %{
3214     bool vector256 = true;
3215     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3216   %}
3217   ins_pipe( pipe_slow );
3218 %}
3219 
3220 // Shorts/Chars vector add
3221 instruct vadd2S(vecS dst, vecS src) %{
3222   predicate(n->as_Vector()->length() == 2);
3223   match(Set dst (AddVS dst src));
3224   format %{ "paddw   $dst,$src\t! add packed2S" %}
3225   ins_encode %{
3226     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
3227   %}
3228   ins_pipe( pipe_slow );
3229 %}
3230 
3231 instruct vadd2S_reg(vecS dst, vecS src1, vecS src2) %{
3232   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
3233   match(Set dst (AddVS src1 src2));
3234   format %{ "vpaddw  $dst,$src1,$src2\t! add packed2S" %}
3235   ins_encode %{
3236     bool vector256 = false;
3237     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3238   %}
3239   ins_pipe( pipe_slow );
3240 %}
3241 
3242 instruct vadd4S(vecD dst, vecD src) %{
3243   predicate(n->as_Vector()->length() == 4);
3244   match(Set dst (AddVS dst src));
3245   format %{ "paddw   $dst,$src\t! add packed4S" %}
3246   ins_encode %{
3247     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
3248   %}
3249   ins_pipe( pipe_slow );
3250 %}
3251 
3252 instruct vadd4S_reg(vecD dst, vecD src1, vecD src2) %{
3253   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
3254   match(Set dst (AddVS src1 src2));
3255   format %{ "vpaddw  $dst,$src1,$src2\t! add packed4S" %}
3256   ins_encode %{
3257     bool vector256 = false;
3258     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3259   %}
3260   ins_pipe( pipe_slow );
3261 %}
3262 
3263 instruct vadd8S(vecX dst, vecX src) %{
3264   predicate(n->as_Vector()->length() == 8);
3265   match(Set dst (AddVS dst src));
3266   format %{ "paddw   $dst,$src\t! add packed8S" %}
3267   ins_encode %{
3268     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
3269   %}
3270   ins_pipe( pipe_slow );
3271 %}
3272 
3273 instruct vadd8S_reg(vecX dst, vecX src1, vecX src2) %{
3274   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
3275   match(Set dst (AddVS src1 src2));
3276   format %{ "vpaddw  $dst,$src1,$src2\t! add packed8S" %}
3277   ins_encode %{
3278     bool vector256 = false;
3279     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3280   %}
3281   ins_pipe( pipe_slow );
3282 %}
3283 
3284 instruct vadd8S_mem(vecX dst, vecX src, memory mem) %{
3285   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
3286   match(Set dst (AddVS src (LoadVector mem)));
3287   format %{ "vpaddw  $dst,$src,$mem\t! add packed8S" %}
3288   ins_encode %{
3289     bool vector256 = false;
3290     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3291   %}
3292   ins_pipe( pipe_slow );
3293 %}
3294 
3295 instruct vadd16S_reg(vecY dst, vecY src1, vecY src2) %{
3296   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
3297   match(Set dst (AddVS src1 src2));
3298   format %{ "vpaddw  $dst,$src1,$src2\t! add packed16S" %}
3299   ins_encode %{
3300     bool vector256 = true;
3301     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3302   %}
3303   ins_pipe( pipe_slow );
3304 %}
3305 
3306 instruct vadd16S_mem(vecY dst, vecY src, memory mem) %{
3307   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
3308   match(Set dst (AddVS src (LoadVector mem)));
3309   format %{ "vpaddw  $dst,$src,$mem\t! add packed16S" %}
3310   ins_encode %{
3311     bool vector256 = true;
3312     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3313   %}
3314   ins_pipe( pipe_slow );
3315 %}
3316 
3317 // Integers vector add
3318 instruct vadd2I(vecD dst, vecD src) %{
3319   predicate(n->as_Vector()->length() == 2);
3320   match(Set dst (AddVI dst src));
3321   format %{ "paddd   $dst,$src\t! add packed2I" %}
3322   ins_encode %{
3323     __ paddd($dst$$XMMRegister, $src$$XMMRegister);
3324   %}
3325   ins_pipe( pipe_slow );
3326 %}
3327 
3328 instruct vadd2I_reg(vecD dst, vecD src1, vecD src2) %{
3329   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
3330   match(Set dst (AddVI src1 src2));
3331   format %{ "vpaddd  $dst,$src1,$src2\t! add packed2I" %}
3332   ins_encode %{
3333     bool vector256 = false;
3334     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3335   %}
3336   ins_pipe( pipe_slow );
3337 %}
3338 
3339 instruct vadd4I(vecX dst, vecX src) %{
3340   predicate(n->as_Vector()->length() == 4);
3341   match(Set dst (AddVI dst src));
3342   format %{ "paddd   $dst,$src\t! add packed4I" %}
3343   ins_encode %{
3344     __ paddd($dst$$XMMRegister, $src$$XMMRegister);
3345   %}
3346   ins_pipe( pipe_slow );
3347 %}
3348 
3349 instruct vadd4I_reg(vecX dst, vecX src1, vecX src2) %{
3350   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
3351   match(Set dst (AddVI src1 src2));
3352   format %{ "vpaddd  $dst,$src1,$src2\t! add packed4I" %}
3353   ins_encode %{
3354     bool vector256 = false;
3355     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3356   %}
3357   ins_pipe( pipe_slow );
3358 %}
3359 
3360 instruct vadd4I_mem(vecX dst, vecX src, memory mem) %{
3361   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
3362   match(Set dst (AddVI src (LoadVector mem)));
3363   format %{ "vpaddd  $dst,$src,$mem\t! add packed4I" %}
3364   ins_encode %{
3365     bool vector256 = false;
3366     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3367   %}
3368   ins_pipe( pipe_slow );
3369 %}
3370 
3371 instruct vadd8I_reg(vecY dst, vecY src1, vecY src2) %{
3372   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
3373   match(Set dst (AddVI src1 src2));
3374   format %{ "vpaddd  $dst,$src1,$src2\t! add packed8I" %}
3375   ins_encode %{
3376     bool vector256 = true;
3377     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3378   %}
3379   ins_pipe( pipe_slow );
3380 %}
3381 
3382 instruct vadd8I_mem(vecY dst, vecY src, memory mem) %{
3383   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
3384   match(Set dst (AddVI src (LoadVector mem)));
3385   format %{ "vpaddd  $dst,$src,$mem\t! add packed8I" %}
3386   ins_encode %{
3387     bool vector256 = true;
3388     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3389   %}
3390   ins_pipe( pipe_slow );
3391 %}
3392 
3393 // Longs vector add
3394 instruct vadd2L(vecX dst, vecX src) %{
3395   predicate(n->as_Vector()->length() == 2);
3396   match(Set dst (AddVL dst src));
3397   format %{ "paddq   $dst,$src\t! add packed2L" %}
3398   ins_encode %{
3399     __ paddq($dst$$XMMRegister, $src$$XMMRegister);
3400   %}
3401   ins_pipe( pipe_slow );
3402 %}
3403 
3404 instruct vadd2L_reg(vecX dst, vecX src1, vecX src2) %{
3405   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
3406   match(Set dst (AddVL src1 src2));
3407   format %{ "vpaddq  $dst,$src1,$src2\t! add packed2L" %}
3408   ins_encode %{
3409     bool vector256 = false;
3410     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3411   %}
3412   ins_pipe( pipe_slow );
3413 %}
3414 
3415 instruct vadd2L_mem(vecX dst, vecX src, memory mem) %{
3416   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
3417   match(Set dst (AddVL src (LoadVector mem)));
3418   format %{ "vpaddq  $dst,$src,$mem\t! add packed2L" %}
3419   ins_encode %{
3420     bool vector256 = false;
3421     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3422   %}
3423   ins_pipe( pipe_slow );
3424 %}
3425 
3426 instruct vadd4L_reg(vecY dst, vecY src1, vecY src2) %{
3427   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
3428   match(Set dst (AddVL src1 src2));
3429   format %{ "vpaddq  $dst,$src1,$src2\t! add packed4L" %}
3430   ins_encode %{
3431     bool vector256 = true;
3432     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3433   %}
3434   ins_pipe( pipe_slow );
3435 %}
3436 
3437 instruct vadd4L_mem(vecY dst, vecY src, memory mem) %{
3438   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
3439   match(Set dst (AddVL src (LoadVector mem)));
3440   format %{ "vpaddq  $dst,$src,$mem\t! add packed4L" %}
3441   ins_encode %{
3442     bool vector256 = true;
3443     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3444   %}
3445   ins_pipe( pipe_slow );
3446 %}
3447 
3448 // Floats vector add
3449 instruct vadd2F(vecD dst, vecD src) %{
3450   predicate(n->as_Vector()->length() == 2);
3451   match(Set dst (AddVF dst src));
3452   format %{ "addps   $dst,$src\t! add packed2F" %}
3453   ins_encode %{
3454     __ addps($dst$$XMMRegister, $src$$XMMRegister);
3455   %}
3456   ins_pipe( pipe_slow );
3457 %}
3458 
3459 instruct vadd2F_reg(vecD dst, vecD src1, vecD src2) %{
3460   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
3461   match(Set dst (AddVF src1 src2));
3462   format %{ "vaddps  $dst,$src1,$src2\t! add packed2F" %}
3463   ins_encode %{
3464     bool vector256 = false;
3465     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3466   %}
3467   ins_pipe( pipe_slow );
3468 %}
3469 
3470 instruct vadd4F(vecX dst, vecX src) %{
3471   predicate(n->as_Vector()->length() == 4);
3472   match(Set dst (AddVF dst src));
3473   format %{ "addps   $dst,$src\t! add packed4F" %}
3474   ins_encode %{
3475     __ addps($dst$$XMMRegister, $src$$XMMRegister);
3476   %}
3477   ins_pipe( pipe_slow );
3478 %}
3479 
3480 instruct vadd4F_reg(vecX dst, vecX src1, vecX src2) %{
3481   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
3482   match(Set dst (AddVF src1 src2));
3483   format %{ "vaddps  $dst,$src1,$src2\t! add packed4F" %}
3484   ins_encode %{
3485     bool vector256 = false;
3486     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3487   %}
3488   ins_pipe( pipe_slow );
3489 %}
3490 
3491 instruct vadd4F_mem(vecX dst, vecX src, memory mem) %{
3492   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
3493   match(Set dst (AddVF src (LoadVector mem)));
3494   format %{ "vaddps  $dst,$src,$mem\t! add packed4F" %}
3495   ins_encode %{
3496     bool vector256 = false;
3497     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3498   %}
3499   ins_pipe( pipe_slow );
3500 %}
3501 
3502 instruct vadd8F_reg(vecY dst, vecY src1, vecY src2) %{
3503   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
3504   match(Set dst (AddVF src1 src2));
3505   format %{ "vaddps  $dst,$src1,$src2\t! add packed8F" %}
3506   ins_encode %{
3507     bool vector256 = true;
3508     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3509   %}
3510   ins_pipe( pipe_slow );
3511 %}
3512 
3513 instruct vadd8F_mem(vecY dst, vecY src, memory mem) %{
3514   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
3515   match(Set dst (AddVF src (LoadVector mem)));
3516   format %{ "vaddps  $dst,$src,$mem\t! add packed8F" %}
3517   ins_encode %{
3518     bool vector256 = true;
3519     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3520   %}
3521   ins_pipe( pipe_slow );
3522 %}
3523 
3524 // Doubles vector add
3525 instruct vadd2D(vecX dst, vecX src) %{
3526   predicate(n->as_Vector()->length() == 2);
3527   match(Set dst (AddVD dst src));
3528   format %{ "addpd   $dst,$src\t! add packed2D" %}
3529   ins_encode %{
3530     __ addpd($dst$$XMMRegister, $src$$XMMRegister);
3531   %}
3532   ins_pipe( pipe_slow );
3533 %}
3534 
3535 instruct vadd2D_reg(vecX dst, vecX src1, vecX src2) %{
3536   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
3537   match(Set dst (AddVD src1 src2));
3538   format %{ "vaddpd  $dst,$src1,$src2\t! add packed2D" %}
3539   ins_encode %{
3540     bool vector256 = false;
3541     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3542   %}
3543   ins_pipe( pipe_slow );
3544 %}
3545 
3546 instruct vadd2D_mem(vecX dst, vecX src, memory mem) %{
3547   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
3548   match(Set dst (AddVD src (LoadVector mem)));
3549   format %{ "vaddpd  $dst,$src,$mem\t! add packed2D" %}
3550   ins_encode %{
3551     bool vector256 = false;
3552     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3553   %}
3554   ins_pipe( pipe_slow );
3555 %}
3556 
3557 instruct vadd4D_reg(vecY dst, vecY src1, vecY src2) %{
3558   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
3559   match(Set dst (AddVD src1 src2));
3560   format %{ "vaddpd  $dst,$src1,$src2\t! add packed4D" %}
3561   ins_encode %{
3562     bool vector256 = true;
3563     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3564   %}
3565   ins_pipe( pipe_slow );
3566 %}
3567 
3568 instruct vadd4D_mem(vecY dst, vecY src, memory mem) %{
3569   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
3570   match(Set dst (AddVD src (LoadVector mem)));
3571   format %{ "vaddpd  $dst,$src,$mem\t! add packed4D" %}
3572   ins_encode %{
3573     bool vector256 = true;
3574     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3575   %}
3576   ins_pipe( pipe_slow );
3577 %}
3578 
3579 // --------------------------------- SUB --------------------------------------
3580 
3581 // Bytes vector sub
3582 instruct vsub4B(vecS dst, vecS src) %{
3583   predicate(n->as_Vector()->length() == 4);
3584   match(Set dst (SubVB dst src));
3585   format %{ "psubb   $dst,$src\t! sub packed4B" %}
3586   ins_encode %{
3587     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
3588   %}
3589   ins_pipe( pipe_slow );
3590 %}
3591 
3592 instruct vsub4B_reg(vecS dst, vecS src1, vecS src2) %{
3593   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
3594   match(Set dst (SubVB src1 src2));
3595   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed4B" %}
3596   ins_encode %{
3597     bool vector256 = false;
3598     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3599   %}
3600   ins_pipe( pipe_slow );
3601 %}
3602 
3603 instruct vsub8B(vecD dst, vecD src) %{
3604   predicate(n->as_Vector()->length() == 8);
3605   match(Set dst (SubVB dst src));
3606   format %{ "psubb   $dst,$src\t! sub packed8B" %}
3607   ins_encode %{
3608     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
3609   %}
3610   ins_pipe( pipe_slow );
3611 %}
3612 
3613 instruct vsub8B_reg(vecD dst, vecD src1, vecD src2) %{
3614   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
3615   match(Set dst (SubVB src1 src2));
3616   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed8B" %}
3617   ins_encode %{
3618     bool vector256 = false;
3619     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3620   %}
3621   ins_pipe( pipe_slow );
3622 %}
3623 
3624 instruct vsub16B(vecX dst, vecX src) %{
3625   predicate(n->as_Vector()->length() == 16);
3626   match(Set dst (SubVB dst src));
3627   format %{ "psubb   $dst,$src\t! sub packed16B" %}
3628   ins_encode %{
3629     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
3630   %}
3631   ins_pipe( pipe_slow );
3632 %}
3633 
3634 instruct vsub16B_reg(vecX dst, vecX src1, vecX src2) %{
3635   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
3636   match(Set dst (SubVB src1 src2));
3637   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed16B" %}
3638   ins_encode %{
3639     bool vector256 = false;
3640     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3641   %}
3642   ins_pipe( pipe_slow );
3643 %}
3644 
3645 instruct vsub16B_mem(vecX dst, vecX src, memory mem) %{
3646   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
3647   match(Set dst (SubVB src (LoadVector mem)));
3648   format %{ "vpsubb  $dst,$src,$mem\t! sub packed16B" %}
3649   ins_encode %{
3650     bool vector256 = false;
3651     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3652   %}
3653   ins_pipe( pipe_slow );
3654 %}
3655 
3656 instruct vsub32B_reg(vecY dst, vecY src1, vecY src2) %{
3657   predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
3658   match(Set dst (SubVB src1 src2));
3659   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed32B" %}
3660   ins_encode %{
3661     bool vector256 = true;
3662     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3663   %}
3664   ins_pipe( pipe_slow );
3665 %}
3666 
3667 instruct vsub32B_mem(vecY dst, vecY src, memory mem) %{
3668   predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
3669   match(Set dst (SubVB src (LoadVector mem)));
3670   format %{ "vpsubb  $dst,$src,$mem\t! sub packed32B" %}
3671   ins_encode %{
3672     bool vector256 = true;
3673     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3674   %}
3675   ins_pipe( pipe_slow );
3676 %}
3677 
3678 // Shorts/Chars vector sub
3679 instruct vsub2S(vecS dst, vecS src) %{
3680   predicate(n->as_Vector()->length() == 2);
3681   match(Set dst (SubVS dst src));
3682   format %{ "psubw   $dst,$src\t! sub packed2S" %}
3683   ins_encode %{
3684     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
3685   %}
3686   ins_pipe( pipe_slow );
3687 %}
3688 
3689 instruct vsub2S_reg(vecS dst, vecS src1, vecS src2) %{
3690   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
3691   match(Set dst (SubVS src1 src2));
3692   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed2S" %}
3693   ins_encode %{
3694     bool vector256 = false;
3695     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3696   %}
3697   ins_pipe( pipe_slow );
3698 %}
3699 
3700 instruct vsub4S(vecD dst, vecD src) %{
3701   predicate(n->as_Vector()->length() == 4);
3702   match(Set dst (SubVS dst src));
3703   format %{ "psubw   $dst,$src\t! sub packed4S" %}
3704   ins_encode %{
3705     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
3706   %}
3707   ins_pipe( pipe_slow );
3708 %}
3709 
3710 instruct vsub4S_reg(vecD dst, vecD src1, vecD src2) %{
3711   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
3712   match(Set dst (SubVS src1 src2));
3713   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed4S" %}
3714   ins_encode %{
3715     bool vector256 = false;
3716     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3717   %}
3718   ins_pipe( pipe_slow );
3719 %}
3720 
3721 instruct vsub8S(vecX dst, vecX src) %{
3722   predicate(n->as_Vector()->length() == 8);
3723   match(Set dst (SubVS dst src));
3724   format %{ "psubw   $dst,$src\t! sub packed8S" %}
3725   ins_encode %{
3726     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
3727   %}
3728   ins_pipe( pipe_slow );
3729 %}
3730 
3731 instruct vsub8S_reg(vecX dst, vecX src1, vecX src2) %{
3732   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
3733   match(Set dst (SubVS src1 src2));
3734   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed8S" %}
3735   ins_encode %{
3736     bool vector256 = false;
3737     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3738   %}
3739   ins_pipe( pipe_slow );
3740 %}
3741 
3742 instruct vsub8S_mem(vecX dst, vecX src, memory mem) %{
3743   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
3744   match(Set dst (SubVS src (LoadVector mem)));
3745   format %{ "vpsubw  $dst,$src,$mem\t! sub packed8S" %}
3746   ins_encode %{
3747     bool vector256 = false;
3748     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3749   %}
3750   ins_pipe( pipe_slow );
3751 %}
3752 
3753 instruct vsub16S_reg(vecY dst, vecY src1, vecY src2) %{
3754   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
3755   match(Set dst (SubVS src1 src2));
3756   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed16S" %}
3757   ins_encode %{
3758     bool vector256 = true;
3759     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3760   %}
3761   ins_pipe( pipe_slow );
3762 %}
3763 
3764 instruct vsub16S_mem(vecY dst, vecY src, memory mem) %{
3765   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
3766   match(Set dst (SubVS src (LoadVector mem)));
3767   format %{ "vpsubw  $dst,$src,$mem\t! sub packed16S" %}
3768   ins_encode %{
3769     bool vector256 = true;
3770     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3771   %}
3772   ins_pipe( pipe_slow );
3773 %}
3774 
3775 // Integers vector sub
3776 instruct vsub2I(vecD dst, vecD src) %{
3777   predicate(n->as_Vector()->length() == 2);
3778   match(Set dst (SubVI dst src));
3779   format %{ "psubd   $dst,$src\t! sub packed2I" %}
3780   ins_encode %{
3781     __ psubd($dst$$XMMRegister, $src$$XMMRegister);
3782   %}
3783   ins_pipe( pipe_slow );
3784 %}
3785 
3786 instruct vsub2I_reg(vecD dst, vecD src1, vecD src2) %{
3787   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
3788   match(Set dst (SubVI src1 src2));
3789   format %{ "vpsubd  $dst,$src1,$src2\t! sub packed2I" %}
3790   ins_encode %{
3791     bool vector256 = false;
3792     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3793   %}
3794   ins_pipe( pipe_slow );
3795 %}
3796 
3797 instruct vsub4I(vecX dst, vecX src) %{
3798   predicate(n->as_Vector()->length() == 4);
3799   match(Set dst (SubVI dst src));
3800   format %{ "psubd   $dst,$src\t! sub packed4I" %}
3801   ins_encode %{
3802     __ psubd($dst$$XMMRegister, $src$$XMMRegister);
3803   %}
3804   ins_pipe( pipe_slow );
3805 %}
3806 
3807 instruct vsub4I_reg(vecX dst, vecX src1, vecX src2) %{
3808   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
3809   match(Set dst (SubVI src1 src2));
3810   format %{ "vpsubd  $dst,$src1,$src2\t! sub packed4I" %}
3811   ins_encode %{
3812     bool vector256 = false;
3813     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3814   %}
3815   ins_pipe( pipe_slow );
3816 %}
3817 
3818 instruct vsub4I_mem(vecX dst, vecX src, memory mem) %{
3819   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
3820   match(Set dst (SubVI src (LoadVector mem)));
3821   format %{ "vpsubd  $dst,$src,$mem\t! sub packed4I" %}
3822   ins_encode %{
3823     bool vector256 = false;
3824     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3825   %}
3826   ins_pipe( pipe_slow );
3827 %}
3828 
3829 instruct vsub8I_reg(vecY dst, vecY src1, vecY src2) %{
3830   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
3831   match(Set dst (SubVI src1 src2));
3832   format %{ "vpsubd  $dst,$src1,$src2\t! sub packed8I" %}
3833   ins_encode %{
3834     bool vector256 = true;
3835     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3836   %}
3837   ins_pipe( pipe_slow );
3838 %}
3839 
3840 instruct vsub8I_mem(vecY dst, vecY src, memory mem) %{
3841   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
3842   match(Set dst (SubVI src (LoadVector mem)));
3843   format %{ "vpsubd  $dst,$src,$mem\t! sub packed8I" %}
3844   ins_encode %{
3845     bool vector256 = true;
3846     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3847   %}
3848   ins_pipe( pipe_slow );
3849 %}
3850 
3851 // Longs vector sub
3852 instruct vsub2L(vecX dst, vecX src) %{
3853   predicate(n->as_Vector()->length() == 2);
3854   match(Set dst (SubVL dst src));
3855   format %{ "psubq   $dst,$src\t! sub packed2L" %}
3856   ins_encode %{
3857     __ psubq($dst$$XMMRegister, $src$$XMMRegister);
3858   %}
3859   ins_pipe( pipe_slow );
3860 %}
3861 
3862 instruct vsub2L_reg(vecX dst, vecX src1, vecX src2) %{
3863   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
3864   match(Set dst (SubVL src1 src2));
3865   format %{ "vpsubq  $dst,$src1,$src2\t! sub packed2L" %}
3866   ins_encode %{
3867     bool vector256 = false;
3868     __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3869   %}
3870   ins_pipe( pipe_slow );
3871 %}
3872 
3873 instruct vsub2L_mem(vecX dst, vecX src, memory mem) %{
3874   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
3875   match(Set dst (SubVL src (LoadVector mem)));
3876   format %{ "vpsubq  $dst,$src,$mem\t! sub packed2L" %}
3877   ins_encode %{
3878     bool vector256 = false;
3879     __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3880   %}
3881   ins_pipe( pipe_slow );
3882 %}
3883 
3884 instruct vsub4L_reg(vecY dst, vecY src1, vecY src2) %{
3885   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
3886   match(Set dst (SubVL src1 src2));
3887   format %{ "vpsubq  $dst,$src1,$src2\t! sub packed4L" %}
3888   ins_encode %{
3889     bool vector256 = true;
3890     __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3891   %}
3892   ins_pipe( pipe_slow );
3893 %}
3894 
3895 instruct vsub4L_mem(vecY dst, vecY src, memory mem) %{
3896   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
3897   match(Set dst (SubVL src (LoadVector mem)));
3898   format %{ "vpsubq  $dst,$src,$mem\t! sub packed4L" %}
3899   ins_encode %{
3900     bool vector256 = true;
3901     __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3902   %}
3903   ins_pipe( pipe_slow );
3904 %}
3905 
3906 // Floats vector sub
3907 instruct vsub2F(vecD dst, vecD src) %{
3908   predicate(n->as_Vector()->length() == 2);
3909   match(Set dst (SubVF dst src));
3910   format %{ "subps   $dst,$src\t! sub packed2F" %}
3911   ins_encode %{
3912     __ subps($dst$$XMMRegister, $src$$XMMRegister);
3913   %}
3914   ins_pipe( pipe_slow );
3915 %}
3916 
3917 instruct vsub2F_reg(vecD dst, vecD src1, vecD src2) %{
3918   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
3919   match(Set dst (SubVF src1 src2));
3920   format %{ "vsubps  $dst,$src1,$src2\t! sub packed2F" %}
3921   ins_encode %{
3922     bool vector256 = false;
3923     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3924   %}
3925   ins_pipe( pipe_slow );
3926 %}
3927 
3928 instruct vsub4F(vecX dst, vecX src) %{
3929   predicate(n->as_Vector()->length() == 4);
3930   match(Set dst (SubVF dst src));
3931   format %{ "subps   $dst,$src\t! sub packed4F" %}
3932   ins_encode %{
3933     __ subps($dst$$XMMRegister, $src$$XMMRegister);
3934   %}
3935   ins_pipe( pipe_slow );
3936 %}
3937 
3938 instruct vsub4F_reg(vecX dst, vecX src1, vecX src2) %{
3939   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
3940   match(Set dst (SubVF src1 src2));
3941   format %{ "vsubps  $dst,$src1,$src2\t! sub packed4F" %}
3942   ins_encode %{
3943     bool vector256 = false;
3944     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3945   %}
3946   ins_pipe( pipe_slow );
3947 %}
3948 
3949 instruct vsub4F_mem(vecX dst, vecX src, memory mem) %{
3950   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
3951   match(Set dst (SubVF src (LoadVector mem)));
3952   format %{ "vsubps  $dst,$src,$mem\t! sub packed4F" %}
3953   ins_encode %{
3954     bool vector256 = false;
3955     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3956   %}
3957   ins_pipe( pipe_slow );
3958 %}
3959 
3960 instruct vsub8F_reg(vecY dst, vecY src1, vecY src2) %{
3961   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
3962   match(Set dst (SubVF src1 src2));
3963   format %{ "vsubps  $dst,$src1,$src2\t! sub packed8F" %}
3964   ins_encode %{
3965     bool vector256 = true;
3966     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3967   %}
3968   ins_pipe( pipe_slow );
3969 %}
3970 
3971 instruct vsub8F_mem(vecY dst, vecY src, memory mem) %{
3972   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
3973   match(Set dst (SubVF src (LoadVector mem)));
3974   format %{ "vsubps  $dst,$src,$mem\t! sub packed8F" %}
3975   ins_encode %{
3976     bool vector256 = true;
3977     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3978   %}
3979   ins_pipe( pipe_slow );
3980 %}
3981 
3982 // Doubles vector sub
3983 instruct vsub2D(vecX dst, vecX src) %{
3984   predicate(n->as_Vector()->length() == 2);
3985   match(Set dst (SubVD dst src));
3986   format %{ "subpd   $dst,$src\t! sub packed2D" %}
3987   ins_encode %{
3988     __ subpd($dst$$XMMRegister, $src$$XMMRegister);
3989   %}
3990   ins_pipe( pipe_slow );
3991 %}
3992 
3993 instruct vsub2D_reg(vecX dst, vecX src1, vecX src2) %{
3994   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
3995   match(Set dst (SubVD src1 src2));
3996   format %{ "vsubpd  $dst,$src1,$src2\t! sub packed2D" %}
3997   ins_encode %{
3998     bool vector256 = false;
3999     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
4000   %}
4001   ins_pipe( pipe_slow );
4002 %}
4003 
4004 instruct vsub2D_mem(vecX dst, vecX src, memory mem) %{
4005   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
4006   match(Set dst (SubVD src (LoadVector mem)));
4007   format %{ "vsubpd  $dst,$src,$mem\t! sub packed2D" %}
4008   ins_encode %{
4009     bool vector256 = false;
4010     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
4011   %}
4012   ins_pipe( pipe_slow );
4013 %}
4014 
4015 instruct vsub4D_reg(vecY dst, vecY src1, vecY src2) %{
4016   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
4017   match(Set dst (SubVD src1 src2));
4018   format %{ "vsubpd  $dst,$src1,$src2\t! sub packed4D" %}
4019   ins_encode %{
4020     bool vector256 = true;
4021     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
4022   %}
4023   ins_pipe( pipe_slow );
4024 %}
4025 
4026 instruct vsub4D_mem(vecY dst, vecY src, memory mem) %{
4027   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
4028   match(Set dst (SubVD src (LoadVector mem)));
4029   format %{ "vsubpd  $dst,$src,$mem\t! sub packed4D" %}
4030   ins_encode %{
4031     bool vector256 = true;
4032     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
4033   %}
4034   ins_pipe( pipe_slow );
4035 %}
4036 
4037 // --------------------------------- MUL --------------------------------------
4038 
4039 // Shorts/Chars vector mul
4040 instruct vmul2S(vecS dst, vecS src) %{
4041   predicate(n->as_Vector()->length() == 2);
4042   match(Set dst (MulVS dst src));
4043   format %{ "pmullw $dst,$src\t! mul packed2S" %}
4044   ins_encode %{
4045     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
4046   %}
4047   ins_pipe( pipe_slow );
4048 %}
4049 
4050 instruct vmul2S_reg(vecS dst, vecS src1, vecS src2) %{
4051   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
4052   match(Set dst (MulVS src1 src2));
4053   format %{ "vpmullw $dst,$src1,$src2\t! mul packed2S" %}
4054   ins_encode %{
4055     bool vector256 = false;
4056     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
4057   %}
4058   ins_pipe( pipe_slow );
4059 %}
4060 
4061 instruct vmul4S(vecD dst, vecD src) %{
4062   predicate(n->as_Vector()->length() == 4);
4063   match(Set dst (MulVS dst src));
4064   format %{ "pmullw  $dst,$src\t! mul packed4S" %}
4065   ins_encode %{
4066     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
4067   %}
4068   ins_pipe( pipe_slow );
4069 %}
4070 
4071 instruct vmul4S_reg(vecD dst, vecD src1, vecD src2) %{
4072   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
4073   match(Set dst (MulVS src1 src2));
4074   format %{ "vpmullw $dst,$src1,$src2\t! mul packed4S" %}
4075   ins_encode %{
4076     bool vector256 = false;
4077     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
4078   %}
4079   ins_pipe( pipe_slow );
4080 %}
4081 
4082 instruct vmul8S(vecX dst, vecX src) %{
4083   predicate(n->as_Vector()->length() == 8);
4084   match(Set dst (MulVS dst src));
4085   format %{ "pmullw  $dst,$src\t! mul packed8S" %}
4086   ins_encode %{
4087     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
4088   %}
4089   ins_pipe( pipe_slow );
4090 %}
4091 
4092 instruct vmul8S_reg(vecX dst, vecX src1, vecX src2) %{
4093   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
4094   match(Set dst (MulVS src1 src2));
4095   format %{ "vpmullw $dst,$src1,$src2\t! mul packed8S" %}
4096   ins_encode %{
4097     bool vector256 = false;
4098     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
4099   %}
4100   ins_pipe( pipe_slow );
4101 %}
4102 
4103 instruct vmul8S_mem(vecX dst, vecX src, memory mem) %{
4104   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
4105   match(Set dst (MulVS src (LoadVector mem)));
4106   format %{ "vpmullw $dst,$src,$mem\t! mul packed8S" %}
4107   ins_encode %{
4108     bool vector256 = false;
4109     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
4110   %}
4111   ins_pipe( pipe_slow );
4112 %}
4113 
4114 instruct vmul16S_reg(vecY dst, vecY src1, vecY src2) %{
4115   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
4116   match(Set dst (MulVS src1 src2));
4117   format %{ "vpmullw $dst,$src1,$src2\t! mul packed16S" %}
4118   ins_encode %{
4119     bool vector256 = true;
4120     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
4121   %}
4122   ins_pipe( pipe_slow );
4123 %}
4124 
4125 instruct vmul16S_mem(vecY dst, vecY src, memory mem) %{
4126   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
4127   match(Set dst (MulVS src (LoadVector mem)));
4128   format %{ "vpmullw $dst,$src,$mem\t! mul packed16S" %}
4129   ins_encode %{
4130     bool vector256 = true;
4131     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
4132   %}
4133   ins_pipe( pipe_slow );
4134 %}
4135 
4136 // Integers vector mul (sse4_1)
4137 instruct vmul2I(vecD dst, vecD src) %{
4138   predicate(UseSSE > 3 && n->as_Vector()->length() == 2);
4139   match(Set dst (MulVI dst src));
4140   format %{ "pmulld  $dst,$src\t! mul packed2I" %}
4141   ins_encode %{
4142     __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
4143   %}
4144   ins_pipe( pipe_slow );
4145 %}
4146 
4147 instruct vmul2I_reg(vecD dst, vecD src1, vecD src2) %{
4148   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
4149   match(Set dst (MulVI src1 src2));
4150   format %{ "vpmulld $dst,$src1,$src2\t! mul packed2I" %}
4151   ins_encode %{
4152     bool vector256 = false;
4153     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
4154   %}
4155   ins_pipe( pipe_slow );
4156 %}
4157 
4158 instruct vmul4I(vecX dst, vecX src) %{
4159   predicate(UseSSE > 3 && n->as_Vector()->length() == 4);
4160   match(Set dst (MulVI dst src));
4161   format %{ "pmulld  $dst,$src\t! mul packed4I" %}
4162   ins_encode %{
4163     __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
4164   %}
4165   ins_pipe( pipe_slow );
4166 %}
4167 
4168 instruct vmul4I_reg(vecX dst, vecX src1, vecX src2) %{
4169   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
4170   match(Set dst (MulVI src1 src2));
4171   format %{ "vpmulld $dst,$src1,$src2\t! mul packed4I" %}
4172   ins_encode %{
4173     bool vector256 = false;
4174     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
4175   %}
4176   ins_pipe( pipe_slow );
4177 %}
4178 
4179 instruct vmul4I_mem(vecX dst, vecX src, memory mem) %{
4180   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
4181   match(Set dst (MulVI src (LoadVector mem)));
4182   format %{ "vpmulld $dst,$src,$mem\t! mul packed4I" %}
4183   ins_encode %{
4184     bool vector256 = false;
4185     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
4186   %}
4187   ins_pipe( pipe_slow );
4188 %}
4189 
4190 instruct vmul8I_reg(vecY dst, vecY src1, vecY src2) %{
4191   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
4192   match(Set dst (MulVI src1 src2));
4193   format %{ "vpmulld $dst,$src1,$src2\t! mul packed8I" %}
4194   ins_encode %{
4195     bool vector256 = true;
4196     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
4197   %}
4198   ins_pipe( pipe_slow );
4199 %}
4200 
4201 instruct vmul8I_mem(vecY dst, vecY src, memory mem) %{
4202   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
4203   match(Set dst (MulVI src (LoadVector mem)));
4204   format %{ "vpmulld $dst,$src,$mem\t! mul packed8I" %}
4205   ins_encode %{
4206     bool vector256 = true;
4207     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
4208   %}
4209   ins_pipe( pipe_slow );
4210 %}
4211 
4212 // Floats vector mul
4213 instruct vmul2F(vecD dst, vecD src) %{
4214   predicate(n->as_Vector()->length() == 2);
4215   match(Set dst (MulVF dst src));
4216   format %{ "mulps   $dst,$src\t! mul packed2F" %}
4217   ins_encode %{
4218     __ mulps($dst$$XMMRegister, $src$$XMMRegister);
4219   %}
4220   ins_pipe( pipe_slow );
4221 %}
4222 
4223 instruct vmul2F_reg(vecD dst, vecD src1, vecD src2) %{
4224   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
4225   match(Set dst (MulVF src1 src2));
4226   format %{ "vmulps  $dst,$src1,$src2\t! mul packed2F" %}
4227   ins_encode %{
4228     bool vector256 = false;
4229     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
4230   %}
4231   ins_pipe( pipe_slow );
4232 %}
4233 
4234 instruct vmul4F(vecX dst, vecX src) %{
4235   predicate(n->as_Vector()->length() == 4);
4236   match(Set dst (MulVF dst src));
4237   format %{ "mulps   $dst,$src\t! mul packed4F" %}
4238   ins_encode %{
4239     __ mulps($dst$$XMMRegister, $src$$XMMRegister);
4240   %}
4241   ins_pipe( pipe_slow );
4242 %}
4243 
4244 instruct vmul4F_reg(vecX dst, vecX src1, vecX src2) %{
4245   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
4246   match(Set dst (MulVF src1 src2));
4247   format %{ "vmulps  $dst,$src1,$src2\t! mul packed4F" %}
4248   ins_encode %{
4249     bool vector256 = false;
4250     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
4251   %}
4252   ins_pipe( pipe_slow );
4253 %}
4254 
4255 instruct vmul4F_mem(vecX dst, vecX src, memory mem) %{
4256   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
4257   match(Set dst (MulVF src (LoadVector mem)));
4258   format %{ "vmulps  $dst,$src,$mem\t! mul packed4F" %}
4259   ins_encode %{
4260     bool vector256 = false;
4261     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
4262   %}
4263   ins_pipe( pipe_slow );
4264 %}
4265 
4266 instruct vmul8F_reg(vecY dst, vecY src1, vecY src2) %{
4267   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
4268   match(Set dst (MulVF src1 src2));
4269   format %{ "vmulps  $dst,$src1,$src2\t! mul packed8F" %}
4270   ins_encode %{
4271     bool vector256 = true;
4272     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
4273   %}
4274   ins_pipe( pipe_slow );
4275 %}
4276 
4277 instruct vmul8F_mem(vecY dst, vecY src, memory mem) %{
4278   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
4279   match(Set dst (MulVF src (LoadVector mem)));
4280   format %{ "vmulps  $dst,$src,$mem\t! mul packed8F" %}
4281   ins_encode %{
4282     bool vector256 = true;
4283     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
4284   %}
4285   ins_pipe( pipe_slow );
4286 %}
4287 
4288 // Doubles vector mul
4289 instruct vmul2D(vecX dst, vecX src) %{
4290   predicate(n->as_Vector()->length() == 2);
4291   match(Set dst (MulVD dst src));
4292   format %{ "mulpd   $dst,$src\t! mul packed2D" %}
4293   ins_encode %{
4294     __ mulpd($dst$$XMMRegister, $src$$XMMRegister);
4295   %}
4296   ins_pipe( pipe_slow );
4297 %}
4298 
4299 instruct vmul2D_reg(vecX dst, vecX src1, vecX src2) %{
4300   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
4301   match(Set dst (MulVD src1 src2));
4302   format %{ "vmulpd  $dst,$src1,$src2\t! mul packed2D" %}
4303   ins_encode %{
4304     bool vector256 = false;
4305     __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
4306   %}
4307   ins_pipe( pipe_slow );
4308 %}
4309 
4310 instruct vmul2D_mem(vecX dst, vecX src, memory mem) %{
4311   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
4312   match(Set dst (MulVD src (LoadVector mem)));
4313   format %{ "vmulpd  $dst,$src,$mem\t! mul packed2D" %}
4314   ins_encode %{
4315     bool vector256 = false;
4316     __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
4317   %}
4318   ins_pipe( pipe_slow );
4319 %}
4320 
4321 instruct vmul4D_reg(vecY dst, vecY src1, vecY src2) %{
4322   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
4323   match(Set dst (MulVD src1 src2));
4324   format %{ "vmulpd  $dst,$src1,$src2\t! mul packed4D" %}
4325   ins_encode %{
4326     bool vector256 = true;
4327     __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
4328   %}
4329   ins_pipe( pipe_slow );
4330 %}
4331 
4332 instruct vmul4D_mem(vecY dst, vecY src, memory mem) %{
4333   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
4334   match(Set dst (MulVD src (LoadVector mem)));
4335   format %{ "vmulpd  $dst,$src,$mem\t! mul packed4D" %}
4336   ins_encode %{
4337     bool vector256 = true;
4338     __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
4339   %}
4340   ins_pipe( pipe_slow );
4341 %}
4342 
4343 // --------------------------------- DIV --------------------------------------
4344 
4345 // Floats vector div
4346 instruct vdiv2F(vecD dst, vecD src) %{
4347   predicate(n->as_Vector()->length() == 2);
4348   match(Set dst (DivVF dst src));
4349   format %{ "divps   $dst,$src\t! div packed2F" %}
4350   ins_encode %{
4351     __ divps($dst$$XMMRegister, $src$$XMMRegister);
4352   %}
4353   ins_pipe( pipe_slow );
4354 %}
4355 
4356 instruct vdiv2F_reg(vecD dst, vecD src1, vecD src2) %{
4357   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
4358   match(Set dst (DivVF src1 src2));
4359   format %{ "vdivps  $dst,$src1,$src2\t! div packed2F" %}
4360   ins_encode %{
4361     bool vector256 = false;
4362     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
4363   %}
4364   ins_pipe( pipe_slow );
4365 %}
4366 
4367 instruct vdiv4F(vecX dst, vecX src) %{
4368   predicate(n->as_Vector()->length() == 4);
4369   match(Set dst (DivVF dst src));
4370   format %{ "divps   $dst,$src\t! div packed4F" %}
4371   ins_encode %{
4372     __ divps($dst$$XMMRegister, $src$$XMMRegister);
4373   %}
4374   ins_pipe( pipe_slow );
4375 %}
4376 
4377 instruct vdiv4F_reg(vecX dst, vecX src1, vecX src2) %{
4378   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
4379   match(Set dst (DivVF src1 src2));
4380   format %{ "vdivps  $dst,$src1,$src2\t! div packed4F" %}
4381   ins_encode %{
4382     bool vector256 = false;
4383     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
4384   %}
4385   ins_pipe( pipe_slow );
4386 %}
4387 
4388 instruct vdiv4F_mem(vecX dst, vecX src, memory mem) %{
4389   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
4390   match(Set dst (DivVF src (LoadVector mem)));
4391   format %{ "vdivps  $dst,$src,$mem\t! div packed4F" %}
4392   ins_encode %{
4393     bool vector256 = false;
4394     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
4395   %}
4396   ins_pipe( pipe_slow );
4397 %}
4398 
4399 instruct vdiv8F_reg(vecY dst, vecY src1, vecY src2) %{
4400   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
4401   match(Set dst (DivVF src1 src2));
4402   format %{ "vdivps  $dst,$src1,$src2\t! div packed8F" %}
4403   ins_encode %{
4404     bool vector256 = true;
4405     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
4406   %}
4407   ins_pipe( pipe_slow );
4408 %}
4409 
4410 instruct vdiv8F_mem(vecY dst, vecY src, memory mem) %{
4411   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
4412   match(Set dst (DivVF src (LoadVector mem)));
4413   format %{ "vdivps  $dst,$src,$mem\t! div packed8F" %}
4414   ins_encode %{
4415     bool vector256 = true;
4416     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
4417   %}
4418   ins_pipe( pipe_slow );
4419 %}
4420 
4421 // Doubles vector div
4422 instruct vdiv2D(vecX dst, vecX src) %{
4423   predicate(n->as_Vector()->length() == 2);
4424   match(Set dst (DivVD dst src));
4425   format %{ "divpd   $dst,$src\t! div packed2D" %}
4426   ins_encode %{
4427     __ divpd($dst$$XMMRegister, $src$$XMMRegister);
4428   %}
4429   ins_pipe( pipe_slow );
4430 %}
4431 
4432 instruct vdiv2D_reg(vecX dst, vecX src1, vecX src2) %{
4433   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
4434   match(Set dst (DivVD src1 src2));
4435   format %{ "vdivpd  $dst,$src1,$src2\t! div packed2D" %}
4436   ins_encode %{
4437     bool vector256 = false;
4438     __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
4439   %}
4440   ins_pipe( pipe_slow );
4441 %}
4442 
4443 instruct vdiv2D_mem(vecX dst, vecX src, memory mem) %{
4444   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
4445   match(Set dst (DivVD src (LoadVector mem)));
4446   format %{ "vdivpd  $dst,$src,$mem\t! div packed2D" %}
4447   ins_encode %{
4448     bool vector256 = false;
4449     __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
4450   %}
4451   ins_pipe( pipe_slow );
4452 %}
4453 
4454 instruct vdiv4D_reg(vecY dst, vecY src1, vecY src2) %{
4455   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
4456   match(Set dst (DivVD src1 src2));
4457   format %{ "vdivpd  $dst,$src1,$src2\t! div packed4D" %}
4458   ins_encode %{
4459     bool vector256 = true;
4460     __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
4461   %}
4462   ins_pipe( pipe_slow );
4463 %}
4464 
4465 instruct vdiv4D_mem(vecY dst, vecY src, memory mem) %{
4466   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
4467   match(Set dst (DivVD src (LoadVector mem)));
4468   format %{ "vdivpd  $dst,$src,$mem\t! div packed4D" %}
4469   ins_encode %{
4470     bool vector256 = true;
4471     __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
4472   %}
4473   ins_pipe( pipe_slow );
4474 %}
4475 
4476 // ------------------------------ Shift ---------------------------------------
4477 
4478 // Left and right shift count vectors are the same on x86
4479 // (only lowest bits of xmm reg are used for count).
4480 instruct vshiftcnt(vecS dst, rRegI cnt) %{
4481   match(Set dst (LShiftCntV cnt));
4482   match(Set dst (RShiftCntV cnt));
4483   format %{ "movd    $dst,$cnt\t! load shift count" %}
4484   ins_encode %{
4485     __ movdl($dst$$XMMRegister, $cnt$$Register);
4486   %}
4487   ins_pipe( pipe_slow );
4488 %}
4489 
4490 // ------------------------------ LeftShift -----------------------------------
4491 
4492 // Shorts/Chars vector left shift
4493 instruct vsll2S(vecS dst, vecS shift) %{
4494   predicate(n->as_Vector()->length() == 2);
4495   match(Set dst (LShiftVS dst shift));
4496   format %{ "psllw   $dst,$shift\t! left shift packed2S" %}
4497   ins_encode %{
4498     __ psllw($dst$$XMMRegister, $shift$$XMMRegister);
4499   %}
4500   ins_pipe( pipe_slow );
4501 %}
4502 
4503 instruct vsll2S_imm(vecS dst, immI8 shift) %{
4504   predicate(n->as_Vector()->length() == 2);
4505   match(Set dst (LShiftVS dst shift));
4506   format %{ "psllw   $dst,$shift\t! left shift packed2S" %}
4507   ins_encode %{
4508     __ psllw($dst$$XMMRegister, (int)$shift$$constant);
4509   %}
4510   ins_pipe( pipe_slow );
4511 %}
4512 
4513 instruct vsll2S_reg(vecS dst, vecS src, vecS shift) %{
4514   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
4515   match(Set dst (LShiftVS src shift));
4516   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
4517   ins_encode %{
4518     bool vector256 = false;
4519     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
4520   %}
4521   ins_pipe( pipe_slow );
4522 %}
4523 
4524 instruct vsll2S_reg_imm(vecS dst, vecS src, immI8 shift) %{
4525   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
4526   match(Set dst (LShiftVS src shift));
4527   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
4528   ins_encode %{
4529     bool vector256 = false;
4530     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
4531   %}
4532   ins_pipe( pipe_slow );
4533 %}
4534 
4535 instruct vsll4S(vecD dst, vecS shift) %{
4536   predicate(n->as_Vector()->length() == 4);
4537   match(Set dst (LShiftVS dst shift));
4538   format %{ "psllw   $dst,$shift\t! left shift packed4S" %}
4539   ins_encode %{
4540     __ psllw($dst$$XMMRegister, $shift$$XMMRegister);
4541   %}
4542   ins_pipe( pipe_slow );
4543 %}
4544 
4545 instruct vsll4S_imm(vecD dst, immI8 shift) %{
4546   predicate(n->as_Vector()->length() == 4);
4547   match(Set dst (LShiftVS dst shift));
4548   format %{ "psllw   $dst,$shift\t! left shift packed4S" %}
4549   ins_encode %{
4550     __ psllw($dst$$XMMRegister, (int)$shift$$constant);
4551   %}
4552   ins_pipe( pipe_slow );
4553 %}
4554 
4555 instruct vsll4S_reg(vecD dst, vecD src, vecS shift) %{
4556   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
4557   match(Set dst (LShiftVS src shift));
4558   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
4559   ins_encode %{
4560     bool vector256 = false;
4561     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
4562   %}
4563   ins_pipe( pipe_slow );
4564 %}
4565 
4566 instruct vsll4S_reg_imm(vecD dst, vecD src, immI8 shift) %{
4567   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
4568   match(Set dst (LShiftVS src shift));
4569   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
4570   ins_encode %{
4571     bool vector256 = false;
4572     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
4573   %}
4574   ins_pipe( pipe_slow );
4575 %}
4576 
4577 instruct vsll8S(vecX dst, vecS shift) %{
4578   predicate(n->as_Vector()->length() == 8);
4579   match(Set dst (LShiftVS dst shift));
4580   format %{ "psllw   $dst,$shift\t! left shift packed8S" %}
4581   ins_encode %{
4582     __ psllw($dst$$XMMRegister, $shift$$XMMRegister);
4583   %}
4584   ins_pipe( pipe_slow );
4585 %}
4586 
4587 instruct vsll8S_imm(vecX dst, immI8 shift) %{
4588   predicate(n->as_Vector()->length() == 8);
4589   match(Set dst (LShiftVS dst shift));
4590   format %{ "psllw   $dst,$shift\t! left shift packed8S" %}
4591   ins_encode %{
4592     __ psllw($dst$$XMMRegister, (int)$shift$$constant);
4593   %}
4594   ins_pipe( pipe_slow );
4595 %}
4596 
4597 instruct vsll8S_reg(vecX dst, vecX src, vecS shift) %{
4598   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
4599   match(Set dst (LShiftVS src shift));
4600   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
4601   ins_encode %{
4602     bool vector256 = false;
4603     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
4604   %}
4605   ins_pipe( pipe_slow );
4606 %}
4607 
4608 instruct vsll8S_reg_imm(vecX dst, vecX src, immI8 shift) %{
4609   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
4610   match(Set dst (LShiftVS src shift));
4611   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
4612   ins_encode %{
4613     bool vector256 = false;
4614     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
4615   %}
4616   ins_pipe( pipe_slow );
4617 %}
4618 
4619 instruct vsll16S_reg(vecY dst, vecY src, vecS shift) %{
4620   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
4621   match(Set dst (LShiftVS src shift));
4622   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
4623   ins_encode %{
4624     bool vector256 = true;
4625     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
4626   %}
4627   ins_pipe( pipe_slow );
4628 %}
4629 
4630 instruct vsll16S_reg_imm(vecY dst, vecY src, immI8 shift) %{
4631   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
4632   match(Set dst (LShiftVS src shift));
4633   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
4634   ins_encode %{
4635     bool vector256 = true;
4636     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
4637   %}
4638   ins_pipe( pipe_slow );
4639 %}
4640 
4641 // Integers vector left shift
4642 instruct vsll2I(vecD dst, vecS shift) %{
4643   predicate(n->as_Vector()->length() == 2);
4644   match(Set dst (LShiftVI dst shift));
4645   format %{ "pslld   $dst,$shift\t! left shift packed2I" %}
4646   ins_encode %{
4647     __ pslld($dst$$XMMRegister, $shift$$XMMRegister);
4648   %}
4649   ins_pipe( pipe_slow );
4650 %}
4651 
4652 instruct vsll2I_imm(vecD dst, immI8 shift) %{
4653   predicate(n->as_Vector()->length() == 2);
4654   match(Set dst (LShiftVI dst shift));
4655   format %{ "pslld   $dst,$shift\t! left shift packed2I" %}
4656   ins_encode %{
4657     __ pslld($dst$$XMMRegister, (int)$shift$$constant);
4658   %}
4659   ins_pipe( pipe_slow );
4660 %}
4661 
4662 instruct vsll2I_reg(vecD dst, vecD src, vecS shift) %{
4663   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
4664   match(Set dst (LShiftVI src shift));
4665   format %{ "vpslld  $dst,$src,$shift\t! left shift packed2I" %}
4666   ins_encode %{
4667     bool vector256 = false;
4668     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
4669   %}
4670   ins_pipe( pipe_slow );
4671 %}
4672 
4673 instruct vsll2I_reg_imm(vecD dst, vecD src, immI8 shift) %{
4674   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
4675   match(Set dst (LShiftVI src shift));
4676   format %{ "vpslld  $dst,$src,$shift\t! left shift packed2I" %}
4677   ins_encode %{
4678     bool vector256 = false;
4679     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
4680   %}
4681   ins_pipe( pipe_slow );
4682 %}
4683 
4684 instruct vsll4I(vecX dst, vecS shift) %{
4685   predicate(n->as_Vector()->length() == 4);
4686   match(Set dst (LShiftVI dst shift));
4687   format %{ "pslld   $dst,$shift\t! left shift packed4I" %}
4688   ins_encode %{
4689     __ pslld($dst$$XMMRegister, $shift$$XMMRegister);
4690   %}
4691   ins_pipe( pipe_slow );
4692 %}
4693 
4694 instruct vsll4I_imm(vecX dst, immI8 shift) %{
4695   predicate(n->as_Vector()->length() == 4);
4696   match(Set dst (LShiftVI dst shift));
4697   format %{ "pslld   $dst,$shift\t! left shift packed4I" %}
4698   ins_encode %{
4699     __ pslld($dst$$XMMRegister, (int)$shift$$constant);
4700   %}
4701   ins_pipe( pipe_slow );
4702 %}
4703 
4704 instruct vsll4I_reg(vecX dst, vecX src, vecS shift) %{
4705   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
4706   match(Set dst (LShiftVI src shift));
4707   format %{ "vpslld  $dst,$src,$shift\t! left shift packed4I" %}
4708   ins_encode %{
4709     bool vector256 = false;
4710     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
4711   %}
4712   ins_pipe( pipe_slow );
4713 %}
4714 
4715 instruct vsll4I_reg_imm(vecX dst, vecX src, immI8 shift) %{
4716   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
4717   match(Set dst (LShiftVI src shift));
4718   format %{ "vpslld  $dst,$src,$shift\t! left shift packed4I" %}
4719   ins_encode %{
4720     bool vector256 = false;
4721     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
4722   %}
4723   ins_pipe( pipe_slow );
4724 %}
4725 
4726 instruct vsll8I_reg(vecY dst, vecY src, vecS shift) %{
4727   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
4728   match(Set dst (LShiftVI src shift));
4729   format %{ "vpslld  $dst,$src,$shift\t! left shift packed8I" %}
4730   ins_encode %{
4731     bool vector256 = true;
4732     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
4733   %}
4734   ins_pipe( pipe_slow );
4735 %}
4736 
4737 instruct vsll8I_reg_imm(vecY dst, vecY src, immI8 shift) %{
4738   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
4739   match(Set dst (LShiftVI src shift));
4740   format %{ "vpslld  $dst,$src,$shift\t! left shift packed8I" %}
4741   ins_encode %{
4742     bool vector256 = true;
4743     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
4744   %}
4745   ins_pipe( pipe_slow );
4746 %}
4747 
4748 // Longs vector left shift
4749 instruct vsll2L(vecX dst, vecS shift) %{
4750   predicate(n->as_Vector()->length() == 2);
4751   match(Set dst (LShiftVL dst shift));
4752   format %{ "psllq   $dst,$shift\t! left shift packed2L" %}
4753   ins_encode %{
4754     __ psllq($dst$$XMMRegister, $shift$$XMMRegister);
4755   %}
4756   ins_pipe( pipe_slow );
4757 %}
4758 
4759 instruct vsll2L_imm(vecX dst, immI8 shift) %{
4760   predicate(n->as_Vector()->length() == 2);
4761   match(Set dst (LShiftVL dst shift));
4762   format %{ "psllq   $dst,$shift\t! left shift packed2L" %}
4763   ins_encode %{
4764     __ psllq($dst$$XMMRegister, (int)$shift$$constant);
4765   %}
4766   ins_pipe( pipe_slow );
4767 %}
4768 
4769 instruct vsll2L_reg(vecX dst, vecX src, vecS shift) %{
4770   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
4771   match(Set dst (LShiftVL src shift));
4772   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed2L" %}
4773   ins_encode %{
4774     bool vector256 = false;
4775     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
4776   %}
4777   ins_pipe( pipe_slow );
4778 %}
4779 
4780 instruct vsll2L_reg_imm(vecX dst, vecX src, immI8 shift) %{
4781   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
4782   match(Set dst (LShiftVL src shift));
4783   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed2L" %}
4784   ins_encode %{
4785     bool vector256 = false;
4786     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
4787   %}
4788   ins_pipe( pipe_slow );
4789 %}
4790 
4791 instruct vsll4L_reg(vecY dst, vecY src, vecS shift) %{
4792   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
4793   match(Set dst (LShiftVL src shift));
4794   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed4L" %}
4795   ins_encode %{
4796     bool vector256 = true;
4797     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
4798   %}
4799   ins_pipe( pipe_slow );
4800 %}
4801 
4802 instruct vsll4L_reg_imm(vecY dst, vecY src, immI8 shift) %{
4803   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
4804   match(Set dst (LShiftVL src shift));
4805   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed4L" %}
4806   ins_encode %{
4807     bool vector256 = true;
4808     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
4809   %}
4810   ins_pipe( pipe_slow );
4811 %}
4812 
4813 // ----------------------- LogicalRightShift -----------------------------------
4814 
4815 // Shorts vector logical right shift produces incorrect Java result
4816 // for negative data because java code convert short value into int with
4817 // sign extension before a shift. But char vectors are fine since chars are
4818 // unsigned values.
4819 
4820 instruct vsrl2S(vecS dst, vecS shift) %{
4821   predicate(n->as_Vector()->length() == 2);
4822   match(Set dst (URShiftVS dst shift));
4823   format %{ "psrlw   $dst,$shift\t! logical right shift packed2S" %}
4824   ins_encode %{
4825     __ psrlw($dst$$XMMRegister, $shift$$XMMRegister);
4826   %}
4827   ins_pipe( pipe_slow );
4828 %}
4829 
4830 instruct vsrl2S_imm(vecS dst, immI8 shift) %{
4831   predicate(n->as_Vector()->length() == 2);
4832   match(Set dst (URShiftVS dst shift));
4833   format %{ "psrlw   $dst,$shift\t! logical right shift packed2S" %}
4834   ins_encode %{
4835     __ psrlw($dst$$XMMRegister, (int)$shift$$constant);
4836   %}
4837   ins_pipe( pipe_slow );
4838 %}
4839 
4840 instruct vsrl2S_reg(vecS dst, vecS src, vecS shift) %{
4841   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
4842   match(Set dst (URShiftVS src shift));
4843   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed2S" %}
4844   ins_encode %{
4845     bool vector256 = false;
4846     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
4847   %}
4848   ins_pipe( pipe_slow );
4849 %}
4850 
4851 instruct vsrl2S_reg_imm(vecS dst, vecS src, immI8 shift) %{
4852   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
4853   match(Set dst (URShiftVS src shift));
4854   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed2S" %}
4855   ins_encode %{
4856     bool vector256 = false;
4857     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
4858   %}
4859   ins_pipe( pipe_slow );
4860 %}
4861 
4862 instruct vsrl4S(vecD dst, vecS shift) %{
4863   predicate(n->as_Vector()->length() == 4);
4864   match(Set dst (URShiftVS dst shift));
4865   format %{ "psrlw   $dst,$shift\t! logical right shift packed4S" %}
4866   ins_encode %{
4867     __ psrlw($dst$$XMMRegister, $shift$$XMMRegister);
4868   %}
4869   ins_pipe( pipe_slow );
4870 %}
4871 
4872 instruct vsrl4S_imm(vecD dst, immI8 shift) %{
4873   predicate(n->as_Vector()->length() == 4);
4874   match(Set dst (URShiftVS dst shift));
4875   format %{ "psrlw   $dst,$shift\t! logical right shift packed4S" %}
4876   ins_encode %{
4877     __ psrlw($dst$$XMMRegister, (int)$shift$$constant);
4878   %}
4879   ins_pipe( pipe_slow );
4880 %}
4881 
4882 instruct vsrl4S_reg(vecD dst, vecD src, vecS shift) %{
4883   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
4884   match(Set dst (URShiftVS src shift));
4885   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed4S" %}
4886   ins_encode %{
4887     bool vector256 = false;
4888     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
4889   %}
4890   ins_pipe( pipe_slow );
4891 %}
4892 
4893 instruct vsrl4S_reg_imm(vecD dst, vecD src, immI8 shift) %{
4894   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
4895   match(Set dst (URShiftVS src shift));
4896   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed4S" %}
4897   ins_encode %{
4898     bool vector256 = false;
4899     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
4900   %}
4901   ins_pipe( pipe_slow );
4902 %}
4903 
4904 instruct vsrl8S(vecX dst, vecS shift) %{
4905   predicate(n->as_Vector()->length() == 8);
4906   match(Set dst (URShiftVS dst shift));
4907   format %{ "psrlw   $dst,$shift\t! logical right shift packed8S" %}
4908   ins_encode %{
4909     __ psrlw($dst$$XMMRegister, $shift$$XMMRegister);
4910   %}
4911   ins_pipe( pipe_slow );
4912 %}
4913 
4914 instruct vsrl8S_imm(vecX dst, immI8 shift) %{
4915   predicate(n->as_Vector()->length() == 8);
4916   match(Set dst (URShiftVS dst shift));
4917   format %{ "psrlw   $dst,$shift\t! logical right shift packed8S" %}
4918   ins_encode %{
4919     __ psrlw($dst$$XMMRegister, (int)$shift$$constant);
4920   %}
4921   ins_pipe( pipe_slow );
4922 %}
4923 
4924 instruct vsrl8S_reg(vecX dst, vecX src, vecS shift) %{
4925   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
4926   match(Set dst (URShiftVS src shift));
4927   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed8S" %}
4928   ins_encode %{
4929     bool vector256 = false;
4930     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
4931   %}
4932   ins_pipe( pipe_slow );
4933 %}
4934 
4935 instruct vsrl8S_reg_imm(vecX dst, vecX src, immI8 shift) %{
4936   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
4937   match(Set dst (URShiftVS src shift));
4938   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed8S" %}
4939   ins_encode %{
4940     bool vector256 = false;
4941     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
4942   %}
4943   ins_pipe( pipe_slow );
4944 %}
4945 
4946 instruct vsrl16S_reg(vecY dst, vecY src, vecS shift) %{
4947   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
4948   match(Set dst (URShiftVS src shift));
4949   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed16S" %}
4950   ins_encode %{
4951     bool vector256 = true;
4952     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
4953   %}
4954   ins_pipe( pipe_slow );
4955 %}
4956 
4957 instruct vsrl16S_reg_imm(vecY dst, vecY src, immI8 shift) %{
4958   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
4959   match(Set dst (URShiftVS src shift));
4960   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed16S" %}
4961   ins_encode %{
4962     bool vector256 = true;
4963     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
4964   %}
4965   ins_pipe( pipe_slow );
4966 %}
4967 
4968 // Integers vector logical right shift
4969 instruct vsrl2I(vecD dst, vecS shift) %{
4970   predicate(n->as_Vector()->length() == 2);
4971   match(Set dst (URShiftVI dst shift));
4972   format %{ "psrld   $dst,$shift\t! logical right shift packed2I" %}
4973   ins_encode %{
4974     __ psrld($dst$$XMMRegister, $shift$$XMMRegister);
4975   %}
4976   ins_pipe( pipe_slow );
4977 %}
4978 
4979 instruct vsrl2I_imm(vecD dst, immI8 shift) %{
4980   predicate(n->as_Vector()->length() == 2);
4981   match(Set dst (URShiftVI dst shift));
4982   format %{ "psrld   $dst,$shift\t! logical right shift packed2I" %}
4983   ins_encode %{
4984     __ psrld($dst$$XMMRegister, (int)$shift$$constant);
4985   %}
4986   ins_pipe( pipe_slow );
4987 %}
4988 
4989 instruct vsrl2I_reg(vecD dst, vecD src, vecS shift) %{
4990   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
4991   match(Set dst (URShiftVI src shift));
4992   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed2I" %}
4993   ins_encode %{
4994     bool vector256 = false;
4995     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
4996   %}
4997   ins_pipe( pipe_slow );
4998 %}
4999 
5000 instruct vsrl2I_reg_imm(vecD dst, vecD src, immI8 shift) %{
5001   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
5002   match(Set dst (URShiftVI src shift));
5003   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed2I" %}
5004   ins_encode %{
5005     bool vector256 = false;
5006     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
5007   %}
5008   ins_pipe( pipe_slow );
5009 %}
5010 
5011 instruct vsrl4I(vecX dst, vecS shift) %{
5012   predicate(n->as_Vector()->length() == 4);
5013   match(Set dst (URShiftVI dst shift));
5014   format %{ "psrld   $dst,$shift\t! logical right shift packed4I" %}
5015   ins_encode %{
5016     __ psrld($dst$$XMMRegister, $shift$$XMMRegister);
5017   %}
5018   ins_pipe( pipe_slow );
5019 %}
5020 
5021 instruct vsrl4I_imm(vecX dst, immI8 shift) %{
5022   predicate(n->as_Vector()->length() == 4);
5023   match(Set dst (URShiftVI dst shift));
5024   format %{ "psrld   $dst,$shift\t! logical right shift packed4I" %}
5025   ins_encode %{
5026     __ psrld($dst$$XMMRegister, (int)$shift$$constant);
5027   %}
5028   ins_pipe( pipe_slow );
5029 %}
5030 
5031 instruct vsrl4I_reg(vecX dst, vecX src, vecS shift) %{
5032   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
5033   match(Set dst (URShiftVI src shift));
5034   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed4I" %}
5035   ins_encode %{
5036     bool vector256 = false;
5037     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
5038   %}
5039   ins_pipe( pipe_slow );
5040 %}
5041 
5042 instruct vsrl4I_reg_imm(vecX dst, vecX src, immI8 shift) %{
5043   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
5044   match(Set dst (URShiftVI src shift));
5045   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed4I" %}
5046   ins_encode %{
5047     bool vector256 = false;
5048     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
5049   %}
5050   ins_pipe( pipe_slow );
5051 %}
5052 
5053 instruct vsrl8I_reg(vecY dst, vecY src, vecS shift) %{
5054   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
5055   match(Set dst (URShiftVI src shift));
5056   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed8I" %}
5057   ins_encode %{
5058     bool vector256 = true;
5059     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
5060   %}
5061   ins_pipe( pipe_slow );
5062 %}
5063 
5064 instruct vsrl8I_reg_imm(vecY dst, vecY src, immI8 shift) %{
5065   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
5066   match(Set dst (URShiftVI src shift));
5067   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed8I" %}
5068   ins_encode %{
5069     bool vector256 = true;
5070     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
5071   %}
5072   ins_pipe( pipe_slow );
5073 %}
5074 
5075 // Longs vector logical right shift
5076 instruct vsrl2L(vecX dst, vecS shift) %{
5077   predicate(n->as_Vector()->length() == 2);
5078   match(Set dst (URShiftVL dst shift));
5079   format %{ "psrlq   $dst,$shift\t! logical right shift packed2L" %}
5080   ins_encode %{
5081     __ psrlq($dst$$XMMRegister, $shift$$XMMRegister);
5082   %}
5083   ins_pipe( pipe_slow );
5084 %}
5085 
5086 instruct vsrl2L_imm(vecX dst, immI8 shift) %{
5087   predicate(n->as_Vector()->length() == 2);
5088   match(Set dst (URShiftVL dst shift));
5089   format %{ "psrlq   $dst,$shift\t! logical right shift packed2L" %}
5090   ins_encode %{
5091     __ psrlq($dst$$XMMRegister, (int)$shift$$constant);
5092   %}
5093   ins_pipe( pipe_slow );
5094 %}
5095 
5096 instruct vsrl2L_reg(vecX dst, vecX src, vecS shift) %{
5097   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
5098   match(Set dst (URShiftVL src shift));
5099   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed2L" %}
5100   ins_encode %{
5101     bool vector256 = false;
5102     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
5103   %}
5104   ins_pipe( pipe_slow );
5105 %}
5106 
5107 instruct vsrl2L_reg_imm(vecX dst, vecX src, immI8 shift) %{
5108   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
5109   match(Set dst (URShiftVL src shift));
5110   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed2L" %}
5111   ins_encode %{
5112     bool vector256 = false;
5113     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
5114   %}
5115   ins_pipe( pipe_slow );
5116 %}
5117 
5118 instruct vsrl4L_reg(vecY dst, vecY src, vecS shift) %{
5119   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
5120   match(Set dst (URShiftVL src shift));
5121   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed4L" %}
5122   ins_encode %{
5123     bool vector256 = true;
5124     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
5125   %}
5126   ins_pipe( pipe_slow );
5127 %}
5128 
5129 instruct vsrl4L_reg_imm(vecY dst, vecY src, immI8 shift) %{
5130   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
5131   match(Set dst (URShiftVL src shift));
5132   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed4L" %}
5133   ins_encode %{
5134     bool vector256 = true;
5135     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
5136   %}
5137   ins_pipe( pipe_slow );
5138 %}
5139 
5140 // ------------------- ArithmeticRightShift -----------------------------------
5141 
5142 // Shorts/Chars vector arithmetic right shift
5143 instruct vsra2S(vecS dst, vecS shift) %{
5144   predicate(n->as_Vector()->length() == 2);
5145   match(Set dst (RShiftVS dst shift));
5146   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed2S" %}
5147   ins_encode %{
5148     __ psraw($dst$$XMMRegister, $shift$$XMMRegister);
5149   %}
5150   ins_pipe( pipe_slow );
5151 %}
5152 
5153 instruct vsra2S_imm(vecS dst, immI8 shift) %{
5154   predicate(n->as_Vector()->length() == 2);
5155   match(Set dst (RShiftVS dst shift));
5156   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed2S" %}
5157   ins_encode %{
5158     __ psraw($dst$$XMMRegister, (int)$shift$$constant);
5159   %}
5160   ins_pipe( pipe_slow );
5161 %}
5162 
5163 instruct vsra2S_reg(vecS dst, vecS src, vecS shift) %{
5164   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
5165   match(Set dst (RShiftVS src shift));
5166   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
5167   ins_encode %{
5168     bool vector256 = false;
5169     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
5170   %}
5171   ins_pipe( pipe_slow );
5172 %}
5173 
5174 instruct vsra2S_reg_imm(vecS dst, vecS src, immI8 shift) %{
5175   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
5176   match(Set dst (RShiftVS src shift));
5177   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
5178   ins_encode %{
5179     bool vector256 = false;
5180     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
5181   %}
5182   ins_pipe( pipe_slow );
5183 %}
5184 
5185 instruct vsra4S(vecD dst, vecS shift) %{
5186   predicate(n->as_Vector()->length() == 4);
5187   match(Set dst (RShiftVS dst shift));
5188   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed4S" %}
5189   ins_encode %{
5190     __ psraw($dst$$XMMRegister, $shift$$XMMRegister);
5191   %}
5192   ins_pipe( pipe_slow );
5193 %}
5194 
5195 instruct vsra4S_imm(vecD dst, immI8 shift) %{
5196   predicate(n->as_Vector()->length() == 4);
5197   match(Set dst (RShiftVS dst shift));
5198   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed4S" %}
5199   ins_encode %{
5200     __ psraw($dst$$XMMRegister, (int)$shift$$constant);
5201   %}
5202   ins_pipe( pipe_slow );
5203 %}
5204 
5205 instruct vsra4S_reg(vecD dst, vecD src, vecS shift) %{
5206   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
5207   match(Set dst (RShiftVS src shift));
5208   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
5209   ins_encode %{
5210     bool vector256 = false;
5211     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
5212   %}
5213   ins_pipe( pipe_slow );
5214 %}
5215 
5216 instruct vsra4S_reg_imm(vecD dst, vecD src, immI8 shift) %{
5217   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
5218   match(Set dst (RShiftVS src shift));
5219   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
5220   ins_encode %{
5221     bool vector256 = false;
5222     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
5223   %}
5224   ins_pipe( pipe_slow );
5225 %}
5226 
5227 instruct vsra8S(vecX dst, vecS shift) %{
5228   predicate(n->as_Vector()->length() == 8);
5229   match(Set dst (RShiftVS dst shift));
5230   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed8S" %}
5231   ins_encode %{
5232     __ psraw($dst$$XMMRegister, $shift$$XMMRegister);
5233   %}
5234   ins_pipe( pipe_slow );
5235 %}
5236 
5237 instruct vsra8S_imm(vecX dst, immI8 shift) %{
5238   predicate(n->as_Vector()->length() == 8);
5239   match(Set dst (RShiftVS dst shift));
5240   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed8S" %}
5241   ins_encode %{
5242     __ psraw($dst$$XMMRegister, (int)$shift$$constant);
5243   %}
5244   ins_pipe( pipe_slow );
5245 %}
5246 
5247 instruct vsra8S_reg(vecX dst, vecX src, vecS shift) %{
5248   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
5249   match(Set dst (RShiftVS src shift));
5250   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
5251   ins_encode %{
5252     bool vector256 = false;
5253     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
5254   %}
5255   ins_pipe( pipe_slow );
5256 %}
5257 
5258 instruct vsra8S_reg_imm(vecX dst, vecX src, immI8 shift) %{
5259   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
5260   match(Set dst (RShiftVS src shift));
5261   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
5262   ins_encode %{
5263     bool vector256 = false;
5264     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
5265   %}
5266   ins_pipe( pipe_slow );
5267 %}
5268 
5269 instruct vsra16S_reg(vecY dst, vecY src, vecS shift) %{
5270   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
5271   match(Set dst (RShiftVS src shift));
5272   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
5273   ins_encode %{
5274     bool vector256 = true;
5275     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
5276   %}
5277   ins_pipe( pipe_slow );
5278 %}
5279 
5280 instruct vsra16S_reg_imm(vecY dst, vecY src, immI8 shift) %{
5281   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
5282   match(Set dst (RShiftVS src shift));
5283   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
5284   ins_encode %{
5285     bool vector256 = true;
5286     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
5287   %}
5288   ins_pipe( pipe_slow );
5289 %}
5290 
5291 // Integers vector arithmetic right shift
5292 instruct vsra2I(vecD dst, vecS shift) %{
5293   predicate(n->as_Vector()->length() == 2);
5294   match(Set dst (RShiftVI dst shift));
5295   format %{ "psrad   $dst,$shift\t! arithmetic right shift packed2I" %}
5296   ins_encode %{
5297     __ psrad($dst$$XMMRegister, $shift$$XMMRegister);
5298   %}
5299   ins_pipe( pipe_slow );
5300 %}
5301 
5302 instruct vsra2I_imm(vecD dst, immI8 shift) %{
5303   predicate(n->as_Vector()->length() == 2);
5304   match(Set dst (RShiftVI dst shift));
5305   format %{ "psrad   $dst,$shift\t! arithmetic right shift packed2I" %}
5306   ins_encode %{
5307     __ psrad($dst$$XMMRegister, (int)$shift$$constant);
5308   %}
5309   ins_pipe( pipe_slow );
5310 %}
5311 
5312 instruct vsra2I_reg(vecD dst, vecD src, vecS shift) %{
5313   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
5314   match(Set dst (RShiftVI src shift));
5315   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed2I" %}
5316   ins_encode %{
5317     bool vector256 = false;
5318     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
5319   %}
5320   ins_pipe( pipe_slow );
5321 %}
5322 
5323 instruct vsra2I_reg_imm(vecD dst, vecD src, immI8 shift) %{
5324   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
5325   match(Set dst (RShiftVI src shift));
5326   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed2I" %}
5327   ins_encode %{
5328     bool vector256 = false;
5329     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
5330   %}
5331   ins_pipe( pipe_slow );
5332 %}
5333 
5334 instruct vsra4I(vecX dst, vecS shift) %{
5335   predicate(n->as_Vector()->length() == 4);
5336   match(Set dst (RShiftVI dst shift));
5337   format %{ "psrad   $dst,$shift\t! arithmetic right shift packed4I" %}
5338   ins_encode %{
5339     __ psrad($dst$$XMMRegister, $shift$$XMMRegister);
5340   %}
5341   ins_pipe( pipe_slow );
5342 %}
5343 
5344 instruct vsra4I_imm(vecX dst, immI8 shift) %{
5345   predicate(n->as_Vector()->length() == 4);
5346   match(Set dst (RShiftVI dst shift));
5347   format %{ "psrad   $dst,$shift\t! arithmetic right shift packed4I" %}
5348   ins_encode %{
5349     __ psrad($dst$$XMMRegister, (int)$shift$$constant);
5350   %}
5351   ins_pipe( pipe_slow );
5352 %}
5353 
5354 instruct vsra4I_reg(vecX dst, vecX src, vecS shift) %{
5355   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
5356   match(Set dst (RShiftVI src shift));
5357   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed4I" %}
5358   ins_encode %{
5359     bool vector256 = false;
5360     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
5361   %}
5362   ins_pipe( pipe_slow );
5363 %}
5364 
5365 instruct vsra4I_reg_imm(vecX dst, vecX src, immI8 shift) %{
5366   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
5367   match(Set dst (RShiftVI src shift));
5368   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed4I" %}
5369   ins_encode %{
5370     bool vector256 = false;
5371     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
5372   %}
5373   ins_pipe( pipe_slow );
5374 %}
5375 
5376 instruct vsra8I_reg(vecY dst, vecY src, vecS shift) %{
5377   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
5378   match(Set dst (RShiftVI src shift));
5379   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed8I" %}
5380   ins_encode %{
5381     bool vector256 = true;
5382     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
5383   %}
5384   ins_pipe( pipe_slow );
5385 %}
5386 
5387 instruct vsra8I_reg_imm(vecY dst, vecY src, immI8 shift) %{
5388   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
5389   match(Set dst (RShiftVI src shift));
5390   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed8I" %}
5391   ins_encode %{
5392     bool vector256 = true;
5393     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
5394   %}
5395   ins_pipe( pipe_slow );
5396 %}
5397 
5398 // There are no longs vector arithmetic right shift instructions.
5399 
5400 
5401 // --------------------------------- AND --------------------------------------
5402 
5403 instruct vand4B(vecS dst, vecS src) %{
5404   predicate(n->as_Vector()->length_in_bytes() == 4);
5405   match(Set dst (AndV dst src));
5406   format %{ "pand    $dst,$src\t! and vectors (4 bytes)" %}
5407   ins_encode %{
5408     __ pand($dst$$XMMRegister, $src$$XMMRegister);
5409   %}
5410   ins_pipe( pipe_slow );
5411 %}
5412 
5413 instruct vand4B_reg(vecS dst, vecS src1, vecS src2) %{
5414   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
5415   match(Set dst (AndV src1 src2));
5416   format %{ "vpand   $dst,$src1,$src2\t! and vectors (4 bytes)" %}
5417   ins_encode %{
5418     bool vector256 = false;
5419     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
5420   %}
5421   ins_pipe( pipe_slow );
5422 %}
5423 
5424 instruct vand8B(vecD dst, vecD src) %{
5425   predicate(n->as_Vector()->length_in_bytes() == 8);
5426   match(Set dst (AndV dst src));
5427   format %{ "pand    $dst,$src\t! and vectors (8 bytes)" %}
5428   ins_encode %{
5429     __ pand($dst$$XMMRegister, $src$$XMMRegister);
5430   %}
5431   ins_pipe( pipe_slow );
5432 %}
5433 
5434 instruct vand8B_reg(vecD dst, vecD src1, vecD src2) %{
5435   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
5436   match(Set dst (AndV src1 src2));
5437   format %{ "vpand   $dst,$src1,$src2\t! and vectors (8 bytes)" %}
5438   ins_encode %{
5439     bool vector256 = false;
5440     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
5441   %}
5442   ins_pipe( pipe_slow );
5443 %}
5444 
5445 instruct vand16B(vecX dst, vecX src) %{
5446   predicate(n->as_Vector()->length_in_bytes() == 16);
5447   match(Set dst (AndV dst src));
5448   format %{ "pand    $dst,$src\t! and vectors (16 bytes)" %}
5449   ins_encode %{
5450     __ pand($dst$$XMMRegister, $src$$XMMRegister);
5451   %}
5452   ins_pipe( pipe_slow );
5453 %}
5454 
5455 instruct vand16B_reg(vecX dst, vecX src1, vecX src2) %{
5456   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
5457   match(Set dst (AndV src1 src2));
5458   format %{ "vpand   $dst,$src1,$src2\t! and vectors (16 bytes)" %}
5459   ins_encode %{
5460     bool vector256 = false;
5461     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
5462   %}
5463   ins_pipe( pipe_slow );
5464 %}
5465 
5466 instruct vand16B_mem(vecX dst, vecX src, memory mem) %{
5467   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
5468   match(Set dst (AndV src (LoadVector mem)));
5469   format %{ "vpand   $dst,$src,$mem\t! and vectors (16 bytes)" %}
5470   ins_encode %{
5471     bool vector256 = false;
5472     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
5473   %}
5474   ins_pipe( pipe_slow );
5475 %}
5476 
5477 instruct vand32B_reg(vecY dst, vecY src1, vecY src2) %{
5478   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
5479   match(Set dst (AndV src1 src2));
5480   format %{ "vpand   $dst,$src1,$src2\t! and vectors (32 bytes)" %}
5481   ins_encode %{
5482     bool vector256 = true;
5483     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
5484   %}
5485   ins_pipe( pipe_slow );
5486 %}
5487 
5488 instruct vand32B_mem(vecY dst, vecY src, memory mem) %{
5489   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
5490   match(Set dst (AndV src (LoadVector mem)));
5491   format %{ "vpand   $dst,$src,$mem\t! and vectors (32 bytes)" %}
5492   ins_encode %{
5493     bool vector256 = true;
5494     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
5495   %}
5496   ins_pipe( pipe_slow );
5497 %}
5498 
5499 // --------------------------------- OR ---------------------------------------
5500 
5501 instruct vor4B(vecS dst, vecS src) %{
5502   predicate(n->as_Vector()->length_in_bytes() == 4);
5503   match(Set dst (OrV dst src));
5504   format %{ "por     $dst,$src\t! or vectors (4 bytes)" %}
5505   ins_encode %{
5506     __ por($dst$$XMMRegister, $src$$XMMRegister);
5507   %}
5508   ins_pipe( pipe_slow );
5509 %}
5510 
5511 instruct vor4B_reg(vecS dst, vecS src1, vecS src2) %{
5512   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
5513   match(Set dst (OrV src1 src2));
5514   format %{ "vpor    $dst,$src1,$src2\t! or vectors (4 bytes)" %}
5515   ins_encode %{
5516     bool vector256 = false;
5517     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
5518   %}
5519   ins_pipe( pipe_slow );
5520 %}
5521 
5522 instruct vor8B(vecD dst, vecD src) %{
5523   predicate(n->as_Vector()->length_in_bytes() == 8);
5524   match(Set dst (OrV dst src));
5525   format %{ "por     $dst,$src\t! or vectors (8 bytes)" %}
5526   ins_encode %{
5527     __ por($dst$$XMMRegister, $src$$XMMRegister);
5528   %}
5529   ins_pipe( pipe_slow );
5530 %}
5531 
5532 instruct vor8B_reg(vecD dst, vecD src1, vecD src2) %{
5533   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
5534   match(Set dst (OrV src1 src2));
5535   format %{ "vpor    $dst,$src1,$src2\t! or vectors (8 bytes)" %}
5536   ins_encode %{
5537     bool vector256 = false;
5538     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
5539   %}
5540   ins_pipe( pipe_slow );
5541 %}
5542 
5543 instruct vor16B(vecX dst, vecX src) %{
5544   predicate(n->as_Vector()->length_in_bytes() == 16);
5545   match(Set dst (OrV dst src));
5546   format %{ "por     $dst,$src\t! or vectors (16 bytes)" %}
5547   ins_encode %{
5548     __ por($dst$$XMMRegister, $src$$XMMRegister);
5549   %}
5550   ins_pipe( pipe_slow );
5551 %}
5552 
5553 instruct vor16B_reg(vecX dst, vecX src1, vecX src2) %{
5554   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
5555   match(Set dst (OrV src1 src2));
5556   format %{ "vpor    $dst,$src1,$src2\t! or vectors (16 bytes)" %}
5557   ins_encode %{
5558     bool vector256 = false;
5559     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
5560   %}
5561   ins_pipe( pipe_slow );
5562 %}
5563 
5564 instruct vor16B_mem(vecX dst, vecX src, memory mem) %{
5565   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
5566   match(Set dst (OrV src (LoadVector mem)));
5567   format %{ "vpor    $dst,$src,$mem\t! or vectors (16 bytes)" %}
5568   ins_encode %{
5569     bool vector256 = false;
5570     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
5571   %}
5572   ins_pipe( pipe_slow );
5573 %}
5574 
5575 instruct vor32B_reg(vecY dst, vecY src1, vecY src2) %{
5576   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
5577   match(Set dst (OrV src1 src2));
5578   format %{ "vpor    $dst,$src1,$src2\t! or vectors (32 bytes)" %}
5579   ins_encode %{
5580     bool vector256 = true;
5581     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
5582   %}
5583   ins_pipe( pipe_slow );
5584 %}
5585 
5586 instruct vor32B_mem(vecY dst, vecY src, memory mem) %{
5587   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
5588   match(Set dst (OrV src (LoadVector mem)));
5589   format %{ "vpor    $dst,$src,$mem\t! or vectors (32 bytes)" %}
5590   ins_encode %{
5591     bool vector256 = true;
5592     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
5593   %}
5594   ins_pipe( pipe_slow );
5595 %}
5596 
5597 // --------------------------------- XOR --------------------------------------
5598 
5599 instruct vxor4B(vecS dst, vecS src) %{
5600   predicate(n->as_Vector()->length_in_bytes() == 4);
5601   match(Set dst (XorV dst src));
5602   format %{ "pxor    $dst,$src\t! xor vectors (4 bytes)" %}
5603   ins_encode %{
5604     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
5605   %}
5606   ins_pipe( pipe_slow );
5607 %}
5608 
5609 instruct vxor4B_reg(vecS dst, vecS src1, vecS src2) %{
5610   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
5611   match(Set dst (XorV src1 src2));
5612   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (4 bytes)" %}
5613   ins_encode %{
5614     bool vector256 = false;
5615     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
5616   %}
5617   ins_pipe( pipe_slow );
5618 %}
5619 
5620 instruct vxor8B(vecD dst, vecD src) %{
5621   predicate(n->as_Vector()->length_in_bytes() == 8);
5622   match(Set dst (XorV dst src));
5623   format %{ "pxor    $dst,$src\t! xor vectors (8 bytes)" %}
5624   ins_encode %{
5625     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
5626   %}
5627   ins_pipe( pipe_slow );
5628 %}
5629 
5630 instruct vxor8B_reg(vecD dst, vecD src1, vecD src2) %{
5631   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
5632   match(Set dst (XorV src1 src2));
5633   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (8 bytes)" %}
5634   ins_encode %{
5635     bool vector256 = false;
5636     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
5637   %}
5638   ins_pipe( pipe_slow );
5639 %}
5640 
5641 instruct vxor16B(vecX dst, vecX src) %{
5642   predicate(n->as_Vector()->length_in_bytes() == 16);
5643   match(Set dst (XorV dst src));
5644   format %{ "pxor    $dst,$src\t! xor vectors (16 bytes)" %}
5645   ins_encode %{
5646     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
5647   %}
5648   ins_pipe( pipe_slow );
5649 %}
5650 
5651 instruct vxor16B_reg(vecX dst, vecX src1, vecX src2) %{
5652   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
5653   match(Set dst (XorV src1 src2));
5654   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (16 bytes)" %}
5655   ins_encode %{
5656     bool vector256 = false;
5657     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
5658   %}
5659   ins_pipe( pipe_slow );
5660 %}
5661 
5662 instruct vxor16B_mem(vecX dst, vecX src, memory mem) %{
5663   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
5664   match(Set dst (XorV src (LoadVector mem)));
5665   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (16 bytes)" %}
5666   ins_encode %{
5667     bool vector256 = false;
5668     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
5669   %}
5670   ins_pipe( pipe_slow );
5671 %}
5672 
5673 instruct vxor32B_reg(vecY dst, vecY src1, vecY src2) %{
5674   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
5675   match(Set dst (XorV src1 src2));
5676   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (32 bytes)" %}
5677   ins_encode %{
5678     bool vector256 = true;
5679     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
5680   %}
5681   ins_pipe( pipe_slow );
5682 %}
5683 
5684 instruct vxor32B_mem(vecY dst, vecY src, memory mem) %{
5685   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
5686   match(Set dst (XorV src (LoadVector mem)));
5687   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (32 bytes)" %}
5688   ins_encode %{
5689     bool vector256 = true;
5690     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
5691   %}
5692   ins_pipe( pipe_slow );
5693 %}
5694