1 //
   2 // Copyright (c) 2011, 2019, Oracle and/or its affiliates. All rights reserved.
   3 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4 //
   5 // This code is free software; you can redistribute it and/or modify it
   6 // under the terms of the GNU General Public License version 2 only, as
   7 // published by the Free Software Foundation.
   8 //
   9 // This code is distributed in the hope that it will be useful, but WITHOUT
  10 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11 // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12 // version 2 for more details (a copy is included in the LICENSE file that
  13 // accompanied this code).
  14 //
  15 // You should have received a copy of the GNU General Public License version
  16 // 2 along with this work; if not, write to the Free Software Foundation,
  17 // Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18 //
  19 // Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20 // or visit www.oracle.com if you need additional information or have any
  21 // questions.
  22 //
  23 //
  24 
  25 // X86 Common Architecture Description File
  26 
  27 //----------REGISTER DEFINITION BLOCK------------------------------------------
  28 // This information is used by the matcher and the register allocator to
  29 // describe individual registers and classes of registers within the target
  30 // archtecture.
  31 
  32 register %{
  33 //----------Architecture Description Register Definitions----------------------
  34 // General Registers
  35 // "reg_def"  name ( register save type, C convention save type,
  36 //                   ideal register type, encoding );
  37 // Register Save Types:
  38 //
  39 // NS  = No-Save:       The register allocator assumes that these registers
  40 //                      can be used without saving upon entry to the method, &
  41 //                      that they do not need to be saved at call sites.
  42 //
  43 // SOC = Save-On-Call:  The register allocator assumes that these registers
  44 //                      can be used without saving upon entry to the method,
  45 //                      but that they must be saved at call sites.
  46 //
  47 // SOE = Save-On-Entry: The register allocator assumes that these registers
  48 //                      must be saved before using them upon entry to the
  49 //                      method, but they do not need to be saved at call
  50 //                      sites.
  51 //
  52 // AS  = Always-Save:   The register allocator assumes that these registers
  53 //                      must be saved before using them upon entry to the
  54 //                      method, & that they must be saved at call sites.
  55 //
  56 // Ideal Register Type is used to determine how to save & restore a
  57 // register.  Op_RegI will get spilled with LoadI/StoreI, Op_RegP will get
  58 // spilled with LoadP/StoreP.  If the register supports both, use Op_RegI.
  59 //
  60 // The encoding number is the actual bit-pattern placed into the opcodes.
  61 
  62 // XMM registers.  512-bit registers or 8 words each, labeled (a)-p.
  63 // Word a in each register holds a Float, words ab hold a Double.
  64 // The whole registers are used in SSE4.2 version intrinsics,
  65 // array copy stubs and superword operations (see UseSSE42Intrinsics,
  66 // UseXMMForArrayCopy and UseSuperword flags).
  67 // For pre EVEX enabled architectures:
  68 //      XMM8-XMM15 must be encoded with REX (VEX for UseAVX)
  69 // For EVEX enabled architectures:
  70 //      XMM8-XMM31 must be encoded with REX (EVEX for UseAVX).
  71 //
  72 // Linux ABI:   No register preserved across function calls
  73 //              XMM0-XMM7 might hold parameters
  74 // Windows ABI: XMM6-XMM31 preserved across function calls
  75 //              XMM0-XMM3 might hold parameters
  76 
  77 reg_def XMM0 ( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg());
  78 reg_def XMM0b( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(1));
  79 reg_def XMM0c( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(2));
  80 reg_def XMM0d( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(3));
  81 reg_def XMM0e( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(4));
  82 reg_def XMM0f( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(5));
  83 reg_def XMM0g( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(6));
  84 reg_def XMM0h( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(7));
  85 reg_def XMM0i( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(8));
  86 reg_def XMM0j( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(9));
  87 reg_def XMM0k( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(10));
  88 reg_def XMM0l( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(11));
  89 reg_def XMM0m( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(12));
  90 reg_def XMM0n( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(13));
  91 reg_def XMM0o( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(14));
  92 reg_def XMM0p( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(15));
  93 
  94 reg_def XMM1 ( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg());
  95 reg_def XMM1b( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(1));
  96 reg_def XMM1c( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(2));
  97 reg_def XMM1d( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(3));
  98 reg_def XMM1e( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(4));
  99 reg_def XMM1f( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(5));
 100 reg_def XMM1g( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(6));
 101 reg_def XMM1h( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(7));
 102 reg_def XMM1i( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(8));
 103 reg_def XMM1j( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(9));
 104 reg_def XMM1k( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(10));
 105 reg_def XMM1l( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(11));
 106 reg_def XMM1m( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(12));
 107 reg_def XMM1n( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(13));
 108 reg_def XMM1o( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(14));
 109 reg_def XMM1p( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(15));
 110 
 111 reg_def XMM2 ( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg());
 112 reg_def XMM2b( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(1));
 113 reg_def XMM2c( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(2));
 114 reg_def XMM2d( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(3));
 115 reg_def XMM2e( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(4));
 116 reg_def XMM2f( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(5));
 117 reg_def XMM2g( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(6));
 118 reg_def XMM2h( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(7));
 119 reg_def XMM2i( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(8));
 120 reg_def XMM2j( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(9));
 121 reg_def XMM2k( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(10));
 122 reg_def XMM2l( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(11));
 123 reg_def XMM2m( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(12));
 124 reg_def XMM2n( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(13));
 125 reg_def XMM2o( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(14));
 126 reg_def XMM2p( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(15));
 127 
 128 reg_def XMM3 ( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg());
 129 reg_def XMM3b( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(1));
 130 reg_def XMM3c( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(2));
 131 reg_def XMM3d( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(3));
 132 reg_def XMM3e( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(4));
 133 reg_def XMM3f( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(5));
 134 reg_def XMM3g( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(6));
 135 reg_def XMM3h( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(7));
 136 reg_def XMM3i( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(8));
 137 reg_def XMM3j( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(9));
 138 reg_def XMM3k( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(10));
 139 reg_def XMM3l( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(11));
 140 reg_def XMM3m( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(12));
 141 reg_def XMM3n( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(13));
 142 reg_def XMM3o( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(14));
 143 reg_def XMM3p( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(15));
 144 
 145 reg_def XMM4 ( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg());
 146 reg_def XMM4b( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(1));
 147 reg_def XMM4c( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(2));
 148 reg_def XMM4d( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(3));
 149 reg_def XMM4e( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(4));
 150 reg_def XMM4f( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(5));
 151 reg_def XMM4g( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(6));
 152 reg_def XMM4h( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(7));
 153 reg_def XMM4i( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(8));
 154 reg_def XMM4j( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(9));
 155 reg_def XMM4k( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(10));
 156 reg_def XMM4l( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(11));
 157 reg_def XMM4m( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(12));
 158 reg_def XMM4n( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(13));
 159 reg_def XMM4o( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(14));
 160 reg_def XMM4p( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(15));
 161 
 162 reg_def XMM5 ( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg());
 163 reg_def XMM5b( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(1));
 164 reg_def XMM5c( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(2));
 165 reg_def XMM5d( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(3));
 166 reg_def XMM5e( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(4));
 167 reg_def XMM5f( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(5));
 168 reg_def XMM5g( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(6));
 169 reg_def XMM5h( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(7));
 170 reg_def XMM5i( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(8));
 171 reg_def XMM5j( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(9));
 172 reg_def XMM5k( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(10));
 173 reg_def XMM5l( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(11));
 174 reg_def XMM5m( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(12));
 175 reg_def XMM5n( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(13));
 176 reg_def XMM5o( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(14));
 177 reg_def XMM5p( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(15));
 178 
 179 reg_def XMM6 ( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg());
 180 reg_def XMM6b( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(1));
 181 reg_def XMM6c( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(2));
 182 reg_def XMM6d( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(3));
 183 reg_def XMM6e( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(4));
 184 reg_def XMM6f( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(5));
 185 reg_def XMM6g( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(6));
 186 reg_def XMM6h( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(7));
 187 reg_def XMM6i( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(8));
 188 reg_def XMM6j( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(9));
 189 reg_def XMM6k( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(10));
 190 reg_def XMM6l( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(11));
 191 reg_def XMM6m( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(12));
 192 reg_def XMM6n( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(13));
 193 reg_def XMM6o( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(14));
 194 reg_def XMM6p( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(15));
 195 
 196 reg_def XMM7 ( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg());
 197 reg_def XMM7b( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(1));
 198 reg_def XMM7c( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(2));
 199 reg_def XMM7d( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(3));
 200 reg_def XMM7e( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(4));
 201 reg_def XMM7f( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(5));
 202 reg_def XMM7g( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(6));
 203 reg_def XMM7h( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(7));
 204 reg_def XMM7i( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(8));
 205 reg_def XMM7j( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(9));
 206 reg_def XMM7k( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(10));
 207 reg_def XMM7l( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(11));
 208 reg_def XMM7m( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(12));
 209 reg_def XMM7n( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(13));
 210 reg_def XMM7o( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(14));
 211 reg_def XMM7p( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(15));
 212 
 213 #ifdef _LP64
 214 
 215 reg_def XMM8 ( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg());
 216 reg_def XMM8b( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(1));
 217 reg_def XMM8c( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(2));
 218 reg_def XMM8d( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(3));
 219 reg_def XMM8e( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(4));
 220 reg_def XMM8f( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(5));
 221 reg_def XMM8g( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(6));
 222 reg_def XMM8h( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(7));
 223 reg_def XMM8i( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(8));
 224 reg_def XMM8j( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(9));
 225 reg_def XMM8k( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(10));
 226 reg_def XMM8l( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(11));
 227 reg_def XMM8m( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(12));
 228 reg_def XMM8n( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(13));
 229 reg_def XMM8o( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(14));
 230 reg_def XMM8p( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(15));
 231 
 232 reg_def XMM9 ( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg());
 233 reg_def XMM9b( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(1));
 234 reg_def XMM9c( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(2));
 235 reg_def XMM9d( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(3));
 236 reg_def XMM9e( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(4));
 237 reg_def XMM9f( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(5));
 238 reg_def XMM9g( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(6));
 239 reg_def XMM9h( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(7));
 240 reg_def XMM9i( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(8));
 241 reg_def XMM9j( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(9));
 242 reg_def XMM9k( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(10));
 243 reg_def XMM9l( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(11));
 244 reg_def XMM9m( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(12));
 245 reg_def XMM9n( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(13));
 246 reg_def XMM9o( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(14));
 247 reg_def XMM9p( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(15));
 248 
 249 reg_def XMM10 ( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg());
 250 reg_def XMM10b( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(1));
 251 reg_def XMM10c( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(2));
 252 reg_def XMM10d( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(3));
 253 reg_def XMM10e( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(4));
 254 reg_def XMM10f( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(5));
 255 reg_def XMM10g( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(6));
 256 reg_def XMM10h( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(7));
 257 reg_def XMM10i( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(8));
 258 reg_def XMM10j( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(9));
 259 reg_def XMM10k( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(10));
 260 reg_def XMM10l( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(11));
 261 reg_def XMM10m( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(12));
 262 reg_def XMM10n( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(13));
 263 reg_def XMM10o( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(14));
 264 reg_def XMM10p( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(15));
 265 
 266 reg_def XMM11 ( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg());
 267 reg_def XMM11b( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(1));
 268 reg_def XMM11c( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(2));
 269 reg_def XMM11d( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(3));
 270 reg_def XMM11e( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(4));
 271 reg_def XMM11f( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(5));
 272 reg_def XMM11g( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(6));
 273 reg_def XMM11h( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(7));
 274 reg_def XMM11i( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(8));
 275 reg_def XMM11j( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(9));
 276 reg_def XMM11k( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(10));
 277 reg_def XMM11l( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(11));
 278 reg_def XMM11m( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(12));
 279 reg_def XMM11n( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(13));
 280 reg_def XMM11o( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(14));
 281 reg_def XMM11p( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(15));
 282 
 283 reg_def XMM12 ( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg());
 284 reg_def XMM12b( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(1));
 285 reg_def XMM12c( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(2));
 286 reg_def XMM12d( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(3));
 287 reg_def XMM12e( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(4));
 288 reg_def XMM12f( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(5));
 289 reg_def XMM12g( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(6));
 290 reg_def XMM12h( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(7));
 291 reg_def XMM12i( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(8));
 292 reg_def XMM12j( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(9));
 293 reg_def XMM12k( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(10));
 294 reg_def XMM12l( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(11));
 295 reg_def XMM12m( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(12));
 296 reg_def XMM12n( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(13));
 297 reg_def XMM12o( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(14));
 298 reg_def XMM12p( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(15));
 299 
 300 reg_def XMM13 ( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg());
 301 reg_def XMM13b( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(1));
 302 reg_def XMM13c( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(2));
 303 reg_def XMM13d( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(3));
 304 reg_def XMM13e( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(4));
 305 reg_def XMM13f( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(5));
 306 reg_def XMM13g( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(6));
 307 reg_def XMM13h( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(7));
 308 reg_def XMM13i( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(8));
 309 reg_def XMM13j( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(9));
 310 reg_def XMM13k( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(10));
 311 reg_def XMM13l( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(11));
 312 reg_def XMM13m( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(12));
 313 reg_def XMM13n( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(13));
 314 reg_def XMM13o( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(14));
 315 reg_def XMM13p( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(15));
 316 
 317 reg_def XMM14 ( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg());
 318 reg_def XMM14b( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(1));
 319 reg_def XMM14c( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(2));
 320 reg_def XMM14d( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(3));
 321 reg_def XMM14e( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(4));
 322 reg_def XMM14f( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(5));
 323 reg_def XMM14g( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(6));
 324 reg_def XMM14h( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(7));
 325 reg_def XMM14i( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(8));
 326 reg_def XMM14j( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(9));
 327 reg_def XMM14k( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(10));
 328 reg_def XMM14l( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(11));
 329 reg_def XMM14m( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(12));
 330 reg_def XMM14n( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(13));
 331 reg_def XMM14o( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(14));
 332 reg_def XMM14p( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(15));
 333 
 334 reg_def XMM15 ( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg());
 335 reg_def XMM15b( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(1));
 336 reg_def XMM15c( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(2));
 337 reg_def XMM15d( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(3));
 338 reg_def XMM15e( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(4));
 339 reg_def XMM15f( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(5));
 340 reg_def XMM15g( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(6));
 341 reg_def XMM15h( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(7));
 342 reg_def XMM15i( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(8));
 343 reg_def XMM15j( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(9));
 344 reg_def XMM15k( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(10));
 345 reg_def XMM15l( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(11));
 346 reg_def XMM15m( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(12));
 347 reg_def XMM15n( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(13));
 348 reg_def XMM15o( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(14));
 349 reg_def XMM15p( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(15));
 350 
 351 reg_def XMM16 ( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg());
 352 reg_def XMM16b( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(1));
 353 reg_def XMM16c( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(2));
 354 reg_def XMM16d( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(3));
 355 reg_def XMM16e( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(4));
 356 reg_def XMM16f( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(5));
 357 reg_def XMM16g( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(6));
 358 reg_def XMM16h( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(7));
 359 reg_def XMM16i( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(8));
 360 reg_def XMM16j( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(9));
 361 reg_def XMM16k( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(10));
 362 reg_def XMM16l( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(11));
 363 reg_def XMM16m( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(12));
 364 reg_def XMM16n( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(13));
 365 reg_def XMM16o( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(14));
 366 reg_def XMM16p( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(15));
 367 
 368 reg_def XMM17 ( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg());
 369 reg_def XMM17b( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(1));
 370 reg_def XMM17c( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(2));
 371 reg_def XMM17d( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(3));
 372 reg_def XMM17e( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(4));
 373 reg_def XMM17f( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(5));
 374 reg_def XMM17g( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(6));
 375 reg_def XMM17h( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(7));
 376 reg_def XMM17i( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(8));
 377 reg_def XMM17j( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(9));
 378 reg_def XMM17k( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(10));
 379 reg_def XMM17l( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(11));
 380 reg_def XMM17m( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(12));
 381 reg_def XMM17n( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(13));
 382 reg_def XMM17o( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(14));
 383 reg_def XMM17p( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(15));
 384 
 385 reg_def XMM18 ( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg());
 386 reg_def XMM18b( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(1));
 387 reg_def XMM18c( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(2));
 388 reg_def XMM18d( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(3));
 389 reg_def XMM18e( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(4));
 390 reg_def XMM18f( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(5));
 391 reg_def XMM18g( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(6));
 392 reg_def XMM18h( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(7));
 393 reg_def XMM18i( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(8));
 394 reg_def XMM18j( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(9));
 395 reg_def XMM18k( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(10));
 396 reg_def XMM18l( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(11));
 397 reg_def XMM18m( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(12));
 398 reg_def XMM18n( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(13));
 399 reg_def XMM18o( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(14));
 400 reg_def XMM18p( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(15));
 401 
 402 reg_def XMM19 ( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg());
 403 reg_def XMM19b( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(1));
 404 reg_def XMM19c( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(2));
 405 reg_def XMM19d( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(3));
 406 reg_def XMM19e( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(4));
 407 reg_def XMM19f( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(5));
 408 reg_def XMM19g( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(6));
 409 reg_def XMM19h( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(7));
 410 reg_def XMM19i( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(8));
 411 reg_def XMM19j( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(9));
 412 reg_def XMM19k( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(10));
 413 reg_def XMM19l( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(11));
 414 reg_def XMM19m( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(12));
 415 reg_def XMM19n( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(13));
 416 reg_def XMM19o( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(14));
 417 reg_def XMM19p( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(15));
 418 
 419 reg_def XMM20 ( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg());
 420 reg_def XMM20b( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(1));
 421 reg_def XMM20c( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(2));
 422 reg_def XMM20d( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(3));
 423 reg_def XMM20e( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(4));
 424 reg_def XMM20f( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(5));
 425 reg_def XMM20g( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(6));
 426 reg_def XMM20h( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(7));
 427 reg_def XMM20i( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(8));
 428 reg_def XMM20j( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(9));
 429 reg_def XMM20k( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(10));
 430 reg_def XMM20l( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(11));
 431 reg_def XMM20m( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(12));
 432 reg_def XMM20n( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(13));
 433 reg_def XMM20o( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(14));
 434 reg_def XMM20p( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(15));
 435 
 436 reg_def XMM21 ( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg());
 437 reg_def XMM21b( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(1));
 438 reg_def XMM21c( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(2));
 439 reg_def XMM21d( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(3));
 440 reg_def XMM21e( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(4));
 441 reg_def XMM21f( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(5));
 442 reg_def XMM21g( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(6));
 443 reg_def XMM21h( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(7));
 444 reg_def XMM21i( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(8));
 445 reg_def XMM21j( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(9));
 446 reg_def XMM21k( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(10));
 447 reg_def XMM21l( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(11));
 448 reg_def XMM21m( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(12));
 449 reg_def XMM21n( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(13));
 450 reg_def XMM21o( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(14));
 451 reg_def XMM21p( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(15));
 452 
 453 reg_def XMM22 ( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg());
 454 reg_def XMM22b( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(1));
 455 reg_def XMM22c( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(2));
 456 reg_def XMM22d( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(3));
 457 reg_def XMM22e( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(4));
 458 reg_def XMM22f( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(5));
 459 reg_def XMM22g( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(6));
 460 reg_def XMM22h( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(7));
 461 reg_def XMM22i( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(8));
 462 reg_def XMM22j( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(9));
 463 reg_def XMM22k( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(10));
 464 reg_def XMM22l( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(11));
 465 reg_def XMM22m( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(12));
 466 reg_def XMM22n( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(13));
 467 reg_def XMM22o( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(14));
 468 reg_def XMM22p( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(15));
 469 
 470 reg_def XMM23 ( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg());
 471 reg_def XMM23b( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(1));
 472 reg_def XMM23c( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(2));
 473 reg_def XMM23d( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(3));
 474 reg_def XMM23e( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(4));
 475 reg_def XMM23f( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(5));
 476 reg_def XMM23g( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(6));
 477 reg_def XMM23h( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(7));
 478 reg_def XMM23i( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(8));
 479 reg_def XMM23j( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(9));
 480 reg_def XMM23k( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(10));
 481 reg_def XMM23l( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(11));
 482 reg_def XMM23m( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(12));
 483 reg_def XMM23n( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(13));
 484 reg_def XMM23o( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(14));
 485 reg_def XMM23p( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(15));
 486 
 487 reg_def XMM24 ( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg());
 488 reg_def XMM24b( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(1));
 489 reg_def XMM24c( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(2));
 490 reg_def XMM24d( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(3));
 491 reg_def XMM24e( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(4));
 492 reg_def XMM24f( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(5));
 493 reg_def XMM24g( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(6));
 494 reg_def XMM24h( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(7));
 495 reg_def XMM24i( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(8));
 496 reg_def XMM24j( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(9));
 497 reg_def XMM24k( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(10));
 498 reg_def XMM24l( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(11));
 499 reg_def XMM24m( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(12));
 500 reg_def XMM24n( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(13));
 501 reg_def XMM24o( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(14));
 502 reg_def XMM24p( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(15));
 503 
 504 reg_def XMM25 ( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg());
 505 reg_def XMM25b( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(1));
 506 reg_def XMM25c( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(2));
 507 reg_def XMM25d( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(3));
 508 reg_def XMM25e( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(4));
 509 reg_def XMM25f( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(5));
 510 reg_def XMM25g( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(6));
 511 reg_def XMM25h( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(7));
 512 reg_def XMM25i( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(8));
 513 reg_def XMM25j( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(9));
 514 reg_def XMM25k( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(10));
 515 reg_def XMM25l( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(11));
 516 reg_def XMM25m( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(12));
 517 reg_def XMM25n( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(13));
 518 reg_def XMM25o( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(14));
 519 reg_def XMM25p( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(15));
 520 
 521 reg_def XMM26 ( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg());
 522 reg_def XMM26b( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(1));
 523 reg_def XMM26c( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(2));
 524 reg_def XMM26d( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(3));
 525 reg_def XMM26e( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(4));
 526 reg_def XMM26f( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(5));
 527 reg_def XMM26g( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(6));
 528 reg_def XMM26h( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(7));
 529 reg_def XMM26i( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(8));
 530 reg_def XMM26j( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(9));
 531 reg_def XMM26k( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(10));
 532 reg_def XMM26l( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(11));
 533 reg_def XMM26m( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(12));
 534 reg_def XMM26n( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(13));
 535 reg_def XMM26o( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(14));
 536 reg_def XMM26p( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(15));
 537 
 538 reg_def XMM27 ( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg());
 539 reg_def XMM27b( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(1));
 540 reg_def XMM27c( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(2));
 541 reg_def XMM27d( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(3));
 542 reg_def XMM27e( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(4));
 543 reg_def XMM27f( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(5));
 544 reg_def XMM27g( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(6));
 545 reg_def XMM27h( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(7));
 546 reg_def XMM27i( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(8));
 547 reg_def XMM27j( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(9));
 548 reg_def XMM27k( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(10));
 549 reg_def XMM27l( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(11));
 550 reg_def XMM27m( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(12));
 551 reg_def XMM27n( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(13));
 552 reg_def XMM27o( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(14));
 553 reg_def XMM27p( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(15));
 554 
 555 reg_def XMM28 ( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg());
 556 reg_def XMM28b( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(1));
 557 reg_def XMM28c( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(2));
 558 reg_def XMM28d( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(3));
 559 reg_def XMM28e( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(4));
 560 reg_def XMM28f( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(5));
 561 reg_def XMM28g( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(6));
 562 reg_def XMM28h( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(7));
 563 reg_def XMM28i( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(8));
 564 reg_def XMM28j( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(9));
 565 reg_def XMM28k( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(10));
 566 reg_def XMM28l( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(11));
 567 reg_def XMM28m( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(12));
 568 reg_def XMM28n( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(13));
 569 reg_def XMM28o( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(14));
 570 reg_def XMM28p( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(15));
 571 
 572 reg_def XMM29 ( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg());
 573 reg_def XMM29b( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(1));
 574 reg_def XMM29c( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(2));
 575 reg_def XMM29d( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(3));
 576 reg_def XMM29e( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(4));
 577 reg_def XMM29f( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(5));
 578 reg_def XMM29g( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(6));
 579 reg_def XMM29h( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(7));
 580 reg_def XMM29i( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(8));
 581 reg_def XMM29j( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(9));
 582 reg_def XMM29k( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(10));
 583 reg_def XMM29l( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(11));
 584 reg_def XMM29m( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(12));
 585 reg_def XMM29n( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(13));
 586 reg_def XMM29o( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(14));
 587 reg_def XMM29p( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(15));
 588 
 589 reg_def XMM30 ( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg());
 590 reg_def XMM30b( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(1));
 591 reg_def XMM30c( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(2));
 592 reg_def XMM30d( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(3));
 593 reg_def XMM30e( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(4));
 594 reg_def XMM30f( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(5));
 595 reg_def XMM30g( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(6));
 596 reg_def XMM30h( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(7));
 597 reg_def XMM30i( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(8));
 598 reg_def XMM30j( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(9));
 599 reg_def XMM30k( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(10));
 600 reg_def XMM30l( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(11));
 601 reg_def XMM30m( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(12));
 602 reg_def XMM30n( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(13));
 603 reg_def XMM30o( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(14));
 604 reg_def XMM30p( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(15));
 605 
 606 reg_def XMM31 ( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg());
 607 reg_def XMM31b( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(1));
 608 reg_def XMM31c( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(2));
 609 reg_def XMM31d( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(3));
 610 reg_def XMM31e( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(4));
 611 reg_def XMM31f( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(5));
 612 reg_def XMM31g( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(6));
 613 reg_def XMM31h( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(7));
 614 reg_def XMM31i( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(8));
 615 reg_def XMM31j( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(9));
 616 reg_def XMM31k( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(10));
 617 reg_def XMM31l( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(11));
 618 reg_def XMM31m( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(12));
 619 reg_def XMM31n( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(13));
 620 reg_def XMM31o( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(14));
 621 reg_def XMM31p( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(15));
 622 
 623 #endif // _LP64
 624 
 625 #ifdef _LP64
 626 reg_def RFLAGS(SOC, SOC, 0, 16, VMRegImpl::Bad());
 627 #else
 628 reg_def RFLAGS(SOC, SOC, 0, 8, VMRegImpl::Bad());
 629 #endif // _LP64
 630 
 631 alloc_class chunk1(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
 632                    XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
 633                    XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
 634                    XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
 635                    XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
 636                    XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
 637                    XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
 638                    XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
 639 #ifdef _LP64
 640                   ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
 641                    XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
 642                    XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
 643                    XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
 644                    XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
 645                    XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
 646                    XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
 647                    XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
 648                   ,XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
 649                    XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
 650                    XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
 651                    XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
 652                    XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
 653                    XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
 654                    XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
 655                    XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
 656                    XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
 657                    XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
 658                    XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
 659                    XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
 660                    XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
 661                    XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
 662                    XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
 663                    XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p
 664 #endif
 665                       );
 666 
 667 // flags allocation class should be last.
 668 alloc_class chunk2(RFLAGS);
 669 
 670 // Singleton class for condition codes
 671 reg_class int_flags(RFLAGS);
 672 
 673 // Class for pre evex float registers
 674 reg_class float_reg_legacy(XMM0,
 675                     XMM1,
 676                     XMM2,
 677                     XMM3,
 678                     XMM4,
 679                     XMM5,
 680                     XMM6,
 681                     XMM7
 682 #ifdef _LP64
 683                    ,XMM8,
 684                     XMM9,
 685                     XMM10,
 686                     XMM11,
 687                     XMM12,
 688                     XMM13,
 689                     XMM14,
 690                     XMM15
 691 #endif
 692                     );
 693 
 694 // Class for evex float registers
 695 reg_class float_reg_evex(XMM0,
 696                     XMM1,
 697                     XMM2,
 698                     XMM3,
 699                     XMM4,
 700                     XMM5,
 701                     XMM6,
 702                     XMM7
 703 #ifdef _LP64
 704                    ,XMM8,
 705                     XMM9,
 706                     XMM10,
 707                     XMM11,
 708                     XMM12,
 709                     XMM13,
 710                     XMM14,
 711                     XMM15,
 712                     XMM16,
 713                     XMM17,
 714                     XMM18,
 715                     XMM19,
 716                     XMM20,
 717                     XMM21,
 718                     XMM22,
 719                     XMM23,
 720                     XMM24,
 721                     XMM25,
 722                     XMM26,
 723                     XMM27,
 724                     XMM28,
 725                     XMM29,
 726                     XMM30,
 727                     XMM31
 728 #endif
 729                     );
 730 
 731 reg_class_dynamic float_reg(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() %} );
 732 reg_class_dynamic float_reg_vl(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
 733 
 734 // Class for pre evex double registers
 735 reg_class double_reg_legacy(XMM0,  XMM0b,
 736                      XMM1,  XMM1b,
 737                      XMM2,  XMM2b,
 738                      XMM3,  XMM3b,
 739                      XMM4,  XMM4b,
 740                      XMM5,  XMM5b,
 741                      XMM6,  XMM6b,
 742                      XMM7,  XMM7b
 743 #ifdef _LP64
 744                     ,XMM8,  XMM8b,
 745                      XMM9,  XMM9b,
 746                      XMM10, XMM10b,
 747                      XMM11, XMM11b,
 748                      XMM12, XMM12b,
 749                      XMM13, XMM13b,
 750                      XMM14, XMM14b,
 751                      XMM15, XMM15b
 752 #endif
 753                      );
 754 
 755 // Class for evex double registers
 756 reg_class double_reg_evex(XMM0,  XMM0b,
 757                      XMM1,  XMM1b,
 758                      XMM2,  XMM2b,
 759                      XMM3,  XMM3b,
 760                      XMM4,  XMM4b,
 761                      XMM5,  XMM5b,
 762                      XMM6,  XMM6b,
 763                      XMM7,  XMM7b
 764 #ifdef _LP64
 765                     ,XMM8,  XMM8b,
 766                      XMM9,  XMM9b,
 767                      XMM10, XMM10b,
 768                      XMM11, XMM11b,
 769                      XMM12, XMM12b,
 770                      XMM13, XMM13b,
 771                      XMM14, XMM14b,
 772                      XMM15, XMM15b,
 773                      XMM16, XMM16b,
 774                      XMM17, XMM17b,
 775                      XMM18, XMM18b,
 776                      XMM19, XMM19b,
 777                      XMM20, XMM20b,
 778                      XMM21, XMM21b,
 779                      XMM22, XMM22b,
 780                      XMM23, XMM23b,
 781                      XMM24, XMM24b,
 782                      XMM25, XMM25b,
 783                      XMM26, XMM26b,
 784                      XMM27, XMM27b,
 785                      XMM28, XMM28b,
 786                      XMM29, XMM29b,
 787                      XMM30, XMM30b,
 788                      XMM31, XMM31b
 789 #endif
 790                      );
 791 
 792 reg_class_dynamic double_reg(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() %} );
 793 reg_class_dynamic double_reg_vl(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
 794 
 795 // Class for pre evex 32bit vector registers
 796 reg_class vectors_reg_legacy(XMM0,
 797                       XMM1,
 798                       XMM2,
 799                       XMM3,
 800                       XMM4,
 801                       XMM5,
 802                       XMM6,
 803                       XMM7
 804 #ifdef _LP64
 805                      ,XMM8,
 806                       XMM9,
 807                       XMM10,
 808                       XMM11,
 809                       XMM12,
 810                       XMM13,
 811                       XMM14,
 812                       XMM15
 813 #endif
 814                       );
 815 
 816 // Class for evex 32bit vector registers
 817 reg_class vectors_reg_evex(XMM0,
 818                       XMM1,
 819                       XMM2,
 820                       XMM3,
 821                       XMM4,
 822                       XMM5,
 823                       XMM6,
 824                       XMM7
 825 #ifdef _LP64
 826                      ,XMM8,
 827                       XMM9,
 828                       XMM10,
 829                       XMM11,
 830                       XMM12,
 831                       XMM13,
 832                       XMM14,
 833                       XMM15,
 834                       XMM16,
 835                       XMM17,
 836                       XMM18,
 837                       XMM19,
 838                       XMM20,
 839                       XMM21,
 840                       XMM22,
 841                       XMM23,
 842                       XMM24,
 843                       XMM25,
 844                       XMM26,
 845                       XMM27,
 846                       XMM28,
 847                       XMM29,
 848                       XMM30,
 849                       XMM31
 850 #endif
 851                       );
 852 
 853 reg_class_dynamic vectors_reg(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_evex() %} );
 854 reg_class_dynamic vectors_reg_vlbwdq(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
 855 
 856 // Class for all 64bit vector registers
 857 reg_class vectord_reg_legacy(XMM0,  XMM0b,
 858                       XMM1,  XMM1b,
 859                       XMM2,  XMM2b,
 860                       XMM3,  XMM3b,
 861                       XMM4,  XMM4b,
 862                       XMM5,  XMM5b,
 863                       XMM6,  XMM6b,
 864                       XMM7,  XMM7b
 865 #ifdef _LP64
 866                      ,XMM8,  XMM8b,
 867                       XMM9,  XMM9b,
 868                       XMM10, XMM10b,
 869                       XMM11, XMM11b,
 870                       XMM12, XMM12b,
 871                       XMM13, XMM13b,
 872                       XMM14, XMM14b,
 873                       XMM15, XMM15b
 874 #endif
 875                       );
 876 
 877 // Class for all 64bit vector registers
 878 reg_class vectord_reg_evex(XMM0,  XMM0b,
 879                       XMM1,  XMM1b,
 880                       XMM2,  XMM2b,
 881                       XMM3,  XMM3b,
 882                       XMM4,  XMM4b,
 883                       XMM5,  XMM5b,
 884                       XMM6,  XMM6b,
 885                       XMM7,  XMM7b
 886 #ifdef _LP64
 887                      ,XMM8,  XMM8b,
 888                       XMM9,  XMM9b,
 889                       XMM10, XMM10b,
 890                       XMM11, XMM11b,
 891                       XMM12, XMM12b,
 892                       XMM13, XMM13b,
 893                       XMM14, XMM14b,
 894                       XMM15, XMM15b,
 895                       XMM16, XMM16b,
 896                       XMM17, XMM17b,
 897                       XMM18, XMM18b,
 898                       XMM19, XMM19b,
 899                       XMM20, XMM20b,
 900                       XMM21, XMM21b,
 901                       XMM22, XMM22b,
 902                       XMM23, XMM23b,
 903                       XMM24, XMM24b,
 904                       XMM25, XMM25b,
 905                       XMM26, XMM26b,
 906                       XMM27, XMM27b,
 907                       XMM28, XMM28b,
 908                       XMM29, XMM29b,
 909                       XMM30, XMM30b,
 910                       XMM31, XMM31b
 911 #endif
 912                       );
 913 
 914 reg_class_dynamic vectord_reg(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_evex() %} );
 915 reg_class_dynamic vectord_reg_vlbwdq(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
 916 
 917 // Class for all 128bit vector registers
 918 reg_class vectorx_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,
 919                       XMM1,  XMM1b,  XMM1c,  XMM1d,
 920                       XMM2,  XMM2b,  XMM2c,  XMM2d,
 921                       XMM3,  XMM3b,  XMM3c,  XMM3d,
 922                       XMM4,  XMM4b,  XMM4c,  XMM4d,
 923                       XMM5,  XMM5b,  XMM5c,  XMM5d,
 924                       XMM6,  XMM6b,  XMM6c,  XMM6d,
 925                       XMM7,  XMM7b,  XMM7c,  XMM7d
 926 #ifdef _LP64
 927                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,
 928                       XMM9,  XMM9b,  XMM9c,  XMM9d,
 929                       XMM10, XMM10b, XMM10c, XMM10d,
 930                       XMM11, XMM11b, XMM11c, XMM11d,
 931                       XMM12, XMM12b, XMM12c, XMM12d,
 932                       XMM13, XMM13b, XMM13c, XMM13d,
 933                       XMM14, XMM14b, XMM14c, XMM14d,
 934                       XMM15, XMM15b, XMM15c, XMM15d
 935 #endif
 936                       );
 937 
 938 // Class for all 128bit vector registers
 939 reg_class vectorx_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,
 940                       XMM1,  XMM1b,  XMM1c,  XMM1d,
 941                       XMM2,  XMM2b,  XMM2c,  XMM2d,
 942                       XMM3,  XMM3b,  XMM3c,  XMM3d,
 943                       XMM4,  XMM4b,  XMM4c,  XMM4d,
 944                       XMM5,  XMM5b,  XMM5c,  XMM5d,
 945                       XMM6,  XMM6b,  XMM6c,  XMM6d,
 946                       XMM7,  XMM7b,  XMM7c,  XMM7d
 947 #ifdef _LP64
 948                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,
 949                       XMM9,  XMM9b,  XMM9c,  XMM9d,
 950                       XMM10, XMM10b, XMM10c, XMM10d,
 951                       XMM11, XMM11b, XMM11c, XMM11d,
 952                       XMM12, XMM12b, XMM12c, XMM12d,
 953                       XMM13, XMM13b, XMM13c, XMM13d,
 954                       XMM14, XMM14b, XMM14c, XMM14d,
 955                       XMM15, XMM15b, XMM15c, XMM15d,
 956                       XMM16, XMM16b, XMM16c, XMM16d,
 957                       XMM17, XMM17b, XMM17c, XMM17d,
 958                       XMM18, XMM18b, XMM18c, XMM18d,
 959                       XMM19, XMM19b, XMM19c, XMM19d,
 960                       XMM20, XMM20b, XMM20c, XMM20d,
 961                       XMM21, XMM21b, XMM21c, XMM21d,
 962                       XMM22, XMM22b, XMM22c, XMM22d,
 963                       XMM23, XMM23b, XMM23c, XMM23d,
 964                       XMM24, XMM24b, XMM24c, XMM24d,
 965                       XMM25, XMM25b, XMM25c, XMM25d,
 966                       XMM26, XMM26b, XMM26c, XMM26d,
 967                       XMM27, XMM27b, XMM27c, XMM27d,
 968                       XMM28, XMM28b, XMM28c, XMM28d,
 969                       XMM29, XMM29b, XMM29c, XMM29d,
 970                       XMM30, XMM30b, XMM30c, XMM30d,
 971                       XMM31, XMM31b, XMM31c, XMM31d
 972 #endif
 973                       );
 974 
 975 reg_class_dynamic vectorx_reg(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_evex() %} );
 976 reg_class_dynamic vectorx_reg_vlbwdq(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
 977 
 978 // Class for all 256bit vector registers
 979 reg_class vectory_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
 980                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
 981                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
 982                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
 983                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
 984                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
 985                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
 986                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
 987 #ifdef _LP64
 988                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
 989                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
 990                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
 991                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
 992                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
 993                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
 994                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
 995                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h
 996 #endif
 997                       );
 998 
 999 // Class for all 256bit vector registers
1000 reg_class vectory_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
1001                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
1002                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
1003                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
1004                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
1005                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
1006                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
1007                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
1008 #ifdef _LP64
1009                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
1010                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
1011                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
1012                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
1013                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
1014                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
1015                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
1016                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h,
1017                       XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h,
1018                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h,
1019                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h,
1020                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h,
1021                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h,
1022                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h,
1023                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h,
1024                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h,
1025                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h,
1026                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h,
1027                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h,
1028                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h,
1029                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h,
1030                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h,
1031                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h,
1032                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h
1033 #endif
1034                       );
1035 
1036 reg_class_dynamic vectory_reg(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_evex() %} );
1037 reg_class_dynamic vectory_reg_vlbwdq(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
1038 
1039 // Class for all 512bit vector registers
1040 reg_class vectorz_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
1041                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
1042                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
1043                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
1044                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
1045                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
1046                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
1047                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
1048 #ifdef _LP64
1049                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
1050                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
1051                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
1052                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
1053                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
1054                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
1055                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
1056                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
1057                      ,XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
1058                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
1059                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
1060                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
1061                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
1062                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
1063                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
1064                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
1065                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
1066                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
1067                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
1068                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
1069                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
1070                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
1071                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
1072                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p
1073 #endif
1074                       );
1075 
1076 // Class for restricted 512bit vector registers
1077 reg_class vectorz_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
1078                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
1079                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
1080                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
1081                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
1082                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
1083                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
1084                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
1085 #ifdef _LP64
1086                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
1087                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
1088                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
1089                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
1090                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
1091                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
1092                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
1093                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
1094 #endif
1095                       );
1096 
1097 reg_class_dynamic vectorz_reg(vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() %} );
1098 reg_class_dynamic vectorz_reg_vl(vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
1099 
1100 reg_class xmm0_reg(XMM0, XMM0b, XMM0c, XMM0d);
1101 reg_class ymm0_reg(XMM0, XMM0b, XMM0c, XMM0d, XMM0e, XMM0f, XMM0g, XMM0h);
1102 reg_class zmm0_reg(XMM0, XMM0b, XMM0c, XMM0d, XMM0e, XMM0f, XMM0g, XMM0h, XMM0i, XMM0j, XMM0k, XMM0l, XMM0m, XMM0n, XMM0o, XMM0p);
1103 
1104 reg_class xmm1_reg(XMM1, XMM1b, XMM1c, XMM1d);
1105 reg_class ymm1_reg(XMM1, XMM1b, XMM1c, XMM1d, XMM1e, XMM1f, XMM1g, XMM1h);
1106 reg_class zmm1_reg(XMM1, XMM1b, XMM1c, XMM1d, XMM1e, XMM1f, XMM1g, XMM1h, XMM1i, XMM1j, XMM1k, XMM1l, XMM1m, XMM1n, XMM1o, XMM1p);
1107 
1108 reg_class xmm2_reg(XMM2, XMM2b, XMM2c, XMM2d);
1109 reg_class ymm2_reg(XMM2, XMM2b, XMM2c, XMM2d, XMM2e, XMM2f, XMM2g, XMM2h);
1110 reg_class zmm2_reg(XMM2, XMM2b, XMM2c, XMM2d, XMM2e, XMM2f, XMM2g, XMM2h, XMM2i, XMM2j, XMM2k, XMM2l, XMM2m, XMM2n, XMM2o, XMM2p);
1111 
1112 reg_class xmm3_reg(XMM3, XMM3b, XMM3c, XMM3d);
1113 reg_class ymm3_reg(XMM3, XMM3b, XMM3c, XMM3d, XMM3e, XMM3f, XMM3g, XMM3h);
1114 reg_class zmm3_reg(XMM3, XMM3b, XMM3c, XMM3d, XMM3e, XMM3f, XMM3g, XMM3h, XMM3i, XMM3j, XMM3k, XMM3l, XMM3m, XMM3n, XMM3o, XMM3p);
1115 
1116 reg_class xmm4_reg(XMM4, XMM4b, XMM4c, XMM4d);
1117 reg_class ymm4_reg(XMM4, XMM4b, XMM4c, XMM4d, XMM4e, XMM4f, XMM4g, XMM4h);
1118 reg_class zmm4_reg(XMM4, XMM4b, XMM4c, XMM4d, XMM4e, XMM4f, XMM4g, XMM4h, XMM4i, XMM4j, XMM4k, XMM4l, XMM4m, XMM4n, XMM4o, XMM4p);
1119 
1120 reg_class xmm5_reg(XMM5, XMM5b, XMM5c, XMM5d);
1121 reg_class ymm5_reg(XMM5, XMM5b, XMM5c, XMM5d, XMM5e, XMM5f, XMM5g, XMM5h);
1122 reg_class zmm5_reg(XMM5, XMM5b, XMM5c, XMM5d, XMM5e, XMM5f, XMM5g, XMM5h, XMM5i, XMM5j, XMM5k, XMM5l, XMM5m, XMM5n, XMM5o, XMM5p);
1123 
1124 reg_class xmm6_reg(XMM6, XMM6b, XMM6c, XMM6d);
1125 reg_class ymm6_reg(XMM6, XMM6b, XMM6c, XMM6d, XMM6e, XMM6f, XMM6g, XMM6h);
1126 reg_class zmm6_reg(XMM6, XMM6b, XMM6c, XMM6d, XMM6e, XMM6f, XMM6g, XMM6h, XMM6i, XMM6j, XMM6k, XMM6l, XMM6m, XMM6n, XMM6o, XMM6p);
1127 
1128 reg_class xmm7_reg(XMM7, XMM7b, XMM7c, XMM7d);
1129 reg_class ymm7_reg(XMM7, XMM7b, XMM7c, XMM7d, XMM7e, XMM7f, XMM7g, XMM7h);
1130 reg_class zmm7_reg(XMM7, XMM7b, XMM7c, XMM7d, XMM7e, XMM7f, XMM7g, XMM7h, XMM7i, XMM7j, XMM7k, XMM7l, XMM7m, XMM7n, XMM7o, XMM7p);
1131 
1132 #ifdef _LP64
1133 
1134 reg_class xmm8_reg(XMM8, XMM8b, XMM8c, XMM8d);
1135 reg_class ymm8_reg(XMM8, XMM8b, XMM8c, XMM8d, XMM8e, XMM8f, XMM8g, XMM8h);
1136 reg_class zmm8_reg(XMM8, XMM8b, XMM8c, XMM8d, XMM8e, XMM8f, XMM8g, XMM8h, XMM8i, XMM8j, XMM8k, XMM8l, XMM8m, XMM8n, XMM8o, XMM8p);
1137 
1138 reg_class xmm9_reg(XMM9, XMM9b, XMM9c, XMM9d);
1139 reg_class ymm9_reg(XMM9, XMM9b, XMM9c, XMM9d, XMM9e, XMM9f, XMM9g, XMM9h);
1140 reg_class zmm9_reg(XMM9, XMM9b, XMM9c, XMM9d, XMM9e, XMM9f, XMM9g, XMM9h, XMM9i, XMM9j, XMM9k, XMM9l, XMM9m, XMM9n, XMM9o, XMM9p);
1141 
1142 reg_class xmm10_reg(XMM10, XMM10b, XMM10c, XMM10d);
1143 reg_class ymm10_reg(XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h);
1144 reg_class zmm10_reg(XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p);
1145 
1146 reg_class xmm11_reg(XMM11, XMM11b, XMM11c, XMM11d);
1147 reg_class ymm11_reg(XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h);
1148 reg_class zmm11_reg(XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p);
1149 
1150 reg_class xmm12_reg(XMM12, XMM12b, XMM12c, XMM12d);
1151 reg_class ymm12_reg(XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h);
1152 reg_class zmm12_reg(XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p);
1153 
1154 reg_class xmm13_reg(XMM13, XMM13b, XMM13c, XMM13d);
1155 reg_class ymm13_reg(XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h);
1156 reg_class zmm13_reg(XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p);
1157 
1158 reg_class xmm14_reg(XMM14, XMM14b, XMM14c, XMM14d);
1159 reg_class ymm14_reg(XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h);
1160 reg_class zmm14_reg(XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p);
1161 
1162 reg_class xmm15_reg(XMM15, XMM15b, XMM15c, XMM15d);
1163 reg_class ymm15_reg(XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h);
1164 reg_class zmm15_reg(XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p);
1165 
1166 reg_class xmm16_reg(XMM16, XMM16b, XMM16c, XMM16d);
1167 reg_class ymm16_reg(XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h);
1168 reg_class zmm16_reg(XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p);
1169 
1170 reg_class xmm17_reg(XMM17, XMM17b, XMM17c, XMM17d);
1171 reg_class ymm17_reg(XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h);
1172 reg_class zmm17_reg(XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p);
1173 
1174 reg_class xmm18_reg(XMM18, XMM18b, XMM18c, XMM18d);
1175 reg_class ymm18_reg(XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h);
1176 reg_class zmm18_reg(XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p);
1177 
1178 reg_class xmm19_reg(XMM19, XMM19b, XMM19c, XMM19d);
1179 reg_class ymm19_reg(XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h);
1180 reg_class zmm19_reg(XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p);
1181 
1182 reg_class xmm20_reg(XMM20, XMM20b, XMM20c, XMM20d);
1183 reg_class ymm20_reg(XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h);
1184 reg_class zmm20_reg(XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p);
1185 
1186 reg_class xmm21_reg(XMM21, XMM21b, XMM21c, XMM21d);
1187 reg_class ymm21_reg(XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h);
1188 reg_class zmm21_reg(XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p);
1189 
1190 reg_class xmm22_reg(XMM22, XMM22b, XMM22c, XMM22d);
1191 reg_class ymm22_reg(XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h);
1192 reg_class zmm22_reg(XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p);
1193 
1194 reg_class xmm23_reg(XMM23, XMM23b, XMM23c, XMM23d);
1195 reg_class ymm23_reg(XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h);
1196 reg_class zmm23_reg(XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p);
1197 
1198 reg_class xmm24_reg(XMM24, XMM24b, XMM24c, XMM24d);
1199 reg_class ymm24_reg(XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h);
1200 reg_class zmm24_reg(XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p);
1201 
1202 reg_class xmm25_reg(XMM25, XMM25b, XMM25c, XMM25d);
1203 reg_class ymm25_reg(XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h);
1204 reg_class zmm25_reg(XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p);
1205 
1206 reg_class xmm26_reg(XMM26, XMM26b, XMM26c, XMM26d);
1207 reg_class ymm26_reg(XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h);
1208 reg_class zmm26_reg(XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p);
1209 
1210 reg_class xmm27_reg(XMM27, XMM27b, XMM27c, XMM27d);
1211 reg_class ymm27_reg(XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h);
1212 reg_class zmm27_reg(XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p);
1213 
1214 reg_class xmm28_reg(XMM28, XMM28b, XMM28c, XMM28d);
1215 reg_class ymm28_reg(XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h);
1216 reg_class zmm28_reg(XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p);
1217 
1218 reg_class xmm29_reg(XMM29, XMM29b, XMM29c, XMM29d);
1219 reg_class ymm29_reg(XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h);
1220 reg_class zmm29_reg(XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p);
1221 
1222 reg_class xmm30_reg(XMM30, XMM30b, XMM30c, XMM30d);
1223 reg_class ymm30_reg(XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h);
1224 reg_class zmm30_reg(XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p);
1225 
1226 reg_class xmm31_reg(XMM31, XMM31b, XMM31c, XMM31d);
1227 reg_class ymm31_reg(XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h);
1228 reg_class zmm31_reg(XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p);
1229 
1230 #endif
1231 
1232 %}
1233 
1234 
1235 //----------SOURCE BLOCK-------------------------------------------------------
1236 // This is a block of C++ code which provides values, functions, and
1237 // definitions necessary in the rest of the architecture description
1238 
1239 source_hpp %{
1240 // Header information of the source block.
1241 // Method declarations/definitions which are used outside
1242 // the ad-scope can conveniently be defined here.
1243 //
1244 // To keep related declarations/definitions/uses close together,
1245 // we switch between source %{ }% and source_hpp %{ }% freely as needed.
1246 
1247 class NativeJump;
1248 
1249 class CallStubImpl {
1250 
1251   //--------------------------------------------------------------
1252   //---<  Used for optimization in Compile::shorten_branches  >---
1253   //--------------------------------------------------------------
1254 
1255  public:
1256   // Size of call trampoline stub.
1257   static uint size_call_trampoline() {
1258     return 0; // no call trampolines on this platform
1259   }
1260 
1261   // number of relocations needed by a call trampoline stub
1262   static uint reloc_call_trampoline() {
1263     return 0; // no call trampolines on this platform
1264   }
1265 };
1266 
1267 class HandlerImpl {
1268 
1269  public:
1270 
1271   static int emit_exception_handler(CodeBuffer &cbuf);
1272   static int emit_deopt_handler(CodeBuffer& cbuf);
1273 
1274   static uint size_exception_handler() {
1275     // NativeCall instruction size is the same as NativeJump.
1276     // exception handler starts out as jump and can be patched to
1277     // a call be deoptimization.  (4932387)
1278     // Note that this value is also credited (in output.cpp) to
1279     // the size of the code section.
1280     return NativeJump::instruction_size;
1281   }
1282 
1283 #ifdef _LP64
1284   static uint size_deopt_handler() {
1285     // three 5 byte instructions plus one move for unreachable address.
1286     return 15+3;
1287   }
1288 #else
1289   static uint size_deopt_handler() {
1290     // NativeCall instruction size is the same as NativeJump.
1291     // exception handler starts out as jump and can be patched to
1292     // a call be deoptimization.  (4932387)
1293     // Note that this value is also credited (in output.cpp) to
1294     // the size of the code section.
1295     return 5 + NativeJump::instruction_size; // pushl(); jmp;
1296   }
1297 #endif
1298 };
1299 
1300 %} // end source_hpp
1301 
1302 source %{
1303 
1304 #include "opto/addnode.hpp"
1305 
1306 // Emit exception handler code.
1307 // Stuff framesize into a register and call a VM stub routine.
1308 int HandlerImpl::emit_exception_handler(CodeBuffer& cbuf) {
1309 
1310   // Note that the code buffer's insts_mark is always relative to insts.
1311   // That's why we must use the macroassembler to generate a handler.
1312   MacroAssembler _masm(&cbuf);
1313   address base = __ start_a_stub(size_exception_handler());
1314   if (base == NULL) {
1315     ciEnv::current()->record_failure("CodeCache is full");
1316     return 0;  // CodeBuffer::expand failed
1317   }
1318   int offset = __ offset();
1319   __ jump(RuntimeAddress(OptoRuntime::exception_blob()->entry_point()));
1320   assert(__ offset() - offset <= (int) size_exception_handler(), "overflow");
1321   __ end_a_stub();
1322   return offset;
1323 }
1324 
1325 // Emit deopt handler code.
1326 int HandlerImpl::emit_deopt_handler(CodeBuffer& cbuf) {
1327 
1328   // Note that the code buffer's insts_mark is always relative to insts.
1329   // That's why we must use the macroassembler to generate a handler.
1330   MacroAssembler _masm(&cbuf);
1331   address base = __ start_a_stub(size_deopt_handler());
1332   if (base == NULL) {
1333     ciEnv::current()->record_failure("CodeCache is full");
1334     return 0;  // CodeBuffer::expand failed
1335   }
1336   int offset = __ offset();
1337 
1338 #ifdef _LP64
1339   address the_pc = (address) __ pc();
1340   Label next;
1341   // push a "the_pc" on the stack without destroying any registers
1342   // as they all may be live.
1343 
1344   // push address of "next"
1345   __ call(next, relocInfo::none); // reloc none is fine since it is a disp32
1346   __ bind(next);
1347   // adjust it so it matches "the_pc"
1348   __ subptr(Address(rsp, 0), __ offset() - offset);
1349 #else
1350   InternalAddress here(__ pc());
1351   __ pushptr(here.addr());
1352 #endif
1353 
1354   __ jump(RuntimeAddress(SharedRuntime::deopt_blob()->unpack()));
1355   assert(__ offset() - offset <= (int) size_deopt_handler(), "overflow %d", (__ offset() - offset));
1356   __ end_a_stub();
1357   return offset;
1358 }
1359 
1360 
1361 //=============================================================================
1362 
1363   // Float masks come from different places depending on platform.
1364 #ifdef _LP64
1365   static address float_signmask()  { return StubRoutines::x86::float_sign_mask(); }
1366   static address float_signflip()  { return StubRoutines::x86::float_sign_flip(); }
1367   static address double_signmask() { return StubRoutines::x86::double_sign_mask(); }
1368   static address double_signflip() { return StubRoutines::x86::double_sign_flip(); }
1369   static address vector_float_signmask() { return StubRoutines::x86::vector_float_sign_mask(); }
1370   static address vector_float_signflip() { return StubRoutines::x86::vector_float_sign_flip(); }
1371   static address vector_double_signmask() { return StubRoutines::x86::vector_double_sign_mask(); }
1372   static address vector_double_signflip() { return StubRoutines::x86::vector_double_sign_flip(); }
1373   static address vector_all_bits_set() { return StubRoutines::x86::vector_all_bits_set(); }
1374   static address vector_iota_indices() { return StubRoutines::x86::vector_iota_indices(); }
1375   static address vector_byte_bitset() { return StubRoutines::x86::vector_byte_bitset(); }
1376   static address vector_long_perm_mask() { return StubRoutines::x86::vector_long_perm_mask(); }
1377   static address vector_short_to_byte_mask() { return StubRoutines::x86::vector_short_to_byte_mask(); }
1378   static address vector_byte_perm_mask() { return StubRoutines::x86::vector_byte_perm_mask(); }
1379   static address vector_int_to_byte_mask() { return StubRoutines::x86::vector_int_to_byte_mask(); }
1380   static address vector_int_to_short_mask() { return StubRoutines::x86::vector_int_to_short_mask(); }
1381   static address vector_32_bit_mask() { return StubRoutines::x86::vector_32_bit_mask(); }
1382   static address vector_64_bit_mask() { return StubRoutines::x86::vector_64_bit_mask(); }
1383   static address vector_all_ones_mask() { return StubRoutines::x86::vector_all_ones_mask(); }
1384   static address vector_int_shufflemask() { return StubRoutines::x86::vector_int_shuffle_mask(); }
1385   static address vector_int_sizemask() { return StubRoutines::x86::vector_int_size_mask(); }
1386   static address vector_short_shufflemask() { return StubRoutines::x86::vector_short_shuffle_mask(); }
1387   static address vector_short_sizemask() { return StubRoutines::x86::vector_short_size_mask(); }
1388   static address vector_long_shufflemask() { return StubRoutines::x86::vector_long_shuffle_mask(); }
1389   static address vector_long_sizemask() { return StubRoutines::x86::vector_long_size_mask(); }
1390 #else
1391   static address float_signmask()  { return (address)float_signmask_pool; }
1392   static address float_signflip()  { return (address)float_signflip_pool; }
1393   static address double_signmask() { return (address)double_signmask_pool; }
1394   static address double_signflip() { return (address)double_signflip_pool; }
1395 #endif
1396 
1397 
1398 const bool Matcher::match_rule_supported(int opcode) {
1399   if (!has_match_rule(opcode))
1400     return false;
1401 
1402   bool ret_value = true;
1403   switch (opcode) {
1404     case Op_PopCountI:
1405     case Op_PopCountL:
1406       if (!UsePopCountInstruction)
1407         ret_value = false;
1408       break;
1409     case Op_PopCountVI:
1410       if (!UsePopCountInstruction || !VM_Version::supports_vpopcntdq())
1411         ret_value = false;
1412       break;
1413     case Op_MulVI:
1414     case Op_MulVL:
1415       if ((UseSSE < 4) && (UseAVX < 1)) // only with SSE4_1 or AVX
1416         ret_value = false;
1417       break;
1418     case Op_MulReductionVL:
1419       if (VM_Version::supports_avx512dq() == false)
1420         ret_value = false;
1421       break;
1422     case Op_AddReductionVL:
1423       if (UseSSE < 2) // requires at least SSE2
1424         ret_value = false;
1425       break;
1426     case Op_MulReductionVI:
1427       if (UseSSE < 4) // requires at least SSE4
1428         ret_value = false;
1429       break;
1430     case Op_AddReductionVF:
1431     case Op_AddReductionVD:
1432     case Op_MulReductionVF:
1433     case Op_MulReductionVD:
1434       if (UseSSE < 1) // requires at least SSE
1435         ret_value = false;
1436       break;
1437     case Op_SqrtVD:
1438     case Op_SqrtVF:
1439       if (UseAVX < 1) // enabled for AVX only
1440         ret_value = false;
1441       break;
1442     case Op_CompareAndSwapL:
1443 #ifdef _LP64
1444     case Op_CompareAndSwapP:
1445 #endif
1446       if (!VM_Version::supports_cx8())
1447         ret_value = false;
1448       break;
1449     case Op_CMoveVF:
1450     case Op_CMoveVD:
1451       if (UseAVX < 1 || UseAVX > 2)
1452         ret_value = false;
1453       break;
1454     case Op_StrIndexOf:
1455       if (!UseSSE42Intrinsics)
1456         ret_value = false;
1457       break;
1458     case Op_StrIndexOfChar:
1459       if (!UseSSE42Intrinsics)
1460         ret_value = false;
1461       break;
1462     case Op_OnSpinWait:
1463       if (VM_Version::supports_on_spin_wait() == false)
1464         ret_value = false;
1465       break;
1466     case Op_MulAddVS2VI:
1467       if (UseSSE < 2)
1468         ret_value = false;
1469       break;
1470 #ifdef _LP64
1471     case Op_MaxD:
1472     case Op_MaxF:
1473     case Op_MinD:
1474     case Op_MinF:
1475       if (UseAVX < 1) // enabled for AVX only
1476         ret_value = false;
1477       break;
1478 #endif
1479   }
1480 
1481   return ret_value;  // Per default match rules are supported.
1482 }
1483 
1484 const bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt, int op_arity) {
1485   // identify extra cases that we might want to provide match rules for
1486   // e.g. Op_ vector nodes and other intrinsics while guarding with vlen
1487   bool ret_value = match_rule_supported(opcode);
1488   if (ret_value) {
1489     int size_in_bits = vlen * type2aelembytes(bt) * BitsPerByte;
1490     if (!vector_size_supported(bt, vlen)) {
1491       ret_value = false;
1492     } else if (size_in_bits > 256 && UseAVX <= 2) {
1493       // Only AVX512 supports 512-bit vectors
1494       ret_value = false;
1495     } else if (UseAVX == 0 && size_in_bits > 128) {
1496       // Only AVX supports 256-bit vectors
1497       ret_value = false;
1498     } else if (is_subword_type(bt) && size_in_bits == 512 && VM_Version::supports_avx512bw() == false) {
1499       // Byte and Short types are not supported in AVX512 if AVX512BW is not true.
1500       ret_value = false;
1501     } else {
1502         switch (opcode) {
1503         case Op_AbsV:
1504           if (is_integral_type(bt) && UseSSE < 3) { ret_value = false; }
1505           else if (is_integral_type(bt) && size_in_bits == 256 && UseAVX < 2) { ret_value = false; }
1506           else if (bt == T_LONG && UseAVX <= 2) { ret_value = false; } // Implementation limitation
1507           break;
1508         case Op_AddVB:
1509         case Op_SubVB:
1510           if ((vlen == 64) && (VM_Version::supports_avx512bw() == false))
1511             ret_value = false;
1512           break;
1513         case Op_MaxV:
1514         case Op_MinV:
1515           if (UseSSE < 4 && (bt == T_BYTE || bt == T_INT || bt == T_LONG))
1516             ret_value = false;
1517 
1518           if ((bt == T_FLOAT || bt == T_DOUBLE)) {
1519             // Float/Double intrinsics are enabled for AVX family currently.
1520             if (UseAVX == 0)
1521               ret_value = false;
1522             // 512 bit Float/Double intrinsics need AVX512DQ
1523             if (UseAVX > 2 && (!VM_Version::supports_avx512dq() && size_in_bits == 512))
1524               ret_value = false;
1525           }
1526           break;
1527         case Op_MulVB:
1528         case Op_LShiftVB:
1529         case Op_RShiftVB:
1530         case Op_URShiftVB:
1531         case Op_LShiftVS:
1532         case Op_RShiftVS:
1533         case Op_URShiftVS:
1534           if (size_in_bits <= 128 && UseSSE < 4) { ret_value = false; }
1535           else if (size_in_bits > 256 && UseAVX < 2) { ret_value = false; }
1536           break;
1537         case Op_LShiftVI:
1538         case Op_RShiftVI:
1539         case Op_URShiftVI:
1540           if (op_arity == 2 && UseAVX <= 1)
1541             ret_value  = false;
1542           break;
1543         case Op_LShiftVL:
1544         case Op_RShiftVL:
1545         case Op_URShiftVL:
1546           if (op_arity == 2 && UseAVX <= 1)
1547             ret_value  = false;
1548           break;
1549         case Op_MulVS:
1550         case Op_AddVS:
1551         case Op_SubVS:
1552           if ((vlen == 32) && (VM_Version::supports_avx512bw() == false))
1553             ret_value = false;
1554           break;
1555         case Op_CallLeafVector:
1556           if (size_in_bits == 512 && !VM_Version::supports_avx512vlbwdq())
1557             ret_value = false;
1558           break;
1559         case Op_CMoveVF:
1560           if (vlen != 8)
1561             ret_value  = false;
1562           break;
1563         case Op_CMoveVD:
1564           if (vlen != 4)
1565             ret_value  = false;
1566           break;
1567         case Op_AddReductionVI:
1568           if (bt == T_INT && UseSSE < 3) { ret_value = false; }
1569           else if (is_subword_type(bt) && UseSSE <= 3) { ret_value = false; }
1570           break;
1571         case Op_AndReductionV:
1572         case Op_OrReductionV:
1573         case Op_XorReductionV:
1574           if (bt == T_BYTE && UseSSE <= 3) { ret_value = false; }
1575           break;
1576         case Op_VectorMaskCmp:
1577           if (UseAVX <= 0) { ret_value = false; }
1578           else if (is_integral_type(bt) && size_in_bits == 256 && UseAVX < 2) { ret_value = false; }
1579           break;
1580         case Op_MinReductionV:
1581         case Op_MaxReductionV:
1582           if ((bt == T_INT || bt == T_LONG || bt == T_BYTE) && UseSSE <= 3) { ret_value = false; }
1583           else if (is_integral_type(bt) && size_in_bits == 256 && UseAVX < 2) { ret_value = false; }
1584 
1585           // Float/Double intrinsics enabled for AVX family.
1586           if (UseAVX == 0 && (bt == T_FLOAT || bt == T_DOUBLE))
1587             ret_value = false;
1588           if (UseAVX > 2 && (!VM_Version::supports_avx512dq() && size_in_bits == 512))
1589             ret_value = false;
1590           break;
1591         case Op_VectorBlend:
1592           if (UseSSE <= 3 && UseAVX == 0) { ret_value = false; }
1593           else if (is_integral_type(bt) && size_in_bits == 256 && UseAVX < 2) { ret_value = false; }
1594           break;
1595         case Op_VectorTest:
1596           if (UseAVX <= 0) { ret_value = false; }
1597           else if (size_in_bits != 128 && size_in_bits != 256) { ret_value = false; } // Implementation limitation
1598           break;
1599         case Op_VectorLoadMask:
1600           if (UseSSE <= 3) { ret_value = false; }
1601           else if (vlen == 1 || vlen == 2) { ret_value = false; } // Implementation limitation
1602           else if (size_in_bits >= 256 && UseAVX < 2) { ret_value = false; } // Implementation limitation
1603           break;
1604         case Op_VectorLoadShuffle:
1605         case Op_VectorRearrange:
1606           if (vlen == 1 || vlen == 2) { ret_value = false; } // Implementation limitation due to how shuffle is loaded
1607           else if (size_in_bits >= 256 && UseAVX < 2) { ret_value = false; } // Implementation limitation
1608           else if (bt == T_BYTE && size_in_bits >= 256 && !VM_Version::supports_avx512vbmi())  { ret_value = false; } // Implementation limitation
1609           else if (bt == T_SHORT && size_in_bits >= 256 && !VM_Version::supports_avx512vlbw())  { ret_value = false; } // Implementation limitation
1610           break;
1611         case Op_VectorStoreMask:
1612           if (UseAVX < 0) { ret_value = false; } // Implementation limitation
1613           else if ((size_in_bits >= 256 || bt == T_LONG || bt == T_DOUBLE) && UseAVX < 2) { ret_value = false; } // Implementation limitation
1614           else if (vlen == 1 || vlen == 2) { ret_value = false; } // Implementation limitation
1615           else if (size_in_bits == 512 && !VM_Version::supports_avx512bw()) { ret_value = false; } // Implementation limitation
1616           break;
1617         case Op_VectorCastB2X:
1618           if (UseAVX <= 0) { ret_value = false; }
1619           else if (size_in_bits >= 256 && UseAVX < 2) { ret_value = false; }
1620           break;
1621         case Op_VectorCastS2X:
1622           if (UseAVX <= 0) { ret_value = false; }
1623           else if (is_integral_type(bt) && size_in_bits == 256 && UseAVX < 2) { ret_value = false; }
1624           else if (is_integral_type(bt) && vlen * type2aelembytes(T_SHORT) * BitsPerByte == 256 && UseAVX < 2) { ret_value = false; }
1625           break;
1626         case Op_VectorCastI2X:
1627           if (UseAVX <= 0) { ret_value = false; }
1628           else if (is_integral_type(bt) && size_in_bits == 256 && UseAVX < 2) { ret_value = false; }
1629           else if (is_integral_type(bt) && vlen * type2aelembytes(T_INT) * BitsPerByte == 256 && UseAVX < 2) { ret_value = false; }
1630           break;
1631         case Op_VectorCastL2X:
1632           if (UseAVX <= 0) { ret_value = false; }
1633           else if (is_integral_type(bt) && size_in_bits == 256 && UseAVX < 2) { ret_value = false; }
1634           else if (is_integral_type(bt) && vlen * type2aelembytes(T_LONG) * BitsPerByte == 256 && UseAVX < 2) { ret_value = false; }
1635           else if (!is_integral_type(bt) && !VM_Version::supports_avx512dq()) { ret_value = false; }
1636           break;
1637         case Op_VectorCastF2X:
1638           // Casts from FP to integral types require special fixup logic not easily
1639           // implementable with vectors.
1640           if (UseAVX <= 0) { ret_value = false; }
1641           else if (bt != T_DOUBLE) { ret_value = false; } // Implementation limitation
1642           break;
1643         case Op_VectorCastD2X:
1644           // Casts from FP to integral types require special fixup logic not easily
1645           // implementable with vectors.
1646           if (UseAVX <= 0) { ret_value = false; }
1647           else if (bt != T_FLOAT) { ret_value = false; } // Implementation limitation
1648           break;
1649         case Op_VectorReinterpret:
1650           if (size_in_bits >= 256 && UseAVX < 2) { ret_value = false; }
1651           break;
1652         case Op_MulReductionVI:
1653           if (bt ==T_BYTE && size_in_bits == 512 && !VM_Version::supports_avx512bw()) { ret_value = false; }
1654           break;
1655         case Op_FmaVD:
1656         case Op_FmaVF:
1657           if (!UseFMA) { ret_value = false; }
1658         case Op_LoadVectorGather:
1659           if (UseAVX < 2) { ret_value = false; }
1660           else if (size_in_bits == 64 ) { ret_value = false; }
1661           break;
1662         case Op_StoreVectorScatter:
1663           if (UseAVX < 3) { ret_value = false; }
1664           else if (size_in_bits == 64 ) { ret_value = false; }
1665           break;
1666         default:
1667           break;
1668       }
1669     }
1670   }
1671   if (ret_value) {
1672     assert(is_java_primitive(bt) && (vlen > 0) && is_power_of_2(vlen) &&
1673            vector_size_supported(bt, vlen), "must be supported");
1674   }
1675 
1676   return ret_value;  // Per default match rules are supported.
1677 }
1678 
1679 const bool Matcher::has_predicated_vectors(void) {
1680   bool ret_value = false;
1681   if (UseAVX > 2) {
1682     ret_value = VM_Version::supports_avx512vl();
1683   }
1684 
1685   return ret_value;
1686 }
1687 
1688 const int Matcher::float_pressure(int default_pressure_threshold) {
1689   int float_pressure_threshold = default_pressure_threshold;
1690 #ifdef _LP64
1691   if (UseAVX > 2) {
1692     // Increase pressure threshold on machines with AVX3 which have
1693     // 2x more XMM registers.
1694     float_pressure_threshold = default_pressure_threshold * 2;
1695   }
1696 #endif
1697   return float_pressure_threshold;
1698 }
1699 
1700 // Max vector size in bytes. 0 if not supported.
1701 const int Matcher::vector_width_in_bytes(BasicType bt) {
1702   assert(is_java_primitive(bt), "only primitive type vectors");
1703   if (UseSSE < 2) return 0;
1704   // SSE2 supports 128bit vectors for all types.
1705   // AVX2 supports 256bit vectors for all types.
1706   // AVX2/EVEX supports 512bit vectors for all types.
1707   int size = (UseAVX > 1) ? (1 << UseAVX) * 8 : 16;
1708   // AVX1 supports 256bit vectors only for FLOAT and DOUBLE.
1709   if (UseAVX > 0 && (bt == T_FLOAT || bt == T_DOUBLE))
1710     size = (UseAVX > 2) ? 64 : 32;
1711   if (UseAVX > 2 && (bt == T_BYTE || bt == T_SHORT || bt == T_CHAR))
1712     size = (VM_Version::supports_avx512bw()) ? 64 : 32;
1713   // Use flag to limit vector size.
1714   size = MIN2(size,(int)MaxVectorSize);
1715   // Minimum 2 values in vector (or 4 for bytes).
1716   switch (bt) {
1717   case T_DOUBLE:
1718   case T_LONG:
1719     if (size < 16) return 0;
1720     break;
1721   case T_FLOAT:
1722   case T_INT:
1723     if (size < 8) return 0;
1724     break;
1725   case T_BOOLEAN:
1726     if (size < 4) return 0;
1727     break;
1728   case T_CHAR:
1729     if (size < 4) return 0;
1730     break;
1731   case T_BYTE:
1732     if (size < 4) return 0;
1733     break;
1734   case T_SHORT:
1735     if (size < 4) return 0;
1736     break;
1737   default:
1738     ShouldNotReachHere();
1739   }
1740   return size;
1741 }
1742 
1743 // Limits on vector size (number of elements) loaded into vector.
1744 const int Matcher::max_vector_size(const BasicType bt) {
1745   return vector_width_in_bytes(bt)/type2aelembytes(bt);
1746 }
1747 const int Matcher::min_vector_size(const BasicType bt) {
1748   int max_size = max_vector_size(bt);
1749   // Min size which can be loaded into vector is 4 bytes.
1750   int size = (type2aelembytes(bt) == 1) ? 4 : 2;
1751   return MIN2(size,max_size);
1752 }
1753 
1754 // Vector ideal reg corresponding to specified size in bytes
1755 const uint Matcher::vector_ideal_reg(int size) {
1756   assert(MaxVectorSize >= size, "");
1757   switch(size) {
1758     case  4: return Op_VecS;
1759     case  8: return Op_VecD;
1760     case 16: return Op_VecX;
1761     case 32: return Op_VecY;
1762     case 64: return Op_VecZ;
1763   }
1764   ShouldNotReachHere();
1765   return 0;
1766 }
1767 
1768 // Only lowest bits of xmm reg are used for vector shift count.
1769 const uint Matcher::vector_shift_count_ideal_reg(int size) {
1770   return Op_VecS;
1771 }
1772 
1773 // x86 supports misaligned vectors store/load.
1774 const bool Matcher::misaligned_vectors_ok() {
1775   return true;
1776 }
1777 
1778 // x86 AES instructions are compatible with SunJCE expanded
1779 // keys, hence we do not need to pass the original key to stubs
1780 const bool Matcher::pass_original_key_for_aes() {
1781   return false;
1782 }
1783 
1784 
1785 const bool Matcher::convi2l_type_required = true;
1786 
1787 // Check for shift by small constant as well
1788 static bool clone_shift(Node* shift, Matcher* matcher, Matcher::MStack& mstack, VectorSet& address_visited) {
1789   if (shift->Opcode() == Op_LShiftX && shift->in(2)->is_Con() &&
1790       shift->in(2)->get_int() <= 3 &&
1791       // Are there other uses besides address expressions?
1792       !matcher->is_visited(shift)) {
1793     address_visited.set(shift->_idx); // Flag as address_visited
1794     mstack.push(shift->in(2), Matcher::Visit);
1795     Node *conv = shift->in(1);
1796 #ifdef _LP64
1797     // Allow Matcher to match the rule which bypass
1798     // ConvI2L operation for an array index on LP64
1799     // if the index value is positive.
1800     if (conv->Opcode() == Op_ConvI2L &&
1801         conv->as_Type()->type()->is_long()->_lo >= 0 &&
1802         // Are there other uses besides address expressions?
1803         !matcher->is_visited(conv)) {
1804       address_visited.set(conv->_idx); // Flag as address_visited
1805       mstack.push(conv->in(1), Matcher::Pre_Visit);
1806     } else
1807 #endif
1808       mstack.push(conv, Matcher::Pre_Visit);
1809     return true;
1810   }
1811   return false;
1812 }
1813 
1814 // Should the Matcher clone shifts on addressing modes, expecting them
1815 // to be subsumed into complex addressing expressions or compute them
1816 // into registers?
1817 bool Matcher::clone_address_expressions(AddPNode* m, Matcher::MStack& mstack, VectorSet& address_visited) {
1818   Node *off = m->in(AddPNode::Offset);
1819   if (off->is_Con()) {
1820     address_visited.test_set(m->_idx); // Flag as address_visited
1821     Node *adr = m->in(AddPNode::Address);
1822 
1823     // Intel can handle 2 adds in addressing mode
1824     // AtomicAdd is not an addressing expression.
1825     // Cheap to find it by looking for screwy base.
1826     if (adr->is_AddP() &&
1827         !adr->in(AddPNode::Base)->is_top() &&
1828         LP64_ONLY( off->get_long() == (int) (off->get_long()) && ) // immL32
1829         // Are there other uses besides address expressions?
1830         !is_visited(adr)) {
1831       address_visited.set(adr->_idx); // Flag as address_visited
1832       Node *shift = adr->in(AddPNode::Offset);
1833       if (!clone_shift(shift, this, mstack, address_visited)) {
1834         mstack.push(shift, Pre_Visit);
1835       }
1836       mstack.push(adr->in(AddPNode::Address), Pre_Visit);
1837       mstack.push(adr->in(AddPNode::Base), Pre_Visit);
1838     } else {
1839       mstack.push(adr, Pre_Visit);
1840     }
1841 
1842     // Clone X+offset as it also folds into most addressing expressions
1843     mstack.push(off, Visit);
1844     mstack.push(m->in(AddPNode::Base), Pre_Visit);
1845     return true;
1846   } else if (clone_shift(off, this, mstack, address_visited)) {
1847     address_visited.test_set(m->_idx); // Flag as address_visited
1848     mstack.push(m->in(AddPNode::Address), Pre_Visit);
1849     mstack.push(m->in(AddPNode::Base), Pre_Visit);
1850     return true;
1851   }
1852   return false;
1853 }
1854 
1855 void Compile::reshape_address(AddPNode* addp) {
1856 }
1857 
1858 // Helper methods for MachSpillCopyNode::implementation().
1859 static int vec_mov_helper(CodeBuffer *cbuf, bool do_size, int src_lo, int dst_lo,
1860                           int src_hi, int dst_hi, uint ireg, outputStream* st) {
1861   // In 64-bit VM size calculation is very complex. Emitting instructions
1862   // into scratch buffer is used to get size in 64-bit VM.
1863   LP64_ONLY( assert(!do_size, "this method calculates size only for 32-bit VM"); )
1864   assert(ireg == Op_VecS || // 32bit vector
1865          (src_lo & 1) == 0 && (src_lo + 1) == src_hi &&
1866          (dst_lo & 1) == 0 && (dst_lo + 1) == dst_hi,
1867          "no non-adjacent vector moves" );
1868   if (cbuf) {
1869     MacroAssembler _masm(cbuf);
1870     int offset = __ offset();
1871     switch (ireg) {
1872     case Op_VecS: // copy whole register
1873     case Op_VecD:
1874     case Op_VecX:
1875 #ifndef _LP64
1876       __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
1877 #else
1878       if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
1879         __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
1880       } else {
1881         __ vextractf32x4(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 0x0);
1882      }
1883 #endif
1884       break;
1885     case Op_VecY:
1886 #ifndef _LP64
1887       __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
1888 #else
1889       if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
1890         __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
1891       } else {
1892         __ vextractf64x4(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 0x0);
1893      }
1894 #endif
1895       break;
1896     case Op_VecZ:
1897       __ evmovdquq(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 2);
1898       break;
1899     default:
1900       ShouldNotReachHere();
1901     }
1902     int size = __ offset() - offset;
1903 #ifdef ASSERT
1904     // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
1905     assert(!do_size || size == 4, "incorrect size calculattion");
1906 #endif
1907     return size;
1908 #ifndef PRODUCT
1909   } else if (!do_size) {
1910     switch (ireg) {
1911     case Op_VecS:
1912     case Op_VecD:
1913     case Op_VecX:
1914       st->print("movdqu  %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
1915       break;
1916     case Op_VecY:
1917     case Op_VecZ:
1918       st->print("vmovdqu %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
1919       break;
1920     default:
1921       ShouldNotReachHere();
1922     }
1923 #endif
1924   }
1925   // VEX_2bytes prefix is used if UseAVX > 0, and it takes the same 2 bytes as SIMD prefix.
1926   return (UseAVX > 2) ? 6 : 4;
1927 }
1928 
1929 static int vec_spill_helper(CodeBuffer *cbuf, bool do_size, bool is_load,
1930                             int stack_offset, int reg, uint ireg, outputStream* st) {
1931   // In 64-bit VM size calculation is very complex. Emitting instructions
1932   // into scratch buffer is used to get size in 64-bit VM.
1933   LP64_ONLY( assert(!do_size, "this method calculates size only for 32-bit VM"); )
1934   if (cbuf) {
1935     MacroAssembler _masm(cbuf);
1936     int offset = __ offset();
1937     if (is_load) {
1938       switch (ireg) {
1939       case Op_VecS:
1940         __ movdl(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
1941         break;
1942       case Op_VecD:
1943         __ movq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
1944         break;
1945       case Op_VecX:
1946 #ifndef _LP64
1947         __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
1948 #else
1949         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
1950           __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
1951         } else {
1952           __ vpxor(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 2);
1953           __ vinsertf32x4(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset),0x0);
1954         }
1955 #endif
1956         break;
1957       case Op_VecY:
1958 #ifndef _LP64
1959         __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
1960 #else
1961         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
1962           __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
1963         } else {
1964           __ vpxor(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 2);
1965           __ vinsertf64x4(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset),0x0);
1966         }
1967 #endif
1968         break;
1969       case Op_VecZ:
1970         __ evmovdquq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset), 2);
1971         break;
1972       default:
1973         ShouldNotReachHere();
1974       }
1975     } else { // store
1976       switch (ireg) {
1977       case Op_VecS:
1978         __ movdl(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
1979         break;
1980       case Op_VecD:
1981         __ movq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
1982         break;
1983       case Op_VecX:
1984 #ifndef _LP64
1985         __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
1986 #else
1987         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
1988           __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
1989         }
1990         else {
1991           __ vextractf32x4(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 0x0);
1992         }
1993 #endif
1994         break;
1995       case Op_VecY:
1996 #ifndef _LP64
1997         __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
1998 #else
1999         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
2000           __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
2001         }
2002         else {
2003           __ vextractf64x4(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 0x0);
2004         }
2005 #endif
2006         break;
2007       case Op_VecZ:
2008         __ evmovdquq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 2);
2009         break;
2010       default:
2011         ShouldNotReachHere();
2012       }
2013     }
2014     int size = __ offset() - offset;
2015 #ifdef ASSERT
2016     int offset_size = (stack_offset == 0) ? 0 : ((stack_offset < 0x80) ? 1 : (UseAVX > 2) ? 6 : 4);
2017     // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
2018     assert(!do_size || size == (5+offset_size), "incorrect size calculattion");
2019 #endif
2020     return size;
2021 #ifndef PRODUCT
2022   } else if (!do_size) {
2023     if (is_load) {
2024       switch (ireg) {
2025       case Op_VecS:
2026         st->print("movd    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
2027         break;
2028       case Op_VecD:
2029         st->print("movq    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
2030         break;
2031        case Op_VecX:
2032         st->print("movdqu  %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
2033         break;
2034       case Op_VecY:
2035       case Op_VecZ:
2036         st->print("vmovdqu %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
2037         break;
2038       default:
2039         ShouldNotReachHere();
2040       }
2041     } else { // store
2042       switch (ireg) {
2043       case Op_VecS:
2044         st->print("movd    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
2045         break;
2046       case Op_VecD:
2047         st->print("movq    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
2048         break;
2049        case Op_VecX:
2050         st->print("movdqu  [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
2051         break;
2052       case Op_VecY:
2053       case Op_VecZ:
2054         st->print("vmovdqu [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
2055         break;
2056       default:
2057         ShouldNotReachHere();
2058       }
2059     }
2060 #endif
2061   }
2062   bool is_single_byte = false;
2063   int vec_len = 0;
2064   if ((UseAVX > 2) && (stack_offset != 0)) {
2065     int tuple_type = Assembler::EVEX_FVM;
2066     int input_size = Assembler::EVEX_32bit;
2067     switch (ireg) {
2068     case Op_VecS:
2069       tuple_type = Assembler::EVEX_T1S;
2070       break;
2071     case Op_VecD:
2072       tuple_type = Assembler::EVEX_T1S;
2073       input_size = Assembler::EVEX_64bit;
2074       break;
2075     case Op_VecX:
2076       break;
2077     case Op_VecY:
2078       vec_len = 1;
2079       break;
2080     case Op_VecZ:
2081       vec_len = 2;
2082       break;
2083     }
2084     is_single_byte = Assembler::query_compressed_disp_byte(stack_offset, true, vec_len, tuple_type, input_size, 0);
2085   }
2086   int offset_size = 0;
2087   int size = 5;
2088   if (UseAVX > 2 ) {
2089     if (VM_Version::supports_avx512novl() && (vec_len == 2)) {
2090       offset_size = (stack_offset == 0) ? 0 : ((is_single_byte) ? 1 : 4);
2091       size += 2; // Need an additional two bytes for EVEX encoding
2092     } else if (VM_Version::supports_avx512novl() && (vec_len < 2)) {
2093       offset_size = (stack_offset == 0) ? 0 : ((stack_offset <= 127) ? 1 : 4);
2094     } else {
2095       offset_size = (stack_offset == 0) ? 0 : ((is_single_byte) ? 1 : 4);
2096       size += 2; // Need an additional two bytes for EVEX encodding
2097     }
2098   } else {
2099     offset_size = (stack_offset == 0) ? 0 : ((stack_offset <= 127) ? 1 : 4);
2100   }
2101   // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
2102   return size+offset_size;
2103 }
2104 
2105 static inline jint replicate4_imm(int con, int width) {
2106   // Load a constant of "width" (in bytes) and replicate it to fill 32bit.
2107   assert(width == 1 || width == 2, "only byte or short types here");
2108   int bit_width = width * 8;
2109   jint val = con;
2110   val &= (1 << bit_width) - 1;  // mask off sign bits
2111   while(bit_width < 32) {
2112     val |= (val << bit_width);
2113     bit_width <<= 1;
2114   }
2115   return val;
2116 }
2117 
2118 static inline jlong replicate8_imm(int con, int width) {
2119   // Load a constant of "width" (in bytes) and replicate it to fill 64bit.
2120   assert(width == 1 || width == 2 || width == 4, "only byte, short or int types here");
2121   int bit_width = width * 8;
2122   jlong val = con;
2123   val &= (((jlong) 1) << bit_width) - 1;  // mask off sign bits
2124   while(bit_width < 64) {
2125     val |= (val << bit_width);
2126     bit_width <<= 1;
2127   }
2128   return val;
2129 }
2130 
2131 
2132 #ifndef PRODUCT
2133   void MachNopNode::format(PhaseRegAlloc*, outputStream* st) const {
2134     st->print("nop \t# %d bytes pad for loops and calls", _count);
2135   }
2136 #endif
2137 
2138   void MachNopNode::emit(CodeBuffer &cbuf, PhaseRegAlloc*) const {
2139     MacroAssembler _masm(&cbuf);
2140     __ nop(_count);
2141   }
2142 
2143   uint MachNopNode::size(PhaseRegAlloc*) const {
2144     return _count;
2145   }
2146 
2147 #ifndef PRODUCT
2148   void MachBreakpointNode::format(PhaseRegAlloc*, outputStream* st) const {
2149     st->print("# breakpoint");
2150   }
2151 #endif
2152 
2153   void MachBreakpointNode::emit(CodeBuffer &cbuf, PhaseRegAlloc* ra_) const {
2154     MacroAssembler _masm(&cbuf);
2155     __ int3();
2156   }
2157 
2158   uint MachBreakpointNode::size(PhaseRegAlloc* ra_) const {
2159     return MachNode::size(ra_);
2160   }
2161 
2162   
2163 
2164 %}
2165 
2166 encode %{
2167 
2168   enc_class call_epilog %{
2169     if (VerifyStackAtCalls) {
2170       // Check that stack depth is unchanged: find majik cookie on stack
2171       int framesize = ra_->reg2offset_unchecked(OptoReg::add(ra_->_matcher._old_SP, -3*VMRegImpl::slots_per_word));
2172       MacroAssembler _masm(&cbuf);
2173       Label L;
2174       __ cmpptr(Address(rsp, framesize), (int32_t)0xbadb100d);
2175       __ jccb(Assembler::equal, L);
2176       // Die if stack mismatch
2177       __ int3();
2178       __ bind(L);
2179     }
2180   %}
2181 
2182 %}
2183 
2184 
2185 //----------OPERANDS-----------------------------------------------------------
2186 // Operand definitions must precede instruction definitions for correct parsing
2187 // in the ADLC because operands constitute user defined types which are used in
2188 // instruction definitions.
2189 
2190 operand immU1() %{
2191   predicate(n->get_int() >= 0 && n->get_int() < nth_bit(1));
2192   match(ConI);
2193 
2194   op_cost(0);
2195   format %{ %}
2196   interface(CONST_INTER);
2197 %}
2198 
2199 operand immU2() %{
2200   predicate(n->get_int() >= 0 && n->get_int() < nth_bit(2));
2201   match(ConI);
2202 
2203   op_cost(0);
2204   format %{ %}
2205   interface(CONST_INTER);
2206 %}
2207 
2208 operand immU3() %{
2209   predicate(n->get_int() >= 0 && n->get_int() < nth_bit(3));
2210   match(ConI);
2211 
2212   op_cost(0);
2213   format %{ %}
2214   interface(CONST_INTER);
2215 %}
2216 
2217 operand immU4() %{
2218   predicate(n->get_int() >= 0 && n->get_int() < nth_bit(4));
2219   match(ConI);
2220 
2221   op_cost(0);
2222   format %{ %}
2223   interface(CONST_INTER);
2224 %}
2225 
2226 operand immU5() %{
2227   predicate(n->get_int() >= 0 && n->get_int() < nth_bit(5));
2228   match(ConI);
2229 
2230   op_cost(0);
2231   format %{ %}
2232   interface(CONST_INTER);
2233 %}
2234 
2235 operand immU6() %{
2236   predicate(n->get_int() >= 0 && n->get_int() < nth_bit(6));
2237   match(ConI);
2238 
2239   op_cost(0);
2240   format %{ %}
2241   interface(CONST_INTER);
2242 %}
2243 
2244 // Comparison Code for FP conditional move
2245 operand cmpOp_vcmppd() %{
2246   match(Bool);
2247 
2248   predicate(n->as_Bool()->_test._test != BoolTest::overflow &&
2249             n->as_Bool()->_test._test != BoolTest::no_overflow);
2250   format %{ "" %}
2251   interface(COND_INTER) %{
2252     equal        (0x0, "eq");
2253     less         (0x1, "lt");
2254     less_equal   (0x2, "le");
2255     not_equal    (0xC, "ne");
2256     greater_equal(0xD, "ge");
2257     greater      (0xE, "gt");
2258     //TODO cannot compile (adlc breaks) without two next lines with error:
2259     // x86_64.ad(13987) Syntax Error: :In operand cmpOp_vcmppd: Do not support this encode constant: ' %{
2260     // equal' for overflow.
2261     overflow     (0x20, "o");  // not really supported by the instruction
2262     no_overflow  (0x21, "no"); // not really supported by the instruction
2263   %}
2264 %}
2265 
2266 
2267 // INSTRUCTIONS -- Platform independent definitions (same for 32- and 64-bit)
2268 
2269 // ============================================================================
2270 
2271 instruct ShouldNotReachHere() %{
2272   match(Halt);
2273   format %{ "ud2\t# ShouldNotReachHere" %}
2274   ins_encode %{
2275     __ ud2();
2276   %}
2277   ins_pipe(pipe_slow);
2278 %}
2279 
2280 // =================================EVEX special===============================
2281 
2282 instruct setMask(rRegI dst, rRegI src) %{
2283   predicate(Matcher::has_predicated_vectors());
2284   match(Set dst (SetVectMaskI  src));
2285   effect(TEMP dst);
2286   format %{ "setvectmask   $dst, $src" %}
2287   ins_encode %{
2288     __ setvectmask($dst$$Register, $src$$Register);
2289   %}
2290   ins_pipe(pipe_slow);
2291 %}
2292 
2293 // ============================================================================
2294 
2295 instruct addF_reg(regF dst, regF src) %{
2296   predicate((UseSSE>=1) && (UseAVX == 0));
2297   match(Set dst (AddF dst src));
2298 
2299   format %{ "addss   $dst, $src" %}
2300   ins_cost(150);
2301   ins_encode %{
2302     __ addss($dst$$XMMRegister, $src$$XMMRegister);
2303   %}
2304   ins_pipe(pipe_slow);
2305 %}
2306 
2307 instruct addF_mem(regF dst, memory src) %{
2308   predicate((UseSSE>=1) && (UseAVX == 0));
2309   match(Set dst (AddF dst (LoadF src)));
2310 
2311   format %{ "addss   $dst, $src" %}
2312   ins_cost(150);
2313   ins_encode %{
2314     __ addss($dst$$XMMRegister, $src$$Address);
2315   %}
2316   ins_pipe(pipe_slow);
2317 %}
2318 
2319 instruct addF_imm(regF dst, immF con) %{
2320   predicate((UseSSE>=1) && (UseAVX == 0));
2321   match(Set dst (AddF dst con));
2322   format %{ "addss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2323   ins_cost(150);
2324   ins_encode %{
2325     __ addss($dst$$XMMRegister, $constantaddress($con));
2326   %}
2327   ins_pipe(pipe_slow);
2328 %}
2329 
2330 instruct addF_reg_reg(regF dst, regF src1, regF src2) %{
2331   predicate(UseAVX > 0);
2332   match(Set dst (AddF src1 src2));
2333 
2334   format %{ "vaddss  $dst, $src1, $src2" %}
2335   ins_cost(150);
2336   ins_encode %{
2337     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2338   %}
2339   ins_pipe(pipe_slow);
2340 %}
2341 
2342 instruct addF_reg_mem(regF dst, regF src1, memory src2) %{
2343   predicate(UseAVX > 0);
2344   match(Set dst (AddF src1 (LoadF src2)));
2345 
2346   format %{ "vaddss  $dst, $src1, $src2" %}
2347   ins_cost(150);
2348   ins_encode %{
2349     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2350   %}
2351   ins_pipe(pipe_slow);
2352 %}
2353 
2354 instruct addF_reg_imm(regF dst, regF src, immF con) %{
2355   predicate(UseAVX > 0);
2356   match(Set dst (AddF src con));
2357 
2358   format %{ "vaddss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
2359   ins_cost(150);
2360   ins_encode %{
2361     __ vaddss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2362   %}
2363   ins_pipe(pipe_slow);
2364 %}
2365 
2366 instruct addD_reg(regD dst, regD src) %{
2367   predicate((UseSSE>=2) && (UseAVX == 0));
2368   match(Set dst (AddD dst src));
2369 
2370   format %{ "addsd   $dst, $src" %}
2371   ins_cost(150);
2372   ins_encode %{
2373     __ addsd($dst$$XMMRegister, $src$$XMMRegister);
2374   %}
2375   ins_pipe(pipe_slow);
2376 %}
2377 
2378 instruct addD_mem(regD dst, memory src) %{
2379   predicate((UseSSE>=2) && (UseAVX == 0));
2380   match(Set dst (AddD dst (LoadD src)));
2381 
2382   format %{ "addsd   $dst, $src" %}
2383   ins_cost(150);
2384   ins_encode %{
2385     __ addsd($dst$$XMMRegister, $src$$Address);
2386   %}
2387   ins_pipe(pipe_slow);
2388 %}
2389 
2390 instruct addD_imm(regD dst, immD con) %{
2391   predicate((UseSSE>=2) && (UseAVX == 0));
2392   match(Set dst (AddD dst con));
2393   format %{ "addsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2394   ins_cost(150);
2395   ins_encode %{
2396     __ addsd($dst$$XMMRegister, $constantaddress($con));
2397   %}
2398   ins_pipe(pipe_slow);
2399 %}
2400 
2401 instruct addD_reg_reg(regD dst, regD src1, regD src2) %{
2402   predicate(UseAVX > 0);
2403   match(Set dst (AddD src1 src2));
2404 
2405   format %{ "vaddsd  $dst, $src1, $src2" %}
2406   ins_cost(150);
2407   ins_encode %{
2408     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2409   %}
2410   ins_pipe(pipe_slow);
2411 %}
2412 
2413 instruct addD_reg_mem(regD dst, regD src1, memory src2) %{
2414   predicate(UseAVX > 0);
2415   match(Set dst (AddD src1 (LoadD src2)));
2416 
2417   format %{ "vaddsd  $dst, $src1, $src2" %}
2418   ins_cost(150);
2419   ins_encode %{
2420     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2421   %}
2422   ins_pipe(pipe_slow);
2423 %}
2424 
2425 instruct addD_reg_imm(regD dst, regD src, immD con) %{
2426   predicate(UseAVX > 0);
2427   match(Set dst (AddD src con));
2428 
2429   format %{ "vaddsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
2430   ins_cost(150);
2431   ins_encode %{
2432     __ vaddsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2433   %}
2434   ins_pipe(pipe_slow);
2435 %}
2436 
2437 instruct subF_reg(regF dst, regF src) %{
2438   predicate((UseSSE>=1) && (UseAVX == 0));
2439   match(Set dst (SubF dst src));
2440 
2441   format %{ "subss   $dst, $src" %}
2442   ins_cost(150);
2443   ins_encode %{
2444     __ subss($dst$$XMMRegister, $src$$XMMRegister);
2445   %}
2446   ins_pipe(pipe_slow);
2447 %}
2448 
2449 instruct subF_mem(regF dst, memory src) %{
2450   predicate((UseSSE>=1) && (UseAVX == 0));
2451   match(Set dst (SubF dst (LoadF src)));
2452 
2453   format %{ "subss   $dst, $src" %}
2454   ins_cost(150);
2455   ins_encode %{
2456     __ subss($dst$$XMMRegister, $src$$Address);
2457   %}
2458   ins_pipe(pipe_slow);
2459 %}
2460 
2461 instruct subF_imm(regF dst, immF con) %{
2462   predicate((UseSSE>=1) && (UseAVX == 0));
2463   match(Set dst (SubF dst con));
2464   format %{ "subss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2465   ins_cost(150);
2466   ins_encode %{
2467     __ subss($dst$$XMMRegister, $constantaddress($con));
2468   %}
2469   ins_pipe(pipe_slow);
2470 %}
2471 
2472 instruct subF_reg_reg(regF dst, regF src1, regF src2) %{
2473   predicate(UseAVX > 0);
2474   match(Set dst (SubF src1 src2));
2475 
2476   format %{ "vsubss  $dst, $src1, $src2" %}
2477   ins_cost(150);
2478   ins_encode %{
2479     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2480   %}
2481   ins_pipe(pipe_slow);
2482 %}
2483 
2484 instruct subF_reg_mem(regF dst, regF src1, memory src2) %{
2485   predicate(UseAVX > 0);
2486   match(Set dst (SubF src1 (LoadF src2)));
2487 
2488   format %{ "vsubss  $dst, $src1, $src2" %}
2489   ins_cost(150);
2490   ins_encode %{
2491     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2492   %}
2493   ins_pipe(pipe_slow);
2494 %}
2495 
2496 instruct subF_reg_imm(regF dst, regF src, immF con) %{
2497   predicate(UseAVX > 0);
2498   match(Set dst (SubF src con));
2499 
2500   format %{ "vsubss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
2501   ins_cost(150);
2502   ins_encode %{
2503     __ vsubss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2504   %}
2505   ins_pipe(pipe_slow);
2506 %}
2507 
2508 instruct subD_reg(regD dst, regD src) %{
2509   predicate((UseSSE>=2) && (UseAVX == 0));
2510   match(Set dst (SubD dst src));
2511 
2512   format %{ "subsd   $dst, $src" %}
2513   ins_cost(150);
2514   ins_encode %{
2515     __ subsd($dst$$XMMRegister, $src$$XMMRegister);
2516   %}
2517   ins_pipe(pipe_slow);
2518 %}
2519 
2520 instruct subD_mem(regD dst, memory src) %{
2521   predicate((UseSSE>=2) && (UseAVX == 0));
2522   match(Set dst (SubD dst (LoadD src)));
2523 
2524   format %{ "subsd   $dst, $src" %}
2525   ins_cost(150);
2526   ins_encode %{
2527     __ subsd($dst$$XMMRegister, $src$$Address);
2528   %}
2529   ins_pipe(pipe_slow);
2530 %}
2531 
2532 instruct subD_imm(regD dst, immD con) %{
2533   predicate((UseSSE>=2) && (UseAVX == 0));
2534   match(Set dst (SubD dst con));
2535   format %{ "subsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2536   ins_cost(150);
2537   ins_encode %{
2538     __ subsd($dst$$XMMRegister, $constantaddress($con));
2539   %}
2540   ins_pipe(pipe_slow);
2541 %}
2542 
2543 instruct subD_reg_reg(regD dst, regD src1, regD src2) %{
2544   predicate(UseAVX > 0);
2545   match(Set dst (SubD src1 src2));
2546 
2547   format %{ "vsubsd  $dst, $src1, $src2" %}
2548   ins_cost(150);
2549   ins_encode %{
2550     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2551   %}
2552   ins_pipe(pipe_slow);
2553 %}
2554 
2555 instruct subD_reg_mem(regD dst, regD src1, memory src2) %{
2556   predicate(UseAVX > 0);
2557   match(Set dst (SubD src1 (LoadD src2)));
2558 
2559   format %{ "vsubsd  $dst, $src1, $src2" %}
2560   ins_cost(150);
2561   ins_encode %{
2562     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2563   %}
2564   ins_pipe(pipe_slow);
2565 %}
2566 
2567 instruct subD_reg_imm(regD dst, regD src, immD con) %{
2568   predicate(UseAVX > 0);
2569   match(Set dst (SubD src con));
2570 
2571   format %{ "vsubsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
2572   ins_cost(150);
2573   ins_encode %{
2574     __ vsubsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2575   %}
2576   ins_pipe(pipe_slow);
2577 %}
2578 
2579 instruct mulF_reg(regF dst, regF src) %{
2580   predicate((UseSSE>=1) && (UseAVX == 0));
2581   match(Set dst (MulF dst src));
2582 
2583   format %{ "mulss   $dst, $src" %}
2584   ins_cost(150);
2585   ins_encode %{
2586     __ mulss($dst$$XMMRegister, $src$$XMMRegister);
2587   %}
2588   ins_pipe(pipe_slow);
2589 %}
2590 
2591 instruct mulF_mem(regF dst, memory src) %{
2592   predicate((UseSSE>=1) && (UseAVX == 0));
2593   match(Set dst (MulF dst (LoadF src)));
2594 
2595   format %{ "mulss   $dst, $src" %}
2596   ins_cost(150);
2597   ins_encode %{
2598     __ mulss($dst$$XMMRegister, $src$$Address);
2599   %}
2600   ins_pipe(pipe_slow);
2601 %}
2602 
2603 instruct mulF_imm(regF dst, immF con) %{
2604   predicate((UseSSE>=1) && (UseAVX == 0));
2605   match(Set dst (MulF dst con));
2606   format %{ "mulss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2607   ins_cost(150);
2608   ins_encode %{
2609     __ mulss($dst$$XMMRegister, $constantaddress($con));
2610   %}
2611   ins_pipe(pipe_slow);
2612 %}
2613 
2614 instruct mulF_reg_reg(regF dst, regF src1, regF src2) %{
2615   predicate(UseAVX > 0);
2616   match(Set dst (MulF src1 src2));
2617 
2618   format %{ "vmulss  $dst, $src1, $src2" %}
2619   ins_cost(150);
2620   ins_encode %{
2621     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2622   %}
2623   ins_pipe(pipe_slow);
2624 %}
2625 
2626 instruct mulF_reg_mem(regF dst, regF src1, memory src2) %{
2627   predicate(UseAVX > 0);
2628   match(Set dst (MulF src1 (LoadF src2)));
2629 
2630   format %{ "vmulss  $dst, $src1, $src2" %}
2631   ins_cost(150);
2632   ins_encode %{
2633     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2634   %}
2635   ins_pipe(pipe_slow);
2636 %}
2637 
2638 instruct mulF_reg_imm(regF dst, regF src, immF con) %{
2639   predicate(UseAVX > 0);
2640   match(Set dst (MulF src con));
2641 
2642   format %{ "vmulss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
2643   ins_cost(150);
2644   ins_encode %{
2645     __ vmulss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2646   %}
2647   ins_pipe(pipe_slow);
2648 %}
2649 
2650 instruct mulD_reg(regD dst, regD src) %{
2651   predicate((UseSSE>=2) && (UseAVX == 0));
2652   match(Set dst (MulD dst src));
2653 
2654   format %{ "mulsd   $dst, $src" %}
2655   ins_cost(150);
2656   ins_encode %{
2657     __ mulsd($dst$$XMMRegister, $src$$XMMRegister);
2658   %}
2659   ins_pipe(pipe_slow);
2660 %}
2661 
2662 instruct mulD_mem(regD dst, memory src) %{
2663   predicate((UseSSE>=2) && (UseAVX == 0));
2664   match(Set dst (MulD dst (LoadD src)));
2665 
2666   format %{ "mulsd   $dst, $src" %}
2667   ins_cost(150);
2668   ins_encode %{
2669     __ mulsd($dst$$XMMRegister, $src$$Address);
2670   %}
2671   ins_pipe(pipe_slow);
2672 %}
2673 
2674 instruct mulD_imm(regD dst, immD con) %{
2675   predicate((UseSSE>=2) && (UseAVX == 0));
2676   match(Set dst (MulD dst con));
2677   format %{ "mulsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2678   ins_cost(150);
2679   ins_encode %{
2680     __ mulsd($dst$$XMMRegister, $constantaddress($con));
2681   %}
2682   ins_pipe(pipe_slow);
2683 %}
2684 
2685 instruct mulD_reg_reg(regD dst, regD src1, regD src2) %{
2686   predicate(UseAVX > 0);
2687   match(Set dst (MulD src1 src2));
2688 
2689   format %{ "vmulsd  $dst, $src1, $src2" %}
2690   ins_cost(150);
2691   ins_encode %{
2692     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2693   %}
2694   ins_pipe(pipe_slow);
2695 %}
2696 
2697 instruct mulD_reg_mem(regD dst, regD src1, memory src2) %{
2698   predicate(UseAVX > 0);
2699   match(Set dst (MulD src1 (LoadD src2)));
2700 
2701   format %{ "vmulsd  $dst, $src1, $src2" %}
2702   ins_cost(150);
2703   ins_encode %{
2704     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2705   %}
2706   ins_pipe(pipe_slow);
2707 %}
2708 
2709 instruct mulD_reg_imm(regD dst, regD src, immD con) %{
2710   predicate(UseAVX > 0);
2711   match(Set dst (MulD src con));
2712 
2713   format %{ "vmulsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
2714   ins_cost(150);
2715   ins_encode %{
2716     __ vmulsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2717   %}
2718   ins_pipe(pipe_slow);
2719 %}
2720 
2721 instruct divF_reg(regF dst, regF src) %{
2722   predicate((UseSSE>=1) && (UseAVX == 0));
2723   match(Set dst (DivF dst src));
2724 
2725   format %{ "divss   $dst, $src" %}
2726   ins_cost(150);
2727   ins_encode %{
2728     __ divss($dst$$XMMRegister, $src$$XMMRegister);
2729   %}
2730   ins_pipe(pipe_slow);
2731 %}
2732 
2733 instruct divF_mem(regF dst, memory src) %{
2734   predicate((UseSSE>=1) && (UseAVX == 0));
2735   match(Set dst (DivF dst (LoadF src)));
2736 
2737   format %{ "divss   $dst, $src" %}
2738   ins_cost(150);
2739   ins_encode %{
2740     __ divss($dst$$XMMRegister, $src$$Address);
2741   %}
2742   ins_pipe(pipe_slow);
2743 %}
2744 
2745 instruct divF_imm(regF dst, immF con) %{
2746   predicate((UseSSE>=1) && (UseAVX == 0));
2747   match(Set dst (DivF dst con));
2748   format %{ "divss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2749   ins_cost(150);
2750   ins_encode %{
2751     __ divss($dst$$XMMRegister, $constantaddress($con));
2752   %}
2753   ins_pipe(pipe_slow);
2754 %}
2755 
2756 instruct divF_reg_reg(regF dst, regF src1, regF src2) %{
2757   predicate(UseAVX > 0);
2758   match(Set dst (DivF src1 src2));
2759 
2760   format %{ "vdivss  $dst, $src1, $src2" %}
2761   ins_cost(150);
2762   ins_encode %{
2763     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2764   %}
2765   ins_pipe(pipe_slow);
2766 %}
2767 
2768 instruct divF_reg_mem(regF dst, regF src1, memory src2) %{
2769   predicate(UseAVX > 0);
2770   match(Set dst (DivF src1 (LoadF src2)));
2771 
2772   format %{ "vdivss  $dst, $src1, $src2" %}
2773   ins_cost(150);
2774   ins_encode %{
2775     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2776   %}
2777   ins_pipe(pipe_slow);
2778 %}
2779 
2780 instruct divF_reg_imm(regF dst, regF src, immF con) %{
2781   predicate(UseAVX > 0);
2782   match(Set dst (DivF src con));
2783 
2784   format %{ "vdivss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
2785   ins_cost(150);
2786   ins_encode %{
2787     __ vdivss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2788   %}
2789   ins_pipe(pipe_slow);
2790 %}
2791 
2792 instruct divD_reg(regD dst, regD src) %{
2793   predicate((UseSSE>=2) && (UseAVX == 0));
2794   match(Set dst (DivD dst src));
2795 
2796   format %{ "divsd   $dst, $src" %}
2797   ins_cost(150);
2798   ins_encode %{
2799     __ divsd($dst$$XMMRegister, $src$$XMMRegister);
2800   %}
2801   ins_pipe(pipe_slow);
2802 %}
2803 
2804 instruct divD_mem(regD dst, memory src) %{
2805   predicate((UseSSE>=2) && (UseAVX == 0));
2806   match(Set dst (DivD dst (LoadD src)));
2807 
2808   format %{ "divsd   $dst, $src" %}
2809   ins_cost(150);
2810   ins_encode %{
2811     __ divsd($dst$$XMMRegister, $src$$Address);
2812   %}
2813   ins_pipe(pipe_slow);
2814 %}
2815 
2816 instruct divD_imm(regD dst, immD con) %{
2817   predicate((UseSSE>=2) && (UseAVX == 0));
2818   match(Set dst (DivD dst con));
2819   format %{ "divsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2820   ins_cost(150);
2821   ins_encode %{
2822     __ divsd($dst$$XMMRegister, $constantaddress($con));
2823   %}
2824   ins_pipe(pipe_slow);
2825 %}
2826 
2827 instruct divD_reg_reg(regD dst, regD src1, regD src2) %{
2828   predicate(UseAVX > 0);
2829   match(Set dst (DivD src1 src2));
2830 
2831   format %{ "vdivsd  $dst, $src1, $src2" %}
2832   ins_cost(150);
2833   ins_encode %{
2834     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2835   %}
2836   ins_pipe(pipe_slow);
2837 %}
2838 
2839 instruct divD_reg_mem(regD dst, regD src1, memory src2) %{
2840   predicate(UseAVX > 0);
2841   match(Set dst (DivD src1 (LoadD src2)));
2842 
2843   format %{ "vdivsd  $dst, $src1, $src2" %}
2844   ins_cost(150);
2845   ins_encode %{
2846     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2847   %}
2848   ins_pipe(pipe_slow);
2849 %}
2850 
2851 instruct divD_reg_imm(regD dst, regD src, immD con) %{
2852   predicate(UseAVX > 0);
2853   match(Set dst (DivD src con));
2854 
2855   format %{ "vdivsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
2856   ins_cost(150);
2857   ins_encode %{
2858     __ vdivsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2859   %}
2860   ins_pipe(pipe_slow);
2861 %}
2862 
2863 instruct absF_reg(regF dst) %{
2864   predicate((UseSSE>=1) && (UseAVX == 0));
2865   match(Set dst (AbsF dst));
2866   ins_cost(150);
2867   format %{ "andps   $dst, [0x7fffffff]\t# abs float by sign masking" %}
2868   ins_encode %{
2869     __ andps($dst$$XMMRegister, ExternalAddress(float_signmask()));
2870   %}
2871   ins_pipe(pipe_slow);
2872 %}
2873 
2874 instruct absF_reg_reg(vlRegF dst, vlRegF src) %{
2875   predicate(UseAVX > 0);
2876   match(Set dst (AbsF src));
2877   ins_cost(150);
2878   format %{ "vandps  $dst, $src, [0x7fffffff]\t# abs float by sign masking" %}
2879   ins_encode %{
2880     int vector_len = 0;
2881     __ vandps($dst$$XMMRegister, $src$$XMMRegister,
2882               ExternalAddress(float_signmask()), vector_len);
2883   %}
2884   ins_pipe(pipe_slow);
2885 %}
2886 
2887 instruct absD_reg(regD dst) %{
2888   predicate((UseSSE>=2) && (UseAVX == 0));
2889   match(Set dst (AbsD dst));
2890   ins_cost(150);
2891   format %{ "andpd   $dst, [0x7fffffffffffffff]\t"
2892             "# abs double by sign masking" %}
2893   ins_encode %{
2894     __ andpd($dst$$XMMRegister, ExternalAddress(double_signmask()));
2895   %}
2896   ins_pipe(pipe_slow);
2897 %}
2898 
2899 instruct absD_reg_reg(vlRegD dst, vlRegD src) %{
2900   predicate(UseAVX > 0);
2901   match(Set dst (AbsD src));
2902   ins_cost(150);
2903   format %{ "vandpd  $dst, $src, [0x7fffffffffffffff]\t"
2904             "# abs double by sign masking" %}
2905   ins_encode %{
2906     int vector_len = 0;
2907     __ vandpd($dst$$XMMRegister, $src$$XMMRegister,
2908               ExternalAddress(double_signmask()), vector_len);
2909   %}
2910   ins_pipe(pipe_slow);
2911 %}
2912 
2913 instruct negI_rReg_2(rRegI dst, rFlagsReg cr)
2914 %{
2915   match(Set dst (NegI dst));
2916   effect(KILL cr);
2917 
2918   format %{ "negl    $dst\t# int" %}
2919   ins_encode %{
2920     __ negl($dst$$Register);
2921   %}
2922   ins_pipe(ialu_reg);
2923 %}
2924 
2925 instruct negL_rReg_2(rRegL dst, rFlagsReg cr)
2926 %{
2927   match(Set dst (NegL dst));
2928   effect(KILL cr);
2929 
2930   format %{ "negq    $dst\t# int" %}
2931   ins_encode %{
2932     __ negq($dst$$Register);
2933   %}
2934   ins_pipe(ialu_reg);
2935 %}
2936 
2937 instruct negF_reg(regF dst) %{
2938   predicate((UseSSE>=1) && (UseAVX == 0));
2939   match(Set dst (NegF dst));
2940   ins_cost(150);
2941   format %{ "xorps   $dst, [0x80000000]\t# neg float by sign flipping" %}
2942   ins_encode %{
2943     __ xorps($dst$$XMMRegister, ExternalAddress(float_signflip()));
2944   %}
2945   ins_pipe(pipe_slow);
2946 %}
2947 
2948 instruct negF_reg_reg(vlRegF dst, vlRegF src) %{
2949   predicate(UseAVX > 0);
2950   match(Set dst (NegF src));
2951   ins_cost(150);
2952   format %{ "vnegatess  $dst, $src, [0x80000000]\t# neg float by sign flipping" %}
2953   ins_encode %{
2954     __ vnegatess($dst$$XMMRegister, $src$$XMMRegister,
2955                  ExternalAddress(float_signflip()));
2956   %}
2957   ins_pipe(pipe_slow);
2958 %}
2959 
2960 instruct negD_reg(regD dst) %{
2961   predicate((UseSSE>=2) && (UseAVX == 0));
2962   match(Set dst (NegD dst));
2963   ins_cost(150);
2964   format %{ "xorpd   $dst, [0x8000000000000000]\t"
2965             "# neg double by sign flipping" %}
2966   ins_encode %{
2967     __ xorpd($dst$$XMMRegister, ExternalAddress(double_signflip()));
2968   %}
2969   ins_pipe(pipe_slow);
2970 %}
2971 
2972 instruct negD_reg_reg(vlRegD dst, vlRegD src) %{
2973   predicate(UseAVX > 0);
2974   match(Set dst (NegD src));
2975   ins_cost(150);
2976   format %{ "vnegatesd  $dst, $src, [0x8000000000000000]\t"
2977             "# neg double by sign flipping" %}
2978   ins_encode %{
2979     __ vnegatesd($dst$$XMMRegister, $src$$XMMRegister,
2980                  ExternalAddress(double_signflip()));
2981   %}
2982   ins_pipe(pipe_slow);
2983 %}
2984 
2985 instruct sqrtF_reg(regF dst, regF src) %{
2986   predicate(UseSSE>=1);
2987   match(Set dst (SqrtF src));
2988 
2989   format %{ "sqrtss  $dst, $src" %}
2990   ins_cost(150);
2991   ins_encode %{
2992     __ sqrtss($dst$$XMMRegister, $src$$XMMRegister);
2993   %}
2994   ins_pipe(pipe_slow);
2995 %}
2996 
2997 instruct sqrtF_mem(regF dst, memory src) %{
2998   predicate(UseSSE>=1);
2999   match(Set dst (SqrtF (LoadF src)));
3000 
3001   format %{ "sqrtss  $dst, $src" %}
3002   ins_cost(150);
3003   ins_encode %{
3004     __ sqrtss($dst$$XMMRegister, $src$$Address);
3005   %}
3006   ins_pipe(pipe_slow);
3007 %}
3008 
3009 instruct sqrtF_imm(regF dst, immF con) %{
3010   predicate(UseSSE>=1);
3011   match(Set dst (SqrtF con));
3012 
3013   format %{ "sqrtss  $dst, [$constantaddress]\t# load from constant table: float=$con" %}
3014   ins_cost(150);
3015   ins_encode %{
3016     __ sqrtss($dst$$XMMRegister, $constantaddress($con));
3017   %}
3018   ins_pipe(pipe_slow);
3019 %}
3020 
3021 instruct sqrtD_reg(regD dst, regD src) %{
3022   predicate(UseSSE>=2);
3023   match(Set dst (SqrtD src));
3024 
3025   format %{ "sqrtsd  $dst, $src" %}
3026   ins_cost(150);
3027   ins_encode %{
3028     __ sqrtsd($dst$$XMMRegister, $src$$XMMRegister);
3029   %}
3030   ins_pipe(pipe_slow);
3031 %}
3032 
3033 instruct sqrtD_mem(regD dst, memory src) %{
3034   predicate(UseSSE>=2);
3035   match(Set dst (SqrtD (LoadD src)));
3036 
3037   format %{ "sqrtsd  $dst, $src" %}
3038   ins_cost(150);
3039   ins_encode %{
3040     __ sqrtsd($dst$$XMMRegister, $src$$Address);
3041   %}
3042   ins_pipe(pipe_slow);
3043 %}
3044 
3045 instruct sqrtD_imm(regD dst, immD con) %{
3046   predicate(UseSSE>=2);
3047   match(Set dst (SqrtD con));
3048   format %{ "sqrtsd  $dst, [$constantaddress]\t# load from constant table: double=$con" %}
3049   ins_cost(150);
3050   ins_encode %{
3051     __ sqrtsd($dst$$XMMRegister, $constantaddress($con));
3052   %}
3053   ins_pipe(pipe_slow);
3054 %}
3055 
3056 instruct onspinwait() %{
3057   match(OnSpinWait);
3058   ins_cost(200);
3059 
3060   format %{
3061     $$template
3062     $$emit$$"pause\t! membar_onspinwait"
3063   %}
3064   ins_encode %{
3065     __ pause();
3066   %}
3067   ins_pipe(pipe_slow);
3068 %}
3069 
3070 // a * b + c
3071 instruct fmaD_reg(regD a, regD b, regD c) %{
3072   predicate(UseFMA);
3073   match(Set c (FmaD  c (Binary a b)));
3074   format %{ "fmasd $a,$b,$c\t# $c = $a * $b + $c" %}
3075   ins_cost(150);
3076   ins_encode %{
3077     __ fmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
3078   %}
3079   ins_pipe( pipe_slow );
3080 %}
3081 
3082 // a * b + c
3083 instruct fmaF_reg(regF a, regF b, regF c) %{
3084   predicate(UseFMA);
3085   match(Set c (FmaF  c (Binary a b)));
3086   format %{ "fmass $a,$b,$c\t# $c = $a * $b + $c" %}
3087   ins_cost(150);
3088   ins_encode %{
3089     __ fmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
3090   %}
3091   ins_pipe( pipe_slow );
3092 %}
3093 
3094 // ====================VECTOR INSTRUCTIONS=====================================
3095 
3096 instruct reinterpretS(vecS dst) %{
3097   predicate(n->bottom_type()->is_vect()->length_in_bytes() == 4 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 4);
3098   match(Set dst (VectorReinterpret dst));
3099   ins_cost(125);
3100   format %{ " # reinterpret $dst" %}
3101   ins_encode %{
3102     // empty
3103   %}
3104   ins_pipe( pipe_slow );
3105 %}
3106 
3107 instruct reinterpretS2D(vecD dst, vecS src, rRegL scratch) %{
3108   predicate(UseAVX == 0 && n->bottom_type()->is_vect()->length_in_bytes() == 8 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 4);
3109   match(Set dst (VectorReinterpret src));
3110   ins_cost(125);
3111   effect(TEMP dst, TEMP scratch);
3112   format %{ " # reinterpret $dst,$src" %}
3113   ins_encode %{
3114     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_32_bit_mask()), $scratch$$Register);
3115     __ pand($dst$$XMMRegister, $src$$XMMRegister);
3116   %}
3117   ins_pipe( pipe_slow );
3118 %}
3119 
3120 instruct reinterpretS2D_avx(vecD dst, vecS src, rRegL scratch) %{
3121   predicate(UseAVX > 0 && n->bottom_type()->is_vect()->length_in_bytes() == 8 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 4);
3122   match(Set dst (VectorReinterpret src));
3123   ins_cost(125);
3124   effect(TEMP dst, TEMP scratch);
3125   format %{ " # reinterpret $dst,$src" %}
3126   ins_encode %{
3127     int vector_len = 0;
3128     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_32_bit_mask()), vector_len, $scratch$$Register);
3129   %}
3130   ins_pipe( pipe_slow );
3131 %}
3132 
3133 instruct reinterpretS2X(vecX dst, vecS src, rRegL scratch) %{
3134   predicate(UseAVX == 0 && n->bottom_type()->is_vect()->length_in_bytes() == 16 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 4);
3135   match(Set dst (VectorReinterpret src));
3136   ins_cost(125);
3137   effect(TEMP dst, TEMP scratch);
3138   format %{ " # reinterpret $dst,$src" %}
3139   ins_encode %{
3140     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_32_bit_mask()), $scratch$$Register);
3141     __ pand($dst$$XMMRegister, $src$$XMMRegister);
3142   %}
3143   ins_pipe( pipe_slow );
3144 %}
3145 
3146 instruct reinterpretS2X_avx(vecX dst, vecS src, rRegL scratch) %{
3147   predicate(UseAVX > 0 && n->bottom_type()->is_vect()->length_in_bytes() == 16 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 4);
3148   match(Set dst (VectorReinterpret src));
3149   ins_cost(125);
3150   effect(TEMP scratch);
3151   format %{ " # reinterpret $dst,$src" %}
3152   ins_encode %{
3153     int vector_len = 0;
3154     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_32_bit_mask()), vector_len, $scratch$$Register);
3155   %}
3156   ins_pipe( pipe_slow );
3157 %}
3158 
3159 instruct reinterpretS2Y(vecY dst, vecS src, rRegL scratch) %{
3160   predicate(UseAVX >= 2 && n->bottom_type()->is_vect()->length_in_bytes() == 32 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 4);
3161   match(Set dst (VectorReinterpret src));
3162   ins_cost(125);
3163   effect(TEMP scratch);
3164   format %{ " # reinterpret $dst,$src" %}
3165   ins_encode %{
3166     int vector_len = 1;
3167     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_32_bit_mask()), vector_len, $scratch$$Register);
3168   %}
3169   ins_pipe( pipe_slow );
3170 %}
3171 
3172 instruct reinterpretS2Z(vecZ dst, vecS src, rRegL scratch) %{
3173   predicate(UseAVX > 2 && n->bottom_type()->is_vect()->length_in_bytes() == 64 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 4);
3174   match(Set dst (VectorReinterpret src));
3175   ins_cost(125);
3176   effect(TEMP scratch);
3177   format %{ " # reinterpret $dst,$src" %}
3178   ins_encode %{
3179     int vector_len = 2;
3180     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_32_bit_mask()), vector_len, $scratch$$Register);
3181   %}
3182   ins_pipe( pipe_slow );
3183 %}
3184 
3185 instruct reinterpretD2S(vecS dst, vecD src) %{
3186   predicate(n->bottom_type()->is_vect()->length_in_bytes() == 4 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 8);
3187   match(Set dst (VectorReinterpret src));
3188   ins_cost(125);
3189   format %{ " # reinterpret $dst,$src" %}
3190   ins_encode %{
3191     // If register is the same, then move is not needed.
3192     if ($dst$$XMMRegister != $src$$XMMRegister) {
3193       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
3194     }
3195   %}
3196   ins_pipe( pipe_slow );
3197 %}
3198 
3199 instruct reinterpretD(vecD dst) %{
3200   predicate(n->bottom_type()->is_vect()->length_in_bytes() == 8 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 8);
3201   match(Set dst (VectorReinterpret dst));
3202   ins_cost(125);
3203   format %{ " # reinterpret $dst" %}
3204   ins_encode %{
3205     // empty
3206   %}
3207   ins_pipe( pipe_slow );
3208 %}
3209 
3210 instruct reinterpretD2X(vecX dst, vecD src, rRegL scratch) %{
3211   predicate(UseAVX == 0 && n->bottom_type()->is_vect()->length_in_bytes() == 16 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 8);
3212   match(Set dst (VectorReinterpret src));
3213   ins_cost(125);
3214   effect(TEMP dst, TEMP scratch);
3215   format %{ " # reinterpret $dst,$src" %}
3216   ins_encode %{
3217     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_64_bit_mask()), $scratch$$Register);
3218     __ pand($dst$$XMMRegister, $src$$XMMRegister);
3219   %}
3220   ins_pipe( pipe_slow );
3221 %}
3222 
3223 instruct reinterpretD2X_avx(vecX dst, vecD src, rRegL scratch) %{
3224   predicate(UseAVX > 0 && n->bottom_type()->is_vect()->length_in_bytes() == 16 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 8);
3225   match(Set dst (VectorReinterpret src));
3226   ins_cost(125);
3227   effect(TEMP dst, TEMP scratch);
3228   format %{ " # reinterpret $dst,$src" %}
3229   ins_encode %{
3230     int vector_len = 0;
3231     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_64_bit_mask()), vector_len, $scratch$$Register);
3232   %}
3233   ins_pipe( pipe_slow );
3234 %}
3235 
3236 instruct reinterpretD2Y(vecY dst, vecD src, rRegL scratch) %{
3237   predicate(UseAVX >= 2 && n->bottom_type()->is_vect()->length_in_bytes() == 32 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 8);
3238   match(Set dst (VectorReinterpret src));
3239   ins_cost(125);
3240   effect(TEMP scratch);
3241   format %{ " # reinterpret $dst,$src" %}
3242   ins_encode %{
3243     int vector_len = 1;
3244     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_64_bit_mask()), vector_len, $scratch$$Register);
3245   %}
3246   ins_pipe( pipe_slow );
3247 %}
3248 
3249 instruct reinterpretD2Z(vecZ dst, vecD src, rRegL scratch) %{
3250   predicate(UseAVX > 2 && n->bottom_type()->is_vect()->length_in_bytes() == 64 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 8);
3251   match(Set dst (VectorReinterpret src));
3252   ins_cost(125);
3253   effect(TEMP scratch);
3254   format %{ " # reinterpret $dst,$src" %}
3255   ins_encode %{
3256     int vector_len = 2;
3257     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_64_bit_mask()), vector_len, $scratch$$Register);
3258   %}
3259   ins_pipe( pipe_slow );
3260 %}
3261 
3262 instruct reinterpretX2S(vecS dst, vecX src) %{
3263   predicate(n->bottom_type()->is_vect()->length_in_bytes() == 4 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 16);
3264   match(Set dst (VectorReinterpret src));
3265   ins_cost(125);
3266   format %{ " # reinterpret $dst,$src" %}
3267   ins_encode %{
3268     // If register is the same, then move is not needed.
3269     if ($dst$$XMMRegister != $src$$XMMRegister) {
3270       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
3271     }
3272   %}
3273   ins_pipe( pipe_slow );
3274 %}
3275 
3276 instruct reinterpretX2D(vecD dst, vecX src) %{
3277   predicate(n->bottom_type()->is_vect()->length_in_bytes() == 8 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 16);
3278   match(Set dst (VectorReinterpret src));
3279   ins_cost(125);
3280   format %{ " # reinterpret $dst,$src" %}
3281   ins_encode %{
3282     // If register is the same, then move is not needed.
3283     if ($dst$$XMMRegister != $src$$XMMRegister) {
3284       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
3285     }
3286   %}
3287   ins_pipe( pipe_slow );
3288 %}
3289 
3290 instruct reinterpretX(vecX dst) %{
3291   predicate(n->bottom_type()->is_vect()->length_in_bytes() == 16 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 16);
3292   match(Set dst (VectorReinterpret dst));
3293   ins_cost(125);
3294   format %{ " # reinterpret $dst" %}
3295   ins_encode %{
3296     // empty
3297   %}
3298   ins_pipe( pipe_slow );
3299 %}
3300 
3301 instruct reinterpretX2Y(vecY dst, vecX src) %{
3302   predicate(n->bottom_type()->is_vect()->length_in_bytes() == 32 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 16);
3303   match(Set dst (VectorReinterpret src));
3304   ins_cost(125);
3305   effect(TEMP dst);
3306   format %{ " # reinterpret $dst,$src" %}
3307   ins_encode %{
3308     int vector_len = 1;
3309     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3310     __ movdqu($dst$$XMMRegister, $src$$XMMRegister);  // just 128-bits need moved
3311   %}
3312   ins_pipe( pipe_slow );
3313 %}
3314 
3315 instruct reinterpretX2Z(vecZ dst, vecX src) %{
3316   predicate(n->bottom_type()->is_vect()->length_in_bytes() == 64 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 16);
3317   match(Set dst (VectorReinterpret src));
3318   ins_cost(125);
3319   effect(TEMP dst);
3320   format %{ " # reinterpret $dst,$src" %}
3321   ins_encode %{
3322     int vector_len = 2;
3323     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3324     __ movdqu($dst$$XMMRegister, $src$$XMMRegister);  // just 128-bits need moved
3325   %}
3326   ins_pipe( pipe_slow );
3327 %}
3328 
3329 instruct reinterpretY2S(vecS dst, vecY src) %{
3330   predicate(n->bottom_type()->is_vect()->length_in_bytes() == 4 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 32);
3331   match(Set dst (VectorReinterpret src));
3332   ins_cost(125);
3333   format %{ " # reinterpret $dst,$src" %}
3334   ins_encode %{
3335     // If register is the same, then move is not needed.
3336     if ($dst$$XMMRegister != $src$$XMMRegister) {
3337       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
3338     }
3339   %}
3340   ins_pipe( pipe_slow );
3341 %}
3342 
3343 instruct reinterpretY2D(vecD dst, vecY src) %{
3344   predicate(n->bottom_type()->is_vect()->length_in_bytes() == 8 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 32);
3345   match(Set dst (VectorReinterpret src));
3346   ins_cost(125);
3347   format %{ " # reinterpret $dst,$src" %}
3348   ins_encode %{
3349     // If register is the same, then move is not needed.
3350     if ($dst$$XMMRegister != $src$$XMMRegister) {
3351       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
3352     }
3353   %}
3354   ins_pipe( pipe_slow );
3355 %}
3356 
3357 instruct reinterpretY2X(vecX dst, vecY src) %{
3358   predicate(n->bottom_type()->is_vect()->length_in_bytes() == 16 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 32);
3359   match(Set dst (VectorReinterpret src));
3360   ins_cost(125);
3361   format %{ " # reinterpret $dst,$src" %}
3362   ins_encode %{
3363     // If register is the same, then move is not needed.
3364     if ($dst$$XMMRegister != $src$$XMMRegister) {
3365       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
3366     }
3367   %}
3368   ins_pipe( pipe_slow );
3369 %}
3370 
3371 instruct reinterpretY(vecY dst) %{
3372   predicate(n->bottom_type()->is_vect()->length_in_bytes() == 32 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 32);
3373   match(Set dst (VectorReinterpret dst));
3374   ins_cost(125);
3375   format %{ " # reinterpret $dst" %}
3376   ins_encode %{
3377     // empty
3378   %}
3379   ins_pipe( pipe_slow );
3380 %}
3381 
3382 instruct reinterpretY2Z(vecZ dst, vecY src) %{
3383   predicate(n->bottom_type()->is_vect()->length_in_bytes() == 64 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 32);
3384   match(Set dst (VectorReinterpret src));
3385   ins_cost(125);
3386   effect(TEMP dst);
3387   format %{ " # reinterpret $dst,$src" %}
3388   ins_encode %{
3389     int vector_len = 2;
3390     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3391     __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister);
3392   %}
3393   ins_pipe( pipe_slow );
3394 %}
3395 
3396 instruct reinterpretZ2S(vecS dst, vecZ src) %{
3397   predicate(n->bottom_type()->is_vect()->length_in_bytes() == 4 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 64);
3398   match(Set dst (VectorReinterpret src));
3399   ins_cost(125);
3400   format %{ " # reinterpret $dst,$src" %}
3401   ins_encode %{
3402     // If register is the same, then move is not needed.
3403     if ($dst$$XMMRegister != $src$$XMMRegister) {
3404       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
3405     }
3406   %}
3407   ins_pipe( pipe_slow );
3408 %}
3409 
3410 instruct reinterpretZ2D(vecD dst, vecZ src) %{
3411   predicate(n->bottom_type()->is_vect()->length_in_bytes() == 8 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 64);
3412   match(Set dst (VectorReinterpret src));
3413   ins_cost(125);
3414   format %{ " # reinterpret $dst,$src" %}
3415   ins_encode %{
3416     // If register is the same, then move is not needed.
3417     if ($dst$$XMMRegister != $src$$XMMRegister) {
3418       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
3419     }
3420   %}
3421   ins_pipe( pipe_slow );
3422 %}
3423 
3424 instruct reinterpretZ2X(vecX dst, vecZ src) %{
3425   predicate(n->bottom_type()->is_vect()->length_in_bytes() == 16 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 64);
3426   match(Set dst (VectorReinterpret src));
3427   ins_cost(125);
3428   format %{ " # reinterpret $dst,$src" %}
3429   ins_encode %{
3430     // If register is the same, then move is not needed.
3431     if ($dst$$XMMRegister != $src$$XMMRegister) {
3432       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
3433     }
3434   %}
3435   ins_pipe( pipe_slow );
3436 %}
3437 
3438 instruct reinterpretZ2Y(vecY dst, vecZ src) %{
3439   predicate(n->bottom_type()->is_vect()->length_in_bytes() == 32 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 64);
3440   match(Set dst (VectorReinterpret src));
3441   ins_cost(125);
3442   format %{ " # reinterpret $dst,$src" %}
3443   ins_encode %{
3444     // If register is the same, then move is not needed.
3445     if ($dst$$XMMRegister != $src$$XMMRegister) {
3446       __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister);
3447     }
3448   %}
3449   ins_pipe( pipe_slow );
3450 %}
3451 
3452 instruct reinterpretZ(vecZ dst) %{
3453   predicate(n->bottom_type()->is_vect()->length_in_bytes() == 64 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 64);
3454   match(Set dst (VectorReinterpret dst));
3455   ins_cost(125);
3456   format %{ " # reinterpret $dst" %}
3457   ins_encode %{
3458     // empty
3459   %}
3460   ins_pipe( pipe_slow );
3461 %}
3462 
3463 // ==========
3464 
3465 // Load vectors (1 byte long)
3466 instruct loadV1(vecS dst, memory mem, rRegI tmp) %{
3467   predicate(n->as_LoadVector()->memory_size() == 1);
3468   match(Set dst (LoadVector mem));
3469   ins_cost(125);
3470   effect(TEMP tmp);
3471   format %{ "movzbl $tmp,$mem\n\t"
3472           "movd $dst,$tmp\t! load vector (1 byte)" %}
3473   ins_encode %{
3474     __ movzbl($tmp$$Register, $mem$$Address);
3475     __ movdl($dst$$XMMRegister, $tmp$$Register);
3476   %}
3477   ins_pipe( pipe_slow );
3478 %}
3479 
3480 // Load vectors (2 bytes long)
3481 instruct loadV2(vecS dst, memory mem, rRegI tmp) %{
3482   predicate(n->as_LoadVector()->memory_size() == 2);
3483   match(Set dst (LoadVector mem));
3484   ins_cost(125);
3485   effect(TEMP tmp);
3486   format %{ "movzwl $tmp,$mem\n\t"
3487           "movd $dst,$tmp\t! load vector (2 bytes)" %}
3488   ins_encode %{
3489     __ movzwl($tmp$$Register, $mem$$Address);
3490     __ movdl($dst$$XMMRegister, $tmp$$Register);
3491   %}
3492   ins_pipe( pipe_slow );
3493 %}
3494 
3495 
3496 // Load vectors (4 bytes long)
3497 instruct loadV4(vecS dst, memory mem) %{
3498   predicate(n->as_LoadVector()->memory_size() == 4);
3499   match(Set dst (LoadVector mem));
3500   ins_cost(125);
3501   format %{ "movd    $dst,$mem\t! load vector (4 bytes)" %}
3502   ins_encode %{
3503     __ movdl($dst$$XMMRegister, $mem$$Address);
3504   %}
3505   ins_pipe( pipe_slow );
3506 %}
3507 
3508 // Load vectors (4 bytes long)
3509 instruct MoveVecS2Leg(legVecS dst, vecS src) %{
3510   match(Set dst src);
3511   format %{ "movss $dst,$src\t! load vector (4 bytes)" %}
3512   ins_encode %{
3513     __ movflt($dst$$XMMRegister, $src$$XMMRegister);
3514   %}
3515   ins_pipe( fpu_reg_reg );
3516 %}
3517 
3518 // Load vectors (4 bytes long)
3519 instruct MoveLeg2VecS(vecS dst, legVecS src) %{
3520   match(Set dst src);
3521   format %{ "movss $dst,$src\t! load vector (4 bytes)" %}
3522   ins_encode %{
3523     __ movflt($dst$$XMMRegister, $src$$XMMRegister);
3524   %}
3525   ins_pipe( fpu_reg_reg );
3526 %}
3527 
3528 // Load vectors (8 bytes long)
3529 instruct loadV8(vecD dst, memory mem) %{
3530   predicate(n->as_LoadVector()->memory_size() == 8);
3531   match(Set dst (LoadVector mem));
3532   ins_cost(125);
3533   format %{ "movq    $dst,$mem\t! load vector (8 bytes)" %}
3534   ins_encode %{
3535     __ movq($dst$$XMMRegister, $mem$$Address);
3536   %}
3537   ins_pipe( pipe_slow );
3538 %}
3539 
3540 // Load vectors (8 bytes long)
3541 instruct MoveVecD2Leg(legVecD dst, vecD src) %{
3542   match(Set dst src);
3543   format %{ "movsd $dst,$src\t! load vector (8 bytes)" %}
3544   ins_encode %{
3545     __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
3546   %}
3547   ins_pipe( fpu_reg_reg );
3548 %}
3549 
3550 // Load vectors (8 bytes long)
3551 instruct MoveLeg2VecD(vecD dst, legVecD src) %{
3552   match(Set dst src);
3553   format %{ "movsd $dst,$src\t! load vector (8 bytes)" %}
3554   ins_encode %{
3555     __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
3556   %}
3557   ins_pipe( fpu_reg_reg );
3558 %}
3559 
3560 // Load vectors (16 bytes long)
3561 instruct loadV16(vecX dst, memory mem) %{
3562   predicate(n->as_LoadVector()->memory_size() == 16);
3563   match(Set dst (LoadVector mem));
3564   ins_cost(125);
3565   format %{ "movdqu  $dst,$mem\t! load vector (16 bytes)" %}
3566   ins_encode %{
3567     __ movdqu($dst$$XMMRegister, $mem$$Address);
3568   %}
3569   ins_pipe( pipe_slow );
3570 %}
3571 
3572 // Load vectors (16 bytes long)
3573 instruct MoveVecX2Leg(legVecX dst, vecX src) %{
3574   match(Set dst src);
3575   format %{ "movdqu $dst,$src\t! load vector (16 bytes)" %}
3576   ins_encode %{
3577     if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
3578       int vector_len = 2;
3579       __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
3580     } else {
3581       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
3582     }
3583   %}
3584   ins_pipe( fpu_reg_reg );
3585 %}
3586 
3587 // Load vectors (16 bytes long)
3588 instruct MoveLeg2VecX(vecX dst, legVecX src) %{
3589   match(Set dst src);
3590   format %{ "movdqu $dst,$src\t! load vector (16 bytes)" %}
3591   ins_encode %{
3592     if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
3593       int vector_len = 2;
3594       __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
3595     } else {
3596       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
3597     }
3598   %}
3599   ins_pipe( fpu_reg_reg );
3600 %}
3601 
3602 // Load vectors (32 bytes long)
3603 instruct loadV32(vecY dst, memory mem) %{
3604   predicate(n->as_LoadVector()->memory_size() == 32);
3605   match(Set dst (LoadVector mem));
3606   ins_cost(125);
3607   format %{ "vmovdqu $dst,$mem\t! load vector (32 bytes)" %}
3608   ins_encode %{
3609     __ vmovdqu($dst$$XMMRegister, $mem$$Address);
3610   %}
3611   ins_pipe( pipe_slow );
3612 %}
3613 
3614 // Load vectors (32 bytes long)
3615 instruct MoveVecY2Leg(legVecY dst, vecY src) %{
3616   match(Set dst src);
3617   format %{ "vmovdqu $dst,$src\t! load vector (32 bytes)" %}
3618   ins_encode %{
3619     if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
3620       int vector_len = 2;
3621       __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
3622     } else {
3623       __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister);
3624     }
3625   %}
3626   ins_pipe( fpu_reg_reg );
3627 %}
3628 
3629 // Load vectors (32 bytes long)
3630 instruct MoveLeg2VecY(vecY dst, legVecY src) %{
3631   match(Set dst src);
3632   format %{ "vmovdqu $dst,$src\t! load vector (32 bytes)" %}
3633   ins_encode %{
3634     if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
3635       int vector_len = 2;
3636       __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
3637     } else {
3638       __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister);
3639     }
3640   %}
3641   ins_pipe( fpu_reg_reg );
3642 %}
3643 
3644 // Load vectors (64 bytes long)
3645 instruct loadV64_dword(vecZ dst, memory mem) %{
3646   predicate(n->as_LoadVector()->memory_size() == 64 && n->as_LoadVector()->element_size() <= 4);
3647   match(Set dst (LoadVector mem));
3648   ins_cost(125);
3649   format %{ "vmovdqul $dst k0,$mem\t! load vector (64 bytes)" %}
3650   ins_encode %{
3651     int vector_len = 2;
3652     __ evmovdqul($dst$$XMMRegister, $mem$$Address, vector_len);
3653   %}
3654   ins_pipe( pipe_slow );
3655 %}
3656 
3657 // Load vectors (64 bytes long)
3658 instruct loadV64_qword(vecZ dst, memory mem) %{
3659   predicate(n->as_LoadVector()->memory_size() == 64 && n->as_LoadVector()->element_size() > 4);
3660   match(Set dst (LoadVector mem));
3661   ins_cost(125);
3662   format %{ "vmovdquq $dst k0,$mem\t! load vector (64 bytes)" %}
3663   ins_encode %{
3664     int vector_len = 2;
3665     __ evmovdquq($dst$$XMMRegister, $mem$$Address, vector_len);
3666   %}
3667   ins_pipe( pipe_slow );
3668 %}
3669 
3670 instruct MoveVecZ2Leg(legVecZ dst, vecZ  src) %{
3671   match(Set dst src);
3672   format %{ "vmovdquq $dst k0,$src\t! Move vector (64 bytes)" %}
3673   ins_encode %{
3674     int vector_len = 2;
3675     __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
3676   %}
3677   ins_pipe( fpu_reg_reg );
3678 %}
3679 
3680 instruct MoveLeg2VecZ(vecZ dst, legVecZ  src) %{
3681   match(Set dst src);
3682   format %{ "vmovdquq $dst k0,$src\t! Move vector (64 bytes)" %}
3683   ins_encode %{
3684     int vector_len = 2;
3685     __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
3686   %}
3687   ins_pipe( fpu_reg_reg );
3688 %}
3689 
3690 // Store vectors
3691 instruct storeV1(memory mem, vecS src, rRegI tmp) %{
3692   predicate(n->as_StoreVector()->memory_size() == 1);
3693   match(Set mem (StoreVector mem src));
3694   ins_cost(145);
3695   effect(TEMP tmp);
3696   format %{ "movd $tmp,$src\n\t"
3697           "movb $mem,$tmp\t! store vector (1 byte)" %}
3698   ins_encode %{
3699     __ movdl($tmp$$Register, $src$$XMMRegister);
3700     __ movb($mem$$Address, $tmp$$Register);
3701   %}
3702   ins_pipe( pipe_slow );
3703 %}
3704 
3705 instruct storeV2(memory mem, vecS src, rRegI tmp) %{
3706   predicate(n->as_StoreVector()->memory_size() == 2);
3707   match(Set mem (StoreVector mem src));
3708   ins_cost(145);
3709   effect(TEMP tmp);
3710   format %{ "movd $tmp,$src\n\t"
3711           "movw $mem,$tmp\t! store vector (2 bytes)" %}
3712   ins_encode %{
3713     __ movdl($tmp$$Register, $src$$XMMRegister);
3714     __ movw($mem$$Address, $tmp$$Register);
3715   %}
3716   ins_pipe( pipe_slow );
3717 %}
3718 
3719 instruct storeV4(memory mem, vecS src) %{
3720   predicate(n->as_StoreVector()->memory_size() == 4);
3721   match(Set mem (StoreVector mem src));
3722   ins_cost(145);
3723   format %{ "movd    $mem,$src\t! store vector (4 bytes)" %}
3724   ins_encode %{
3725     __ movdl($mem$$Address, $src$$XMMRegister);
3726   %}
3727   ins_pipe( pipe_slow );
3728 %}
3729 
3730 instruct storeV8(memory mem, vecD src) %{
3731   predicate(n->as_StoreVector()->memory_size() == 8);
3732   match(Set mem (StoreVector mem src));
3733   ins_cost(145);
3734   format %{ "movq    $mem,$src\t! store vector (8 bytes)" %}
3735   ins_encode %{
3736     __ movq($mem$$Address, $src$$XMMRegister);
3737   %}
3738   ins_pipe( pipe_slow );
3739 %}
3740 
3741 instruct storeV16(memory mem, vecX src) %{
3742   predicate(n->as_StoreVector()->memory_size() == 16);
3743   match(Set mem (StoreVector mem src));
3744   ins_cost(145);
3745   format %{ "movdqu  $mem,$src\t! store vector (16 bytes)" %}
3746   ins_encode %{
3747     __ movdqu($mem$$Address, $src$$XMMRegister);
3748   %}
3749   ins_pipe( pipe_slow );
3750 %}
3751 
3752 instruct storeV32(memory mem, vecY src) %{
3753   predicate(n->as_StoreVector()->memory_size() == 32);
3754   match(Set mem (StoreVector mem src));
3755   ins_cost(145);
3756   format %{ "vmovdqu $mem,$src\t! store vector (32 bytes)" %}
3757   ins_encode %{
3758     __ vmovdqu($mem$$Address, $src$$XMMRegister);
3759   %}
3760   ins_pipe( pipe_slow );
3761 %}
3762 
3763 instruct storeV64_dword(memory mem, vecZ src) %{
3764   predicate(n->as_StoreVector()->memory_size() == 64 && n->as_StoreVector()->element_size() <= 4);
3765   match(Set mem (StoreVector mem src));
3766   ins_cost(145);
3767   format %{ "vmovdqul $mem k0,$src\t! store vector (64 bytes)" %}
3768   ins_encode %{
3769     int vector_len = 2;
3770     __ evmovdqul($mem$$Address, $src$$XMMRegister, vector_len);
3771   %}
3772   ins_pipe( pipe_slow );
3773 %}
3774 
3775 instruct storeV64_qword(memory mem, vecZ src) %{
3776   predicate(n->as_StoreVector()->memory_size() == 64 && n->as_StoreVector()->element_size() > 4);
3777   match(Set mem (StoreVector mem src));
3778   ins_cost(145);
3779   format %{ "vmovdquq $mem k0,$src\t! store vector (64 bytes)" %}
3780   ins_encode %{
3781     int vector_len = 2;
3782     __ evmovdquq($mem$$Address, $src$$XMMRegister, vector_len);
3783   %}
3784   ins_pipe( pipe_slow );
3785 %}
3786 
3787 // ====================LEGACY REPLICATE=======================================
3788 
3789 instruct Repl4B_mem(vecS dst, memory mem) %{
3790   predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
3791   match(Set dst (ReplicateB (LoadB mem)));
3792   format %{ "punpcklbw $dst,$mem\n\t"
3793             "pshuflw $dst,$dst,0x00\t! replicate4B" %}
3794   ins_encode %{
3795     __ punpcklbw($dst$$XMMRegister, $mem$$Address);
3796     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3797   %}
3798   ins_pipe( pipe_slow );
3799 %}
3800 
3801 instruct Repl8B_mem(vecD dst, memory mem) %{
3802   predicate(n->as_Vector()->length() == 8 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
3803   match(Set dst (ReplicateB (LoadB mem)));
3804   format %{ "punpcklbw $dst,$mem\n\t"
3805             "pshuflw $dst,$dst,0x00\t! replicate8B" %}
3806   ins_encode %{
3807     __ punpcklbw($dst$$XMMRegister, $mem$$Address);
3808     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3809   %}
3810   ins_pipe( pipe_slow );
3811 %}
3812 
3813 instruct Repl16B(vecX dst, rRegI src) %{
3814   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
3815   match(Set dst (ReplicateB src));
3816   format %{ "movd    $dst,$src\n\t"
3817             "punpcklbw $dst,$dst\n\t"
3818             "pshuflw $dst,$dst,0x00\n\t"
3819             "punpcklqdq $dst,$dst\t! replicate16B" %}
3820   ins_encode %{
3821     __ movdl($dst$$XMMRegister, $src$$Register);
3822     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
3823     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3824     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3825   %}
3826   ins_pipe( pipe_slow );
3827 %}
3828 
3829 instruct Repl16B_mem(vecX dst, memory mem) %{
3830   predicate(n->as_Vector()->length() == 16 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
3831   match(Set dst (ReplicateB (LoadB mem)));
3832   format %{ "punpcklbw $dst,$mem\n\t"
3833             "pshuflw $dst,$dst,0x00\n\t"
3834             "punpcklqdq $dst,$dst\t! replicate16B" %}
3835   ins_encode %{
3836     __ punpcklbw($dst$$XMMRegister, $mem$$Address);
3837     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3838     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3839   %}
3840   ins_pipe( pipe_slow );
3841 %}
3842 
3843 instruct Repl32B(vecY dst, rRegI src) %{
3844   predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
3845   match(Set dst (ReplicateB src));
3846   format %{ "movd    $dst,$src\n\t"
3847             "punpcklbw $dst,$dst\n\t"
3848             "pshuflw $dst,$dst,0x00\n\t"
3849             "punpcklqdq $dst,$dst\n\t"
3850             "vinserti128_high $dst,$dst\t! replicate32B" %}
3851   ins_encode %{
3852     __ movdl($dst$$XMMRegister, $src$$Register);
3853     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
3854     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3855     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3856     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3857   %}
3858   ins_pipe( pipe_slow );
3859 %}
3860 
3861 instruct Repl32B_mem(vecY dst, memory mem) %{
3862   predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
3863   match(Set dst (ReplicateB (LoadB mem)));
3864   format %{ "punpcklbw $dst,$mem\n\t"
3865             "pshuflw $dst,$dst,0x00\n\t"
3866             "punpcklqdq $dst,$dst\n\t"
3867             "vinserti128_high $dst,$dst\t! replicate32B" %}
3868   ins_encode %{
3869     __ punpcklbw($dst$$XMMRegister, $mem$$Address);
3870     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3871     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3872     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3873   %}
3874   ins_pipe( pipe_slow );
3875 %}
3876 
3877 instruct Repl64B(legVecZ dst, rRegI src) %{
3878   predicate(n->as_Vector()->length() == 64 && !VM_Version::supports_avx512vlbw());
3879   match(Set dst (ReplicateB src));
3880   format %{ "movd    $dst,$src\n\t"
3881             "punpcklbw $dst,$dst\n\t"
3882             "pshuflw $dst,$dst,0x00\n\t"
3883             "punpcklqdq $dst,$dst\n\t"
3884             "vinserti128_high $dst,$dst\t"
3885             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate64B" %}
3886   ins_encode %{
3887     __ movdl($dst$$XMMRegister, $src$$Register);
3888     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
3889     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3890     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3891     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3892     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3893   %}
3894   ins_pipe( pipe_slow );
3895 %}
3896 
3897 instruct Repl64B_mem(legVecZ dst, memory mem) %{
3898   predicate(n->as_Vector()->length() == 64 && !VM_Version::supports_avx512vlbw());
3899   match(Set dst (ReplicateB (LoadB mem)));
3900   format %{ "punpcklbw $dst,$mem\n\t"
3901             "pshuflw $dst,$dst,0x00\n\t"
3902             "punpcklqdq $dst,$dst\n\t"
3903             "vinserti128_high $dst,$dst\t"
3904             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate64B" %}
3905   ins_encode %{
3906     __ punpcklbw($dst$$XMMRegister, $mem$$Address);
3907     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3908     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3909     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3910     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3911   %}
3912   ins_pipe( pipe_slow );
3913 %}
3914 
3915 instruct Repl16B_imm(vecX dst, immI con) %{
3916   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
3917   match(Set dst (ReplicateB con));
3918   format %{ "movq    $dst,[$constantaddress]\n\t"
3919             "punpcklqdq $dst,$dst\t! replicate16B($con)" %}
3920   ins_encode %{
3921     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
3922     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3923   %}
3924   ins_pipe( pipe_slow );
3925 %}
3926 
3927 instruct Repl32B_imm(vecY dst, immI con) %{
3928   predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
3929   match(Set dst (ReplicateB con));
3930   format %{ "movq    $dst,[$constantaddress]\n\t"
3931             "punpcklqdq $dst,$dst\n\t"
3932             "vinserti128_high $dst,$dst\t! lreplicate32B($con)" %}
3933   ins_encode %{
3934     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
3935     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3936     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3937   %}
3938   ins_pipe( pipe_slow );
3939 %}
3940 
3941 instruct Repl64B_imm(legVecZ dst, immI con) %{
3942   predicate(n->as_Vector()->length() == 64 && !VM_Version::supports_avx512vlbw());
3943   match(Set dst (ReplicateB con));
3944   format %{ "movq    $dst,[$constantaddress]\n\t"
3945             "punpcklqdq $dst,$dst\n\t"
3946             "vinserti128_high $dst,$dst\t"
3947             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate64B($con)" %}
3948   ins_encode %{
3949     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
3950     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3951     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3952     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3953   %}
3954   ins_pipe( pipe_slow );
3955 %}
3956 
3957 instruct Repl4S(vecD dst, rRegI src) %{
3958   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vlbw());
3959   match(Set dst (ReplicateS src));
3960   format %{ "movd    $dst,$src\n\t"
3961             "pshuflw $dst,$dst,0x00\t! replicate4S" %}
3962   ins_encode %{
3963     __ movdl($dst$$XMMRegister, $src$$Register);
3964     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3965   %}
3966   ins_pipe( pipe_slow );
3967 %}
3968 
3969 instruct Repl4S_mem(vecD dst, memory mem) %{
3970   predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
3971   match(Set dst (ReplicateS (LoadS mem)));
3972   format %{ "pshuflw $dst,$mem,0x00\t! replicate4S" %}
3973   ins_encode %{
3974     __ pshuflw($dst$$XMMRegister, $mem$$Address, 0x00);
3975   %}
3976   ins_pipe( pipe_slow );
3977 %}
3978 
3979 instruct Repl8S(vecX dst, rRegI src) %{
3980   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vlbw());
3981   match(Set dst (ReplicateS src));
3982   format %{ "movd    $dst,$src\n\t"
3983             "pshuflw $dst,$dst,0x00\n\t"
3984             "punpcklqdq $dst,$dst\t! replicate8S" %}
3985   ins_encode %{
3986     __ movdl($dst$$XMMRegister, $src$$Register);
3987     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3988     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3989   %}
3990   ins_pipe( pipe_slow );
3991 %}
3992 
3993 instruct Repl8S_mem(vecX dst, memory mem) %{
3994   predicate(n->as_Vector()->length() == 8 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
3995   match(Set dst (ReplicateS (LoadS mem)));
3996   format %{ "pshuflw $dst,$mem,0x00\n\t"
3997             "punpcklqdq $dst,$dst\t! replicate8S" %}
3998   ins_encode %{
3999     __ pshuflw($dst$$XMMRegister, $mem$$Address, 0x00);
4000     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4001   %}
4002   ins_pipe( pipe_slow );
4003 %}
4004 
4005 instruct Repl8S_imm(vecX dst, immI con) %{
4006   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vlbw());
4007   match(Set dst (ReplicateS con));
4008   format %{ "movq    $dst,[$constantaddress]\n\t"
4009             "punpcklqdq $dst,$dst\t! replicate8S($con)" %}
4010   ins_encode %{
4011     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
4012     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4013   %}
4014   ins_pipe( pipe_slow );
4015 %}
4016 
4017 instruct Repl16S(vecY dst, rRegI src) %{
4018   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
4019   match(Set dst (ReplicateS src));
4020   format %{ "movd    $dst,$src\n\t"
4021             "pshuflw $dst,$dst,0x00\n\t"
4022             "punpcklqdq $dst,$dst\n\t"
4023             "vinserti128_high $dst,$dst\t! replicate16S" %}
4024   ins_encode %{
4025     __ movdl($dst$$XMMRegister, $src$$Register);
4026     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
4027     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4028     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
4029   %}
4030   ins_pipe( pipe_slow );
4031 %}
4032 
4033 instruct Repl16S_mem(vecY dst, memory mem) %{
4034   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
4035   match(Set dst (ReplicateS (LoadS mem)));
4036   format %{ "pshuflw $dst,$mem,0x00\n\t"
4037             "punpcklqdq $dst,$dst\n\t"
4038             "vinserti128_high $dst,$dst\t! replicate16S" %}
4039   ins_encode %{
4040     __ pshuflw($dst$$XMMRegister, $mem$$Address, 0x00);
4041     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4042     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
4043   %}
4044   ins_pipe( pipe_slow );
4045 %}
4046 
4047 instruct Repl16S_imm(vecY dst, immI con) %{
4048   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
4049   match(Set dst (ReplicateS con));
4050   format %{ "movq    $dst,[$constantaddress]\n\t"
4051             "punpcklqdq $dst,$dst\n\t"
4052             "vinserti128_high $dst,$dst\t! replicate16S($con)" %}
4053   ins_encode %{
4054     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
4055     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4056     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
4057   %}
4058   ins_pipe( pipe_slow );
4059 %}
4060 
4061 instruct Repl32S(legVecZ dst, rRegI src) %{
4062   predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
4063   match(Set dst (ReplicateS src));
4064   format %{ "movd    $dst,$src\n\t"
4065             "pshuflw $dst,$dst,0x00\n\t"
4066             "punpcklqdq $dst,$dst\n\t"
4067             "vinserti128_high $dst,$dst\t"
4068             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate32S" %}
4069   ins_encode %{
4070     __ movdl($dst$$XMMRegister, $src$$Register);
4071     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
4072     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4073     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
4074     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
4075   %}
4076   ins_pipe( pipe_slow );
4077 %}
4078 
4079 instruct Repl32S_mem(legVecZ dst, memory mem) %{
4080   predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
4081   match(Set dst (ReplicateS (LoadS mem)));
4082   format %{ "pshuflw $dst,$mem,0x00\n\t"
4083             "punpcklqdq $dst,$dst\n\t"
4084             "vinserti128_high $dst,$dst\t"
4085             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate32S" %}
4086   ins_encode %{
4087     __ pshuflw($dst$$XMMRegister, $mem$$Address, 0x00);
4088     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4089     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
4090     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
4091   %}
4092   ins_pipe( pipe_slow );
4093 %}
4094 
4095 instruct Repl32S_imm(legVecZ dst, immI con) %{
4096   predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
4097   match(Set dst (ReplicateS con));
4098   format %{ "movq    $dst,[$constantaddress]\n\t"
4099             "punpcklqdq $dst,$dst\n\t"
4100             "vinserti128_high $dst,$dst\t"
4101             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate32S($con)" %}
4102   ins_encode %{
4103     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
4104     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4105     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
4106     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
4107   %}
4108   ins_pipe( pipe_slow );
4109 %}
4110 
4111 instruct Repl4I(vecX dst, rRegI src) %{
4112   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
4113   match(Set dst (ReplicateI src));
4114   format %{ "movd    $dst,$src\n\t"
4115             "pshufd  $dst,$dst,0x00\t! replicate4I" %}
4116   ins_encode %{
4117     __ movdl($dst$$XMMRegister, $src$$Register);
4118     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
4119   %}
4120   ins_pipe( pipe_slow );
4121 %}
4122 
4123 instruct Repl4I_mem(vecX dst, memory mem) %{
4124   predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vl());
4125   match(Set dst (ReplicateI (LoadI mem)));
4126   format %{ "pshufd  $dst,$mem,0x00\t! replicate4I" %}
4127   ins_encode %{
4128     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
4129   %}
4130   ins_pipe( pipe_slow );
4131 %}
4132 
4133 instruct Repl8I(vecY dst, rRegI src) %{
4134   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
4135   match(Set dst (ReplicateI src));
4136   format %{ "movd    $dst,$src\n\t"
4137             "pshufd  $dst,$dst,0x00\n\t"
4138             "vinserti128_high $dst,$dst\t! replicate8I" %}
4139   ins_encode %{
4140     __ movdl($dst$$XMMRegister, $src$$Register);
4141     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
4142     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
4143   %}
4144   ins_pipe( pipe_slow );
4145 %}
4146 
4147 instruct Repl8I_mem(vecY dst, memory mem) %{
4148   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
4149   match(Set dst (ReplicateI (LoadI mem)));
4150   format %{ "pshufd  $dst,$mem,0x00\n\t"
4151             "vinserti128_high $dst,$dst\t! replicate8I" %}
4152   ins_encode %{
4153     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
4154     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
4155   %}
4156   ins_pipe( pipe_slow );
4157 %}
4158 
4159 instruct Repl16I(legVecZ dst, rRegI src) %{
4160   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vl());
4161   match(Set dst (ReplicateI src));
4162   format %{ "movd    $dst,$src\n\t"
4163             "pshufd  $dst,$dst,0x00\n\t"
4164             "vinserti128_high $dst,$dst\t"
4165             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate16I" %}
4166   ins_encode %{
4167     __ movdl($dst$$XMMRegister, $src$$Register);
4168     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
4169     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
4170     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
4171   %}
4172   ins_pipe( pipe_slow );
4173 %}
4174 
4175 instruct Repl16I_mem(legVecZ dst, memory mem) %{
4176   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vl());
4177   match(Set dst (ReplicateI (LoadI mem)));
4178   format %{ "pshufd  $dst,$mem,0x00\n\t"
4179             "vinserti128_high $dst,$dst\t"
4180             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate16I" %}
4181   ins_encode %{
4182     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
4183     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
4184     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
4185   %}
4186   ins_pipe( pipe_slow );
4187 %}
4188 
4189 instruct Repl4I_imm(vecX dst, immI con) %{
4190   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
4191   match(Set dst (ReplicateI con));
4192   format %{ "movq    $dst,[$constantaddress]\t! replicate4I($con)\n\t"
4193             "punpcklqdq $dst,$dst" %}
4194   ins_encode %{
4195     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
4196     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4197   %}
4198   ins_pipe( pipe_slow );
4199 %}
4200 
4201 instruct Repl8I_imm(vecY dst, immI con) %{
4202   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
4203   match(Set dst (ReplicateI con));
4204   format %{ "movq    $dst,[$constantaddress]\t! replicate8I($con)\n\t"
4205             "punpcklqdq $dst,$dst\n\t"
4206             "vinserti128_high $dst,$dst" %}
4207   ins_encode %{
4208     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
4209     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4210     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
4211   %}
4212   ins_pipe( pipe_slow );
4213 %}
4214 
4215 instruct Repl16I_imm(legVecZ dst, immI con) %{
4216   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vl());
4217   match(Set dst (ReplicateI con));
4218   format %{ "movq    $dst,[$constantaddress]\t"
4219             "punpcklqdq $dst,$dst\n\t"
4220             "vinserti128_high $dst,$dst"
4221             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate16I($con)" %}
4222   ins_encode %{
4223     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
4224     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4225     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
4226     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
4227   %}
4228   ins_pipe( pipe_slow );
4229 %}
4230 
4231 // Long could be loaded into xmm register directly from memory.
4232 instruct Repl2L_mem(vecX dst, memory mem) %{
4233   predicate(n->as_Vector()->length() == 2 && !VM_Version::supports_avx512vlbw());
4234   match(Set dst (ReplicateL (LoadL mem)));
4235   format %{ "movq    $dst,$mem\n\t"
4236             "punpcklqdq $dst,$dst\t! replicate2L" %}
4237   ins_encode %{
4238     __ movq($dst$$XMMRegister, $mem$$Address);
4239     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4240   %}
4241   ins_pipe( pipe_slow );
4242 %}
4243 
4244 // Replicate long (8 byte) scalar to be vector
4245 #ifdef _LP64
4246 instruct Repl4L(vecY dst, rRegL src) %{
4247   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
4248   match(Set dst (ReplicateL src));
4249   format %{ "movdq   $dst,$src\n\t"
4250             "punpcklqdq $dst,$dst\n\t"
4251             "vinserti128_high $dst,$dst\t! replicate4L" %}
4252   ins_encode %{
4253     __ movdq($dst$$XMMRegister, $src$$Register);
4254     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4255     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
4256   %}
4257   ins_pipe( pipe_slow );
4258 %}
4259 
4260 instruct Repl8L(legVecZ dst, rRegL src) %{
4261   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
4262   match(Set dst (ReplicateL src));
4263   format %{ "movdq   $dst,$src\n\t"
4264             "punpcklqdq $dst,$dst\n\t"
4265             "vinserti128_high $dst,$dst\t"
4266             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate8L" %}
4267   ins_encode %{
4268     __ movdq($dst$$XMMRegister, $src$$Register);
4269     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4270     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
4271     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
4272   %}
4273   ins_pipe( pipe_slow );
4274 %}
4275 #else // _LP64
4276 instruct Repl4L(vecY dst, eRegL src, vecY tmp) %{
4277   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
4278   match(Set dst (ReplicateL src));
4279   effect(TEMP dst, USE src, TEMP tmp);
4280   format %{ "movdl   $dst,$src.lo\n\t"
4281             "movdl   $tmp,$src.hi\n\t"
4282             "punpckldq $dst,$tmp\n\t"
4283             "punpcklqdq $dst,$dst\n\t"
4284             "vinserti128_high $dst,$dst\t! replicate4L" %}
4285   ins_encode %{
4286     __ movdl($dst$$XMMRegister, $src$$Register);
4287     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
4288     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
4289     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4290     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
4291   %}
4292   ins_pipe( pipe_slow );
4293 %}
4294 
4295 instruct Repl8L(legVecZ dst, eRegL src, legVecZ tmp) %{
4296   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
4297   match(Set dst (ReplicateL src));
4298   effect(TEMP dst, USE src, TEMP tmp);
4299   format %{ "movdl   $dst,$src.lo\n\t"
4300             "movdl   $tmp,$src.hi\n\t"
4301             "punpckldq $dst,$tmp\n\t"
4302             "punpcklqdq $dst,$dst\n\t"
4303             "vinserti128_high $dst,$dst\t"
4304             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate8L" %}
4305   ins_encode %{
4306     __ movdl($dst$$XMMRegister, $src$$Register);
4307     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
4308     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
4309     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4310     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
4311     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
4312   %}
4313   ins_pipe( pipe_slow );
4314 %}
4315 #endif // _LP64
4316 
4317 instruct Repl4L_imm(vecY dst, immL con) %{
4318   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
4319   match(Set dst (ReplicateL con));
4320   format %{ "movq    $dst,[$constantaddress]\n\t"
4321             "punpcklqdq $dst,$dst\n\t"
4322             "vinserti128_high $dst,$dst\t! replicate4L($con)" %}
4323   ins_encode %{
4324     __ movq($dst$$XMMRegister, $constantaddress($con));
4325     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4326     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
4327   %}
4328   ins_pipe( pipe_slow );
4329 %}
4330 
4331 instruct Repl8L_imm(legVecZ dst, immL con) %{
4332   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
4333   match(Set dst (ReplicateL con));
4334   format %{ "movq    $dst,[$constantaddress]\n\t"
4335             "punpcklqdq $dst,$dst\n\t"
4336             "vinserti128_high $dst,$dst\t"
4337             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate8L($con)" %}
4338   ins_encode %{
4339     __ movq($dst$$XMMRegister, $constantaddress($con));
4340     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4341     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
4342     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
4343   %}
4344   ins_pipe( pipe_slow );
4345 %}
4346 
4347 instruct Repl4L_mem(vecY dst, memory mem) %{
4348   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
4349   match(Set dst (ReplicateL (LoadL mem)));
4350   format %{ "movq    $dst,$mem\n\t"
4351             "punpcklqdq $dst,$dst\n\t"
4352             "vinserti128_high $dst,$dst\t! replicate4L" %}
4353   ins_encode %{
4354     __ movq($dst$$XMMRegister, $mem$$Address);
4355     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4356     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
4357   %}
4358   ins_pipe( pipe_slow );
4359 %}
4360 
4361 instruct Repl8L_mem(legVecZ dst, memory mem) %{
4362   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
4363   match(Set dst (ReplicateL (LoadL mem)));
4364   format %{ "movq    $dst,$mem\n\t"
4365             "punpcklqdq $dst,$dst\n\t"
4366             "vinserti128_high $dst,$dst\t"
4367             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate8L" %}
4368   ins_encode %{
4369     __ movq($dst$$XMMRegister, $mem$$Address);
4370     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4371     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
4372     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
4373   %}
4374   ins_pipe( pipe_slow );
4375 %}
4376 
4377 instruct Repl2F_mem(vecD dst, memory mem) %{
4378   predicate(n->as_Vector()->length() == 2 && UseAVX > 0 && !VM_Version::supports_avx512vl());
4379   match(Set dst (ReplicateF (LoadF mem)));
4380   format %{ "pshufd  $dst,$mem,0x00\t! replicate2F" %}
4381   ins_encode %{
4382     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
4383   %}
4384   ins_pipe( pipe_slow );
4385 %}
4386 
4387 instruct Repl4F_mem(vecX dst, memory mem) %{
4388   predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vl());
4389   match(Set dst (ReplicateF (LoadF mem)));
4390   format %{ "pshufd  $dst,$mem,0x00\t! replicate4F" %}
4391   ins_encode %{
4392     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
4393   %}
4394   ins_pipe( pipe_slow );
4395 %}
4396 
4397 instruct Repl8F(vecY dst, vlRegF src) %{
4398   predicate(n->as_Vector()->length() == 8 && UseAVX > 0 && !VM_Version::supports_avx512vl());
4399   match(Set dst (ReplicateF src));
4400   format %{ "pshufd  $dst,$src,0x00\n\t"
4401             "vinsertf128_high $dst,$dst\t! replicate8F" %}
4402   ins_encode %{
4403     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
4404     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
4405   %}
4406   ins_pipe( pipe_slow );
4407 %}
4408 
4409 instruct Repl8F_mem(vecY dst, memory mem) %{
4410   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
4411   match(Set dst (ReplicateF (LoadF mem)));
4412   format %{ "pshufd  $dst,$mem,0x00\n\t"
4413             "vinsertf128_high $dst,$dst\t! replicate8F" %}
4414   ins_encode %{
4415     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
4416     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
4417   %}
4418   ins_pipe( pipe_slow );
4419 %}
4420 
4421 instruct Repl16F(legVecZ dst, vlRegF src) %{
4422   predicate(n->as_Vector()->length() == 16 && UseAVX > 0 && !VM_Version::supports_avx512vl());
4423   match(Set dst (ReplicateF src));
4424   format %{ "pshufd  $dst,$src,0x00\n\t"
4425             "vinsertf128_high $dst,$dst\t"
4426             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate16F" %}
4427   ins_encode %{
4428     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
4429     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
4430     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
4431   %}
4432   ins_pipe( pipe_slow );
4433 %}
4434 
4435 instruct Repl16F_mem(legVecZ dst, memory mem) %{
4436   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vl());
4437   match(Set dst (ReplicateF (LoadF mem)));
4438   format %{ "pshufd  $dst,$mem,0x00\n\t"
4439             "vinsertf128_high $dst,$dst\t"
4440             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate16F" %}
4441   ins_encode %{
4442     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
4443     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
4444     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
4445   %}
4446   ins_pipe( pipe_slow );
4447 %}
4448 
4449 instruct Repl2F_zero(vecD dst, immF0 zero) %{
4450   predicate(n->as_Vector()->length() == 2 && UseAVX < 3);
4451   match(Set dst (ReplicateF zero));
4452   format %{ "xorps   $dst,$dst\t! replicate2F zero" %}
4453   ins_encode %{
4454     __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
4455   %}
4456   ins_pipe( fpu_reg_reg );
4457 %}
4458 
4459 instruct Repl4F_zero(vecX dst, immF0 zero) %{
4460   predicate(n->as_Vector()->length() == 4 && UseAVX < 3);
4461   match(Set dst (ReplicateF zero));
4462   format %{ "xorps   $dst,$dst\t! replicate4F zero" %}
4463   ins_encode %{
4464     __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
4465   %}
4466   ins_pipe( fpu_reg_reg );
4467 %}
4468 
4469 instruct Repl8F_zero(vecY dst, immF0 zero) %{
4470   predicate(n->as_Vector()->length() == 8 && UseAVX < 3);
4471   match(Set dst (ReplicateF zero));
4472   format %{ "vxorps  $dst,$dst,$dst\t! replicate8F zero" %}
4473   ins_encode %{
4474     int vector_len = 1;
4475     __ vxorps($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4476   %}
4477   ins_pipe( fpu_reg_reg );
4478 %}
4479 
4480 instruct Repl2D_mem(vecX dst, memory mem) %{
4481   predicate(n->as_Vector()->length() == 2 && UseAVX > 0 && !VM_Version::supports_avx512vl());
4482   match(Set dst (ReplicateD (LoadD mem)));
4483   format %{ "pshufd  $dst,$mem,0x44\t! replicate2D" %}
4484   ins_encode %{
4485     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x44);
4486   %}
4487   ins_pipe( pipe_slow );
4488 %}
4489 
4490 instruct Repl4D(vecY dst, vlRegD src) %{
4491   predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vl());
4492   match(Set dst (ReplicateD src));
4493   format %{ "pshufd  $dst,$src,0x44\n\t"
4494             "vinsertf128_high $dst,$dst\t! replicate4D" %}
4495   ins_encode %{
4496     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
4497     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
4498   %}
4499   ins_pipe( pipe_slow );
4500 %}
4501 
4502 instruct Repl4D_mem(vecY dst, memory mem) %{
4503   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
4504   match(Set dst (ReplicateD (LoadD mem)));
4505   format %{ "pshufd  $dst,$mem,0x44\n\t"
4506             "vinsertf128_high $dst,$dst\t! replicate4D" %}
4507   ins_encode %{
4508     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x44);
4509     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
4510   %}
4511   ins_pipe( pipe_slow );
4512 %}
4513 
4514 instruct Repl8D(legVecZ dst, vlRegD src) %{
4515   predicate(n->as_Vector()->length() == 8 && UseAVX > 0 && !VM_Version::supports_avx512vl());
4516   match(Set dst (ReplicateD src));
4517   format %{ "pshufd  $dst,$src,0x44\n\t"
4518             "vinsertf128_high $dst,$dst\t"
4519             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate8D" %}
4520   ins_encode %{
4521     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
4522     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
4523     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
4524   %}
4525   ins_pipe( pipe_slow );
4526 %}
4527 
4528 instruct Repl8D_mem(legVecZ dst, memory mem) %{
4529   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
4530   match(Set dst (ReplicateD (LoadD mem)));
4531   format %{ "pshufd  $dst,$mem,0x44\n\t"
4532             "vinsertf128_high $dst,$dst\t"
4533             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate8D" %}
4534   ins_encode %{
4535     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x44);
4536     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
4537     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
4538   %}
4539   ins_pipe( pipe_slow );
4540 %}
4541 
4542 // Replicate double (8 byte) scalar zero to be vector
4543 instruct Repl2D_zero(vecX dst, immD0 zero) %{
4544   predicate(n->as_Vector()->length() == 2 && UseAVX < 3);
4545   match(Set dst (ReplicateD zero));
4546   format %{ "xorpd   $dst,$dst\t! replicate2D zero" %}
4547   ins_encode %{
4548     __ xorpd($dst$$XMMRegister, $dst$$XMMRegister);
4549   %}
4550   ins_pipe( fpu_reg_reg );
4551 %}
4552 
4553 instruct Repl4D_zero(vecY dst, immD0 zero) %{
4554   predicate(n->as_Vector()->length() == 4 && UseAVX < 3);
4555   match(Set dst (ReplicateD zero));
4556   format %{ "vxorpd  $dst,$dst,$dst,vect256\t! replicate4D zero" %}
4557   ins_encode %{
4558     int vector_len = 1;
4559     __ vxorpd($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4560   %}
4561   ins_pipe( fpu_reg_reg );
4562 %}
4563 
4564 // ====================GENERIC REPLICATE==========================================
4565 
4566 // Replicate byte scalar to be vector
4567 instruct Repl4B(vecS dst, rRegI src) %{
4568   predicate(n->as_Vector()->length() == 4);
4569   match(Set dst (ReplicateB src));
4570   format %{ "movd    $dst,$src\n\t"
4571             "punpcklbw $dst,$dst\n\t"
4572             "pshuflw $dst,$dst,0x00\t! replicate4B" %}
4573   ins_encode %{
4574     __ movdl($dst$$XMMRegister, $src$$Register);
4575     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
4576     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
4577   %}
4578   ins_pipe( pipe_slow );
4579 %}
4580 
4581 instruct Repl8B(vecD dst, rRegI src) %{
4582   predicate(n->as_Vector()->length() == 8);
4583   match(Set dst (ReplicateB src));
4584   format %{ "movd    $dst,$src\n\t"
4585             "punpcklbw $dst,$dst\n\t"
4586             "pshuflw $dst,$dst,0x00\t! replicate8B" %}
4587   ins_encode %{
4588     __ movdl($dst$$XMMRegister, $src$$Register);
4589     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
4590     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
4591   %}
4592   ins_pipe( pipe_slow );
4593 %}
4594 
4595 // Replicate byte scalar immediate to be vector by loading from const table.
4596 instruct Repl4B_imm(vecS dst, immI con) %{
4597   predicate(n->as_Vector()->length() == 4);
4598   match(Set dst (ReplicateB con));
4599   format %{ "movdl   $dst,[$constantaddress]\t! replicate4B($con)" %}
4600   ins_encode %{
4601     __ movdl($dst$$XMMRegister, $constantaddress(replicate4_imm($con$$constant, 1)));
4602   %}
4603   ins_pipe( pipe_slow );
4604 %}
4605 
4606 instruct Repl8B_imm(vecD dst, immI con) %{
4607   predicate(n->as_Vector()->length() == 8);
4608   match(Set dst (ReplicateB con));
4609   format %{ "movq    $dst,[$constantaddress]\t! replicate8B($con)" %}
4610   ins_encode %{
4611     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
4612   %}
4613   ins_pipe( pipe_slow );
4614 %}
4615 
4616 // Replicate byte scalar zero to be vector
4617 instruct Repl4B_zero(vecS dst, immI0 zero) %{
4618   predicate(n->as_Vector()->length() == 4);
4619   match(Set dst (ReplicateB zero));
4620   format %{ "pxor    $dst,$dst\t! replicate4B zero" %}
4621   ins_encode %{
4622     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4623   %}
4624   ins_pipe( fpu_reg_reg );
4625 %}
4626 
4627 instruct Repl8B_zero(vecD dst, immI0 zero) %{
4628   predicate(n->as_Vector()->length() == 8);
4629   match(Set dst (ReplicateB zero));
4630   format %{ "pxor    $dst,$dst\t! replicate8B zero" %}
4631   ins_encode %{
4632     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4633   %}
4634   ins_pipe( fpu_reg_reg );
4635 %}
4636 
4637 instruct Repl16B_zero(vecX dst, immI0 zero) %{
4638   predicate(n->as_Vector()->length() == 16);
4639   match(Set dst (ReplicateB zero));
4640   format %{ "pxor    $dst,$dst\t! replicate16B zero" %}
4641   ins_encode %{
4642     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4643   %}
4644   ins_pipe( fpu_reg_reg );
4645 %}
4646 
4647 instruct Repl32B_zero(vecY dst, immI0 zero) %{
4648   predicate(n->as_Vector()->length() == 32);
4649   match(Set dst (ReplicateB zero));
4650   format %{ "vpxor   $dst,$dst,$dst\t! replicate32B zero" %}
4651   ins_encode %{
4652     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
4653     int vector_len = 1;
4654     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4655   %}
4656   ins_pipe( fpu_reg_reg );
4657 %}
4658 
4659 // Replicate char/short (2 byte) scalar to be vector
4660 instruct Repl2S(vecS dst, rRegI src) %{
4661   predicate(n->as_Vector()->length() == 2);
4662   match(Set dst (ReplicateS src));
4663   format %{ "movd    $dst,$src\n\t"
4664             "pshuflw $dst,$dst,0x00\t! replicate2S" %}
4665   ins_encode %{
4666     __ movdl($dst$$XMMRegister, $src$$Register);
4667     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
4668   %}
4669   ins_pipe( fpu_reg_reg );
4670 %}
4671 
4672 // Replicate char/short (2 byte) scalar immediate to be vector by loading from const table.
4673 instruct Repl2S_imm(vecS dst, immI con) %{
4674   predicate(n->as_Vector()->length() == 2);
4675   match(Set dst (ReplicateS con));
4676   format %{ "movdl   $dst,[$constantaddress]\t! replicate2S($con)" %}
4677   ins_encode %{
4678     __ movdl($dst$$XMMRegister, $constantaddress(replicate4_imm($con$$constant, 2)));
4679   %}
4680   ins_pipe( fpu_reg_reg );
4681 %}
4682 
4683 instruct Repl4S_imm(vecD dst, immI con) %{
4684   predicate(n->as_Vector()->length() == 4);
4685   match(Set dst (ReplicateS con));
4686   format %{ "movq    $dst,[$constantaddress]\t! replicate4S($con)" %}
4687   ins_encode %{
4688     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
4689   %}
4690   ins_pipe( fpu_reg_reg );
4691 %}
4692 
4693 // Replicate char/short (2 byte) scalar zero to be vector
4694 instruct Repl2S_zero(vecS dst, immI0 zero) %{
4695   predicate(n->as_Vector()->length() == 2);
4696   match(Set dst (ReplicateS zero));
4697   format %{ "pxor    $dst,$dst\t! replicate2S zero" %}
4698   ins_encode %{
4699     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4700   %}
4701   ins_pipe( fpu_reg_reg );
4702 %}
4703 
4704 instruct Repl4S_zero(vecD dst, immI0 zero) %{
4705   predicate(n->as_Vector()->length() == 4);
4706   match(Set dst (ReplicateS zero));
4707   format %{ "pxor    $dst,$dst\t! replicate4S zero" %}
4708   ins_encode %{
4709     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4710   %}
4711   ins_pipe( fpu_reg_reg );
4712 %}
4713 
4714 instruct Repl8S_zero(vecX dst, immI0 zero) %{
4715   predicate(n->as_Vector()->length() == 8);
4716   match(Set dst (ReplicateS zero));
4717   format %{ "pxor    $dst,$dst\t! replicate8S zero" %}
4718   ins_encode %{
4719     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4720   %}
4721   ins_pipe( fpu_reg_reg );
4722 %}
4723 
4724 instruct Repl16S_zero(vecY dst, immI0 zero) %{
4725   predicate(n->as_Vector()->length() == 16);
4726   match(Set dst (ReplicateS zero));
4727   format %{ "vpxor   $dst,$dst,$dst\t! replicate16S zero" %}
4728   ins_encode %{
4729     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
4730     int vector_len = 1;
4731     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4732   %}
4733   ins_pipe( fpu_reg_reg );
4734 %}
4735 
4736 // Replicate integer (4 byte) scalar to be vector
4737 instruct Repl2I(vecD dst, rRegI src) %{
4738   predicate(n->as_Vector()->length() == 2);
4739   match(Set dst (ReplicateI src));
4740   format %{ "movd    $dst,$src\n\t"
4741             "pshufd  $dst,$dst,0x00\t! replicate2I" %}
4742   ins_encode %{
4743     __ movdl($dst$$XMMRegister, $src$$Register);
4744     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
4745   %}
4746   ins_pipe( fpu_reg_reg );
4747 %}
4748 
4749 // Integer could be loaded into xmm register directly from memory.
4750 instruct Repl2I_mem(vecD dst, memory mem) %{
4751   predicate(n->as_Vector()->length() == 2);
4752   match(Set dst (ReplicateI (LoadI mem)));
4753   format %{ "movd    $dst,$mem\n\t"
4754             "pshufd  $dst,$dst,0x00\t! replicate2I" %}
4755   ins_encode %{
4756     __ movdl($dst$$XMMRegister, $mem$$Address);
4757     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
4758   %}
4759   ins_pipe( fpu_reg_reg );
4760 %}
4761 
4762 // Replicate integer (4 byte) scalar immediate to be vector by loading from const table.
4763 instruct Repl2I_imm(vecD dst, immI con) %{
4764   predicate(n->as_Vector()->length() == 2);
4765   match(Set dst (ReplicateI con));
4766   format %{ "movq    $dst,[$constantaddress]\t! replicate2I($con)" %}
4767   ins_encode %{
4768     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
4769   %}
4770   ins_pipe( fpu_reg_reg );
4771 %}
4772 
4773 // Replicate integer (4 byte) scalar zero to be vector
4774 instruct Repl2I_zero(vecD dst, immI0 zero) %{
4775   predicate(n->as_Vector()->length() == 2);
4776   match(Set dst (ReplicateI zero));
4777   format %{ "pxor    $dst,$dst\t! replicate2I" %}
4778   ins_encode %{
4779     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4780   %}
4781   ins_pipe( fpu_reg_reg );
4782 %}
4783 
4784 instruct Repl4I_zero(vecX dst, immI0 zero) %{
4785   predicate(n->as_Vector()->length() == 4);
4786   match(Set dst (ReplicateI zero));
4787   format %{ "pxor    $dst,$dst\t! replicate4I zero)" %}
4788   ins_encode %{
4789     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4790   %}
4791   ins_pipe( fpu_reg_reg );
4792 %}
4793 
4794 instruct Repl8I_zero(vecY dst, immI0 zero) %{
4795   predicate(n->as_Vector()->length() == 8);
4796   match(Set dst (ReplicateI zero));
4797   format %{ "vpxor   $dst,$dst,$dst\t! replicate8I zero" %}
4798   ins_encode %{
4799     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
4800     int vector_len = 1;
4801     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4802   %}
4803   ins_pipe( fpu_reg_reg );
4804 %}
4805 
4806 // Replicate long (8 byte) scalar to be vector
4807 #ifdef _LP64
4808 instruct Repl2L(vecX dst, rRegL src) %{
4809   predicate(n->as_Vector()->length() == 2);
4810   match(Set dst (ReplicateL src));
4811   format %{ "movdq   $dst,$src\n\t"
4812             "punpcklqdq $dst,$dst\t! replicate2L" %}
4813   ins_encode %{
4814     __ movdq($dst$$XMMRegister, $src$$Register);
4815     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4816   %}
4817   ins_pipe( pipe_slow );
4818 %}
4819 #else // _LP64
4820 instruct Repl2L(vecX dst, eRegL src, vecX tmp) %{
4821   predicate(n->as_Vector()->length() == 2);
4822   match(Set dst (ReplicateL src));
4823   effect(TEMP dst, USE src, TEMP tmp);
4824   format %{ "movdl   $dst,$src.lo\n\t"
4825             "movdl   $tmp,$src.hi\n\t"
4826             "punpckldq $dst,$tmp\n\t"
4827             "punpcklqdq $dst,$dst\t! replicate2L"%}
4828   ins_encode %{
4829     __ movdl($dst$$XMMRegister, $src$$Register);
4830     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
4831     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
4832     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4833   %}
4834   ins_pipe( pipe_slow );
4835 %}
4836 #endif // _LP64
4837 
4838 // Replicate long (8 byte) scalar immediate to be vector by loading from const table.
4839 instruct Repl2L_imm(vecX dst, immL con) %{
4840   predicate(n->as_Vector()->length() == 2);
4841   match(Set dst (ReplicateL con));
4842   format %{ "movq    $dst,[$constantaddress]\n\t"
4843             "punpcklqdq $dst,$dst\t! replicate2L($con)" %}
4844   ins_encode %{
4845     __ movq($dst$$XMMRegister, $constantaddress($con));
4846     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4847   %}
4848   ins_pipe( pipe_slow );
4849 %}
4850 
4851 // Replicate long (8 byte) scalar zero to be vector
4852 instruct Repl2L_zero(vecX dst, immL0 zero) %{
4853   predicate(n->as_Vector()->length() == 2);
4854   match(Set dst (ReplicateL zero));
4855   format %{ "pxor    $dst,$dst\t! replicate2L zero" %}
4856   ins_encode %{
4857     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4858   %}
4859   ins_pipe( fpu_reg_reg );
4860 %}
4861 
4862 instruct Repl4L_zero(vecY dst, immL0 zero) %{
4863   predicate(n->as_Vector()->length() == 4);
4864   match(Set dst (ReplicateL zero));
4865   format %{ "vpxor   $dst,$dst,$dst\t! replicate4L zero" %}
4866   ins_encode %{
4867     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
4868     int vector_len = 1;
4869     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4870   %}
4871   ins_pipe( fpu_reg_reg );
4872 %}
4873 
4874 // Replicate float (4 byte) scalar to be vector
4875 instruct Repl2F(vecD dst, vlRegF src) %{
4876   predicate(n->as_Vector()->length() == 2);
4877   match(Set dst (ReplicateF src));
4878   format %{ "pshufd  $dst,$dst,0x00\t! replicate2F" %}
4879   ins_encode %{
4880     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
4881   %}
4882   ins_pipe( fpu_reg_reg );
4883 %}
4884 
4885 instruct Repl4F(vecX dst, vlRegF src) %{
4886   predicate(n->as_Vector()->length() == 4);
4887   match(Set dst (ReplicateF src));
4888   format %{ "pshufd  $dst,$dst,0x00\t! replicate4F" %}
4889   ins_encode %{
4890     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
4891   %}
4892   ins_pipe( pipe_slow );
4893 %}
4894 
4895 // Replicate double (8 bytes) scalar to be vector
4896 instruct Repl2D(vecX dst, vlRegD src) %{
4897   predicate(n->as_Vector()->length() == 2);
4898   match(Set dst (ReplicateD src));
4899   format %{ "pshufd  $dst,$src,0x44\t! replicate2D" %}
4900   ins_encode %{
4901     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
4902   %}
4903   ins_pipe( pipe_slow );
4904 %}
4905 
4906 // ====================EVEX REPLICATE=============================================
4907 
4908 instruct Repl4B_mem_evex(vecS dst, memory mem) %{
4909   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4910   match(Set dst (ReplicateB (LoadB mem)));
4911   format %{ "vpbroadcastb  $dst,$mem\t! replicate4B" %}
4912   ins_encode %{
4913     int vector_len = 0;
4914     __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
4915   %}
4916   ins_pipe( pipe_slow );
4917 %}
4918 
4919 instruct Repl8B_mem_evex(vecD dst, memory mem) %{
4920   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4921   match(Set dst (ReplicateB (LoadB mem)));
4922   format %{ "vpbroadcastb  $dst,$mem\t! replicate8B" %}
4923   ins_encode %{
4924     int vector_len = 0;
4925     __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
4926   %}
4927   ins_pipe( pipe_slow );
4928 %}
4929 
4930 instruct Repl16B_evex(vecX dst, rRegI src) %{
4931   predicate(n->as_Vector()->length() == 16 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4932   match(Set dst (ReplicateB src));
4933   format %{ "evpbroadcastb $dst,$src\t! replicate16B" %}
4934   ins_encode %{
4935    int vector_len = 0;
4936     __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vector_len);
4937   %}
4938   ins_pipe( pipe_slow );
4939 %}
4940 
4941 instruct Repl16B_mem_evex(vecX dst, memory mem) %{
4942   predicate(n->as_Vector()->length() == 16 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4943   match(Set dst (ReplicateB (LoadB mem)));
4944   format %{ "vpbroadcastb  $dst,$mem\t! replicate16B" %}
4945   ins_encode %{
4946     int vector_len = 0;
4947     __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
4948   %}
4949   ins_pipe( pipe_slow );
4950 %}
4951 
4952 instruct Repl32B_evex(vecY dst, rRegI src) %{
4953   predicate(n->as_Vector()->length() == 32 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4954   match(Set dst (ReplicateB src));
4955   format %{ "evpbroadcastb $dst,$src\t! replicate32B" %}
4956   ins_encode %{
4957    int vector_len = 1;
4958     __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vector_len);
4959   %}
4960   ins_pipe( pipe_slow );
4961 %}
4962 
4963 instruct Repl32B_mem_evex(vecY dst, memory mem) %{
4964   predicate(n->as_Vector()->length() == 32 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4965   match(Set dst (ReplicateB (LoadB mem)));
4966   format %{ "vpbroadcastb  $dst,$mem\t! replicate32B" %}
4967   ins_encode %{
4968     int vector_len = 1;
4969     __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
4970   %}
4971   ins_pipe( pipe_slow );
4972 %}
4973 
4974 instruct Repl64B_evex(vecZ dst, rRegI src) %{
4975   predicate(n->as_Vector()->length() == 64 && UseAVX > 2 && VM_Version::supports_avx512bw());
4976   match(Set dst (ReplicateB src));
4977   format %{ "evpbroadcastb $dst,$src\t! upper replicate64B" %}
4978   ins_encode %{
4979    int vector_len = 2;
4980     __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vector_len);
4981   %}
4982   ins_pipe( pipe_slow );
4983 %}
4984 
4985 instruct Repl64B_mem_evex(vecZ dst, memory mem) %{
4986   predicate(n->as_Vector()->length() == 64 && UseAVX > 2 && VM_Version::supports_avx512bw());
4987   match(Set dst (ReplicateB (LoadB mem)));
4988   format %{ "vpbroadcastb  $dst,$mem\t! replicate64B" %}
4989   ins_encode %{
4990     int vector_len = 2;
4991     __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
4992   %}
4993   ins_pipe( pipe_slow );
4994 %}
4995 
4996 instruct Repl16B_imm_evex(vecX dst, immI con) %{
4997   predicate(n->as_Vector()->length() == 16 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4998   match(Set dst (ReplicateB con));
4999   format %{ "movq    $dst,[$constantaddress]\n\t"
5000             "vpbroadcastb $dst,$dst\t! replicate16B" %}
5001   ins_encode %{
5002    int vector_len = 0;
5003     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
5004     __ vpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
5005   %}
5006   ins_pipe( pipe_slow );
5007 %}
5008 
5009 instruct Repl32B_imm_evex(vecY dst, immI con) %{
5010   predicate(n->as_Vector()->length() == 32 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
5011   match(Set dst (ReplicateB con));
5012   format %{ "movq    $dst,[$constantaddress]\n\t"
5013             "vpbroadcastb $dst,$dst\t! replicate32B" %}
5014   ins_encode %{
5015    int vector_len = 1;
5016     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
5017     __ vpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
5018   %}
5019   ins_pipe( pipe_slow );
5020 %}
5021 
5022 instruct Repl64B_imm_evex(vecZ dst, immI con) %{
5023   predicate(n->as_Vector()->length() == 64 && UseAVX > 2 && VM_Version::supports_avx512bw());
5024   match(Set dst (ReplicateB con));
5025   format %{ "movq    $dst,[$constantaddress]\n\t"
5026             "vpbroadcastb $dst,$dst\t! upper replicate64B" %}
5027   ins_encode %{
5028    int vector_len = 2;
5029     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
5030     __ vpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
5031   %}
5032   ins_pipe( pipe_slow );
5033 %}
5034 
5035 instruct Repl64B_zero_evex(vecZ dst, immI0 zero) %{
5036   predicate(n->as_Vector()->length() == 64 && UseAVX > 2);
5037   match(Set dst (ReplicateB zero));
5038   format %{ "vpxor   $dst k0,$dst,$dst\t! replicate64B zero" %}
5039   ins_encode %{
5040     // Use vxorpd since AVX does not have vpxor for 512-bit (EVEX will have it).
5041     int vector_len = 2;
5042     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
5043   %}
5044   ins_pipe( fpu_reg_reg );
5045 %}
5046 
5047 instruct Repl4S_evex(vecD dst, rRegI src) %{
5048   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
5049   match(Set dst (ReplicateS src));
5050   format %{ "evpbroadcastw $dst,$src\t! replicate4S" %}
5051   ins_encode %{
5052    int vector_len = 0;
5053     __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
5054   %}
5055   ins_pipe( pipe_slow );
5056 %}
5057 
5058 instruct Repl4S_mem_evex(vecD dst, memory mem) %{
5059   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
5060   match(Set dst (ReplicateS (LoadS mem)));
5061   format %{ "vpbroadcastw  $dst,$mem\t! replicate4S" %}
5062   ins_encode %{
5063     int vector_len = 0;
5064     __ vpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
5065   %}
5066   ins_pipe( pipe_slow );
5067 %}
5068 
5069 instruct Repl8S_evex(vecX dst, rRegI src) %{
5070   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
5071   match(Set dst (ReplicateS src));
5072   format %{ "evpbroadcastw $dst,$src\t! replicate8S" %}
5073   ins_encode %{
5074    int vector_len = 0;
5075     __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
5076   %}
5077   ins_pipe( pipe_slow );
5078 %}
5079 
5080 instruct Repl8S_mem_evex(vecX dst, memory mem) %{
5081   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
5082   match(Set dst (ReplicateS (LoadS mem)));
5083   format %{ "vpbroadcastw  $dst,$mem\t! replicate8S" %}
5084   ins_encode %{
5085     int vector_len = 0;
5086     __ vpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
5087   %}
5088   ins_pipe( pipe_slow );
5089 %}
5090 
5091 instruct Repl16S_evex(vecY dst, rRegI src) %{
5092   predicate(n->as_Vector()->length() == 16 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
5093   match(Set dst (ReplicateS src));
5094   format %{ "evpbroadcastw $dst,$src\t! replicate16S" %}
5095   ins_encode %{
5096    int vector_len = 1;
5097     __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
5098   %}
5099   ins_pipe( pipe_slow );
5100 %}
5101 
5102 instruct Repl16S_mem_evex(vecY dst, memory mem) %{
5103   predicate(n->as_Vector()->length() == 16 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
5104   match(Set dst (ReplicateS (LoadS mem)));
5105   format %{ "vpbroadcastw  $dst,$mem\t! replicate16S" %}
5106   ins_encode %{
5107     int vector_len = 1;
5108     __ vpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
5109   %}
5110   ins_pipe( pipe_slow );
5111 %}
5112 
5113 instruct Repl32S_evex(vecZ dst, rRegI src) %{
5114   predicate(n->as_Vector()->length() == 32 && UseAVX > 2 && VM_Version::supports_avx512bw());
5115   match(Set dst (ReplicateS src));
5116   format %{ "evpbroadcastw $dst,$src\t! replicate32S" %}
5117   ins_encode %{
5118    int vector_len = 2;
5119     __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
5120   %}
5121   ins_pipe( pipe_slow );
5122 %}
5123 
5124 instruct Repl32S_mem_evex(vecZ dst, memory mem) %{
5125   predicate(n->as_Vector()->length() == 32 && UseAVX > 2 && VM_Version::supports_avx512bw());
5126   match(Set dst (ReplicateS (LoadS mem)));
5127   format %{ "vpbroadcastw  $dst,$mem\t! replicate32S" %}
5128   ins_encode %{
5129     int vector_len = 2;
5130     __ vpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
5131   %}
5132   ins_pipe( pipe_slow );
5133 %}
5134 
5135 instruct Repl8S_imm_evex(vecX dst, immI con) %{
5136   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
5137   match(Set dst (ReplicateS con));
5138   format %{ "movq    $dst,[$constantaddress]\n\t"
5139             "vpbroadcastw $dst,$dst\t! replicate8S" %}
5140   ins_encode %{
5141    int vector_len = 0;
5142     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
5143     __ vpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
5144   %}
5145   ins_pipe( pipe_slow );
5146 %}
5147 
5148 instruct Repl16S_imm_evex(vecY dst, immI con) %{
5149   predicate(n->as_Vector()->length() == 16 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
5150   match(Set dst (ReplicateS con));
5151   format %{ "movq    $dst,[$constantaddress]\n\t"
5152             "vpbroadcastw $dst,$dst\t! replicate16S" %}
5153   ins_encode %{
5154    int vector_len = 1;
5155     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
5156     __ vpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
5157   %}
5158   ins_pipe( pipe_slow );
5159 %}
5160 
5161 instruct Repl32S_imm_evex(vecZ dst, immI con) %{
5162   predicate(n->as_Vector()->length() == 32 && UseAVX > 2 && VM_Version::supports_avx512bw());
5163   match(Set dst (ReplicateS con));
5164   format %{ "movq    $dst,[$constantaddress]\n\t"
5165             "vpbroadcastw $dst,$dst\t! replicate32S" %}
5166   ins_encode %{
5167    int vector_len = 2;
5168     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
5169     __ vpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
5170   %}
5171   ins_pipe( pipe_slow );
5172 %}
5173 
5174 instruct Repl32S_zero_evex(vecZ dst, immI0 zero) %{
5175   predicate(n->as_Vector()->length() == 32 && UseAVX > 2);
5176   match(Set dst (ReplicateS zero));
5177   format %{ "vpxor   $dst k0,$dst,$dst\t! replicate32S zero" %}
5178   ins_encode %{
5179     // Use vxorpd since AVX does not have vpxor for 512-bit (EVEX will have it).
5180     int vector_len = 2;
5181     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
5182   %}
5183   ins_pipe( fpu_reg_reg );
5184 %}
5185 
5186 instruct Repl4I_evex(vecX dst, rRegI src) %{
5187   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
5188   match(Set dst (ReplicateI src));
5189   format %{ "evpbroadcastd  $dst,$src\t! replicate4I" %}
5190   ins_encode %{
5191     int vector_len = 0;
5192     __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vector_len);
5193   %}
5194   ins_pipe( pipe_slow );
5195 %}
5196 
5197 instruct Repl4I_mem_evex(vecX dst, memory mem) %{
5198   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
5199   match(Set dst (ReplicateI (LoadI mem)));
5200   format %{ "vpbroadcastd  $dst,$mem\t! replicate4I" %}
5201   ins_encode %{
5202     int vector_len = 0;
5203     __ vpbroadcastd($dst$$XMMRegister, $mem$$Address, vector_len);
5204   %}
5205   ins_pipe( pipe_slow );
5206 %}
5207 
5208 instruct Repl8I_evex(vecY dst, rRegI src) %{
5209   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vl());
5210   match(Set dst (ReplicateI src));
5211   format %{ "evpbroadcastd  $dst,$src\t! replicate8I" %}
5212   ins_encode %{
5213     int vector_len = 1;
5214     __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vector_len);
5215   %}
5216   ins_pipe( pipe_slow );
5217 %}
5218 
5219 instruct Repl8I_mem_evex(vecY dst, memory mem) %{
5220   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vl());
5221   match(Set dst (ReplicateI (LoadI mem)));
5222   format %{ "vpbroadcastd  $dst,$mem\t! replicate8I" %}
5223   ins_encode %{
5224     int vector_len = 1;
5225     __ vpbroadcastd($dst$$XMMRegister, $mem$$Address, vector_len);
5226   %}
5227   ins_pipe( pipe_slow );
5228 %}
5229 
5230 instruct Repl16I_evex(vecZ dst, rRegI src) %{
5231   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
5232   match(Set dst (ReplicateI src));
5233   format %{ "evpbroadcastd  $dst,$src\t! replicate16I" %}
5234   ins_encode %{
5235     int vector_len = 2;
5236     __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vector_len);
5237   %}
5238   ins_pipe( pipe_slow );
5239 %}
5240 
5241 instruct Repl16I_mem_evex(vecZ dst, memory mem) %{
5242   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
5243   match(Set dst (ReplicateI (LoadI mem)));
5244   format %{ "vpbroadcastd  $dst,$mem\t! replicate16I" %}
5245   ins_encode %{
5246     int vector_len = 2;
5247     __ vpbroadcastd($dst$$XMMRegister, $mem$$Address, vector_len);
5248   %}
5249   ins_pipe( pipe_slow );
5250 %}
5251 
5252 instruct Repl4I_imm_evex(vecX dst, immI con) %{
5253   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
5254   match(Set dst (ReplicateI con));
5255   format %{ "movq    $dst,[$constantaddress]\t! replicate8I($con)\n\t"
5256             "vpbroadcastd  $dst,$dst\t! replicate4I" %}
5257   ins_encode %{
5258     int vector_len = 0;
5259     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
5260     __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
5261   %}
5262   ins_pipe( pipe_slow );
5263 %}
5264 
5265 instruct Repl8I_imm_evex(vecY dst, immI con) %{
5266   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vl());
5267   match(Set dst (ReplicateI con));
5268   format %{ "movq    $dst,[$constantaddress]\t! replicate8I($con)\n\t"
5269             "vpbroadcastd  $dst,$dst\t! replicate8I" %}
5270   ins_encode %{
5271     int vector_len = 1;
5272     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
5273     __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
5274   %}
5275   ins_pipe( pipe_slow );
5276 %}
5277 
5278 instruct Repl16I_imm_evex(vecZ dst, immI con) %{
5279   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
5280   match(Set dst (ReplicateI con));
5281   format %{ "movq    $dst,[$constantaddress]\t! replicate16I($con)\n\t"
5282             "vpbroadcastd  $dst,$dst\t! replicate16I" %}
5283   ins_encode %{
5284     int vector_len = 2;
5285     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
5286     __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
5287   %}
5288   ins_pipe( pipe_slow );
5289 %}
5290 
5291 instruct Repl16I_zero_evex(vecZ dst, immI0 zero) %{
5292   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
5293   match(Set dst (ReplicateI zero));
5294   format %{ "vpxor   $dst k0,$dst,$dst\t! replicate16I zero" %}
5295   ins_encode %{
5296     // Use vxorpd since AVX does not have vpxor for 512-bit (AVX2 will have it).
5297     int vector_len = 2;
5298     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
5299   %}
5300   ins_pipe( fpu_reg_reg );
5301 %}
5302 
5303 // Replicate long (8 byte) scalar to be vector
5304 #ifdef _LP64
5305 instruct Repl4L_evex(vecY dst, rRegL src) %{
5306   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
5307   match(Set dst (ReplicateL src));
5308   format %{ "evpbroadcastq  $dst,$src\t! replicate4L" %}
5309   ins_encode %{
5310     int vector_len = 1;
5311     __ evpbroadcastq($dst$$XMMRegister, $src$$Register, vector_len);
5312   %}
5313   ins_pipe( pipe_slow );
5314 %}
5315 
5316 instruct Repl8L_evex(vecZ dst, rRegL src) %{
5317   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
5318   match(Set dst (ReplicateL src));
5319   format %{ "evpbroadcastq  $dst,$src\t! replicate8L" %}
5320   ins_encode %{
5321     int vector_len = 2;
5322     __ evpbroadcastq($dst$$XMMRegister, $src$$Register, vector_len);
5323   %}
5324   ins_pipe( pipe_slow );
5325 %}
5326 #else // _LP64
5327 instruct Repl4L_evex(vecY dst, eRegL src, regD tmp) %{
5328   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
5329   match(Set dst (ReplicateL src));
5330   effect(TEMP dst, USE src, TEMP tmp);
5331   format %{ "movdl   $dst,$src.lo\n\t"
5332             "movdl   $tmp,$src.hi\n\t"
5333             "punpckldq $dst,$tmp\n\t"
5334             "vpbroadcastq  $dst,$dst\t! replicate4L" %}
5335   ins_encode %{
5336     int vector_len = 1;
5337     __ movdl($dst$$XMMRegister, $src$$Register);
5338     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
5339     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
5340     __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
5341   %}
5342   ins_pipe( pipe_slow );
5343 %}
5344 
5345 instruct Repl8L_evex(legVecZ dst, eRegL src, legVecZ tmp) %{
5346   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
5347   match(Set dst (ReplicateL src));
5348   effect(TEMP dst, USE src, TEMP tmp);
5349   format %{ "movdl   $dst,$src.lo\n\t"
5350             "movdl   $tmp,$src.hi\n\t"
5351             "punpckldq $dst,$tmp\n\t"
5352             "vpbroadcastq  $dst,$dst\t! replicate8L" %}
5353   ins_encode %{
5354     int vector_len = 2;
5355     __ movdl($dst$$XMMRegister, $src$$Register);
5356     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
5357     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
5358     __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
5359   %}
5360   ins_pipe( pipe_slow );
5361 %}
5362 #endif // _LP64
5363 
5364 instruct Repl4L_imm_evex(vecY dst, immL con) %{
5365   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
5366   match(Set dst (ReplicateL con));
5367   format %{ "movq    $dst,[$constantaddress]\n\t"
5368             "vpbroadcastq  $dst,$dst\t! replicate4L" %}
5369   ins_encode %{
5370     int vector_len = 1;
5371     __ movq($dst$$XMMRegister, $constantaddress($con));
5372     __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
5373   %}
5374   ins_pipe( pipe_slow );
5375 %}
5376 
5377 instruct Repl8L_imm_evex(vecZ dst, immL con) %{
5378   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
5379   match(Set dst (ReplicateL con));
5380   format %{ "movq    $dst,[$constantaddress]\n\t"
5381             "vpbroadcastq  $dst,$dst\t! replicate8L" %}
5382   ins_encode %{
5383     int vector_len = 2;
5384     __ movq($dst$$XMMRegister, $constantaddress($con));
5385     __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
5386   %}
5387   ins_pipe( pipe_slow );
5388 %}
5389 
5390 instruct Repl2L_mem_evex(vecX dst, memory mem) %{
5391   predicate(n->as_Vector()->length() == 2 && UseAVX > 2 && VM_Version::supports_avx512vl());
5392   match(Set dst (ReplicateL (LoadL mem)));
5393   format %{ "vpbroadcastd  $dst,$mem\t! replicate2L" %}
5394   ins_encode %{
5395     int vector_len = 0;
5396     __ vpbroadcastq($dst$$XMMRegister, $mem$$Address, vector_len);
5397   %}
5398   ins_pipe( pipe_slow );
5399 %}
5400 
5401 instruct Repl4L_mem_evex(vecY dst, memory mem) %{
5402   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
5403   match(Set dst (ReplicateL (LoadL mem)));
5404   format %{ "vpbroadcastd  $dst,$mem\t! replicate4L" %}
5405   ins_encode %{
5406     int vector_len = 1;
5407     __ vpbroadcastq($dst$$XMMRegister, $mem$$Address, vector_len);
5408   %}
5409   ins_pipe( pipe_slow );
5410 %}
5411 
5412 instruct Repl8L_mem_evex(vecZ dst, memory mem) %{
5413   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
5414   match(Set dst (ReplicateL (LoadL mem)));
5415   format %{ "vpbroadcastd  $dst,$mem\t! replicate8L" %}
5416   ins_encode %{
5417     int vector_len = 2;
5418     __ vpbroadcastq($dst$$XMMRegister, $mem$$Address, vector_len);
5419   %}
5420   ins_pipe( pipe_slow );
5421 %}
5422 
5423 instruct Repl8L_zero_evex(vecZ dst, immL0 zero) %{
5424   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
5425   match(Set dst (ReplicateL zero));
5426   format %{ "vpxor   $dst k0,$dst,$dst\t! replicate8L zero" %}
5427   ins_encode %{
5428     // Use vxorpd since AVX does not have vpxor for 512-bit (EVEX will have it).
5429     int vector_len = 2;
5430     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
5431   %}
5432   ins_pipe( fpu_reg_reg );
5433 %}
5434 
5435 instruct Repl8F_evex(vecY dst, regF src) %{
5436   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vl());
5437   match(Set dst (ReplicateF src));
5438   format %{ "vpbroadcastss $dst,$src\t! replicate8F" %}
5439   ins_encode %{
5440     int vector_len = 1;
5441     __ vpbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vector_len);
5442   %}
5443   ins_pipe( pipe_slow );
5444 %}
5445 
5446 instruct Repl8F_mem_evex(vecY dst, memory mem) %{
5447   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vl());
5448   match(Set dst (ReplicateF (LoadF mem)));
5449   format %{ "vbroadcastss  $dst,$mem\t! replicate8F" %}
5450   ins_encode %{
5451     int vector_len = 1;
5452     __ vpbroadcastss($dst$$XMMRegister, $mem$$Address, vector_len);
5453   %}
5454   ins_pipe( pipe_slow );
5455 %}
5456 
5457 instruct Repl16F_evex(vecZ dst, regF src) %{
5458   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
5459   match(Set dst (ReplicateF src));
5460   format %{ "vpbroadcastss $dst,$src\t! replicate16F" %}
5461   ins_encode %{
5462     int vector_len = 2;
5463     __ vpbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vector_len);
5464   %}
5465   ins_pipe( pipe_slow );
5466 %}
5467 
5468 instruct Repl16F_mem_evex(vecZ dst, memory mem) %{
5469   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
5470   match(Set dst (ReplicateF (LoadF mem)));
5471   format %{ "vbroadcastss  $dst,$mem\t! replicate16F" %}
5472   ins_encode %{
5473     int vector_len = 2;
5474     __ vpbroadcastss($dst$$XMMRegister, $mem$$Address, vector_len);
5475   %}
5476   ins_pipe( pipe_slow );
5477 %}
5478 
5479 instruct Repl2F_zero_evex(vecD dst, immF0 zero) %{
5480   predicate(n->as_Vector()->length() == 2 && UseAVX > 2);
5481   match(Set dst (ReplicateF zero));
5482   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate2F zero" %}
5483   ins_encode %{
5484     // Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
5485     int vector_len = 2;
5486     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
5487   %}
5488   ins_pipe( fpu_reg_reg );
5489 %}
5490 
5491 instruct Repl4F_zero_evex(vecX dst, immF0 zero) %{
5492   predicate(n->as_Vector()->length() == 4 && UseAVX > 2);
5493   match(Set dst (ReplicateF zero));
5494   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate4F zero" %}
5495   ins_encode %{
5496     // Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
5497     int vector_len = 2;
5498     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
5499   %}
5500   ins_pipe( fpu_reg_reg );
5501 %}
5502 
5503 instruct Repl8F_zero_evex(vecY dst, immF0 zero) %{
5504   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
5505   match(Set dst (ReplicateF zero));
5506   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate8F zero" %}
5507   ins_encode %{
5508     // Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
5509     int vector_len = 2;
5510     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
5511   %}
5512   ins_pipe( fpu_reg_reg );
5513 %}
5514 
5515 instruct Repl16F_zero_evex(vecZ dst, immF0 zero) %{
5516   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
5517   match(Set dst (ReplicateF zero));
5518   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate16F zero" %}
5519   ins_encode %{
5520     // Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
5521     int vector_len = 2;
5522     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
5523   %}
5524   ins_pipe( fpu_reg_reg );
5525 %}
5526 
5527 instruct Repl4D_evex(vecY dst, regD src) %{
5528   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
5529   match(Set dst (ReplicateD src));
5530   format %{ "vpbroadcastsd $dst,$src\t! replicate4D" %}
5531   ins_encode %{
5532     int vector_len = 1;
5533     __ vpbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
5534   %}
5535   ins_pipe( pipe_slow );
5536 %}
5537 
5538 instruct Repl4D_mem_evex(vecY dst, memory mem) %{
5539   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
5540   match(Set dst (ReplicateD (LoadD mem)));
5541   format %{ "vbroadcastsd  $dst,$mem\t! replicate4D" %}
5542   ins_encode %{
5543     int vector_len = 1;
5544     __ vpbroadcastsd($dst$$XMMRegister, $mem$$Address, vector_len);
5545   %}
5546   ins_pipe( pipe_slow );
5547 %}
5548 
5549 instruct Repl8D_evex(vecZ dst, regD src) %{
5550   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
5551   match(Set dst (ReplicateD src));
5552   format %{ "vpbroadcastsd $dst,$src\t! replicate8D" %}
5553   ins_encode %{
5554     int vector_len = 2;
5555     __ vpbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
5556   %}
5557   ins_pipe( pipe_slow );
5558 %}
5559 
5560 instruct Repl8D_mem_evex(vecZ dst, memory mem) %{
5561   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
5562   match(Set dst (ReplicateD (LoadD mem)));
5563   format %{ "vbroadcastsd  $dst,$mem\t! replicate8D" %}
5564   ins_encode %{
5565     int vector_len = 2;
5566     __ vpbroadcastsd($dst$$XMMRegister, $mem$$Address, vector_len);
5567   %}
5568   ins_pipe( pipe_slow );
5569 %}
5570 
5571 instruct Repl2D_zero_evex(vecX dst, immD0 zero) %{
5572   predicate(n->as_Vector()->length() == 2 && UseAVX > 2);
5573   match(Set dst (ReplicateD zero));
5574   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate2D zero" %}
5575   ins_encode %{
5576     // Use vpxor in place of vxorpd since EVEX has a constriant on dq for vxorpd: this is a 512-bit operation
5577     int vector_len = 2;
5578     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
5579   %}
5580   ins_pipe( fpu_reg_reg );
5581 %}
5582 
5583 instruct Repl4D_zero_evex(vecY dst, immD0 zero) %{
5584   predicate(n->as_Vector()->length() == 4 && UseAVX > 2);
5585   match(Set dst (ReplicateD zero));
5586   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate4D zero" %}
5587   ins_encode %{
5588     // Use vpxor in place of vxorpd since EVEX has a constriant on dq for vxorpd: this is a 512-bit operation
5589     int vector_len = 2;
5590     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
5591   %}
5592   ins_pipe( fpu_reg_reg );
5593 %}
5594 
5595 instruct Repl8D_zero_evex(vecZ dst, immD0 zero) %{
5596   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
5597   match(Set dst (ReplicateD zero));
5598   format %{ "vpxor  $dst k0,$dst,$dst,vect512\t! replicate8D zero" %}
5599   ins_encode %{
5600     // Use vpxor in place of vxorpd since EVEX has a constriant on dq for vxorpd: this is a 512-bit operation
5601     int vector_len = 2;
5602     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
5603   %}
5604   ins_pipe( fpu_reg_reg );
5605 %}
5606 
5607 // ====================VECTOR INSERT=======================================
5608 
5609 instruct rvinsert8B(vecD dst, vecD src, rRegI val, immU3 idx) %{
5610   predicate(UseSSE > 3 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
5611   match(Set dst (VectorInsert (Binary src val) idx));
5612   effect(TEMP dst);
5613   format %{ "movdqu  $dst,$src\n\t"
5614             "pinsrb  $dst,$val\t! Insert 8B" %}
5615   ins_encode %{
5616     if ($dst$$XMMRegister != $src$$XMMRegister) {
5617       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
5618     }
5619     __ pinsrb($dst$$XMMRegister, $val$$Register, $idx$$constant);
5620   %}
5621   ins_pipe( pipe_slow );
5622 %}
5623 
5624 instruct rvinsert16B(vecX dst, vecX src, rRegI val, immU4 idx) %{
5625   predicate(UseSSE > 3 && UseAVX == 0 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
5626   match(Set dst (VectorInsert (Binary src val) idx));
5627   effect(TEMP dst);
5628   format %{ "movdqu  $dst,$src\n\t"
5629             "pinsrb  $dst,$val\t! Insert 16B" %}
5630   ins_encode %{
5631     if ($dst$$XMMRegister != $src$$XMMRegister) {
5632       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
5633     }
5634     __ pinsrb($dst$$XMMRegister, $val$$Register, $idx$$constant);
5635   %}
5636   ins_pipe( pipe_slow );
5637 %}
5638 
5639 instruct rvinsert16B_avx(vecX dst, vecX src, rRegI val, immU4 idx) %{
5640   predicate(UseAVX > 0 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
5641   match(Set dst (VectorInsert (Binary src val) idx));
5642   effect(TEMP dst);
5643   format %{ "vmovdqu  $dst,$src\n\t"
5644             "vpinsrb  $dst,$dst,$val\t! Insert 16B" %}
5645   ins_encode %{
5646     if ($dst$$XMMRegister != $src$$XMMRegister) {
5647       __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister);
5648     }
5649     __ vpinsrb($dst$$XMMRegister, $dst$$XMMRegister, $val$$Register, $idx$$constant);
5650   %}
5651   ins_pipe( pipe_slow );
5652 %}
5653 
5654 instruct rvinsert32B(vecY dst, vecY src, vecY tmp, rRegI val, immU5 idx) %{
5655   predicate(UseAVX > 0 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
5656   match(Set dst (VectorInsert (Binary src val) idx));
5657   effect(TEMP dst, TEMP tmp);
5658   format %{"vmovdqu  $dst,$src\n\t"
5659            "vextracti128  $tmp,$src\n\t"
5660            "vpinsrb  $tmp,$tmp,$val\n\t"
5661            "vinserti128  $dst,$tmp\t! Insert 32B" %}
5662   ins_encode %{
5663     uint x_idx = $idx$$constant & right_n_bits(4);
5664     uint y_idx = ($idx$$constant >> 4) & 1;
5665 
5666     if ($dst$$XMMRegister != $src$$XMMRegister) {
5667       __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister);
5668     }
5669     __ vextracti128($tmp$$XMMRegister, $src$$XMMRegister, y_idx);
5670     __ vpinsrb($tmp$$XMMRegister, $tmp$$XMMRegister, $val$$Register, x_idx);
5671     __ vinserti128($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, y_idx);
5672   %}
5673   ins_pipe( pipe_slow );
5674 %}
5675 
5676 instruct rvinsert64B(vecZ dst, vecZ src, vecZ tmp, vecX tmp1, rRegI val, immU6 idx) %{
5677   predicate(UseAVX > 2 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
5678   match(Set dst (VectorInsert (Binary src val) idx));
5679   effect(TEMP dst, TEMP tmp, TEMP tmp1);
5680   format %{ "evmovdquq  $dst,$src\n\t"
5681             "vextracti64x4  $tmp,$src\n\t"
5682             "vextracti128  $tmp1,$tmp\n\t"
5683             "vpinsrb  $tmp1,$tmp1,$val\n\t"
5684             "vinserti128  $tmp,$tmp,$tmp1\n\t"
5685             "vinserti64x4  $dst,$dst,$tmp\t! Insert 64B" %}
5686   ins_encode %{
5687     uint x_idx = $idx$$constant & right_n_bits(4);
5688     uint y_idx = ($idx$$constant >> 4) & 1;
5689     uint z_idx = ($idx$$constant >> 5) & 1;
5690 
5691     if ($dst$$XMMRegister != $src$$XMMRegister) {
5692       __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, 2);
5693     }
5694     __ vextracti64x4($tmp$$XMMRegister, $src$$XMMRegister, z_idx);
5695     __ vextracti128($tmp1$$XMMRegister, $tmp$$XMMRegister, y_idx);
5696     __ vpinsrb($tmp1$$XMMRegister, $tmp1$$XMMRegister, $val$$Register, x_idx);
5697     __ vinserti128($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp1$$XMMRegister, y_idx);
5698     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, z_idx);
5699   %}
5700   ins_pipe( pipe_slow );
5701 %}
5702 
5703 instruct rvinsert4S(vecD dst, vecD src, rRegI val, immU2 idx) %{
5704   predicate(UseSSE > 3 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
5705   match(Set dst (VectorInsert (Binary src val) idx));
5706   effect(TEMP dst);
5707   format %{ "movdqu  $dst,$src\n\t"
5708             "pinsrw  $dst,$val\t! Insert 4S" %}
5709   ins_encode %{
5710     if ($dst$$XMMRegister != $src$$XMMRegister) {
5711       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
5712     }
5713     __ pinsrw($dst$$XMMRegister, $val$$Register, $idx$$constant);
5714   %}
5715   ins_pipe( pipe_slow );
5716 %}
5717 
5718 instruct rvinsert8S(vecX dst, vecX src, rRegI val, immU3 idx) %{
5719   predicate(UseSSE > 3 && UseAVX == 0 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
5720   match(Set dst (VectorInsert (Binary src val) idx));
5721   effect(TEMP dst);
5722   format %{ "movdqu  $dst,$src\n\t"
5723             "pinsrw  $dst,$val\t! Insert 8S" %}
5724   ins_encode %{
5725     if ($dst$$XMMRegister != $src$$XMMRegister) {
5726       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
5727     }
5728     __ pinsrw($dst$$XMMRegister, $val$$Register, $idx$$constant);
5729   %}
5730   ins_pipe( pipe_slow );
5731 %}
5732 
5733 instruct rvinsert8S_avx(vecX dst, vecX src, rRegI val, immU3 idx) %{
5734   predicate(UseAVX > 0 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
5735   match(Set dst (VectorInsert (Binary src val) idx));
5736   effect(TEMP dst);
5737   format %{ "vmovdqu  $dst,$src\n\t"
5738             "vpinsrw  $dst,$dst,$val\t! Insert 8S" %}
5739   ins_encode %{
5740     if ($dst$$XMMRegister != $src$$XMMRegister) {
5741       __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister);
5742     }
5743     __ vpinsrw($dst$$XMMRegister, $dst$$XMMRegister, $val$$Register, $idx$$constant);
5744   %}
5745   ins_pipe( pipe_slow );
5746 %}
5747 
5748 
5749 instruct rvinsert16S(vecY dst, vecY src, vecX tmp, rRegI val, immU4 idx) %{
5750   predicate(UseAVX > 0 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
5751   match(Set dst (VectorInsert (Binary src val) idx));
5752   effect(TEMP dst, TEMP tmp);
5753   format %{ "vmovdqu  $dst,$src\n\t"
5754             "vextracti128  $tmp,$src\n\t"
5755             "vpinsrw  $tmp,$tmp,$val\n\t"
5756             "vinserti128  $dst,$dst,$tmp\t! Insert 16S" %}
5757   ins_encode %{
5758     uint x_idx = $idx$$constant & right_n_bits(3);
5759     uint y_idx = ($idx$$constant >> 3) & 1;
5760 
5761     if ($dst$$XMMRegister != $src$$XMMRegister) {
5762       __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister);
5763     }
5764     __ vextracti128($tmp$$XMMRegister, $src$$XMMRegister, y_idx);
5765     __ vpinsrw($tmp$$XMMRegister, $tmp$$XMMRegister, $val$$Register, x_idx);
5766     __ vinserti128($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, y_idx);
5767   %}
5768   ins_pipe( pipe_slow );
5769 %}
5770 
5771 instruct rvinsert32S(vecZ dst, vecZ src, vecZ tmp, vecX tmp1, rRegI val, immU5 idx) %{
5772   predicate(UseAVX > 2 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
5773   match(Set dst (VectorInsert (Binary src val) idx));
5774   effect(TEMP dst, TEMP tmp, TEMP tmp1);
5775   format %{ "evmovdquq  $dst,$src\n\t"
5776             "vextracti64x4  $tmp,$src\n\t"
5777             "vextracti128  $tmp1,$tmp\n\t"
5778             "vpinsrw  $tmp1,$tmp1,$val\n\t"
5779             "vinserti128  $tmp,$tmp,$tmp1\n\t"
5780             "vinserti64x4  $dst,$dst,$tmp\t! Insert 32S" %}
5781   ins_encode %{
5782     uint x_idx = $idx$$constant & right_n_bits(3);
5783     uint y_idx = ($idx$$constant >> 3) & 1;
5784     uint z_idx = ($idx$$constant >> 4) & 1;
5785 
5786     if ($dst$$XMMRegister != $src$$XMMRegister) {
5787       __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, 2);
5788     }
5789     __ vextracti64x4($tmp$$XMMRegister, $src$$XMMRegister, z_idx);
5790     __ vextracti128($tmp1$$XMMRegister, $tmp$$XMMRegister, y_idx);
5791     __ vpinsrw($tmp1$$XMMRegister, $tmp1$$XMMRegister, $val$$Register, x_idx);
5792     __ vinserti128($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp1$$XMMRegister, y_idx);
5793     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, z_idx);
5794   %}
5795   ins_pipe( pipe_slow );
5796 %}
5797 
5798 instruct rvinsert2I(vecD dst, vecD src, rRegI val, immU1 idx) %{
5799   predicate(UseSSE > 3 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
5800   match(Set dst (VectorInsert (Binary src val) idx));
5801   effect(TEMP dst);
5802   format %{ "movdqu  $dst,$src\n\t"
5803             "pinsrd  $dst,$val\t! Insert 2I" %}
5804   ins_encode %{
5805     if ($dst$$XMMRegister != $src$$XMMRegister) {
5806       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
5807     }
5808     __ pinsrd($dst$$XMMRegister, $val$$Register, $idx$$constant);
5809   %}
5810   ins_pipe( pipe_slow );
5811 %}
5812 
5813 instruct rvinsert4I(vecX dst, vecX src, rRegI val, immU2 idx) %{
5814   predicate(UseSSE > 3 && UseAVX == 0 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
5815   match(Set dst (VectorInsert (Binary src val) idx));
5816   effect(TEMP dst);
5817   format %{ "movdqu  $dst,$src\n\t"
5818             "pinsrd  $dst,$val\t! Insert 4I" %}
5819   ins_encode %{
5820     if ($dst$$XMMRegister != $src$$XMMRegister) {
5821       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
5822     }
5823     __ pinsrd($dst$$XMMRegister, $val$$Register, $idx$$constant);
5824   %}
5825   ins_pipe( pipe_slow );
5826 %}
5827 
5828 instruct rvinsert4I_avx(vecX dst, vecX src, rRegI val, immU2 idx) %{
5829   predicate(UseAVX > 0 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
5830   match(Set dst (VectorInsert (Binary src val) idx));
5831   effect(TEMP dst);
5832   format %{ "vmovdqu  $dst,$src\n\t"
5833             "vpinsrd  $dst,$val\t! Insert 4I" %}
5834   ins_encode %{
5835     if ($dst$$XMMRegister != $src$$XMMRegister) {
5836       __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister);
5837     }
5838     __ vpinsrd($dst$$XMMRegister, $dst$$XMMRegister, $val$$Register, $idx$$constant);
5839   %}
5840   ins_pipe( pipe_slow );
5841 %}
5842 
5843 instruct rvinsert8I(vecY dst, vecY src, vecY tmp, rRegI val, immU3 idx) %{
5844   predicate(UseAVX > 0 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
5845   match(Set dst (VectorInsert (Binary src val) idx));
5846   effect(TEMP dst, TEMP tmp);
5847   format %{ "vmovdqu  $dst,$src\n\t"
5848             "vextracti128  $tmp,$src\n\t"
5849             "vpinsrd  $tmp,$tmp,$val\n\t"
5850             "vinserti128  $dst,$dst,$tmp\t! Insert 8I" %}
5851   ins_encode %{
5852     uint x_idx = $idx$$constant & right_n_bits(2);
5853     uint y_idx = ($idx$$constant >> 2) & 1;
5854 
5855     if ($dst$$XMMRegister != $src$$XMMRegister) {
5856       __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister);
5857     }
5858     __ vextracti128($tmp$$XMMRegister, $src$$XMMRegister, y_idx);
5859     __ vpinsrd($tmp$$XMMRegister, $tmp$$XMMRegister, $val$$Register, x_idx);
5860     __ vinserti128($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, y_idx);
5861   %}
5862   ins_pipe( pipe_slow );
5863 %}
5864 
5865 instruct rvinsert16I(vecZ dst, vecZ src, vecZ tmp, vecX tmp1, rRegI val, immU4 idx) %{
5866   predicate(UseAVX > 2 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
5867   match(Set dst (VectorInsert (Binary src val) idx));
5868   effect(TEMP dst, TEMP tmp, TEMP tmp1);
5869   format %{ "evmovdquq  $dst,$src\n\t"
5870             "vextracti64x4  $tmp,$src\n\t"
5871             "vextracti128  $tmp,$tmp\n\t"
5872             "vpinsrd  $tmp,$tmp,$val\n\t"
5873             "vinserti128  $tmp,$tmp,$tmp\n\t"
5874             "vinserti64x4  $dst,$dst,$tmp\t! Insert 16I" %}
5875   ins_encode %{
5876     uint x_idx = $idx$$constant & right_n_bits(2);
5877     uint y_idx = ($idx$$constant >> 2) & 1;
5878     uint z_idx = ($idx$$constant >> 3) & 1;
5879 
5880     if ($dst$$XMMRegister != $src$$XMMRegister) {
5881       __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, 2);
5882     }
5883     __ vextracti64x4($tmp$$XMMRegister, $src$$XMMRegister, z_idx);
5884     __ vextracti128($tmp1$$XMMRegister, $tmp$$XMMRegister, y_idx);
5885     __ vpinsrd($tmp1$$XMMRegister, $tmp1$$XMMRegister, $val$$Register, x_idx);
5886     __ vinserti128($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp1$$XMMRegister, y_idx);
5887     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, z_idx);
5888   %}
5889   ins_pipe( pipe_slow );
5890 %}
5891 
5892 instruct rvinsert1L(vecD dst, vecD src, rRegL val, immI0 idx) %{
5893   predicate(UseSSE > 3 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
5894   match(Set dst (VectorInsert (Binary src val) idx));
5895   effect(TEMP dst);
5896   format %{ "movdqu  $dst,$src\n\t"
5897             "pinsrq  $dst,$val\t! Insert 1L" %}
5898   ins_encode %{
5899     if ($dst$$XMMRegister != $src$$XMMRegister) {
5900       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
5901     }
5902     __ pinsrq($dst$$XMMRegister, $val$$Register, 0);
5903   %}
5904   ins_pipe( pipe_slow );
5905 %}
5906 
5907 instruct rvinsert2L(vecX dst, vecX src, rRegL val, immU1 idx) %{
5908   predicate(UseSSE > 3 && UseAVX == 0 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
5909   match(Set dst (VectorInsert (Binary src val) idx));
5910   effect(TEMP dst);
5911   format %{ "movdqu  $dst,$src\n\t"
5912             "pinsrq  $dst,$dst\t! Insert 2L" %}
5913   ins_encode %{
5914     if ($dst$$XMMRegister != $src$$XMMRegister) {
5915       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
5916     }
5917     __ pinsrq($dst$$XMMRegister, $val$$Register, $idx$$constant);
5918   %}
5919   ins_pipe( pipe_slow );
5920 %}
5921 
5922 instruct rvinsert2L_avx(vecX dst, vecX src, rRegL val, immU1 idx) %{
5923   predicate(UseAVX > 0 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
5924   match(Set dst (VectorInsert (Binary src val) idx));
5925   effect(TEMP dst);
5926   format %{ "vmovdqu  $dst,$src\n\t"
5927             "vpinsrq  $dst,$dst,$val\t! Insert 2L" %}
5928   ins_encode %{
5929     if ($dst$$XMMRegister != $src$$XMMRegister) {
5930       __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister);
5931     }
5932     __ vpinsrq($dst$$XMMRegister, $dst$$XMMRegister, $val$$Register, $idx$$constant);
5933   %}
5934   ins_pipe( pipe_slow );
5935 %}
5936 
5937 instruct rvinsert4L(vecY dst, vecY src, vecY tmp, rRegL val, immU2 idx) %{
5938   predicate(UseAVX > 0 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
5939   match(Set dst (VectorInsert (Binary src val) idx));
5940   effect(TEMP dst, TEMP tmp);
5941   format %{ "vmovdqu  $dst,$src\n\t"
5942             "vextracti128  $tmp,$src\n\t"
5943             "vpinsrq  $tmp,$tmp,$val\n\t"
5944             "vinserti128  $dst,$dst,$tmp\t! Insert 4L" %}
5945   ins_encode %{
5946     uint x_idx = $idx$$constant & 1;
5947     uint y_idx = ($idx$$constant >> 1) & 1;
5948 
5949     if ($dst$$XMMRegister != $src$$XMMRegister) {
5950       __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister);
5951     }
5952     __ vextracti128($tmp$$XMMRegister, $src$$XMMRegister, y_idx);
5953     __ vpinsrq($tmp$$XMMRegister, $tmp$$XMMRegister, $val$$Register, x_idx);
5954     __ vinserti128($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, y_idx);
5955   %}
5956   ins_pipe( pipe_slow );
5957 %}
5958 
5959 instruct rvinsert8L(vecZ dst, vecZ src, vecZ tmp, vecX tmp1, rRegL val, immU3 idx) %{
5960   predicate(UseAVX > 2 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
5961   match(Set dst (VectorInsert (Binary src val) idx));
5962   effect(TEMP dst, TEMP tmp, TEMP tmp1);
5963   format %{ "evmovdquq  $dst,$src\n\t"
5964             "vextracti64x4  $tmp,$src\n\t"
5965             "vextracti128  $tmp,$tmp\n\t"
5966             "vpinsrq  $tmp,$tmp,$val\n\t"
5967             "vinserti128  $tmp,$tmp,$tmp\n\t"
5968             "vinserti64x4  $dst,$dst,$tmp\t! Insert 8L" %}
5969   ins_encode %{
5970     uint x_idx = $idx$$constant & 1;
5971     uint y_idx = ($idx$$constant >> 1) & 1;
5972     uint z_idx = ($idx$$constant >> 2) & 1;
5973 
5974     if ($dst$$XMMRegister != $src$$XMMRegister) {
5975       __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, 2);
5976     }
5977     __ vextracti64x4($tmp$$XMMRegister, $src$$XMMRegister, z_idx);
5978     __ vextracti128($tmp1$$XMMRegister, $tmp$$XMMRegister, y_idx);
5979     __ vpinsrq($tmp1$$XMMRegister, $tmp1$$XMMRegister, $val$$Register, x_idx);
5980     __ vinserti128($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp1$$XMMRegister, y_idx);
5981     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, z_idx);
5982   %}
5983   ins_pipe( pipe_slow );
5984 %}
5985 
5986 instruct rvinsert2F(vecD dst, vecD src, regF val, immU1 idx) %{
5987   predicate(UseSSE > 3 && UseAVX == 0 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
5988   match(Set dst (VectorInsert (Binary src val) idx));
5989   effect(TEMP dst);
5990   format %{ "movdqu  $dst,$src\n\t"
5991             "insertps  $dst,$dst,$val\t! Insert 2F" %}
5992   ins_encode %{
5993     if ($dst$$XMMRegister != $src$$XMMRegister) {
5994       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
5995     }
5996     __ insertps($dst$$XMMRegister, $val$$XMMRegister, $idx$$constant);
5997   %}
5998   ins_pipe( pipe_slow );
5999 %}
6000 
6001 instruct rvinsert2F_avx(vecD dst, vecD src, regF val, immU1 idx) %{
6002   predicate(UseAVX > 0 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
6003   match(Set dst (VectorInsert (Binary src val) idx));
6004   effect(TEMP dst);
6005   format %{ "movdqu  $dst,$src\n\t"
6006             "insertps  $dst,$dst,$val\t! Insert 2F" %}
6007   ins_encode %{
6008     if ($dst$$XMMRegister != $src$$XMMRegister) {
6009       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
6010     }
6011     __ vinsertps($dst$$XMMRegister, $dst$$XMMRegister, $val$$XMMRegister, $idx$$constant);
6012   %}
6013   ins_pipe( pipe_slow );
6014 %}
6015 
6016 instruct rvinsert4F(vecX dst, vecX src, regF val, immU2 idx) %{
6017   predicate(UseSSE > 3 && UseAVX == 0 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
6018   match(Set dst (VectorInsert (Binary src val) idx));
6019   effect(TEMP dst);
6020   format %{ "movdqu  $dst,$src\n\t"
6021             "insertps  $dst,$dst,$val\t! Insert 4F" %}
6022   ins_encode %{
6023     if ($dst$$XMMRegister != $src$$XMMRegister) {
6024       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
6025     }
6026     __ insertps($dst$$XMMRegister, $val$$XMMRegister, $idx$$constant);
6027   %}
6028   ins_pipe( pipe_slow );
6029 %}
6030 
6031 instruct rvinsert4F_avx(vecX dst, vecX src, regF val, immU2 idx) %{
6032   predicate(UseAVX > 0 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
6033   match(Set dst (VectorInsert (Binary src val) idx));
6034   effect(TEMP dst);
6035   format %{ "vmovdqu  $dst,$src\n\t"
6036             "vinsertps  $dst,$dst,$val\t! Insert 4F" %}
6037   ins_encode %{
6038     if ($dst$$XMMRegister != $src$$XMMRegister) {
6039       __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister);
6040     }
6041     __ vinsertps($dst$$XMMRegister, $dst$$XMMRegister, $val$$XMMRegister, $idx$$constant);
6042   %}
6043   ins_pipe( pipe_slow );
6044 %}
6045 
6046 instruct rvinsert8F(vecY dst, vecY src, vecY tmp, regF val, immU3 idx) %{
6047   predicate(UseAVX > 0 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
6048   match(Set dst (VectorInsert (Binary src val) idx));
6049   effect(TEMP dst, TEMP tmp);
6050   format %{ "vmovdqu  $dst,$src\n\t"
6051             "vextractf128  $tmp,$src\n\t"
6052             "vinsertps  $tmp,$tmp,$val\n\t"
6053             "vinsertf128  $dst,$dst,$tmp\t! Insert 8F" %}
6054   ins_encode %{
6055     uint x_idx = $idx$$constant & right_n_bits(2);
6056     uint y_idx = ($idx$$constant >> 2) & 1;
6057 
6058     if ($dst$$XMMRegister != $src$$XMMRegister) {
6059       __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister);
6060     }
6061     __ vextractf128($tmp$$XMMRegister, $src$$XMMRegister, y_idx);
6062     __ vinsertps($tmp$$XMMRegister, $tmp$$XMMRegister, $val$$XMMRegister, x_idx);
6063     __ vinsertf128($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, y_idx);
6064   %}
6065   ins_pipe( pipe_slow );
6066 %}
6067 
6068 instruct rvinsert16F(vecZ dst, vecZ src, vecZ tmp, vecX tmp1, regF val, immU4 idx) %{
6069   predicate(UseAVX > 2 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
6070   match(Set dst (VectorInsert (Binary src val) idx));
6071   effect(TEMP dst, TEMP tmp, TEMP tmp1);
6072   format %{ "evmovdquq  $dst,$src\n\t"
6073             "vextractf128  $tmp,$src\n\t"
6074             "vinsertps  $tmp,$tmp,$val\n\t"
6075             "movsbl  $dst,$dst\t! Insert 4I" %}
6076   ins_encode %{
6077     uint x_idx = $idx$$constant & right_n_bits(2);
6078     uint y_idx = ($idx$$constant >> 2) & 1;
6079     uint z_idx = ($idx$$constant >> 3) & 1;
6080 
6081     if ($dst$$XMMRegister != $src$$XMMRegister) {
6082       __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, 2);
6083     }
6084     __ vextractf64x4($tmp$$XMMRegister, $src$$XMMRegister, z_idx);
6085     __ vextractf128($tmp1$$XMMRegister, $tmp$$XMMRegister, y_idx);
6086     __ vinsertps($tmp1$$XMMRegister, $tmp1$$XMMRegister, $val$$XMMRegister, x_idx);
6087     __ vinsertf128($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp1$$XMMRegister, y_idx);
6088     __ vinsertf64x4($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, z_idx);
6089   %}
6090   ins_pipe( pipe_slow );
6091 %}
6092 
6093 instruct rvinsert1D(vecD dst, vecD src, regD val, rRegL tmp, immI0 idx) %{
6094   predicate(UseSSE > 3 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
6095   match(Set dst (VectorInsert (Binary src val) idx));
6096   effect(TEMP dst, TEMP tmp);
6097   format %{ "movdqu  $dst,$src\n\t"
6098             "movq $tmp,$val\n\t"
6099             "pinsrq  $dst,$tmp\t! Insert 1D" %}
6100   ins_encode %{
6101     if ($dst$$XMMRegister != $src$$XMMRegister) {
6102       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
6103     }
6104     __ movq($tmp$$Register, $val$$XMMRegister);
6105     __ pinsrq($dst$$XMMRegister, $tmp$$Register, 0);
6106   %}
6107   ins_pipe( pipe_slow );
6108 %}
6109 
6110 instruct rvinsert2D(vecX dst, vecX src, regD val, rRegL tmp, immU1 idx) %{
6111   predicate(UseSSE > 3 && UseAVX == 0 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
6112   match(Set dst (VectorInsert (Binary src val) idx));
6113   effect(TEMP dst, TEMP tmp);
6114   format %{ "movdqu  $dst,$src\n\t"
6115             "movq  $dst,$src\n\t"
6116             "pinsrq  $dst,$dst\t! Insert 2D" %}
6117   ins_encode %{
6118     if ($dst$$XMMRegister != $src$$XMMRegister) {
6119       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
6120     }
6121     __ movq($tmp$$Register, $val$$XMMRegister);
6122     __ pinsrq($dst$$XMMRegister, $tmp$$Register, $idx$$constant);
6123   %}
6124   ins_pipe( pipe_slow );
6125 %}
6126 
6127 instruct rvinsert2D_avx(vecX dst, vecX src, regD val, rRegL tmp, immU1 idx) %{
6128   predicate(UseAVX > 0 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
6129   match(Set dst (VectorInsert (Binary src val) idx));
6130   effect(TEMP dst, TEMP tmp);
6131   format %{ "vmovdqu  $dst,$src\n\t"
6132             "movq  $tmp,$val\n\t"
6133             "vpinsrq  $dst,$dst,$tmp\t! Insert 2D" %}
6134   ins_encode %{
6135     if ($dst$$XMMRegister != $src$$XMMRegister) {
6136       __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister);
6137     }
6138     __ movq($tmp$$Register, $val$$XMMRegister);
6139     __ vpinsrq($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$Register, $idx$$constant);
6140   %}
6141   ins_pipe( pipe_slow );
6142 %}
6143 
6144 instruct rvinsert4D(vecY dst, vecY src, vecY tmp, regD val, rRegL tmp1, immU2 idx) %{
6145   predicate(UseAVX > 0 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
6146   match(Set dst (VectorInsert (Binary src val) idx));
6147   effect(TEMP dst, TEMP tmp, TEMP tmp1);
6148   format %{ "vmovdqu  $dst,$src\n\t"
6149             "vextracti128  $tmp,$src\n\t"
6150             "movq $tmp1,$val\n\t"
6151             "vpinsrq  $tmp,$tmp,$tmp1\n\t"
6152             "vinserti128  $dst,$dst,$tmp\t! Insert 4D" %}
6153   ins_encode %{
6154     uint x_idx = $idx$$constant & 1;
6155     uint y_idx = ($idx$$constant >> 1) & 1;
6156 
6157     if ($dst$$XMMRegister != $src$$XMMRegister) {
6158       __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister);
6159     }
6160     __ vextractf128($tmp$$XMMRegister, $src$$XMMRegister, y_idx);
6161     __ movq($tmp1$$Register, $val$$XMMRegister);
6162     __ vpinsrq($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp1$$Register, x_idx);
6163     __ vinsertf128($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, y_idx);
6164   %}
6165   ins_pipe( pipe_slow );
6166 %}
6167 
6168 instruct rvinsert8D(vecZ dst, vecZ src, vecZ tmp, vecY tmp2, regD val, rRegL tmp1, immU3 idx) %{
6169   predicate(UseAVX > 2 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
6170   match(Set dst (VectorInsert (Binary src val) idx));
6171   effect(TEMP dst, TEMP tmp, TEMP tmp1, TEMP tmp2);
6172   format %{ "evmovdquq  $dst,$src\n\t"
6173             "vextractf64x4  $tmp,$src\n\t"
6174             "vextractf128  $tmp,$tmp\n\t"
6175             "movq $tmp1,$val\n\t"
6176             "vpinsrq  $tmp,$tmp,$val\n\t"
6177             "vinsertf128  $tmp,$tmp,$tmp\n\t"
6178             "vinsertf64x4  $dst,$dst,$tmp\t! Insert 8D" %}
6179   ins_encode %{
6180     uint x_idx = $idx$$constant & 1;
6181     uint y_idx = ($idx$$constant >> 1) & 1;
6182     uint z_idx = ($idx$$constant >> 2) & 1;
6183 
6184     if ($dst$$XMMRegister != $src$$XMMRegister) {
6185       __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, 2);
6186     }
6187     __ vextractf64x4($tmp2$$XMMRegister, $src$$XMMRegister, z_idx);
6188     __ vextractf128($tmp$$XMMRegister, $tmp2$$XMMRegister, y_idx);
6189     __ movq($tmp1$$Register, $val$$XMMRegister);
6190     __ vpinsrq($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp1$$Register, x_idx);
6191     __ vinsertf128($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, y_idx);
6192     __ vinsertf64x4($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister, z_idx);
6193   %}
6194   ins_pipe( pipe_slow );
6195 %}
6196 
6197 // ====================REDUCTION ARITHMETIC=======================================
6198 
6199 instruct rsadd8B_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, rRegI tmp2) %{
6200   predicate(UseSSE > 3 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
6201   match(Set dst (AddReductionVI src1 src2));
6202   effect(TEMP tmp, TEMP tmp2, TEMP dst);
6203   format %{
6204             "pshufd  $tmp,$src2,0x1\n\t"
6205             "paddb   $tmp,$src2\n\t"
6206             "movzbl  $dst,$src1\n\t"
6207             "pextrb  $tmp2,$tmp, 0x0\n\t"
6208             "addl    $dst,$tmp2\n\t"
6209             "pextrb  $tmp2,$tmp, 0x1\n\t"
6210             "addl    $dst,$tmp2\n\t"
6211             "pextrb  $tmp2,$tmp, 0x2\n\t"
6212             "addl    $dst,$tmp2\n\t"
6213             "pextrb  $tmp2,$tmp, 0x3\n\t"
6214             "addl    $dst,$tmp2\n\t"
6215             "movsbl  $dst,$dst\t! add reduction8B" %}
6216   ins_encode %{
6217     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x1);
6218     __ paddb($tmp$$XMMRegister, $src2$$XMMRegister);
6219     __ movzbl($dst$$Register, $src1$$Register);
6220     __ pextrb($tmp2$$Register, $tmp$$XMMRegister, 0x0);
6221     __ addl($dst$$Register, $tmp2$$Register);
6222     __ pextrb($tmp2$$Register, $tmp$$XMMRegister, 0x1);
6223     __ addl($dst$$Register, $tmp2$$Register);
6224     __ pextrb($tmp2$$Register, $tmp$$XMMRegister, 0x2);
6225     __ addl($dst$$Register, $tmp2$$Register);
6226     __ pextrb($tmp2$$Register, $tmp$$XMMRegister, 0x3);
6227     __ addl($dst$$Register, $tmp2$$Register);
6228     __ movsbl($dst$$Register, $dst$$Register);
6229   %}
6230   ins_pipe( pipe_slow );
6231 %}
6232 
6233 instruct rsadd16B_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2, rRegI tmp3) %{
6234   predicate(UseSSE > 3 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
6235   match(Set dst (AddReductionVI src1 src2));
6236   effect(TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP dst);
6237   format %{ "pshufd  $tmp,$src2,0xE\n\t"
6238             "paddb   $tmp,$src2\n\t"
6239             "pshufd  $tmp2,$tmp,0x1\n\t"
6240             "paddb   $tmp,$tmp,$tmp2\n\t"
6241             "movzbl  $dst,$src1\n\t"
6242             "pextrb  $tmp3,$tmp, 0x0\n\t"
6243             "addl    $dst,$tmp3\n\t"
6244             "pextrb  $tmp3,$tmp, 0x1\n\t"
6245             "addl    $dst,$tmp3\n\t"
6246             "pextrb  $tmp3,$tmp, 0x2\n\t"
6247             "addl    $dst,$tmp3\n\t"
6248             "pextrb  $tmp3,$tmp, 0x3\n\t"
6249             "addl    $dst,$tmp3\n\t"
6250             "movsbl  $dst,$dst\t! add reduction16B" %}
6251   ins_encode %{
6252     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
6253     __ paddb($tmp$$XMMRegister, $src2$$XMMRegister);
6254     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
6255     __ paddb($tmp$$XMMRegister, $tmp2$$XMMRegister);
6256     __ movzbl($dst$$Register, $src1$$Register);
6257     __ pextrb($tmp3$$Register, $tmp$$XMMRegister, 0x0);
6258     __ addl($dst$$Register, $tmp3$$Register);
6259     __ pextrb($tmp3$$Register, $tmp$$XMMRegister, 0x1);
6260     __ addl($dst$$Register, $tmp3$$Register);
6261     __ pextrb($tmp3$$Register, $tmp$$XMMRegister, 0x2);
6262     __ addl($dst$$Register, $tmp3$$Register);
6263     __ pextrb($tmp3$$Register, $tmp$$XMMRegister, 0x3);
6264     __ addl($dst$$Register, $tmp3$$Register);
6265     __ movsbl($dst$$Register, $dst$$Register);
6266   %}
6267   ins_pipe( pipe_slow );
6268 %}
6269 
6270 instruct rvadd32B_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2, rRegI tmp3) %{
6271   predicate(UseAVX > 1 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
6272   match(Set dst (AddReductionVI src1 src2));
6273   effect(TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP dst);
6274   format %{ "vextracti128_high  $tmp,$src2\n\t"
6275             "vpaddb  $tmp,$tmp,$src2\n\t"
6276             "pshufd  $tmp2,$tmp,0xE\n\t"
6277             "vpaddb  $tmp,$tmp,$tmp2\n\t"
6278             "pshufd  $tmp2,$tmp,0x1\n\t"
6279             "vpaddb  $tmp,$tmp,$tmp2\n\t"
6280             "movzbl  $dst,$src1\n\t"
6281             "pextrb  $tmp3,$tmp, 0x0\n\t"
6282             "addl    $dst,$tmp3\n\t"
6283             "pextrb  $tmp3,$tmp, 0x1\n\t"
6284             "addl    $dst,$tmp3\n\t"
6285             "pextrb  $tmp3,$tmp, 0x2\n\t"
6286             "addl    $dst,$tmp3\n\t"
6287             "pextrb  $tmp3,$tmp, 0x3\n\t"
6288             "addl    $dst,$tmp3\n\t"
6289             "movsbl  $dst,$dst\t! add reduction32B" %}
6290   ins_encode %{
6291     int vector_len = 0;
6292     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
6293     __ vpaddb($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
6294     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
6295     __ vpaddb($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
6296     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
6297     __ vpaddb($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
6298     __ movzbl($dst$$Register, $src1$$Register);
6299     __ pextrb($tmp3$$Register, $tmp$$XMMRegister, 0x0);
6300     __ addl($dst$$Register, $tmp3$$Register);
6301     __ pextrb($tmp3$$Register, $tmp$$XMMRegister, 0x1);
6302     __ addl($dst$$Register, $tmp3$$Register);
6303     __ pextrb($tmp3$$Register, $tmp$$XMMRegister, 0x2);
6304     __ addl($dst$$Register, $tmp3$$Register);
6305     __ pextrb($tmp3$$Register, $tmp$$XMMRegister, 0x3);
6306     __ addl($dst$$Register, $tmp3$$Register);
6307     __ movsbl($dst$$Register, $dst$$Register);
6308   %}
6309   ins_pipe( pipe_slow );
6310 %}
6311 
6312 instruct rvadd64B_reduction_reg(rRegI dst, rRegI src1, vecZ src2, regF tmp, regF tmp2, rRegI tmp3) %{
6313   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
6314   match(Set dst (AddReductionVI src1 src2));
6315   effect(TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP dst);
6316   format %{ "vextracti64x4_high  $tmp2,$src2\n\t"
6317             "vpaddb  $tmp2,$tmp2,$src2\n\t"
6318             "vextracti128_high  $tmp,$tmp2\n\t"
6319             "vpaddb  $tmp,$tmp,$tmp2\n\t"
6320             "pshufd  $tmp2,$tmp,0xE\n\t"
6321             "vpaddb  $tmp,$tmp,$tmp2\n\t"
6322             "pshufd  $tmp2,$tmp,0x1\n\t"
6323             "vpaddb  $tmp,$tmp,$tmp2\n\t"
6324             "movzbl  $dst,$src1\n\t"
6325             "movdl   $tmp3,$tmp\n\t"
6326             "addl    $dst,$tmp3\n\t"
6327             "shrl    $tmp3,0x8\n\t"
6328             "addl    $dst,$tmp3\n\t"
6329             "shrl    $tmp3,0x8\n\t"
6330             "addl    $dst,$tmp3\n\t"
6331             "shrl    $tmp3,0x8\n\t"
6332             "addl    $dst,$tmp3\n\t"
6333             "movsbl  $dst,$dst\t! add reduction64B" %}
6334   ins_encode %{
6335     int vector_len = 0;
6336     __ vextracti64x4_high($tmp2$$XMMRegister, $src2$$XMMRegister);
6337     __ vpaddb($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
6338     __ vextracti128_high($tmp$$XMMRegister, $tmp2$$XMMRegister);
6339     __ vpaddb($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
6340     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
6341     __ vpaddb($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
6342     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
6343     __ vpaddb($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
6344     __ movzbl($dst$$Register, $src1$$Register);
6345     __ movdl($tmp3$$Register, $tmp$$XMMRegister);
6346     __ addl($dst$$Register, $tmp3$$Register);
6347     __ shrl($tmp3$$Register, 8);
6348     __ addl($dst$$Register, $tmp3$$Register);
6349     __ shrl($tmp3$$Register, 8);
6350     __ addl($dst$$Register, $tmp3$$Register);
6351     __ shrl($tmp3$$Register, 8);
6352     __ addl($dst$$Register, $tmp3$$Register);
6353     __ movsbl($dst$$Register, $dst$$Register);
6354   %}
6355   ins_pipe( pipe_slow );
6356 %}
6357 
6358 instruct rsadd4S_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, rRegI tmp2) %{
6359   predicate(UseSSE > 3 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
6360   match(Set dst (AddReductionVI src1 src2));
6361   effect(TEMP tmp, TEMP tmp2, TEMP dst);
6362   format %{
6363             "movdqu   $tmp,$src2\n\t"
6364             "phaddw   $tmp,$tmp\n\t"
6365             "phaddw   $tmp,$tmp\n\t"
6366             "movzwl   $dst,$src1\n\t"
6367             "pextrw   $tmp2,$tmp, 0x0\n\t"
6368             "addw     $dst,$tmp2\n\t"
6369             "movswl  $dst,$dst\t! add reduction4S" %}
6370   ins_encode %{
6371     __ movdqu($tmp$$XMMRegister, $src2$$XMMRegister);
6372     __ phaddw($tmp$$XMMRegister, $tmp$$XMMRegister);
6373     __ phaddw($tmp$$XMMRegister, $tmp$$XMMRegister);
6374     __ movzwl($dst$$Register, $src1$$Register);
6375     __ pextrw($tmp2$$Register, $tmp$$XMMRegister, 0x0);
6376     __ addw($dst$$Register, $tmp2$$Register);
6377     __ movswl($dst$$Register, $dst$$Register);
6378   %}
6379   ins_pipe( pipe_slow );
6380 %}
6381 
6382 instruct rvadd4S_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, rRegI tmp2) %{
6383   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
6384   match(Set dst (AddReductionVI src1 src2));
6385   effect(TEMP tmp, TEMP tmp2, TEMP dst);
6386   format %{ "vphaddw  $tmp,$src2,$src2\n\t"
6387             "vphaddw  $tmp,$tmp,$tmp\n\t"
6388             "movzwl   $dst,$src1\n\t"
6389             "pextrw   $tmp2,$tmp, 0x0\n\t"
6390             "addw     $dst,$tmp2\n\t"
6391             "movswl  $dst,$dst\t! add reduction4S" %}
6392   ins_encode %{
6393     int vector_len = 0;
6394     __ vphaddw($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len);
6395     __ vphaddw($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp$$XMMRegister, vector_len);
6396     __ movzwl($dst$$Register, $src1$$Register);
6397     __ pextrw($tmp2$$Register, $tmp$$XMMRegister, 0x0);
6398     __ addw($dst$$Register, $tmp2$$Register);
6399     __ movswl($dst$$Register, $dst$$Register);
6400   %}
6401   ins_pipe( pipe_slow );
6402 %}
6403 
6404 instruct rsadd8S_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, rRegI tmp2) %{
6405   predicate(UseSSE > 3 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
6406   match(Set dst (AddReductionVI src1 src2));
6407   effect(TEMP tmp, TEMP tmp2, TEMP dst);
6408   format %{
6409             "movdqu   $tmp,$src2\n\t"
6410             "phaddw  $tmp,$tmp\n\t"
6411             "phaddw  $tmp,$tmp\n\t"
6412             "phaddw  $tmp,$tmp\n\t"
6413             "movzwl   $dst,$src1\n\t"
6414             "pextrw   $tmp2,$tmp, 0x0\n\t"
6415             "addw     $dst,$tmp2\n\t"
6416             "movswl  $dst,$dst\t! add reduction8S" %}
6417   ins_encode %{
6418     __ movdqu($tmp$$XMMRegister, $src2$$XMMRegister);
6419     __ phaddw($tmp$$XMMRegister, $tmp$$XMMRegister);
6420     __ phaddw($tmp$$XMMRegister, $tmp$$XMMRegister);
6421     __ phaddw($tmp$$XMMRegister, $tmp$$XMMRegister);
6422     __ movzwl($dst$$Register, $src1$$Register);
6423     __ pextrw($tmp2$$Register, $tmp$$XMMRegister, 0x0);
6424     __ addw($dst$$Register, $tmp2$$Register);
6425     __ movswl($dst$$Register, $dst$$Register);
6426   %}
6427   ins_pipe( pipe_slow );
6428 %}
6429 
6430 instruct rvadd8S_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, rRegI tmp2) %{
6431   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
6432   match(Set dst (AddReductionVI src1 src2));
6433   effect(TEMP tmp, TEMP tmp2, TEMP dst);
6434   format %{ "vphaddw  $tmp,$src2,$src2\n\t"
6435             "vphaddw  $tmp,$tmp,$tmp\n\t"
6436             "vphaddw  $tmp,$tmp,$tmp\n\t"
6437             "movzwl   $dst,$src1\n\t"
6438             "pextrw   $tmp2,$tmp, 0x0\n\t"
6439             "addw     $dst,$tmp2\n\t"
6440             "movswl  $dst,$dst\t! add reduction8S" %}
6441   ins_encode %{
6442     int vector_len = 0;
6443     __ vphaddw($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len);
6444     __ vphaddw($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp$$XMMRegister, vector_len);
6445     __ vphaddw($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp$$XMMRegister, vector_len);
6446     __ movzwl($dst$$Register, $src1$$Register);
6447     __ pextrw($tmp2$$Register, $tmp$$XMMRegister, 0x0);
6448     __ addw($dst$$Register, $tmp2$$Register);
6449     __ movswl($dst$$Register, $dst$$Register);
6450   %}
6451   ins_pipe( pipe_slow );
6452 %}
6453 
6454 instruct rvadd16S_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, rRegI tmp2) %{
6455   predicate(UseAVX > 1 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
6456   match(Set dst (AddReductionVI src1 src2));
6457   effect(TEMP tmp, TEMP tmp2, TEMP dst);
6458   format %{ "vphaddw  $tmp,$src2,$src2\n\t"
6459             "vphaddw  $tmp,$tmp,$tmp\n\t"
6460             "vphaddw  $tmp,$tmp,$tmp\n\t"
6461             "vphaddw  $tmp,$tmp,$tmp\n\t"
6462             "movzwl   $dst,$src1\n\t"
6463             "pextrw   $tmp2,$tmp, 0x0\n\t"
6464             "addw     $dst,$tmp2\n\t"
6465             "movswl  $dst,$dst\t! add reduction16S" %}
6466   ins_encode %{
6467     int vector_len = 1;
6468     __ vphaddw($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len);
6469     __ vpermq($tmp$$XMMRegister, $tmp$$XMMRegister, 0xD8, vector_len);
6470     __ vphaddw($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp$$XMMRegister, vector_len);
6471     __ vphaddw($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp$$XMMRegister, vector_len);
6472     __ vphaddw($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp$$XMMRegister, vector_len);
6473     __ movzwl($dst$$Register, $src1$$Register);
6474     __ pextrw($tmp2$$Register, $tmp$$XMMRegister, 0x0);
6475     __ addw($dst$$Register, $tmp2$$Register);
6476     __ movswl($dst$$Register, $dst$$Register);
6477   %}
6478   ins_pipe( pipe_slow );
6479 %}
6480 
6481 instruct rvadd32S_reduction_reg_evex(rRegI dst, rRegI src1, vecZ src2, regF tmp, regF tmp2, rRegI tmp3) %{
6482   predicate(UseAVX > 2  && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
6483   match(Set dst (AddReductionVI src1 src2));
6484   effect(TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP dst);
6485   format %{ "vextracti64x4_high  $tmp2,$src2\n\t"
6486             "vpaddw  $tmp2,$tmp2,$src2\n\t"
6487             "vextracti128_high  $tmp,$tmp2\n\t"
6488             "vpaddw  $tmp,$tmp,$tmp2\n\t"
6489             "pshufd  $tmp2,$tmp,0xE\n\t"
6490             "vpaddw  $tmp,$tmp,$tmp2\n\t"
6491             "pshufd  $tmp2,$tmp,0x1\n\t"
6492             "vpaddw  $tmp,$tmp,$tmp2\n\t"
6493             "movdl   $tmp3,$tmp\n\t"
6494             "addw    $dst,$tmp3\n\t"
6495             "shrl    $tmp3,0x16\n\t"
6496             "addw     $dst,$tmp3\n\t"
6497             "movswl  $dst,$dst\t! add reduction32S" %}
6498   ins_encode %{
6499     __ vextracti64x4_high($tmp2$$XMMRegister, $src2$$XMMRegister);
6500     __ vpaddw($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
6501     __ vextracti128_high($tmp$$XMMRegister, $tmp2$$XMMRegister);
6502     __ vpaddw($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
6503     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
6504     __ vpaddw($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
6505     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
6506     __ vpaddw($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
6507     __ movdl($tmp3$$Register, $tmp$$XMMRegister);
6508     __ movzwl($dst$$Register, $src1$$Register);
6509     __ addw($dst$$Register, $tmp3$$Register);
6510     __ shrl($tmp3$$Register, 16);
6511     __ addw($dst$$Register, $tmp3$$Register);
6512     __ movswl($dst$$Register, $dst$$Register);
6513   %}
6514   ins_pipe( pipe_slow );
6515 %}
6516 
6517 instruct rsadd2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, vecD tmp, vecD tmp2) %{
6518   predicate(UseSSE > 2 && UseAVX == 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
6519   match(Set dst (AddReductionVI src1 src2));
6520   effect(TEMP tmp2, TEMP tmp);
6521   format %{ "movdqu  $tmp2,$src2\n\t"
6522             "phaddd  $tmp2,$tmp2\n\t"
6523             "movd    $tmp,$src1\n\t"
6524             "paddd   $tmp,$tmp2\n\t"
6525             "movd    $dst,$tmp\t! add reduction2I" %}
6526   ins_encode %{
6527     __ movdqu($tmp2$$XMMRegister, $src2$$XMMRegister);
6528     __ phaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister);
6529     __ movdl($tmp$$XMMRegister, $src1$$Register);
6530     __ paddd($tmp$$XMMRegister, $tmp2$$XMMRegister);
6531     __ movdl($dst$$Register, $tmp$$XMMRegister);
6532   %}
6533   ins_pipe( pipe_slow );
6534 %}
6535 
6536 instruct rvadd2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, vecD tmp, vecD tmp2) %{
6537   predicate(VM_Version::supports_avxonly()  && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
6538   match(Set dst (AddReductionVI src1 src2));
6539   effect(TEMP tmp, TEMP tmp2);
6540   format %{ "vphaddd  $tmp,$src2,$src2\n\t"
6541             "movd     $tmp2,$src1\n\t"
6542             "vpaddd   $tmp2,$tmp2,$tmp\n\t"
6543             "movd     $dst,$tmp2\t! add reduction2I" %}
6544   ins_encode %{
6545     int vector_len = 0;
6546     __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len);
6547     __ movdl($tmp2$$XMMRegister, $src1$$Register);
6548     __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
6549     __ movdl($dst$$Register, $tmp2$$XMMRegister);
6550   %}
6551   ins_pipe( pipe_slow );
6552 %}
6553 
6554 instruct rvadd2I_reduction_reg_evex(rRegI dst, rRegI src1, vecD src2, vecD tmp, vecD tmp2) %{
6555   predicate(UseAVX > 2  && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
6556   match(Set dst (AddReductionVI src1 src2));
6557   effect(TEMP tmp, TEMP tmp2);
6558   format %{ "pshufd  $tmp2,$src2,0x1\n\t"
6559             "vpaddd  $tmp,$src2,$tmp2\n\t"
6560             "movd    $tmp2,$src1\n\t"
6561             "vpaddd  $tmp2,$tmp,$tmp2\n\t"
6562             "movd    $dst,$tmp2\t! add reduction2I" %}
6563   ins_encode %{
6564     int vector_len = 0;
6565     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
6566     __ vpaddd($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
6567     __ movdl($tmp2$$XMMRegister, $src1$$Register);
6568     __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
6569     __ movdl($dst$$Register, $tmp2$$XMMRegister);
6570   %}
6571   ins_pipe( pipe_slow );
6572 %}
6573 
6574 instruct rsadd4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, vecX tmp, vecX tmp2) %{
6575   predicate(UseSSE > 2 && UseAVX == 0  && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
6576   match(Set dst (AddReductionVI src1 src2));
6577   effect(TEMP tmp, TEMP tmp2);
6578   format %{ "movdqu  $tmp,$src2\n\t"
6579             "phaddd  $tmp,$tmp\n\t"
6580             "phaddd  $tmp,$tmp\n\t"
6581             "movd    $tmp2,$src1\n\t"
6582             "paddd   $tmp2,$tmp\n\t"
6583             "movd    $dst,$tmp2\t! add reduction4I" %}
6584   ins_encode %{
6585     __ movdqu($tmp$$XMMRegister, $src2$$XMMRegister);
6586     __ phaddd($tmp$$XMMRegister, $tmp$$XMMRegister);
6587     __ phaddd($tmp$$XMMRegister, $tmp$$XMMRegister);
6588     __ movdl($tmp2$$XMMRegister, $src1$$Register);
6589     __ paddd($tmp2$$XMMRegister, $tmp$$XMMRegister);
6590     __ movdl($dst$$Register, $tmp2$$XMMRegister);
6591   %}
6592   ins_pipe( pipe_slow );
6593 %}
6594 
6595 instruct rvadd4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, vecX tmp, vecX tmp2) %{
6596   predicate(VM_Version::supports_avxonly() && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
6597   match(Set dst (AddReductionVI src1 src2));
6598   effect(TEMP tmp, TEMP tmp2);
6599   format %{ "vphaddd  $tmp,$src2,$src2\n\t"
6600             "vphaddd  $tmp,$tmp,$tmp\n\t"
6601             "movd     $tmp2,$src1\n\t"
6602             "vpaddd   $tmp2,$tmp2,$tmp\n\t"
6603             "movd     $dst,$tmp2\t! add reduction4I" %}
6604   ins_encode %{
6605     int vector_len = 0;
6606     __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len);
6607     __ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp$$XMMRegister, vector_len);
6608     __ movdl($tmp2$$XMMRegister, $src1$$Register);
6609     __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
6610     __ movdl($dst$$Register, $tmp2$$XMMRegister);
6611   %}
6612   ins_pipe( pipe_slow );
6613 %}
6614 
6615 instruct rvadd4I_reduction_reg_evex(rRegI dst, rRegI src1, vecX src2, vecX tmp, vecX tmp2) %{
6616   predicate(UseAVX > 2  && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
6617   match(Set dst (AddReductionVI src1 src2));
6618   effect(TEMP tmp, TEMP tmp2);
6619   format %{ "pshufd  $tmp2,$src2,0xE\n\t"
6620             "vpaddd  $tmp,$src2,$tmp2\n\t"
6621             "pshufd  $tmp2,$tmp,0x1\n\t"
6622             "vpaddd  $tmp,$tmp,$tmp2\n\t"
6623             "movd    $tmp2,$src1\n\t"
6624             "vpaddd  $tmp2,$tmp,$tmp2\n\t"
6625             "movd    $dst,$tmp2\t! add reduction4I" %}
6626   ins_encode %{
6627     int vector_len = 0;
6628     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
6629     __ vpaddd($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
6630     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
6631     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
6632     __ movdl($tmp2$$XMMRegister, $src1$$Register);
6633     __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
6634     __ movdl($dst$$Register, $tmp2$$XMMRegister);
6635   %}
6636   ins_pipe( pipe_slow );
6637 %}
6638 
6639 instruct rvadd8I_reduction_reg(rRegI dst, rRegI src1, vecY src2, vecY tmp, vecY tmp2) %{
6640   predicate(VM_Version::supports_avxonly()  && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
6641   match(Set dst (AddReductionVI src1 src2));
6642   effect(TEMP tmp, TEMP tmp2);
6643   format %{ "vphaddd  $tmp,$src2,$src2\n\t"
6644             "vphaddd  $tmp,$tmp,$tmp2\n\t"
6645             "vextracti128_high  $tmp2,$tmp\n\t"
6646             "vpaddd   $tmp,$tmp,$tmp2\n\t"
6647             "movd     $tmp2,$src1\n\t"
6648             "vpaddd   $tmp2,$tmp2,$tmp\n\t"
6649             "movd     $dst,$tmp2\t! add reduction8I" %}
6650   ins_encode %{
6651     int vector_len = 1;
6652     __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len);
6653     __ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
6654     __ vextracti128_high($tmp2$$XMMRegister, $tmp$$XMMRegister);
6655     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
6656     __ movdl($tmp2$$XMMRegister, $src1$$Register);
6657     __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
6658     __ movdl($dst$$Register, $tmp2$$XMMRegister);
6659   %}
6660   ins_pipe( pipe_slow );
6661 %}
6662 
6663 instruct rvadd8I_reduction_reg_evex(rRegI dst, rRegI src1, vecY src2, vecY tmp, vecY tmp2) %{
6664   predicate(UseAVX > 2  && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
6665   match(Set dst (AddReductionVI src1 src2));
6666   effect(TEMP tmp, TEMP tmp2);
6667   format %{ "vextracti128_high  $tmp,$src2\n\t"
6668             "vpaddd  $tmp,$tmp,$src2\n\t"
6669             "pshufd  $tmp2,$tmp,0xE\n\t"
6670             "vpaddd  $tmp,$tmp,$tmp2\n\t"
6671             "pshufd  $tmp2,$tmp,0x1\n\t"
6672             "vpaddd  $tmp,$tmp,$tmp2\n\t"
6673             "movd    $tmp2,$src1\n\t"
6674             "vpaddd  $tmp2,$tmp,$tmp2\n\t"
6675             "movd    $dst,$tmp2\t! add reduction8I" %}
6676   ins_encode %{
6677     int vector_len = 0;
6678     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
6679     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
6680     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
6681     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
6682     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
6683     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
6684     __ movdl($tmp2$$XMMRegister, $src1$$Register);
6685     __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
6686     __ movdl($dst$$Register, $tmp2$$XMMRegister);
6687   %}
6688   ins_pipe( pipe_slow );
6689 %}
6690 
6691 instruct rvadd16I_reduction_reg_evex(rRegI dst, rRegI src1, legVecZ src2, legVecZ tmp, legVecZ tmp2, legVecZ tmp3) %{
6692   predicate(UseAVX > 2  && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
6693   match(Set dst (AddReductionVI src1 src2));
6694   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
6695   format %{ "vextracti64x4_high  $tmp3,$src2\n\t"
6696             "vpaddd  $tmp3,$tmp3,$src2\n\t"
6697             "vextracti128_high  $tmp,$tmp3\n\t"
6698             "vpaddd  $tmp,$tmp,$tmp3\n\t"
6699             "pshufd  $tmp2,$tmp,0xE\n\t"
6700             "vpaddd  $tmp,$tmp,$tmp2\n\t"
6701             "pshufd  $tmp2,$tmp,0x1\n\t"
6702             "vpaddd  $tmp,$tmp,$tmp2\n\t"
6703             "movd    $tmp2,$src1\n\t"
6704             "vpaddd  $tmp2,$tmp,$tmp2\n\t"
6705             "movd    $dst,$tmp2\t! mul reduction16I" %}
6706   ins_encode %{
6707     __ vextracti64x4_high($tmp3$$XMMRegister, $src2$$XMMRegister);
6708     __ vpaddd($tmp3$$XMMRegister, $tmp3$$XMMRegister, $src2$$XMMRegister, 1);
6709     __ vextracti128_high($tmp$$XMMRegister, $tmp3$$XMMRegister);
6710     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp3$$XMMRegister, 0);
6711     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
6712     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
6713     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
6714     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
6715     __ movdl($tmp2$$XMMRegister, $src1$$Register);
6716     __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
6717     __ movdl($dst$$Register, $tmp2$$XMMRegister);
6718   %}
6719   ins_pipe( pipe_slow );
6720 %}
6721 
6722 #ifdef _LP64
6723 instruct rvadd2L_reduction_reg(rRegL dst, rRegL src1, vecX src2, vecX tmp, vecX tmp2) %{
6724   match(Set dst (AddReductionVL src1 src2));
6725   effect(TEMP tmp, TEMP tmp2);
6726   format %{ "pshufd  $tmp2,$src2,0xE\n\t"
6727             "paddq   $tmp2,$src2\n\t"
6728             "movdq   $tmp,$src1\n\t"
6729             "paddq   $tmp2,$tmp\n\t"
6730             "movdq   $dst,$tmp2\t! add reduction2L" %}
6731   ins_encode %{
6732     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
6733     __ paddq($tmp2$$XMMRegister, $src2$$XMMRegister);
6734     __ movdq($tmp$$XMMRegister, $src1$$Register);
6735     __ paddq($tmp2$$XMMRegister, $tmp$$XMMRegister);
6736     __ movdq($dst$$Register, $tmp2$$XMMRegister);
6737   %}
6738   ins_pipe( pipe_slow );
6739 %}
6740 
6741 instruct rvadd4L_reduction_reg(rRegL dst, rRegL src1, vecY src2, vecY tmp, vecY tmp2) %{
6742   predicate(UseAVX > 1);
6743   match(Set dst (AddReductionVL src1 src2));
6744   effect(TEMP tmp, TEMP tmp2);
6745   format %{ "vextracti128_high  $tmp,$src2\n\t"
6746             "vpaddq  $tmp2,$tmp,$src2\n\t"
6747             "pshufd  $tmp,$tmp2,0xE\n\t"
6748             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
6749             "movdq   $tmp,$src1\n\t"
6750             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
6751             "movdq   $dst,$tmp2\t! add reduction4L" %}
6752   ins_encode %{
6753     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
6754     __ vpaddq($tmp2$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, 0);
6755     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
6756     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
6757     __ movdq($tmp$$XMMRegister, $src1$$Register);
6758     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
6759     __ movdq($dst$$Register, $tmp2$$XMMRegister);
6760   %}
6761   ins_pipe( pipe_slow );
6762 %}
6763 
6764 instruct rvadd8L_reduction_reg(rRegL dst, rRegL src1, legVecZ src2, legVecZ tmp, legVecZ tmp2) %{
6765   predicate(UseAVX > 2);
6766   match(Set dst (AddReductionVL src1 src2));
6767   effect(TEMP tmp, TEMP tmp2);
6768   format %{ "vextracti64x4_high  $tmp2,$src2\n\t"
6769             "vpaddq  $tmp2,$tmp2,$src2\n\t"
6770             "vextracti128_high  $tmp,$tmp2\n\t"
6771             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
6772             "pshufd  $tmp,$tmp2,0xE\n\t"
6773             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
6774             "movdq   $tmp,$src1\n\t"
6775             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
6776             "movdq   $dst,$tmp2\t! add reduction8L" %}
6777   ins_encode %{
6778     __ vextracti64x4_high($tmp2$$XMMRegister, $src2$$XMMRegister);
6779     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
6780     __ vextracti128_high($tmp$$XMMRegister, $tmp2$$XMMRegister);
6781     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
6782     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
6783     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
6784     __ movdq($tmp$$XMMRegister, $src1$$Register);
6785     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
6786     __ movdq($dst$$Register, $tmp2$$XMMRegister);
6787   %}
6788   ins_pipe( pipe_slow );
6789 %}
6790 #endif
6791 
6792 instruct rsadd2F_reduction_reg(regF dst, vecD src2, vecD tmp) %{
6793   predicate(UseSSE >= 1 && UseAVX == 0);
6794   match(Set dst (AddReductionVF dst src2));
6795   effect(TEMP dst, TEMP tmp);
6796   format %{ "addss   $dst,$src2\n\t"
6797             "pshufd  $tmp,$src2,0x01\n\t"
6798             "addss   $dst,$tmp\t! add reduction2F" %}
6799   ins_encode %{
6800     __ addss($dst$$XMMRegister, $src2$$XMMRegister);
6801     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
6802     __ addss($dst$$XMMRegister, $tmp$$XMMRegister);
6803   %}
6804   ins_pipe( pipe_slow );
6805 %}
6806 
6807 instruct rvadd2F_reduction_reg(regF dst, vecD src2, vecD tmp) %{
6808   predicate(UseAVX > 0);
6809   match(Set dst (AddReductionVF dst src2));
6810   effect(TEMP dst, TEMP tmp);
6811   format %{ "vaddss  $dst,$dst,$src2\n\t"
6812             "pshufd  $tmp,$src2,0x01\n\t"
6813             "vaddss  $dst,$dst,$tmp\t! add reduction2F" %}
6814   ins_encode %{
6815     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
6816     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
6817     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6818   %}
6819   ins_pipe( pipe_slow );
6820 %}
6821 
6822 instruct rsadd4F_reduction_reg(regF dst, vecX src2, vecX tmp) %{
6823   predicate(UseSSE >= 1 && UseAVX == 0);
6824   match(Set dst (AddReductionVF dst src2));
6825   effect(TEMP dst, TEMP tmp);
6826   format %{ "addss   $dst,$src2\n\t"
6827             "pshufd  $tmp,$src2,0x01\n\t"
6828             "addss   $dst,$tmp\n\t"
6829             "pshufd  $tmp,$src2,0x02\n\t"
6830             "addss   $dst,$tmp\n\t"
6831             "pshufd  $tmp,$src2,0x03\n\t"
6832             "addss   $dst,$tmp\t! add reduction4F" %}
6833   ins_encode %{
6834     __ addss($dst$$XMMRegister, $src2$$XMMRegister);
6835     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
6836     __ addss($dst$$XMMRegister, $tmp$$XMMRegister);
6837     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
6838     __ addss($dst$$XMMRegister, $tmp$$XMMRegister);
6839     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
6840     __ addss($dst$$XMMRegister, $tmp$$XMMRegister);
6841   %}
6842   ins_pipe( pipe_slow );
6843 %}
6844 
6845 instruct rvadd4F_reduction_reg(regF dst, vecX src2, vecX tmp) %{
6846   predicate(UseAVX > 0);
6847   match(Set dst (AddReductionVF dst src2));
6848   effect(TEMP tmp, TEMP dst);
6849   format %{ "vaddss  $dst,dst,$src2\n\t"
6850             "pshufd  $tmp,$src2,0x01\n\t"
6851             "vaddss  $dst,$dst,$tmp\n\t"
6852             "pshufd  $tmp,$src2,0x02\n\t"
6853             "vaddss  $dst,$dst,$tmp\n\t"
6854             "pshufd  $tmp,$src2,0x03\n\t"
6855             "vaddss  $dst,$dst,$tmp\t! add reduction4F" %}
6856   ins_encode %{
6857     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
6858     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
6859     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6860     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
6861     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6862     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
6863     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6864   %}
6865   ins_pipe( pipe_slow );
6866 %}
6867 
6868 instruct radd8F_reduction_reg(regF dst, vecY src2, vecY tmp, vecY tmp2) %{
6869   predicate(UseAVX > 0);
6870   match(Set dst (AddReductionVF dst src2));
6871   effect(TEMP tmp, TEMP dst, TEMP tmp2);
6872   format %{ "vaddss  $dst,$dst,$src2\n\t"
6873             "pshufd  $tmp,$src2,0x01\n\t"
6874             "vaddss  $dst,$dst,$tmp\n\t"
6875             "pshufd  $tmp,$src2,0x02\n\t"
6876             "vaddss  $dst,$dst,$tmp\n\t"
6877             "pshufd  $tmp,$src2,0x03\n\t"
6878             "vaddss  $dst,$dst,$tmp\n\t"
6879             "vextractf128_high  $tmp2,$src2\n\t"
6880             "vaddss  $dst,$dst,$tmp2\n\t"
6881             "pshufd  $tmp,$tmp2,0x01\n\t"
6882             "vaddss  $dst,$dst,$tmp\n\t"
6883             "pshufd  $tmp,$tmp2,0x02\n\t"
6884             "vaddss  $dst,$dst,$tmp\n\t"
6885             "pshufd  $tmp,$tmp2,0x03\n\t"
6886             "vaddss  $dst,$dst,$tmp\t! add reduction8F" %}
6887   ins_encode %{
6888     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
6889     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
6890     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6891     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
6892     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6893     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
6894     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6895     __ vextractf128_high($tmp2$$XMMRegister, $src2$$XMMRegister);
6896     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
6897     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
6898     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6899     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
6900     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6901     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
6902     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6903   %}
6904   ins_pipe( pipe_slow );
6905 %}
6906 
6907 instruct radd16F_reduction_reg(regF dst, legVecZ src2, legVecZ tmp, legVecZ tmp2) %{
6908   predicate(UseAVX > 2);
6909   match(Set dst (AddReductionVF dst src2));
6910   effect(TEMP tmp, TEMP dst, TEMP tmp2);
6911   format %{ "vaddss  $dst,$dst,$src2\n\t"
6912             "pshufd  $tmp,$src2,0x01\n\t"
6913             "vaddss  $dst,$dst,$tmp\n\t"
6914             "pshufd  $tmp,$src2,0x02\n\t"
6915             "vaddss  $dst,$dst,$tmp\n\t"
6916             "pshufd  $tmp,$src2,0x03\n\t"
6917             "vaddss  $dst,$dst,$tmp\n\t"
6918             "vextractf32x4  $tmp2,$src2,0x1\n\t"
6919             "vaddss  $dst,$dst,$tmp2\n\t"
6920             "pshufd  $tmp,$tmp2,0x01\n\t"
6921             "vaddss  $dst,$dst,$tmp\n\t"
6922             "pshufd  $tmp,$tmp2,0x02\n\t"
6923             "vaddss  $dst,$dst,$tmp\n\t"
6924             "pshufd  $tmp,$tmp2,0x03\n\t"
6925             "vaddss  $dst,$dst,$tmp\n\t"
6926             "vextractf32x4  $tmp2,$src2,0x2\n\t"
6927             "vaddss  $dst,$dst,$tmp2\n\t"
6928             "pshufd  $tmp,$tmp2,0x01\n\t"
6929             "vaddss  $dst,$dst,$tmp\n\t"
6930             "pshufd  $tmp,$tmp2,0x02\n\t"
6931             "vaddss  $dst,$dst,$tmp\n\t"
6932             "pshufd  $tmp,$tmp2,0x03\n\t"
6933             "vaddss  $dst,$dst,$tmp\n\t"
6934             "vextractf32x4  $tmp2,$src2,0x3\n\t"
6935             "vaddss  $dst,$dst,$tmp2\n\t"
6936             "pshufd  $tmp,$tmp2,0x01\n\t"
6937             "vaddss  $dst,$dst,$tmp\n\t"
6938             "pshufd  $tmp,$tmp2,0x02\n\t"
6939             "vaddss  $dst,$dst,$tmp\n\t"
6940             "pshufd  $tmp,$tmp2,0x03\n\t"
6941             "vaddss  $dst,$dst,$tmp\t! add reduction16F" %}
6942   ins_encode %{
6943     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
6944     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
6945     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6946     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
6947     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6948     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
6949     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6950     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
6951     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
6952     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
6953     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6954     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
6955     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6956     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
6957     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6958     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
6959     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
6960     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
6961     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6962     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
6963     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6964     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
6965     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6966     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
6967     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
6968     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
6969     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6970     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
6971     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6972     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
6973     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6974   %}
6975   ins_pipe( pipe_slow );
6976 %}
6977 
6978 instruct rsadd2D_reduction_reg(regD dst, vecX src2, vecX tmp) %{
6979   predicate(UseSSE >= 1 && UseAVX == 0);
6980   match(Set dst (AddReductionVD dst src2));
6981   effect(TEMP tmp, TEMP dst);
6982   format %{ "addsd   $dst,$src2\n\t"
6983             "pshufd  $tmp,$src2,0xE\n\t"
6984             "addsd   $dst,$tmp\t! add reduction2D" %}
6985   ins_encode %{
6986     __ addsd($dst$$XMMRegister, $src2$$XMMRegister);
6987     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
6988     __ addsd($dst$$XMMRegister, $tmp$$XMMRegister);
6989   %}
6990   ins_pipe( pipe_slow );
6991 %}
6992 
6993 instruct rvadd2D_reduction_reg(regD dst, vecX src2, vecX tmp) %{
6994   predicate(UseAVX > 0);
6995   match(Set dst (AddReductionVD dst src2));
6996   effect(TEMP tmp, TEMP dst);
6997   format %{ "vaddsd  $dst,$dst,$src2\n\t"
6998             "pshufd  $tmp,$src2,0xE\n\t"
6999             "vaddsd  $dst,$dst,$tmp\t! add reduction2D" %}
7000   ins_encode %{
7001     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
7002     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
7003     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7004   %}
7005   ins_pipe( pipe_slow );
7006 %}
7007 
7008 instruct rvadd4D_reduction_reg(regD dst, vecY src2, vecX tmp, vecX tmp2) %{
7009   predicate(UseAVX > 0);
7010   match(Set dst (AddReductionVD dst src2));
7011   effect(TEMP tmp, TEMP dst, TEMP tmp2);
7012   format %{ "vaddsd  $dst,$dst,$src2\n\t"
7013             "pshufd  $tmp,$src2,0xE\n\t"
7014             "vaddsd  $dst,$dst,$tmp\n\t"
7015             "vextractf128  $tmp2,$src2,0x1\n\t"
7016             "vaddsd  $dst,$dst,$tmp2\n\t"
7017             "pshufd  $tmp,$tmp2,0xE\n\t"
7018             "vaddsd  $dst,$dst,$tmp\t! add reduction4D" %}
7019   ins_encode %{
7020     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
7021     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
7022     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7023     __ vextractf128($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
7024     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
7025     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
7026     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7027   %}
7028   ins_pipe( pipe_slow );
7029 %}
7030 
7031 instruct rvadd8D_reduction_reg(regD dst, legVecZ src2, legVecZ tmp, legVecZ tmp2) %{
7032   predicate(UseAVX > 2);
7033   match(Set dst (AddReductionVD dst src2));
7034   effect(TEMP tmp, TEMP dst, TEMP tmp2);
7035   format %{ "vaddsd  $dst,$dst,$src2\n\t"
7036             "pshufd  $tmp,$src2,0xE\n\t"
7037             "vaddsd  $dst,$dst,$tmp\n\t"
7038             "vextractf32x4  $tmp2,$src2,0x1\n\t"
7039             "vaddsd  $dst,$dst,$tmp2\n\t"
7040             "pshufd  $tmp,$tmp2,0xE\n\t"
7041             "vaddsd  $dst,$dst,$tmp\n\t"
7042             "vextractf32x4  $tmp2,$src2,0x2\n\t"
7043             "vaddsd  $dst,$dst,$tmp2\n\t"
7044             "pshufd  $tmp,$tmp2,0xE\n\t"
7045             "vaddsd  $dst,$dst,$tmp\n\t"
7046             "vextractf32x4  $tmp2,$src2,0x3\n\t"
7047             "vaddsd  $dst,$dst,$tmp2\n\t"
7048             "pshufd  $tmp,$tmp2,0xE\n\t"
7049             "vaddsd  $dst,$dst,$tmp\t! add reduction8D" %}
7050   ins_encode %{
7051     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
7052     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
7053     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7054     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
7055     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
7056     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
7057     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7058     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
7059     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
7060     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
7061     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7062     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
7063     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
7064     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
7065     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7066   %}
7067   ins_pipe( pipe_slow );
7068 %}
7069 
7070 instruct rssub2F_reduction_reg(regF dst, vecD src2, regF tmp) %{
7071   predicate(UseSSE >= 1 && UseAVX == 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
7072   match(Set dst (SubReductionVFP dst src2));
7073   effect(TEMP dst, TEMP tmp);
7074   format %{ "subss  $dst,$src2\n\t"
7075             "pshufd  $tmp,$src2,0x01\n\t"
7076             "subss  $dst,$dst,$tmp\t! sub reduction2F" %}
7077   ins_encode %{
7078     __ subss($dst$$XMMRegister, $src2$$XMMRegister);
7079     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
7080     __ subss($dst$$XMMRegister, $tmp$$XMMRegister);
7081   %}
7082   ins_pipe( pipe_slow );
7083 %}
7084 
7085 instruct rvsub2F_reduction_reg(regF dst, vecD src2, regF tmp) %{
7086   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
7087   match(Set dst (SubReductionVFP dst src2));
7088   effect(TEMP dst, TEMP tmp);
7089   format %{ "vsubss  $dst,$dst,$src2\n\t"
7090             "pshufd  $tmp,$src2,0x01\n\t"
7091             "vsubss  $dst,$dst,$tmp\t! sub reduction2F" %}
7092   ins_encode %{
7093     __ vsubss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
7094     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
7095     __ vsubss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7096   %}
7097   ins_pipe( pipe_slow );
7098 %}
7099 
7100 instruct rssub4F_reduction_reg(regF dst, vecX src2, regF tmp) %{
7101   predicate(UseSSE >= 1 && UseAVX == 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
7102   match(Set dst (SubReductionVFP dst src2));
7103   effect(TEMP dst, TEMP tmp);
7104   format %{ "subss   $dst,$src2\n\t"
7105             "pshufd  $tmp,$src2,0x01\n\t"
7106             "subss   $dst,$tmp\n\t"
7107             "pshufd  $tmp,$src2,0x02\n\t"
7108             "subss   $dst,$tmp\n\t"
7109             "pshufd  $tmp,$src2,0x03\n\t"
7110             "subss   $dst,$tmp\t! sub reduction4F" %}
7111   ins_encode %{
7112     __ subss($dst$$XMMRegister, $src2$$XMMRegister);
7113     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
7114     __ subss($dst$$XMMRegister, $tmp$$XMMRegister);
7115     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
7116     __ subss($dst$$XMMRegister, $tmp$$XMMRegister);
7117     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
7118     __ subss($dst$$XMMRegister, $tmp$$XMMRegister);
7119   %}
7120   ins_pipe( pipe_slow );
7121 %}
7122 
7123 instruct rvsub4F_reduction_reg(regF dst, vecX src2, regF tmp) %{
7124   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
7125   match(Set dst (SubReductionVFP dst src2));
7126   effect(TEMP tmp, TEMP dst);
7127   format %{ "vsubss  $dst,dst,$src2\n\t"
7128             "pshufd  $tmp,$src2,0x01\n\t"
7129             "vsubss  $dst,$dst,$tmp\n\t"
7130             "pshufd  $tmp,$src2,0x02\n\t"
7131             "vsubss  $dst,$dst,$tmp\n\t"
7132             "pshufd  $tmp,$src2,0x03\n\t"
7133             "vsubss  $dst,$dst,$tmp\t! sub reduction4F" %}
7134   ins_encode %{
7135     __ vsubss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
7136     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
7137     __ vsubss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7138     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
7139     __ vsubss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7140     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
7141     __ vsubss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7142   %}
7143   ins_pipe( pipe_slow );
7144 %}
7145 
7146 instruct rsub8F_reduction_reg(regF dst, vecY src2, regF tmp, regF tmp2) %{
7147   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
7148   match(Set dst (SubReductionVFP dst src2));
7149   effect(TEMP tmp, TEMP dst, TEMP tmp2);
7150   format %{ "vsubss  $dst,$dst,$src2\n\t"
7151             "pshufd  $tmp,$src2,0x01\n\t"
7152             "vsubss  $dst,$dst,$tmp\n\t"
7153             "pshufd  $tmp,$src2,0x02\n\t"
7154             "vsubss  $dst,$dst,$tmp\n\t"
7155             "pshufd  $tmp,$src2,0x03\n\t"
7156             "vsubss  $dst,$dst,$tmp\n\t"
7157             "vextractf128_high  $tmp2,$src2\n\t"
7158             "vsubss  $dst,$dst,$tmp2\n\t"
7159             "pshufd  $tmp,$tmp2,0x01\n\t"
7160             "vsubss  $dst,$dst,$tmp\n\t"
7161             "pshufd  $tmp,$tmp2,0x02\n\t"
7162             "vsubss  $dst,$dst,$tmp\n\t"
7163             "pshufd  $tmp,$tmp2,0x03\n\t"
7164             "vsubss  $dst,$dst,$tmp\t! sub reduction8F" %}
7165   ins_encode %{
7166     __ vsubss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
7167     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
7168     __ vsubss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7169     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
7170     __ vsubss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7171     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
7172     __ vsubss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7173     __ vextractf128_high($tmp2$$XMMRegister, $src2$$XMMRegister);
7174     __ vsubss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
7175     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
7176     __ vsubss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7177     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
7178     __ vsubss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7179     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
7180     __ vsubss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7181   %}
7182   ins_pipe( pipe_slow );
7183 %}
7184 
7185 instruct rsub16F_reduction_reg(regF dst, vecZ src2, regF tmp, regF tmp2) %{
7186   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
7187   match(Set dst (SubReductionVFP dst src2));
7188   effect(TEMP tmp, TEMP dst, TEMP tmp2);
7189   format %{ "vsubss  $dst,$dst,$src2\n\t"
7190             "pshufd  $tmp,$src2,0x01\n\t"
7191             "vsubss  $dst,$dst,$tmp\n\t"
7192             "pshufd  $tmp,$src2,0x02\n\t"
7193             "vsubss  $dst,$dst,$tmp\n\t"
7194             "pshufd  $tmp,$src2,0x03\n\t"
7195             "vsubss  $dst,$dst,$tmp\n\t"
7196             "vextractf32x4  $tmp2,$src2,0x1\n\t"
7197             "vsubss  $dst,$dst,$tmp2\n\t"
7198             "pshufd  $tmp,$tmp2,0x01\n\t"
7199             "vsubss  $dst,$dst,$tmp\n\t"
7200             "pshufd  $tmp,$tmp2,0x02\n\t"
7201             "vsubss  $dst,$dst,$tmp\n\t"
7202             "pshufd  $tmp,$tmp2,0x03\n\t"
7203             "vsubss  $dst,$dst,$tmp\n\t"
7204             "vextractf32x4  $tmp2,$src2,0x2\n\t"
7205             "vsubss  $dst,$dst,$tmp2\n\t"
7206             "pshufd  $tmp,$tmp2,0x01\n\t"
7207             "vsubss  $dst,$dst,$tmp\n\t"
7208             "pshufd  $tmp,$tmp2,0x02\n\t"
7209             "vsubss  $dst,$dst,$tmp\n\t"
7210             "pshufd  $tmp,$tmp2,0x03\n\t"
7211             "vsubss  $dst,$dst,$tmp\n\t"
7212             "vextractf32x4  $tmp2,$src2,0x3\n\t"
7213             "vsubss  $dst,$dst,$tmp2\n\t"
7214             "pshufd  $tmp,$tmp2,0x01\n\t"
7215             "vsubss  $dst,$dst,$tmp\n\t"
7216             "pshufd  $tmp,$tmp2,0x02\n\t"
7217             "vsubss  $dst,$dst,$tmp\n\t"
7218             "pshufd  $tmp,$tmp2,0x03\n\t"
7219             "vsubss  $dst,$dst,$tmp\t! sub reduction16F" %}
7220   ins_encode %{
7221     __ vsubss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
7222     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
7223     __ vsubss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7224     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
7225     __ vsubss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7226     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
7227     __ vsubss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7228     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
7229     __ vsubss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
7230     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
7231     __ vsubss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7232     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
7233     __ vsubss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7234     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
7235     __ vsubss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7236     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
7237     __ vsubss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
7238     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
7239     __ vsubss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7240     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
7241     __ vsubss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7242     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
7243     __ vsubss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7244     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
7245     __ vsubss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
7246     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
7247     __ vsubss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7248     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
7249     __ vsubss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7250     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
7251     __ vsubss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7252   %}
7253   ins_pipe( pipe_slow );
7254 %}
7255 
7256 instruct rssub2D_reduction_reg(regD dst, vecX src2, regD tmp) %{
7257   predicate(UseSSE >= 1 && UseAVX == 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
7258   match(Set dst (SubReductionVFP dst src2));
7259   effect(TEMP tmp, TEMP dst);
7260   format %{ "subsd   $dst,$src2\n\t"
7261             "pshufd  $tmp,$src2,0xE\n\t"
7262             "subsd   $dst,$tmp\t! sub reduction2D" %}
7263   ins_encode %{
7264     __ subsd($dst$$XMMRegister, $src2$$XMMRegister);
7265     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
7266     __ subsd($dst$$XMMRegister, $tmp$$XMMRegister);
7267   %}
7268   ins_pipe( pipe_slow );
7269 %}
7270 
7271 instruct rvsub2D_reduction_reg(regD dst, vecX src2, regD tmp) %{
7272   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
7273   match(Set dst (SubReductionVFP dst src2));
7274   effect(TEMP tmp, TEMP dst);
7275   format %{ "vsubsd  $dst,$dst,$src2\n\t"
7276             "pshufd  $tmp,$src2,0xE\n\t"
7277             "vsubsd  $dst,$dst,$tmp\t! sub reduction2D" %}
7278   ins_encode %{
7279     __ vsubsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
7280     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
7281     __ vsubsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7282   %}
7283   ins_pipe( pipe_slow );
7284 %}
7285 
7286 instruct rvsub4D_reduction_reg(regD dst, vecY src2, regD tmp, regD tmp2) %{
7287   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
7288   match(Set dst (SubReductionVFP dst src2));
7289   effect(TEMP tmp, TEMP dst, TEMP tmp2);
7290   format %{ "vsubsd  $dst,$dst,$src2\n\t"
7291             "pshufd  $tmp,$src2,0xE\n\t"
7292             "vsubsd  $dst,$dst,$tmp\n\t"
7293             "vextractf32x4  $tmp2,$src2,0x1\n\t"
7294             "vsubsd  $dst,$dst,$tmp2\n\t"
7295             "pshufd  $tmp,$tmp2,0xE\n\t"
7296             "vsubsd  $dst,$dst,$tmp\t! sub reduction4D" %}
7297   ins_encode %{
7298     __ vsubsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
7299     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
7300     __ vsubsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7301     __ vextractf128($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
7302     __ vsubsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
7303     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
7304     __ vsubsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7305   %}
7306   ins_pipe( pipe_slow );
7307 %}
7308 
7309 instruct rvsub8D_reduction_reg(regD dst, vecZ src2, regD tmp, regD tmp2) %{
7310   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
7311   match(Set dst (SubReductionVFP dst src2));
7312   effect(TEMP tmp, TEMP dst, TEMP tmp2);
7313   format %{ "vsubsd  $dst,$dst,$src2\n\t"
7314             "pshufd  $tmp,$src2,0xE\n\t"
7315             "vsubsd  $dst,$dst,$tmp\n\t"
7316             "vextractf32x4  $tmp2,$src2,0x1\n\t"
7317             "vsubsd  $dst,$dst,$tmp2\n\t"
7318             "pshufd  $tmp,$tmp2,0xE\n\t"
7319             "vsubsd  $dst,$dst,$tmp\n\t"
7320             "vextractf32x4  $tmp2,$src2,0x2\n\t"
7321             "vsubsd  $dst,$dst,$tmp2\n\t"
7322             "pshufd  $tmp,$tmp2,0xE\n\t"
7323             "vsubsd  $dst,$dst,$tmp\n\t"
7324             "vextractf32x4  $tmp2,$src2,0x3\n\t"
7325             "vsubsd  $dst,$dst,$tmp2\n\t"
7326             "pshufd  $tmp,$tmp2,0xE\n\t"
7327             "vsubsd  $dst,$dst,$tmp\t! sub reduction8D" %}
7328   ins_encode %{
7329     __ vsubsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
7330     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
7331     __ vsubsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7332     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
7333     __ vsubsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
7334     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
7335     __ vsubsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7336     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
7337     __ vsubsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
7338     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
7339     __ vsubsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7340     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
7341     __ vsubsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
7342     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
7343     __ vsubsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7344   %}
7345   ins_pipe( pipe_slow );
7346 %}
7347 
7348 instruct rsmul8B_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp1, rRegI tmp2, rRegI tmp3) %{
7349   predicate(UseSSE > 3 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
7350   match(Set dst (MulReductionVI src1 src2));
7351   effect(TEMP dst, TEMP tmp, TEMP tmp1, TEMP tmp2, TEMP tmp3);
7352   format %{ "pmovsxbw $tmp,$src2\n\t"
7353             "pshufd   $tmp1,$tmp,0xE\n\t"
7354             "pmullw   $tmp,$tmp1\n\t"
7355             "pshufd   $tmp1,$tmp,0x1\n\t"
7356             "pmullw   $tmp,$tmp1\n\t"
7357             "pextrw   $tmp2,$tmp, 0x1\n\t"
7358             "pextrw   $tmp3,$tmp, 0x0\n\t"
7359             "imul     $tmp2,$tmp3 \n\t"
7360             "movsbl   $dst,$src1\n\t"
7361             "imull    $dst,$tmp2\n\t"
7362             "movsbl   $dst,$dst\t! mul reduction8B" %}
7363   ins_encode %{
7364     __ pmovsxbw($tmp$$XMMRegister, $src2$$XMMRegister);
7365     __ pshufd($tmp1$$XMMRegister, $tmp$$XMMRegister, 0xE);
7366     __ pmullw($tmp$$XMMRegister, $tmp1$$XMMRegister);
7367     __ pshufd($tmp1$$XMMRegister, $tmp$$XMMRegister, 0x1);
7368     __ pmullw($tmp$$XMMRegister, $tmp1$$XMMRegister);
7369     __ pextrw($tmp2$$Register, $tmp$$XMMRegister, 0x1);
7370     __ pextrw($tmp3$$Register, $tmp$$XMMRegister, 0x0);
7371     __ imull($tmp2$$Register, $tmp3$$Register);
7372     __ movsbl($dst$$Register, $src1$$Register);
7373     __ imull($dst$$Register, $tmp2$$Register);
7374     __ movsbl($dst$$Register, $dst$$Register);
7375   %}
7376   ins_pipe( pipe_slow );
7377 %}
7378 
7379 instruct rsmul16B_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp1, rRegI tmp2, rRegI tmp3) %{
7380   predicate(UseSSE > 3 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
7381   match(Set dst (MulReductionVI src1 src2));
7382   effect(TEMP dst, TEMP tmp, TEMP tmp1, TEMP tmp2, TEMP tmp3);
7383   format %{ "pmovsxbw $tmp,$src2\n\t"
7384             "pshufd   $tmp1,$src2,0xEE\n\t"
7385             "pmovsxbw $tmp1,$tmp1\n\t"
7386             "pmullw   $tmp,$tmp1\n\t"
7387             "pshufd   $tmp1,$tmp,0xE\n\t"
7388             "pmullw   $tmp,$tmp1\n\t"
7389             "pshufd   $tmp1,$tmp,0x1\n\t"
7390             "pmullw   $tmp,$tmp1\n\t"
7391             "pextrw   $tmp2,$tmp, 0x1\n\t"
7392             "pextrw   $tmp3,$tmp, 0x0\n\t"
7393             "imull    $tmp2,$tmp3 \n\t"
7394             "movsbl   $dst,$src1\n\t"
7395             "imull    $dst,$tmp2\n\t"
7396             "movsbl   $dst,$dst\t! mul reduction16B" %}
7397   ins_encode %{
7398     int vector_len = 0;
7399     __ pmovsxbw($tmp$$XMMRegister, $src2$$XMMRegister);
7400     __ pshufd($tmp1$$XMMRegister, $src2$$XMMRegister, 0xEE);
7401     __ pmovsxbw($tmp1$$XMMRegister, $tmp1$$XMMRegister);
7402     __ pmullw($tmp$$XMMRegister, $tmp1$$XMMRegister);
7403     __ pshufd($tmp1$$XMMRegister, $tmp$$XMMRegister, 0xE);
7404     __ pmullw($tmp$$XMMRegister, $tmp1$$XMMRegister);
7405     __ pshufd($tmp1$$XMMRegister, $tmp$$XMMRegister, 0x1);
7406     __ pmullw($tmp$$XMMRegister, $tmp1$$XMMRegister);
7407     __ pextrw($tmp2$$Register, $tmp$$XMMRegister, 0x1);
7408     __ pextrw($tmp3$$Register, $tmp$$XMMRegister, 0x0);
7409     __ imull($tmp2$$Register, $tmp3$$Register);
7410     __ movsbl($dst$$Register, $src1$$Register);
7411     __ imull($dst$$Register, $tmp2$$Register);
7412     __ movsbl($dst$$Register, $dst$$Register);
7413   %}
7414   ins_pipe( pipe_slow );
7415 %}
7416 
7417 instruct rvmul32B_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp1, rRegI tmp2, rRegI tmp3) %{
7418   predicate(UseAVX > 1 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
7419   match(Set dst (MulReductionVI src1 src2));
7420   effect(TEMP dst, TEMP tmp, TEMP tmp1, TEMP tmp2, TEMP tmp3);
7421   format %{ "vextracti128_high  $tmp,$src2\n\t"
7422             "pmovsxbw $tmp,$tmp\n\t"
7423             "pmovsxbw $tmp1,$src2\n\t"
7424             "vpmullw  $tmp,$tmp,$tmp1\n\t"
7425             "vextracti128_high  $tmp1,$tmp\n\t"
7426             "vpmullw  $tmp,$tmp,$tmp1\n\t"
7427             "pshufd   $tmp1,$tmp,0xE\n\t"
7428             "vpmullw  $tmp,$tmp,$tmp1\n\t"
7429             "pshufd   $tmp1,$tmp,0x1\n\t"
7430             "vpmullw  $tmp,$tmp,$tmp1\n\t"
7431             "pextrw   $tmp2,$tmp, 0x1\n\t"
7432             "pextrw   $tmp3,$tmp, 0x0\n\t"
7433             "imull    $tmp2,$tmp3 \n\t"
7434             "movsbl   $dst,$src1\n\t"
7435             "imull    $dst,$tmp2\n\t"
7436             "movsbl   $dst,$dst\t! mul reduction32B" %}
7437   ins_encode %{
7438     int vector_len = 1;
7439     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
7440     __ vpmovsxbw($tmp$$XMMRegister, $tmp$$XMMRegister, vector_len);
7441     __ vpmovsxbw($tmp1$$XMMRegister, $src2$$XMMRegister, vector_len);
7442     __ vpmullw($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp1$$XMMRegister, vector_len);
7443     __ vextracti128_high($tmp1$$XMMRegister, $tmp$$XMMRegister);
7444     __ vpmullw($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp1$$XMMRegister, 0);
7445     __ pshufd($tmp1$$XMMRegister, $tmp$$XMMRegister, 0xE);
7446     __ vpmullw($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp1$$XMMRegister, 0);
7447     __ pshufd($tmp1$$XMMRegister, $tmp$$XMMRegister, 0x1);
7448     __ vpmullw($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp1$$XMMRegister, 0);
7449     __ pextrw($tmp2$$Register, $tmp$$XMMRegister, 0x1);
7450     __ pextrw($tmp3$$Register, $tmp$$XMMRegister, 0x0);
7451     __ imull($tmp2$$Register, $tmp3$$Register);
7452     __ movsbl($dst$$Register, $src1$$Register);
7453     __ imull($dst$$Register, $tmp2$$Register);
7454     __ movsbl($dst$$Register, $dst$$Register);
7455   %}
7456   ins_pipe( pipe_slow );
7457 %}
7458 
7459 instruct rvmul64B_reduction_reg(rRegI dst, rRegI src1, vecZ src2, regF tmp, regF tmp1, rRegI tmp2, rRegI tmp3) %{
7460   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
7461   match(Set dst (MulReductionVI src1 src2));
7462   effect(TEMP dst, TEMP tmp, TEMP tmp1, TEMP tmp2, TEMP tmp3);
7463   format %{ "vextracti64x4_high  $tmp,$src2\n\t"
7464             "vpmovsxbw $tmp,$tmp\n\t"
7465             "vpmovsxbw $tmp1,$src2\n\t"
7466             "vpmullw  $tmp,$tmp,$tmp1\n\t"
7467             "vextracti64x4_high  $tmp1,$tmp\n\t"
7468             "vpmullw  $tmp,$tmp,$tmp1\n\t"
7469             "vextracti128_high  $tmp1,$tmp\n\t"
7470             "vpmullw  $tmp,$tmp,$tmp1\n\t"
7471             "pshufd   $tmp1,$tmp,0xE\n\t"
7472             "vpmullw  $tmp,$tmp,$tmp1\n\t"
7473             "pshufd   $tmp1,$tmp,0x1\n\t"
7474             "vpmullw  $tmp,$tmp,$tmp1\n\t"
7475             "pextrw   $tmp2,$tmp, 0x1\n\t"
7476             "pextrw   $tmp3,$tmp, 0x0\n\t"
7477             "imull    $tmp2,$tmp3 \n\t"
7478             "movsbl   $dst,$src1\n\t"
7479             "imull    $dst,$tmp2\n\t"
7480             "movsbl   $dst,$dst\t! mul reduction64B" %}
7481   ins_encode %{
7482     int vector_len = 2;
7483     __ vextracti64x4_high($tmp$$XMMRegister, $src2$$XMMRegister);
7484     __ vpmovsxbw($tmp$$XMMRegister, $tmp$$XMMRegister, vector_len);
7485     __ vpmovsxbw($tmp1$$XMMRegister, $src2$$XMMRegister, vector_len);
7486     __ vpmullw($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp1$$XMMRegister, vector_len);
7487     __ vextracti64x4_high($tmp1$$XMMRegister, $tmp$$XMMRegister);
7488     __ vpmullw($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp1$$XMMRegister, 1);
7489     __ vextracti128_high($tmp1$$XMMRegister, $tmp$$XMMRegister);
7490     __ vpmullw($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp1$$XMMRegister, 0);
7491     __ pshufd($tmp1$$XMMRegister, $tmp$$XMMRegister, 0xE);
7492     __ vpmullw($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp1$$XMMRegister, 0);
7493     __ pshufd($tmp1$$XMMRegister, $tmp$$XMMRegister, 0x1);
7494     __ vpmullw($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp1$$XMMRegister, 0);
7495     __ pextrw($tmp2$$Register, $tmp$$XMMRegister, 0x1);
7496     __ pextrw($tmp3$$Register, $tmp$$XMMRegister, 0x0);
7497     __ imull($tmp2$$Register, $tmp3$$Register);
7498     __ movsbl($dst$$Register, $src1$$Register);
7499     __ imull($dst$$Register, $tmp2$$Register);
7500     __ movsbl($dst$$Register, $dst$$Register);
7501   %}
7502   ins_pipe( pipe_slow );
7503 %}
7504 
7505 instruct rsmul4S_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, rRegI tmp2, rRegI tmp3) %{
7506   predicate(UseSSE > 1 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
7507   match(Set dst (MulReductionVI src1 src2));
7508   effect(TEMP dst, TEMP tmp, TEMP tmp2, TEMP tmp3);
7509   format %{ "pshufd  $tmp,$src2,0x1\n\t"
7510             "pmullw  $tmp,$src2\n\t"
7511             "pextrw  $tmp2,$tmp, 0x1\n\t"
7512             "pextrw  $tmp3,$tmp, 0x0\n\t"
7513             "imull    $tmp2,$tmp3 \n\t"
7514             "movswl   $dst,$src1\n\t"
7515             "imull    $dst,$tmp2\n\t"
7516             "movswl   $dst,$dst\t! mul reduction4S" %}
7517   ins_encode %{
7518     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x1);
7519     __ pmullw($tmp$$XMMRegister, $src2$$XMMRegister);
7520     __ pextrw($tmp2$$Register, $tmp$$XMMRegister, 0x1);
7521     __ pextrw($tmp3$$Register, $tmp$$XMMRegister, 0x0);
7522     __ imull($tmp2$$Register, $tmp3$$Register);
7523     __ movswl($dst$$Register, $src1$$Register);
7524     __ imull($dst$$Register, $tmp2$$Register);
7525     __ movswl($dst$$Register, $dst$$Register);
7526   %}
7527   ins_pipe( pipe_slow );
7528 %}
7529 
7530 instruct rsmul8S_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp1, rRegI tmp2, rRegI tmp3) %{
7531   predicate(UseSSE > 1 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
7532   match(Set dst (MulReductionVI src1 src2));
7533   effect(TEMP dst, TEMP tmp, TEMP tmp1, TEMP tmp2, TEMP tmp3);
7534   format %{ "pshufd  $tmp,$src2,0xE\n\t"
7535             "pmullw  $tmp,$src2\n\t"
7536             "pshufd  $tmp1,$tmp,0x1\n\t"
7537             "pmullw  $tmp,$tmp1\n\t"
7538             "pextrw  $tmp2,$tmp, 0x1\n\t"
7539             "pextrw  $tmp3,$tmp, 0x0\n\t"
7540             "imul    $tmp2,$tmp3 \n\t"
7541             "movswl   $dst,$src1\n\t"
7542             "imull    $dst,$tmp2\n\t"
7543             "movswl   $dst,$dst\t! mul reduction8S" %}
7544   ins_encode %{
7545     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
7546     __ pmullw($tmp$$XMMRegister, $src2$$XMMRegister);
7547     __ pshufd($tmp1$$XMMRegister, $tmp$$XMMRegister, 0x1);
7548     __ pmullw($tmp$$XMMRegister, $tmp1$$XMMRegister);
7549     __ pextrw($tmp2$$Register, $tmp$$XMMRegister, 0x1);
7550     __ pextrw($tmp3$$Register, $tmp$$XMMRegister, 0x0);
7551     __ imull($tmp2$$Register, $tmp3$$Register);
7552     __ movswl($dst$$Register, $src1$$Register);
7553     __ imull($dst$$Register, $tmp2$$Register);
7554     __ movswl($dst$$Register, $dst$$Register);
7555   %}
7556   ins_pipe( pipe_slow );
7557 %}
7558 
7559 instruct rvmul16S_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp1, rRegI tmp2, rRegI tmp3) %{
7560   predicate(UseAVX > 1 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
7561   match(Set dst (MulReductionVI src1 src2));
7562   effect(TEMP dst, TEMP tmp, TEMP tmp1, TEMP tmp2, TEMP tmp3);
7563   format %{ "vextracti128_high  $tmp,$src2\n\t"
7564             "vpmullw  $tmp,$tmp,$src2\n\t"
7565             "pshufd  $tmp1,$tmp,0xE\n\t"
7566             "pmullw  $tmp,$tmp1\n\t"
7567             "pshufd  $tmp1,$tmp,0x1\n\t"
7568             "pmullw  $tmp,$tmp1\n\t"
7569             "pextrw  $tmp2,$tmp, 0x1\n\t"
7570             "pextrw  $tmp3,$tmp, 0x0\n\t"
7571             "imul    $tmp2,$tmp3 \n\t"
7572             "movswl   $dst,$src1\n\t"
7573             "imull    $dst,$tmp2\n\t"
7574             "movswl   $dst,$dst\t! mul reduction16S" %}
7575   ins_encode %{
7576     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
7577     __ vpmullw($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, 1);
7578     __ pshufd($tmp1$$XMMRegister, $tmp$$XMMRegister, 0xE);
7579     __ vpmullw($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp1$$XMMRegister, 0);
7580     __ pshufd($tmp1$$XMMRegister, $tmp$$XMMRegister, 0x1);
7581     __ vpmullw($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp1$$XMMRegister, 0);
7582     __ pextrw($tmp2$$Register, $tmp$$XMMRegister, 0x1);
7583     __ pextrw($tmp3$$Register, $tmp$$XMMRegister, 0x0);
7584     __ imull($tmp2$$Register, $tmp3$$Register);
7585     __ movswl($dst$$Register, $src1$$Register);
7586     __ imull($dst$$Register, $tmp2$$Register);
7587     __ movswl($dst$$Register, $dst$$Register);
7588   %}
7589   ins_pipe( pipe_slow );
7590 %}
7591 
7592 instruct rvmul32S_reduction_reg(rRegI dst, rRegI src1, vecZ src2, regF tmp, regF tmp1, rRegI tmp2, rRegI tmp3) %{
7593   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
7594   match(Set dst (MulReductionVI src1 src2));
7595   effect(TEMP dst, TEMP tmp, TEMP tmp1, TEMP tmp2, TEMP tmp3);
7596   format %{ "vextracti64x4_high  $tmp1,$src2\n\t"
7597             "vpmullw  $tmp1,$tmp1,$src2\n\t"
7598             "vextracti128_high  $tmp,$tmp1\n\t"
7599             "vpmullw  $tmp,$tmp,$tmp1\n\t"
7600             "pshufd  $tmp1,$tmp,0xE\n\t"
7601             "pmullw  $tmp,$tmp1\n\t"
7602             "pshufd  $tmp1,$tmp,0x1\n\t"
7603             "pmullw  $tmp,$tmp1\n\t"
7604             "pextrw  $tmp2,$tmp, 0x1\n\t"
7605             "pextrw  $tmp3,$tmp, 0x0\n\t"
7606             "imul    $tmp2,$tmp3 \n\t"
7607             "movswl   $dst,$src1\n\t"
7608             "imull    $dst,$tmp2\n\t"
7609             "movswl   $dst,$dst\t! mul reduction32S" %}
7610   ins_encode %{
7611     int vector_len = 0;
7612     __ vextracti64x4_high($tmp1$$XMMRegister, $src2$$XMMRegister);
7613     __ vpmullw($tmp1$$XMMRegister, $tmp1$$XMMRegister, $src2$$XMMRegister, 1);
7614     __ vextracti128_high($tmp$$XMMRegister, $tmp1$$XMMRegister);
7615     __ vpmullw($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp1$$XMMRegister, vector_len);
7616     __ pshufd($tmp1$$XMMRegister, $tmp$$XMMRegister, 0xE);
7617     __ vpmullw($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp1$$XMMRegister, vector_len);
7618     __ pshufd($tmp1$$XMMRegister, $tmp$$XMMRegister, 0x1);
7619     __ vpmullw($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp1$$XMMRegister, vector_len);
7620     __ pextrw($tmp2$$Register, $tmp$$XMMRegister, 0x1);
7621     __ pextrw($tmp3$$Register, $tmp$$XMMRegister, 0x0);
7622     __ imull($tmp2$$Register, $tmp3$$Register);
7623     __ movswl($dst$$Register, $src1$$Register);
7624     __ imull($dst$$Register, $tmp2$$Register);
7625     __ movswl($dst$$Register, $dst$$Register);
7626   %}
7627   ins_pipe( pipe_slow );
7628 %}
7629 
7630 
7631 instruct rsmul2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, vecD tmp, vecD tmp2) %{
7632   predicate(UseSSE > 3 && UseAVX == 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
7633   match(Set dst (MulReductionVI src1 src2));
7634   effect(TEMP tmp, TEMP tmp2);
7635   format %{ "pshufd  $tmp2,$src2,0x1\n\t"
7636             "pmulld  $tmp2,$src2\n\t"
7637             "movd    $tmp,$src1\n\t"
7638             "pmulld  $tmp2,$tmp\n\t"
7639             "movd    $dst,$tmp2\t! mul reduction2I" %}
7640   ins_encode %{
7641     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
7642     __ pmulld($tmp2$$XMMRegister, $src2$$XMMRegister);
7643     __ movdl($tmp$$XMMRegister, $src1$$Register);
7644     __ pmulld($tmp2$$XMMRegister, $tmp$$XMMRegister);
7645     __ movdl($dst$$Register, $tmp2$$XMMRegister);
7646   %}
7647   ins_pipe( pipe_slow );
7648 %}
7649 
7650 instruct rvmul2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, vecD tmp, vecD tmp2) %{
7651   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
7652   match(Set dst (MulReductionVI src1 src2));
7653   effect(TEMP tmp, TEMP tmp2);
7654   format %{ "pshufd   $tmp2,$src2,0x1\n\t"
7655             "vpmulld  $tmp,$src2,$tmp2\n\t"
7656             "movd     $tmp2,$src1\n\t"
7657             "vpmulld  $tmp2,$tmp,$tmp2\n\t"
7658             "movd     $dst,$tmp2\t! mul reduction2I" %}
7659   ins_encode %{
7660     int vector_len = 0;
7661     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
7662     __ vpmulld($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
7663     __ movdl($tmp2$$XMMRegister, $src1$$Register);
7664     __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
7665     __ movdl($dst$$Register, $tmp2$$XMMRegister);
7666   %}
7667   ins_pipe( pipe_slow );
7668 %}
7669 
7670 instruct rsmul4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, vecX tmp, vecX tmp2) %{
7671   predicate(UseSSE > 3 && UseAVX == 0 && UseAVX == 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
7672   match(Set dst (MulReductionVI src1 src2));
7673   effect(TEMP tmp, TEMP tmp2);
7674   format %{ "pshufd  $tmp2,$src2,0xE\n\t"
7675             "pmulld  $tmp2,$src2\n\t"
7676             "pshufd  $tmp,$tmp2,0x1\n\t"
7677             "pmulld  $tmp2,$tmp\n\t"
7678             "movd    $tmp,$src1\n\t"
7679             "pmulld  $tmp2,$tmp\n\t"
7680             "movd    $dst,$tmp2\t! mul reduction4I" %}
7681   ins_encode %{
7682     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
7683     __ pmulld($tmp2$$XMMRegister, $src2$$XMMRegister);
7684     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x1);
7685     __ pmulld($tmp2$$XMMRegister, $tmp$$XMMRegister);
7686     __ movdl($tmp$$XMMRegister, $src1$$Register);
7687     __ pmulld($tmp2$$XMMRegister, $tmp$$XMMRegister);
7688     __ movdl($dst$$Register, $tmp2$$XMMRegister);
7689   %}
7690   ins_pipe( pipe_slow );
7691 %}
7692 
7693 instruct rvmul4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, vecX tmp, vecX tmp2) %{
7694   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
7695   match(Set dst (MulReductionVI src1 src2));
7696   effect(TEMP tmp, TEMP tmp2);
7697   format %{ "pshufd   $tmp2,$src2,0xE\n\t"
7698             "vpmulld  $tmp,$src2,$tmp2\n\t"
7699             "pshufd   $tmp2,$tmp,0x1\n\t"
7700             "vpmulld  $tmp,$tmp,$tmp2\n\t"
7701             "movd     $tmp2,$src1\n\t"
7702             "vpmulld  $tmp2,$tmp,$tmp2\n\t"
7703             "movd     $dst,$tmp2\t! mul reduction4I" %}
7704   ins_encode %{
7705     int vector_len = 0;
7706     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
7707     __ vpmulld($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
7708     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
7709     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
7710     __ movdl($tmp2$$XMMRegister, $src1$$Register);
7711     __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
7712     __ movdl($dst$$Register, $tmp2$$XMMRegister);
7713   %}
7714   ins_pipe( pipe_slow );
7715 %}
7716 
7717 instruct rvmul8I_reduction_reg(rRegI dst, rRegI src1, vecY src2, vecY tmp, vecY tmp2) %{
7718   predicate(UseAVX > 1 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
7719   match(Set dst (MulReductionVI src1 src2));
7720   effect(TEMP tmp, TEMP tmp2);
7721   format %{ "vextracti128_high  $tmp,$src2\n\t"
7722             "vpmulld  $tmp,$tmp,$src2\n\t"
7723             "pshufd   $tmp2,$tmp,0xE\n\t"
7724             "vpmulld  $tmp,$tmp,$tmp2\n\t"
7725             "pshufd   $tmp2,$tmp,0x1\n\t"
7726             "vpmulld  $tmp,$tmp,$tmp2\n\t"
7727             "movd     $tmp2,$src1\n\t"
7728             "vpmulld  $tmp2,$tmp,$tmp2\n\t"
7729             "movd     $dst,$tmp2\t! mul reduction8I" %}
7730   ins_encode %{
7731     int vector_len = 0;
7732     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
7733     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
7734     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
7735     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
7736     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
7737     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
7738     __ movdl($tmp2$$XMMRegister, $src1$$Register);
7739     __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
7740     __ movdl($dst$$Register, $tmp2$$XMMRegister);
7741   %}
7742   ins_pipe( pipe_slow );
7743 %}
7744 
7745 instruct rvmul16I_reduction_reg(rRegI dst, rRegI src1, legVecZ src2, legVecZ tmp, legVecZ tmp2, legVecZ tmp3) %{
7746   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
7747   match(Set dst (MulReductionVI src1 src2));
7748   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
7749   format %{ "vextracti64x4_high  $tmp3,$src2\n\t"
7750             "vpmulld  $tmp3,$tmp3,$src2\n\t"
7751             "vextracti128_high  $tmp,$tmp3\n\t"
7752             "vpmulld  $tmp,$tmp,$src2\n\t"
7753             "pshufd   $tmp2,$tmp,0xE\n\t"
7754             "vpmulld  $tmp,$tmp,$tmp2\n\t"
7755             "pshufd   $tmp2,$tmp,0x1\n\t"
7756             "vpmulld  $tmp,$tmp,$tmp2\n\t"
7757             "movd     $tmp2,$src1\n\t"
7758             "vpmulld  $tmp2,$tmp,$tmp2\n\t"
7759             "movd     $dst,$tmp2\t! mul reduction16I" %}
7760   ins_encode %{
7761     __ vextracti64x4_high($tmp3$$XMMRegister, $src2$$XMMRegister);
7762     __ vpmulld($tmp3$$XMMRegister, $tmp3$$XMMRegister, $src2$$XMMRegister, 1);
7763     __ vextracti128_high($tmp$$XMMRegister, $tmp3$$XMMRegister);
7764     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp3$$XMMRegister, 0);
7765     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
7766     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
7767     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
7768     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
7769     __ movdl($tmp2$$XMMRegister, $src1$$Register);
7770     __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
7771     __ movdl($dst$$Register, $tmp2$$XMMRegister);
7772   %}
7773   ins_pipe( pipe_slow );
7774 %}
7775 
7776 #ifdef _LP64
7777 instruct rvmul2L_reduction_reg(rRegL dst, rRegL src1, vecX src2, vecX tmp, vecX tmp2) %{
7778   predicate(UseAVX > 2 && VM_Version::supports_avx512dq());
7779   match(Set dst (MulReductionVL src1 src2));
7780   effect(TEMP tmp, TEMP tmp2);
7781   format %{ "pshufd   $tmp2,$src2,0xE\n\t"
7782             "vpmullq  $tmp,$src2,$tmp2\n\t"
7783             "movdq    $tmp2,$src1\n\t"
7784             "vpmullq  $tmp2,$tmp,$tmp2\n\t"
7785             "movdq    $dst,$tmp2\t! mul reduction2L" %}
7786   ins_encode %{
7787     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
7788     __ vpmullq($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, 0);
7789     __ movdq($tmp2$$XMMRegister, $src1$$Register);
7790     __ vpmullq($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
7791     __ movdq($dst$$Register, $tmp2$$XMMRegister);
7792   %}
7793   ins_pipe( pipe_slow );
7794 %}
7795 
7796 instruct rvmul4L_reduction_reg(rRegL dst, rRegL src1, vecY src2, vecY tmp, vecY tmp2) %{
7797   predicate(UseAVX > 2 && VM_Version::supports_avx512dq());
7798   match(Set dst (MulReductionVL src1 src2));
7799   effect(TEMP tmp, TEMP tmp2);
7800   format %{ "vextracti128_high  $tmp,$src2\n\t"
7801             "vpmullq  $tmp2,$tmp,$src2\n\t"
7802             "pshufd   $tmp,$tmp2,0xE\n\t"
7803             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
7804             "movdq    $tmp,$src1\n\t"
7805             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
7806             "movdq    $dst,$tmp2\t! mul reduction4L" %}
7807   ins_encode %{
7808     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
7809     __ vpmullq($tmp2$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, 0);
7810     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
7811     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
7812     __ movdq($tmp$$XMMRegister, $src1$$Register);
7813     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
7814     __ movdq($dst$$Register, $tmp2$$XMMRegister);
7815   %}
7816   ins_pipe( pipe_slow );
7817 %}
7818 
7819 instruct rvmul8L_reduction_reg(rRegL dst, rRegL src1, legVecZ src2, legVecZ tmp, legVecZ tmp2) %{
7820   predicate(UseAVX > 2 && VM_Version::supports_avx512dq());
7821   match(Set dst (MulReductionVL src1 src2));
7822   effect(TEMP tmp, TEMP tmp2);
7823   format %{ "vextracti64x4_high  $tmp2,$src2\n\t"
7824             "vpmullq  $tmp2,$tmp2,$src2\n\t"
7825             "vextracti128_high  $tmp,$tmp2\n\t"
7826             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
7827             "pshufd   $tmp,$tmp2,0xE\n\t"
7828             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
7829             "movdq    $tmp,$src1\n\t"
7830             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
7831             "movdq    $dst,$tmp2\t! mul reduction8L" %}
7832   ins_encode %{
7833     __ vextracti64x4_high($tmp2$$XMMRegister, $src2$$XMMRegister);
7834     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
7835     __ vextracti128_high($tmp$$XMMRegister, $tmp2$$XMMRegister);
7836     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
7837     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
7838     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
7839     __ movdq($tmp$$XMMRegister, $src1$$Register);
7840     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
7841     __ movdq($dst$$Register, $tmp2$$XMMRegister);
7842   %}
7843   ins_pipe( pipe_slow );
7844 %}
7845 #endif
7846 
7847 instruct rsmul2F_reduction(regF dst, vecD src2, vecD tmp) %{
7848   predicate(UseSSE >= 1 && UseAVX == 0);
7849   match(Set dst (MulReductionVF dst src2));
7850   effect(TEMP dst, TEMP tmp);
7851   format %{ "mulss   $dst,$src2\n\t"
7852             "pshufd  $tmp,$src2,0x01\n\t"
7853             "mulss   $dst,$tmp\t! mul reduction2F" %}
7854   ins_encode %{
7855     __ mulss($dst$$XMMRegister, $src2$$XMMRegister);
7856     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
7857     __ mulss($dst$$XMMRegister, $tmp$$XMMRegister);
7858   %}
7859   ins_pipe( pipe_slow );
7860 %}
7861 
7862 instruct rvmul2F_reduction_reg(regF dst, vecD src2, vecD tmp) %{
7863   predicate(UseAVX > 0);
7864   match(Set dst (MulReductionVF dst src2));
7865   effect(TEMP tmp, TEMP dst);
7866   format %{ "vmulss  $dst,$dst,$src2\n\t"
7867             "pshufd  $tmp,$src2,0x01\n\t"
7868             "vmulss  $dst,$dst,$tmp\t! mul reduction2F" %}
7869   ins_encode %{
7870     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
7871     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
7872     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7873   %}
7874   ins_pipe( pipe_slow );
7875 %}
7876 
7877 instruct rsmul4F_reduction_reg(regF dst, vecX src2, vecX tmp) %{
7878   predicate(UseSSE >= 1 && UseAVX == 0);
7879   match(Set dst (MulReductionVF dst src2));
7880   effect(TEMP dst, TEMP tmp);
7881   format %{ "mulss   $dst,$src2\n\t"
7882             "pshufd  $tmp,$src2,0x01\n\t"
7883             "mulss   $dst,$tmp\n\t"
7884             "pshufd  $tmp,$src2,0x02\n\t"
7885             "mulss   $dst,$tmp\n\t"
7886             "pshufd  $tmp,$src2,0x03\n\t"
7887             "mulss   $dst,$tmp\t! mul reduction4F" %}
7888   ins_encode %{
7889     __ mulss($dst$$XMMRegister, $src2$$XMMRegister);
7890     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
7891     __ mulss($dst$$XMMRegister, $tmp$$XMMRegister);
7892     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
7893     __ mulss($dst$$XMMRegister, $tmp$$XMMRegister);
7894     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
7895     __ mulss($dst$$XMMRegister, $tmp$$XMMRegister);
7896   %}
7897   ins_pipe( pipe_slow );
7898 %}
7899 
7900 instruct rvmul4F_reduction_reg(regF dst, vecX src2, vecX tmp) %{
7901   predicate(UseAVX > 0);
7902   match(Set dst (MulReductionVF dst src2));
7903   effect(TEMP tmp, TEMP dst);
7904   format %{ "vmulss  $dst,$dst,$src2\n\t"
7905             "pshufd  $tmp,$src2,0x01\n\t"
7906             "vmulss  $dst,$dst,$tmp\n\t"
7907             "pshufd  $tmp,$src2,0x02\n\t"
7908             "vmulss  $dst,$dst,$tmp\n\t"
7909             "pshufd  $tmp,$src2,0x03\n\t"
7910             "vmulss  $dst,$dst,$tmp\t! mul reduction4F" %}
7911   ins_encode %{
7912     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
7913     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
7914     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7915     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
7916     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7917     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
7918     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7919   %}
7920   ins_pipe( pipe_slow );
7921 %}
7922 
7923 instruct rvmul8F_reduction_reg(regF dst, vecY src2, vecY tmp, vecY tmp2) %{
7924   predicate(UseAVX > 0);
7925   match(Set dst (MulReductionVF dst src2));
7926   effect(TEMP tmp, TEMP dst, TEMP tmp2);
7927   format %{ "vmulss  $dst,$dst,$src2\n\t"
7928             "pshufd  $tmp,$src2,0x01\n\t"
7929             "vmulss  $dst,$dst,$tmp\n\t"
7930             "pshufd  $tmp,$src2,0x02\n\t"
7931             "vmulss  $dst,$dst,$tmp\n\t"
7932             "pshufd  $tmp,$src2,0x03\n\t"
7933             "vmulss  $dst,$dst,$tmp\n\t"
7934             "vextractf128_high  $tmp2,$src2\n\t"
7935             "vmulss  $dst,$dst,$tmp2\n\t"
7936             "pshufd  $tmp,$tmp2,0x01\n\t"
7937             "vmulss  $dst,$dst,$tmp\n\t"
7938             "pshufd  $tmp,$tmp2,0x02\n\t"
7939             "vmulss  $dst,$dst,$tmp\n\t"
7940             "pshufd  $tmp,$tmp2,0x03\n\t"
7941             "vmulss  $dst,$dst,$tmp\t! mul reduction8F" %}
7942   ins_encode %{
7943     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
7944     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
7945     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7946     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
7947     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7948     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
7949     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7950     __ vextractf128_high($tmp2$$XMMRegister, $src2$$XMMRegister);
7951     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
7952     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
7953     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7954     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
7955     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7956     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
7957     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7958   %}
7959   ins_pipe( pipe_slow );
7960 %}
7961 
7962 instruct rvmul16F_reduction_reg(regF dst, legVecZ src2, legVecZ tmp, legVecZ tmp2) %{
7963   predicate(UseAVX > 2);
7964   match(Set dst (MulReductionVF dst src2));
7965   effect(TEMP tmp, TEMP dst, TEMP tmp2);
7966   format %{ "vmulss  $dst,$dst,$src2\n\t"
7967             "pshufd  $tmp,$src2,0x01\n\t"
7968             "vmulss  $dst,$dst,$tmp\n\t"
7969             "pshufd  $tmp,$src2,0x02\n\t"
7970             "vmulss  $dst,$dst,$tmp\n\t"
7971             "pshufd  $tmp,$src2,0x03\n\t"
7972             "vmulss  $dst,$dst,$tmp\n\t"
7973             "vextractf32x4  $tmp2,$src2,0x1\n\t"
7974             "vmulss  $dst,$dst,$tmp2\n\t"
7975             "pshufd  $tmp,$tmp2,0x01\n\t"
7976             "vmulss  $dst,$dst,$tmp\n\t"
7977             "pshufd  $tmp,$tmp2,0x02\n\t"
7978             "vmulss  $dst,$dst,$tmp\n\t"
7979             "pshufd  $tmp,$tmp2,0x03\n\t"
7980             "vmulss  $dst,$dst,$tmp\n\t"
7981             "vextractf32x4  $tmp2,$src2,0x2\n\t"
7982             "vmulss  $dst,$dst,$tmp2\n\t"
7983             "pshufd  $tmp,$tmp2,0x01\n\t"
7984             "vmulss  $dst,$dst,$tmp\n\t"
7985             "pshufd  $tmp,$tmp2,0x02\n\t"
7986             "vmulss  $dst,$dst,$tmp\n\t"
7987             "pshufd  $tmp,$tmp2,0x03\n\t"
7988             "vmulss  $dst,$dst,$tmp\n\t"
7989             "vextractf32x4  $tmp2,$src2,0x3\n\t"
7990             "vmulss  $dst,$dst,$tmp2\n\t"
7991             "pshufd  $tmp,$tmp2,0x01\n\t"
7992             "vmulss  $dst,$dst,$tmp\n\t"
7993             "pshufd  $tmp,$tmp2,0x02\n\t"
7994             "vmulss  $dst,$dst,$tmp\n\t"
7995             "pshufd  $tmp,$tmp2,0x03\n\t"
7996             "vmulss  $dst,$dst,$tmp\t! mul reduction16F" %}
7997   ins_encode %{
7998     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
7999     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
8000     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
8001     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
8002     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
8003     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
8004     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
8005     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
8006     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
8007     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
8008     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
8009     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
8010     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
8011     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
8012     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
8013     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
8014     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
8015     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
8016     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
8017     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
8018     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
8019     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
8020     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
8021     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
8022     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
8023     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
8024     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
8025     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
8026     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
8027     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
8028     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
8029   %}
8030   ins_pipe( pipe_slow );
8031 %}
8032 
8033 instruct rsmul2D_reduction_reg(regD dst, vecX src2, vecX tmp) %{
8034   predicate(UseSSE >= 1 && UseAVX == 0);
8035   match(Set dst (MulReductionVD dst src2));
8036   effect(TEMP dst, TEMP tmp);
8037   format %{ "mulsd   $dst,$src2\n\t"
8038             "pshufd  $tmp,$src2,0xE\n\t"
8039             "mulsd   $dst,$tmp\t! mul reduction2D" %}
8040   ins_encode %{
8041     __ mulsd($dst$$XMMRegister, $src2$$XMMRegister);
8042     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
8043     __ mulsd($dst$$XMMRegister, $tmp$$XMMRegister);
8044   %}
8045   ins_pipe( pipe_slow );
8046 %}
8047 
8048 instruct rvmul2D_reduction_reg(regD dst, vecX src2, vecX tmp) %{
8049   predicate(UseAVX > 0);
8050   match(Set dst (MulReductionVD dst src2));
8051   effect(TEMP tmp, TEMP dst);
8052   format %{ "vmulsd  $dst,$dst,$src2\n\t"
8053             "pshufd  $tmp,$src2,0xE\n\t"
8054             "vmulsd  $dst,$dst,$tmp\t! mul reduction2D" %}
8055   ins_encode %{
8056     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
8057     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
8058     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
8059   %}
8060   ins_pipe( pipe_slow );
8061 %}
8062 
8063 instruct rvmul4D_reduction_reg(regD dst, vecY src2, vecY tmp, vecY tmp2) %{
8064   predicate(UseAVX > 0);
8065   match(Set dst (MulReductionVD dst src2));
8066   effect(TEMP tmp, TEMP dst, TEMP tmp2);
8067   format %{ "vmulsd  $dst,$dst,$src2\n\t"
8068             "pshufd  $tmp,$src2,0xE\n\t"
8069             "vmulsd  $dst,$dst,$tmp\n\t"
8070             "vextractf128_high  $tmp2,$src2\n\t"
8071             "vmulsd  $dst,$dst,$tmp2\n\t"
8072             "pshufd  $tmp,$tmp2,0xE\n\t"
8073             "vmulsd  $dst,$dst,$tmp\t! mul reduction4D" %}
8074   ins_encode %{
8075     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
8076     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
8077     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
8078     __ vextractf128_high($tmp2$$XMMRegister, $src2$$XMMRegister);
8079     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
8080     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
8081     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
8082   %}
8083   ins_pipe( pipe_slow );
8084 %}
8085 
8086 instruct rvmul8D_reduction_reg(regD dst, legVecZ src2, legVecZ tmp, legVecZ tmp2) %{
8087   predicate(UseAVX > 2);
8088   match(Set dst (MulReductionVD dst src2));
8089   effect(TEMP tmp, TEMP dst, TEMP tmp2);
8090   format %{ "vmulsd  $dst,$dst,$src2\n\t"
8091             "pshufd  $tmp,$src2,0xE\n\t"
8092             "vmulsd  $dst,$dst,$tmp\n\t"
8093             "vextractf32x4  $tmp2,$src2,0x1\n\t"
8094             "vmulsd  $dst,$dst,$tmp2\n\t"
8095             "pshufd  $tmp,$src2,0xE\n\t"
8096             "vmulsd  $dst,$dst,$tmp\n\t"
8097             "vextractf32x4  $tmp2,$src2,0x2\n\t"
8098             "vmulsd  $dst,$dst,$tmp2\n\t"
8099             "pshufd  $tmp,$tmp2,0xE\n\t"
8100             "vmulsd  $dst,$dst,$tmp\n\t"
8101             "vextractf32x4  $tmp2,$src2,0x3\n\t"
8102             "vmulsd  $dst,$dst,$tmp2\n\t"
8103             "pshufd  $tmp,$tmp2,0xE\n\t"
8104             "vmulsd  $dst,$dst,$tmp\t! mul reduction8D" %}
8105   ins_encode %{
8106     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
8107     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
8108     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
8109     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
8110     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
8111     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
8112     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
8113     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
8114     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
8115     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
8116     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
8117     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
8118     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
8119     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
8120     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
8121   %}
8122   ins_pipe( pipe_slow );
8123 %}
8124 
8125 //--------------------Min Reduction --------------------
8126 instruct rsmin8B_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, rRegI tmp2, rRegI tmp3, regF tmp4) %{
8127   predicate(UseSSE > 3 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
8128   match(Set dst (MinReductionV src1 src2));
8129   effect(TEMP dst, TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP tmp4);
8130   format %{ "pshufd  $tmp,$src2,0x1\n\t"
8131             "pminsb  $tmp,$src2\n\t"
8132             "pextrb  $tmp2,$tmp, 0x1\n\t"
8133             "movsbl  $tmp2,$tmp2\n\t"
8134             "pextrb  $tmp3,$tmp,0x0\n\t"
8135             "movsbl  $tmp3,$tmp3\n\t"
8136             "cmpl  $tmp2,$tmp3\n\t"
8137             "cmovl  $tmp3,$tmp2\n\t"
8138             "cmpl  $src1,$tmp3\n\t"
8139             "cmovl  $tmp3,$src1, 0x0\n\t"
8140             "movl  $dst,$tmp2\n\t"
8141             "pextrb  $tmp2,$tmp\n\t"
8142             "movsbl  $tmp2,$tmp2\n\t"
8143             "pextrb  $tmp3,$tmp\n\t"
8144             "movsbl  $tmp3,$tmp3\n\t"
8145             "cmpl  $tmp2,$tmp3\n\t"
8146             "cmovl  $tmp3,$tmp2\n\t"
8147             "cmpl  $tmp3,$dst\n\t"
8148             "cmovl  $dst,$tmp3\t! min reduction4S" %}
8149   ins_encode %{
8150     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister,0x1);
8151     __ pminsb($tmp$$XMMRegister, $src2$$XMMRegister);
8152     __ pextrb($tmp2$$Register, $tmp$$XMMRegister,0x1);
8153     __ movsbl($tmp2$$Register, $tmp2$$Register);
8154     __ pextrb($tmp3$$Register, $tmp$$XMMRegister,0x0);
8155     __ movsbl($tmp3$$Register, $tmp3$$Register);
8156     __ cmpl($tmp2$$Register, $tmp3$$Register);
8157     __ cmovl(Assembler::less, $tmp3$$Register, $tmp2$$Register);
8158     __ cmpl($src1$$Register, $tmp3$$Register);
8159     __ cmovl(Assembler::less, $tmp3$$Register, $src1$$Register);
8160     __ movl($dst$$Register, $tmp3$$Register);
8161     __ pextrb($tmp2$$Register, $tmp$$XMMRegister,0x3);
8162     __ movsbl($tmp2$$Register, $tmp2$$Register);
8163     __ pextrb($tmp3$$Register, $tmp$$XMMRegister,0x2);
8164     __ movsbl($tmp3$$Register, $tmp3$$Register);
8165     __ cmpl($tmp2$$Register, $tmp3$$Register);
8166     __ cmovl(Assembler::less, $tmp3$$Register, $tmp2$$Register);
8167     __ cmpl($tmp3$$Register, $dst$$Register);
8168     __ cmovl(Assembler::less, $dst$$Register, $tmp3$$Register);
8169     __ movsbl($dst$$Register, $dst$$Register);
8170   %}
8171   ins_pipe( pipe_slow );
8172 %}
8173 
8174 instruct rsmin16B_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, rRegI tmp2, rRegI tmp3, regF tmp4) %{
8175   predicate(UseSSE > 3 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
8176   match(Set dst (MinReductionV src1 src2));
8177   effect(TEMP dst, TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP tmp4);
8178   format %{ "pshufd  $tmp4,$src2,0xE\n\t"
8179             "pminsb  $tmp4,$src2\n\t"
8180             "pshufd  $tmp,$tmp4,0x1\n\t"
8181             "pminsb  $tmp,$tmp4\n\t"
8182             "pextrb  $tmp2,$tmp, 0x1\n\t"
8183             "movsbl  $tmp2,$tmp2\n\t"
8184             "pextrb  $tmp3,$tmp,0x0\n\t"
8185             "movsbl  $tmp3,$tmp3\n\t"
8186             "cmpl  $tmp2,$tmp3\n\t"
8187             "cmovl  $tmp3,$tmp2\n\t"
8188             "cmpl  $src1,$tmp3\n\t"
8189             "cmovl  $tmp3,$src1, 0x0\n\t"
8190             "movl  $dst,$tmp2\n\t"
8191             "pextrb  $tmp2,$tmp\n\t"
8192             "movsbl  $tmp2,$tmp2\n\t"
8193             "pextrb  $tmp3,$tmp\n\t"
8194             "movsbl  $tmp3,$tmp3\n\t"
8195             "cmpl  $tmp2,$tmp3\n\t"
8196             "cmovl  $tmp3,$tmp2\n\t"
8197             "cmpl  $tmp3,$dst\n\t"
8198             "cmovl  $dst,$tmp3\t! min reduction4S" %}
8199   ins_encode %{
8200     __ pshufd($tmp4$$XMMRegister, $src2$$XMMRegister, 0xE);
8201     __ pminsb($tmp4$$XMMRegister, $src2$$XMMRegister);
8202     __ pshufd($tmp$$XMMRegister, $tmp4$$XMMRegister,0x1);
8203     __ pminsb($tmp$$XMMRegister, $tmp4$$XMMRegister);
8204     __ pextrb($tmp2$$Register, $tmp$$XMMRegister,0x1);
8205     __ movsbl($tmp2$$Register, $tmp2$$Register);
8206     __ pextrb($tmp3$$Register, $tmp$$XMMRegister,0x0);
8207     __ movsbl($tmp3$$Register, $tmp3$$Register);
8208     __ cmpl($tmp2$$Register, $tmp3$$Register);
8209     __ cmovl(Assembler::less, $tmp3$$Register, $tmp2$$Register);
8210     __ cmpl($src1$$Register, $tmp3$$Register);
8211     __ cmovl(Assembler::less, $tmp3$$Register, $src1$$Register);
8212     __ movl($dst$$Register, $tmp3$$Register);
8213     __ pextrb($tmp2$$Register, $tmp$$XMMRegister,0x3);
8214     __ movsbl($tmp2$$Register, $tmp2$$Register);
8215     __ pextrb($tmp3$$Register, $tmp$$XMMRegister,0x2);
8216     __ movsbl($tmp3$$Register, $tmp3$$Register);
8217     __ cmpl($tmp2$$Register, $tmp3$$Register);
8218     __ cmovl(Assembler::less, $tmp3$$Register, $tmp2$$Register);
8219     __ cmpl($tmp3$$Register, $dst$$Register);
8220     __ cmovl(Assembler::less, $dst$$Register, $tmp3$$Register);
8221     __ movsbl($dst$$Register, $dst$$Register);
8222   %}
8223   ins_pipe( pipe_slow );
8224 %}
8225 
8226 instruct rvmin16B_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, rRegI tmp2, rRegI tmp3, regF tmp4) %{
8227   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
8228   match(Set dst (MinReductionV src1 src2));
8229   effect(TEMP dst, TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP tmp4);
8230   format %{ "pshufd  $tmp4,$src2,0xE\n\t"
8231             "vpminsb  $tmp,$tmp4,$src2\n\t"
8232             "pshufd  $tmp,$tmp4,0x1\n\t"
8233             "vpminsb  $tmp,$tmp,$tmp4\n\t"
8234             "pextrb  $tmp2,$tmp, 0x1\n\t"
8235             "movsbl  $tmp2,$tmp2\n\t"
8236             "pextrb  $tmp3,$tmp,0x0\n\t"
8237             "movsbl  $tmp3,$tmp3\n\t"
8238             "cmpl  $tmp2,$tmp3\n\t"
8239             "cmovl  $tmp3,$tmp2\n\t"
8240             "cmpl  $src1,$tmp3\n\t"
8241             "cmovl  $tmp3,$src1, 0x0\n\t"
8242             "movl  $dst,$tmp2\n\t"
8243             "pextrb  $tmp2,$tmp\n\t"
8244             "movsbl  $tmp2,$tmp2\n\t"
8245             "pextrb  $tmp3,$tmp\n\t"
8246             "movsbl  $tmp3,$tmp3\n\t"
8247             "cmpl  $tmp2,$tmp3\n\t"
8248             "cmovl  $tmp3,$tmp2\n\t"
8249             "cmpl  $tmp3,$dst\n\t"
8250             "cmovl  $dst,$tmp3\t! min reduction4S" %}
8251   ins_encode %{
8252     int vector_len = 0;
8253     __ pshufd($tmp4$$XMMRegister, $src2$$XMMRegister, 0xE);
8254     __ vpminsb($tmp4$$XMMRegister, $tmp4$$XMMRegister, $src2$$XMMRegister, 0);
8255     __ pshufd($tmp$$XMMRegister, $tmp4$$XMMRegister,0x1);
8256     __ vpminsb($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp4$$XMMRegister, 0);
8257     __ pextrb($tmp2$$Register, $tmp$$XMMRegister,0x1);
8258     __ movsbl($tmp2$$Register, $tmp2$$Register);
8259     __ pextrb($tmp3$$Register, $tmp$$XMMRegister,0x0);
8260     __ movsbl($tmp3$$Register, $tmp3$$Register);
8261     __ cmpl($tmp2$$Register, $tmp3$$Register);
8262     __ cmovl(Assembler::less, $tmp3$$Register, $tmp2$$Register);
8263     __ cmpl($src1$$Register, $tmp3$$Register);
8264     __ cmovl(Assembler::less, $tmp3$$Register, $src1$$Register);
8265     __ movl($dst$$Register, $tmp3$$Register);
8266     __ pextrb($tmp2$$Register, $tmp$$XMMRegister,0x3);
8267     __ movsbl($tmp2$$Register, $tmp2$$Register);
8268     __ pextrb($tmp3$$Register, $tmp$$XMMRegister,0x2);
8269     __ movsbl($tmp3$$Register, $tmp3$$Register);
8270     __ cmpl($tmp2$$Register, $tmp3$$Register);
8271     __ cmovl(Assembler::less, $tmp3$$Register, $tmp2$$Register);
8272     __ cmpl($tmp3$$Register, $dst$$Register);
8273     __ cmovl(Assembler::less, $dst$$Register, $tmp3$$Register);
8274     __ movsbl($dst$$Register, $dst$$Register);
8275   %}
8276   ins_pipe( pipe_slow );
8277 %}
8278 
8279 instruct rvmin32B_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, rRegI tmp2, rRegI tmp3, regF tmp4) %{
8280   predicate(UseAVX > 1 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
8281   match(Set dst (MinReductionV src1 src2));
8282   effect(TEMP dst, TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP tmp4);
8283   format %{ "vextracti128_high  $tmp,$src2\n\t"
8284             "vpminsb  $tmp,$tmp,$src2\n\t"
8285             "pshufd  $tmp4,$tmp,0xE\n\t"
8286             "vpminsb  $tmp4,$tmp4,$tmp\n\t"
8287             "pshufd  $tmp,$tmp4,0x1\n\t"
8288             "vpminsb  $tmp,$tmp,$tmp4\n\t"
8289             "pextrb  $tmp2,$tmp, 0x1\n\t"
8290             "movsbl  $tmp2,$tmp2\n\t"
8291             "pextrb  $tmp3,$tmp,0x0\n\t"
8292             "movsbl  $tmp3,$tmp3\n\t"
8293             "cmpl  $tmp2,$tmp3\n\t"
8294             "cmovl  $tmp3,$tmp2\n\t"
8295             "cmpl  $src1,$tmp3\n\t"
8296             "cmovl  $tmp3,$src1, 0x0\n\t"
8297             "movl  $dst,$tmp2\n\t"
8298             "pextrb  $tmp2,$tmp\n\t"
8299             "movsbl  $tmp2,$tmp2\n\t"
8300             "pextrb  $tmp3,$tmp\n\t"
8301             "movsbl  $tmp3,$tmp3\n\t"
8302             "cmpl  $tmp2,$tmp3\n\t"
8303             "cmovl  $tmp3,$tmp2\n\t"
8304             "cmpl  $tmp3,$dst\n\t"
8305             "cmovl  $dst,$tmp3\t! min reduction4S" %}
8306   ins_encode %{
8307     int vector_len = 1;
8308     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
8309     __ vpminsb($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
8310     __ pshufd($tmp4$$XMMRegister, $tmp$$XMMRegister, 0xE);
8311     __ vpminsb($tmp4$$XMMRegister, $tmp4$$XMMRegister, $tmp$$XMMRegister, 0);
8312     __ pshufd($tmp$$XMMRegister, $tmp4$$XMMRegister,0x1);
8313     __ vpminsb($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp4$$XMMRegister, 0);
8314     __ pextrb($tmp2$$Register, $tmp$$XMMRegister,0x1);
8315     __ movsbl($tmp2$$Register, $tmp2$$Register);
8316     __ pextrb($tmp3$$Register, $tmp$$XMMRegister,0x0);
8317     __ movsbl($tmp3$$Register, $tmp3$$Register);
8318     __ cmpl($tmp2$$Register, $tmp3$$Register);
8319     __ cmovl(Assembler::less, $tmp3$$Register, $tmp2$$Register);
8320     __ cmpl($src1$$Register, $tmp3$$Register);
8321     __ cmovl(Assembler::less, $tmp3$$Register, $src1$$Register);
8322     __ movl($dst$$Register, $tmp3$$Register);
8323     __ pextrb($tmp2$$Register, $tmp$$XMMRegister,0x3);
8324     __ movsbl($tmp2$$Register, $tmp2$$Register);
8325     __ pextrb($tmp3$$Register, $tmp$$XMMRegister,0x2);
8326     __ movsbl($tmp3$$Register, $tmp3$$Register);
8327     __ cmpl($tmp2$$Register, $tmp3$$Register);
8328     __ cmovl(Assembler::less, $tmp3$$Register, $tmp2$$Register);
8329     __ cmpl($tmp3$$Register, $dst$$Register);
8330     __ cmovl(Assembler::less, $dst$$Register, $tmp3$$Register);
8331     __ movsbl($dst$$Register, $dst$$Register);
8332   %}
8333   ins_pipe( pipe_slow );
8334 %}
8335 
8336 instruct rvmin64B_reduction_reg(rRegI dst, rRegI src1, vecZ src2, regF tmp, rRegI tmp2, rRegI tmp3, regF tmp4) %{
8337   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
8338   match(Set dst (MinReductionV src1 src2));
8339   effect(TEMP dst, TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP tmp4);
8340   format %{ "vextracti64x4_high  $tmp4,$src2\n\t"
8341             "vpminsb  $tmp4,$tmp4,$src2\n\t"
8342             "vextracti128_high  $tmp,$tmp4\n\t"
8343             "vpminsb  $tmp,$tmp,$tmp4\n\t"
8344             "pshufd  $tmp4,$tmp,0xE\n\t"
8345             "vpminsb  $tmp4,$tmp4,$tmp\n\t"
8346             "pshufd  $tmp,$tmp4,0x1\n\t"
8347             "vpminsb  $tmp,$tmp,$tmp4\n\t"
8348             "pextrb  $tmp2,$tmp, 0x1\n\t"
8349             "movsbl  $tmp2,$tmp2\n\t"
8350             "pextrb  $tmp3,$tmp,0x0\n\t"
8351             "movsbl  $tmp3,$tmp3\n\t"
8352             "cmpl  $tmp2,$tmp3\n\t"
8353             "cmovl  $tmp3,$tmp2\n\t"
8354             "cmpl  $src1,$tmp3\n\t"
8355             "cmovl  $tmp3,$src1, 0x0\n\t"
8356             "movl  $dst,$tmp2\n\t"
8357             "pextrb  $tmp2,$tmp\n\t"
8358             "movsbl  $tmp2,$tmp2\n\t"
8359             "pextrb  $tmp3,$tmp\n\t"
8360             "movsbl  $tmp3,$tmp3\n\t"
8361             "cmpl  $tmp2,$tmp3\n\t"
8362             "cmovl  $tmp3,$tmp2\n\t"
8363             "cmpl  $tmp3,$dst\n\t"
8364             "cmovl  $dst,$tmp3\t! min reduction4S" %}
8365   ins_encode %{
8366     __ vextracti64x4_high($tmp4$$XMMRegister, $src2$$XMMRegister);
8367     __ vpminsb($tmp4$$XMMRegister, $tmp4$$XMMRegister, $src2$$XMMRegister, 2);
8368     __ vextracti128_high($tmp$$XMMRegister, $tmp4$$XMMRegister);
8369     __ vpminsb($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp4$$XMMRegister, 1);
8370     __ pshufd($tmp4$$XMMRegister, $tmp$$XMMRegister, 0xE);
8371     __ vpminsb($tmp4$$XMMRegister, $tmp4$$XMMRegister, $tmp$$XMMRegister, 0);
8372     __ pshufd($tmp$$XMMRegister, $tmp4$$XMMRegister,0x1);
8373     __ vpminsb($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp4$$XMMRegister, 0);
8374     __ pextrb($tmp2$$Register, $tmp$$XMMRegister,0x1);
8375     __ movsbl($tmp2$$Register, $tmp2$$Register);
8376     __ pextrb($tmp3$$Register, $tmp$$XMMRegister,0x0);
8377     __ movsbl($tmp3$$Register, $tmp3$$Register);
8378     __ cmpl($tmp2$$Register, $tmp3$$Register);
8379     __ cmovl(Assembler::less, $tmp3$$Register, $tmp2$$Register);
8380     __ cmpl($src1$$Register, $tmp3$$Register);
8381     __ cmovl(Assembler::less, $tmp3$$Register, $src1$$Register);
8382     __ movl($dst$$Register, $tmp3$$Register);
8383     __ pextrb($tmp2$$Register, $tmp$$XMMRegister,0x3);
8384     __ movsbl($tmp2$$Register, $tmp2$$Register);
8385     __ pextrb($tmp3$$Register, $tmp$$XMMRegister,0x2);
8386     __ movsbl($tmp3$$Register, $tmp3$$Register);
8387     __ cmpl($tmp2$$Register, $tmp3$$Register);
8388     __ cmovl(Assembler::less, $tmp3$$Register, $tmp2$$Register);
8389     __ cmpl($tmp3$$Register, $dst$$Register);
8390     __ cmovl(Assembler::less, $dst$$Register, $tmp3$$Register);
8391     __ movsbl($dst$$Register, $dst$$Register);
8392   %}
8393   ins_pipe( pipe_slow );
8394 %}
8395 
8396 instruct rsmin4S_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, rRegI tmp2, rRegI tmp3) %{
8397   predicate(UseSSE > 1 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
8398   match(Set dst (MinReductionV src1 src2));
8399   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
8400   format %{ "pshufd  $tmp,$src2,0x1\n\t"
8401             "pminsw  $tmp,$src2\n\t"
8402             "pextrw  $tmp2,$tmp, 0x1\n\t"
8403             "movswl  $tmp2,$tmp2\n\t"
8404             "pextrb  $tmp3,$tmp, 0x0\n\t"
8405             "movswl  $tmp3,$tmp3,0x1\n\t"
8406             "cmpl  $tmp2,$tmp3\n\t"
8407             "cmovl  $tmp3,tmp2\n\t"
8408             "cmpl  $src1,$tmp3\n\t"
8409             "cmovl  $tmp3,$src1\n\t"
8410             "movswl  $dst,$tmp3\t! min reduction4S" %}
8411   ins_encode %{
8412     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister,0x1);
8413     __ pminsw($tmp$$XMMRegister, $src2$$XMMRegister);
8414     __ pextrw($tmp2$$Register, $tmp$$XMMRegister,0x1);
8415     __ movswl($tmp2$$Register, $tmp2$$Register);
8416     __ pextrw($tmp3$$Register, $tmp$$XMMRegister,0x0);
8417     __ movswl($tmp3$$Register, $tmp3$$Register);
8418     __ cmpl($tmp2$$Register, $tmp3$$Register);
8419     __ cmovl(Assembler::less, $tmp3$$Register, $tmp2$$Register);
8420     __ cmpl($src1$$Register, $tmp3$$Register);
8421     __ cmovl(Assembler::less, $tmp3$$Register, $src1$$Register);
8422     __ movl($dst$$Register, $tmp3$$Register);
8423   %}
8424   ins_pipe( pipe_slow );
8425 %}
8426 
8427 instruct rsmin8S_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2, rRegI tmp3, rRegI tmp4) %{
8428   predicate(UseSSE > 1 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
8429   match(Set dst (MinReductionV src1 src2));
8430   effect(TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP tmp4);
8431   format %{ "pshufd  $tmp2,$src2,0xE\n\t"
8432             "pminsw  $tmp2,$src2\n\t"
8433             "pshufd  $tmp,$tmp2,0x1\n\t"
8434             "pminsw  $tmp,$tmp2\n\t"
8435             "pextrw  $tmp2,$tmp\n\t"
8436             "movswl  $tmp2,$tmp2\n\t"
8437             "pextrw  $tmp3,$tmp, 0x0\n\t"
8438             "movswl  $tmp3,$tmp3\n\t"
8439             "cmpl    $tmp2,$tmp3\n\t"
8440             "cmovl  $tmp3,$tmp2\n\t"
8441             "cmpl  $src1,$tmp3\n\t"
8442             "cmovl  $tmp3,$src1\n\t"
8443             "movl  $dst,$tmp3\t! min reduction8S" %}
8444   ins_encode %{
8445     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister,0xE);
8446     __ pminsw($tmp2$$XMMRegister, $src2$$XMMRegister);
8447     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister,0x1);
8448     __ pminsw($tmp$$XMMRegister, $tmp2$$XMMRegister);
8449     __ pextrw($tmp4$$Register, $tmp$$XMMRegister,0x1);
8450     __ movswl($tmp4$$Register, $tmp4$$Register);
8451     __ pextrw($tmp3$$Register, $tmp$$XMMRegister,0x0);
8452     __ movswl($tmp3$$Register, $tmp3$$Register);
8453     __ cmpl($tmp4$$Register, $tmp3$$Register);
8454     __ cmovl(Assembler::less, $tmp3$$Register, $tmp4$$Register);
8455     __ cmpl($src1$$Register, $tmp3$$Register);
8456     __ cmovl(Assembler::less, $tmp3$$Register, $src1$$Register);
8457     __ movl($dst$$Register, $tmp3$$Register);
8458   %}
8459   ins_pipe( pipe_slow );
8460 %}
8461 
8462 instruct rvmin8S_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2, rRegI tmp3, rRegI tmp4) %{
8463   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
8464   match(Set dst (MinReductionV src1 src2));
8465   effect(TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP tmp4);
8466   format %{ "pshufd   $tmp,$src2,0xE\n\t"
8467             "vpminsw  $tmp,$tmp,$src2\n\t"
8468             "pshufd   $tmp2,$tmp,0x1\n\t"
8469             "vpminsw  $tmp,$tmp,$tmp2\n\t"
8470             "movzwl   $dst,$src1\n\t"
8471             "pextrw   $tmp3,$tmp, 0x0\n\t"
8472             "vpminsw  $dst,$dst,$tmp3\n\t"
8473             "pextrw   $tmp3,$tmp, 0x1\n\t"
8474             "vpminsw  $dst,$dst,$tmp3\n\t"
8475             "movswl   $dst,$dst\t! min reduction8S" %}
8476   ins_encode %{
8477     int vector_len = 0;
8478     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
8479     __ vpminsw($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
8480     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister,0x1);
8481     __ vpminsw($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
8482     __ pextrw($tmp4$$Register, $tmp$$XMMRegister,0x1);
8483     __ movswl($tmp4$$Register, $tmp4$$Register);
8484     __ pextrw($tmp3$$Register, $tmp$$XMMRegister,0x0);
8485     __ movswl($tmp3$$Register, $tmp3$$Register);
8486     __ cmpl($tmp4$$Register, $tmp3$$Register);
8487     __ cmovl(Assembler::less, $tmp3$$Register, $tmp4$$Register);
8488     __ cmpl($src1$$Register, $tmp3$$Register);
8489     __ cmovl(Assembler::less, $tmp3$$Register, $src1$$Register);
8490     __ movl($dst$$Register, $tmp3$$Register);
8491   %}
8492   ins_pipe( pipe_slow );
8493 %}
8494 
8495 instruct rvmin16S_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2, rRegI tmp3, rRegI tmp4) %{
8496   predicate(UseAVX > 1 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
8497   match(Set dst (MinReductionV src1 src2));
8498   effect(TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP tmp4);
8499   format %{ "vextracti128_high  $tmp,$src2\n\t"
8500             "vpminsw  $tmp,$tmp,$src2\n\t"
8501             "pshufd  $tmp2,$tmp,0xE\n\t"
8502             "vpminsw  $tmp,$tmp,$tmp2\n\t"
8503             "pshufd  $tmp2,$tmp,0x1\n\t"
8504             "vpminsw  $tmp,$tmp,$tmp2\n\t"
8505             "pextrw  $tmp2,$tmp, 0x1\n\t"
8506             "movswl  $tmp2,$tmp2\n\t"
8507             "pextrw  $tmp3,$tmp, 0x0\n\t"
8508             "movswl  $tmp3,$tmp3\n\t"
8509             "cmpl  $tmp2$tmp3\n\t"
8510             "cmovl  $tmp3,$tmp2\n\t"
8511             "cmpl  $src1,$tmp3\n\t"
8512             "cmovl  $tmp3,$src1\n\t"
8513             "movl  $dst,$tmp3\t! min reduction16S" %}
8514   ins_encode %{
8515     int vector_len = 1;
8516     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
8517     __ vpminsw($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
8518     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
8519     __ vpminsw($tmp$$XMMRegister,$tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
8520     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister,0x1);
8521     __ vpminsw($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
8522     __ pextrw($tmp4$$Register, $tmp$$XMMRegister,0x1);
8523     __ movswl($tmp4$$Register, $tmp4$$Register);
8524     __ pextrw($tmp3$$Register, $tmp$$XMMRegister,0x0);
8525     __ movswl($tmp3$$Register, $tmp3$$Register);
8526     __ cmpl($tmp4$$Register, $tmp3$$Register);
8527     __ cmovl(Assembler::less, $tmp3$$Register, $tmp4$$Register);
8528     __ cmpl($src1$$Register, $tmp3$$Register);
8529     __ cmovl(Assembler::less, $tmp3$$Register, $src1$$Register);
8530     __ movl($dst$$Register, $tmp3$$Register);
8531   %}
8532   ins_pipe( pipe_slow );
8533 %}
8534 
8535 instruct rvmin32S_reduction_reg(rRegI dst, rRegI src1, vecZ src2, regF tmp, regF tmp2, rRegI tmp3, rRegI tmp4) %{
8536   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
8537   match(Set dst (MinReductionV src1 src2));
8538   effect(TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP tmp4);
8539   format %{ "vextracti64x4_high  $tmp2,$src2\n\t"
8540             "vpminsw  $tmp2,$tmp2,$src2\n\t"
8541             "vextracti128_high  $tmp,$tmp2\n\t"
8542             "vpminsw  $tmp,$tmp,$tmp2\n\t"
8543             "pshufd  $tmp2,$tmp,0xE\n\t"
8544             "vpminsw  $tmp,$tmp,$tmp2\n\t"
8545             "pshufd  $tmp2,$tmp,0x1\n\t"
8546             "vpminsw  $tmp,$tmp,$tmp2\n\t"
8547             "pextrw  $tmp3,$tmp, 0x0\n\t"
8548             "movswl  $dst,$src1\n\t"
8549             "pextrw  $tmp3,$tmp, 0x0\n\t"
8550             "movswl  $dst,$src1\n\t"
8551             "cmpl  $tmp2$tmp3\n\t"
8552             "cmovl  $tmp3,$tmp2\n\t"
8553             "cmpl  $src1,$tmp3\n\t"
8554             "cmovl  $tmp3,$src1\n\t"
8555             "movl  $dst,$dst\t! min reduction32S" %}
8556   ins_encode %{
8557     int vector_len = 2;
8558     __ vextracti64x4_high($tmp2$$XMMRegister, $src2$$XMMRegister);
8559     __ vpminsw($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, vector_len);
8560     __ vextracti128_high($tmp$$XMMRegister, $tmp2$$XMMRegister);
8561     __ vpminsw($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
8562     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
8563     __ vpminsw($tmp$$XMMRegister,$tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
8564     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister,0x1);
8565     __ vpminsw($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
8566     __ pextrw($tmp4$$Register, $tmp$$XMMRegister,0x1);
8567     __ movswl($tmp4$$Register, $tmp4$$Register);
8568     __ pextrw($tmp3$$Register, $tmp$$XMMRegister,0x0);
8569     __ movswl($tmp3$$Register, $tmp3$$Register);
8570     __ cmpl($tmp4$$Register, $tmp3$$Register);
8571     __ cmovl(Assembler::less, $tmp3$$Register, $tmp4$$Register);
8572     __ cmpl($src1$$Register, $tmp3$$Register);
8573     __ cmovl(Assembler::less, $tmp3$$Register, $src1$$Register);
8574     __ movl($dst$$Register, $tmp3$$Register);
8575   %}
8576   ins_pipe( pipe_slow );
8577 %}
8578 
8579 instruct rsmin2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
8580   predicate(UseSSE > 3 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
8581   match(Set dst (MinReductionV src1 src2));
8582   effect(TEMP tmp, TEMP tmp2);
8583   format %{ "pshufd  $tmp,$src2,0x1\n\t"
8584             "pminsd  $tmp,$src2\n\t"
8585             "movd    $tmp2,$src1\n\t"
8586             "pminsd  $tmp2,$tmp\n\t"
8587             "movd    $dst,$tmp2\t! min reduction2I" %}
8588   ins_encode %{
8589     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x1);
8590     __ pminsd($tmp$$XMMRegister, $src2$$XMMRegister);
8591     __ movdl($tmp2$$XMMRegister, $src1$$Register);
8592     __ pminsd($tmp2$$XMMRegister, $tmp$$XMMRegister);
8593     __ movdl($dst$$Register, $tmp2$$XMMRegister);
8594   %}
8595   ins_pipe( pipe_slow );
8596 %}
8597 
8598 instruct rvmin2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
8599   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
8600   match(Set dst (MinReductionV src1 src2));
8601   effect(TEMP tmp, TEMP tmp2);
8602   format %{ "pshufd   $tmp,$src2,0x1\n\t"
8603             "vpminsd  $tmp2,$tmp,$src2\n\t"
8604             "movd     $tmp,$src1\n\t"
8605             "vpminsd  $tmp2,$tmp,$tmp2\n\t"
8606             "movd     $dst,$tmp2\t! min reduction2I" %}
8607   ins_encode %{
8608     int vector_len = 0;
8609     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x1);
8610     __ vpminsd($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
8611     __ movdl($tmp2$$XMMRegister, $src1$$Register);
8612     __ vpminsd($tmp2$$XMMRegister,$tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
8613     __ movdl($dst$$Register, $tmp2$$XMMRegister);
8614   %}
8615   ins_pipe( pipe_slow );
8616 %}
8617 
8618 instruct rsmin4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
8619   predicate(UseSSE > 3 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
8620   match(Set dst (MinReductionV src1 src2));
8621   effect(TEMP tmp, TEMP tmp2);
8622   format %{ "pshufd  $tmp,$src2,0xE\n\t"
8623             "pminsd  $tmp,$src2\n\t"
8624             "pshufd  $tmp2,$tmp,0x1\n\t"
8625             "pminsd  $tmp2,$tmp\n\t"
8626             "movd    $tmp,$src1\n\t"
8627             "pminsd  $tmp2,$tmp\n\t"
8628             "movd    $dst,$tmp2\t! min reduction4I" %}
8629   ins_encode %{
8630     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
8631     __ pminsd($tmp$$XMMRegister, $src2$$XMMRegister);
8632     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
8633     __ pminsd($tmp2$$XMMRegister,$tmp$$XMMRegister);
8634     __ movdl($tmp$$XMMRegister, $src1$$Register);
8635     __ pminsd($tmp2$$XMMRegister,$tmp$$XMMRegister);
8636     __ movdl($dst$$Register, $tmp2$$XMMRegister);
8637   %}
8638   ins_pipe( pipe_slow );
8639 %}
8640 
8641 instruct rvmin4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
8642   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
8643   match(Set dst (MinReductionV src1 src2));
8644   effect(TEMP tmp, TEMP tmp2);
8645   format %{ "pshufd   $tmp,$src2,0xE\n\t"
8646             "vpminsd  $tmp2,$tmp,$src2\n\t"
8647             "pshufd   $tmp,$tmp2,0x1\n\t"
8648             "vpminsd  $tmp2,$tmp2,$tmp\n\t"
8649             "movd     $tmp,$src1\n\t"
8650             "vpminsd  $tmp2,$tmp,$tmp2\n\t"
8651             "movd     $dst,$tmp2\t! min reduction4I" %}
8652   ins_encode %{
8653     int vector_len = 0;
8654     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
8655     __ vpminsd($tmp2$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
8656     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister,0x1);
8657     __ vpminsd($tmp2$$XMMRegister,$tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
8658     __ movdl($tmp$$XMMRegister, $src1$$Register);
8659     __ vpminsd($tmp2$$XMMRegister,$tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
8660     __ movdl($dst$$Register, $tmp2$$XMMRegister);
8661   %}
8662   ins_pipe( pipe_slow );
8663 %}
8664 
8665 instruct rvmin4I_reduction_evex(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
8666   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
8667   match(Set dst (MinReductionV src1 src2));
8668   effect(TEMP tmp, TEMP tmp2);
8669   format %{ "pshufd   $tmp,$src2,0xE\n\t"
8670             "vpminsd  $tmp2,$tmp,$src2\n\t"
8671             "pshufd   $tmp,$tmp2,0x1\n\t"
8672             "vpminsd  $tmp2,$tmp2,$tmp\n\t"
8673             "movd     $tmp,$src1\n\t"
8674             "vpminsd  $tmp2,$tmp,$tmp2\n\t"
8675             "movd     $dst,$tmp2\t! min reduction4I" %}
8676   ins_encode %{
8677     int vector_len = 0;
8678     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
8679     __ vpminsd($tmp2$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
8680     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister,0x1);
8681     __ vpminsd($tmp2$$XMMRegister,$tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
8682     __ movdl($tmp$$XMMRegister, $src1$$Register);
8683     __ vpminsd($tmp2$$XMMRegister,$tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
8684     __ movdl($dst$$Register, $tmp2$$XMMRegister);
8685   %}
8686   ins_pipe( pipe_slow );
8687 %}
8688 
8689 instruct rvmin8I_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2) %{
8690   predicate(UseAVX > 1 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
8691   match(Set dst (MinReductionV src1 src2));
8692   effect(TEMP tmp, TEMP tmp2);
8693   format %{ "vextracti128_high   $tmp,$src2\n\t"
8694             "vpminsd  $tmp,$tmp,$src2\n\t"
8695             "pshufd   $tmp2,$tmp,0xE\n\t"
8696             "vpminsd  $tmp2,$tmp,$tmp2\n\t"
8697             "pshufd   $tmp,$tmp2,0x1\n\t"
8698             "vpminsd  $tmp2,$tmp2,$tmp\n\t"
8699             "movd     $tmp,$src1\n\t"
8700             "vpminsd  $tmp2,$tmp,$tmp2\n\t"
8701             "movd     $dst,$tmp2\t! min reduction8I" %}
8702   ins_encode %{
8703     int vector_len = 1;
8704     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
8705     __ vpminsd($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
8706     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
8707     __ vpminsd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
8708     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister,0x1);
8709     __ vpminsd($tmp2$$XMMRegister,$tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
8710     __ movdl($tmp$$XMMRegister, $src1$$Register);
8711     __ vpminsd($tmp2$$XMMRegister,$tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
8712     __ movdl($dst$$Register, $tmp2$$XMMRegister);
8713   %}
8714   ins_pipe( pipe_slow );
8715 %}
8716 
8717 instruct rvmin8I_reduction_evex(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2) %{
8718   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
8719   match(Set dst (MinReductionV src1 src2));
8720   effect(TEMP tmp, TEMP tmp2);
8721   format %{ "vextracti128_high   $tmp,$src2\n\t"
8722             "vpminsd  $tmp,$tmp,$src2\n\t"
8723             "pshufd   $tmp2,$tmp,0xE\n\t"
8724             "vpminsd  $tmp2,$tmp,$tmp2\n\t"
8725             "pshufd   $tmp,$tmp2,0x1\n\t"
8726             "vpminsd  $tmp2,$tmp2,$tmp\n\t"
8727             "movd     $tmp,$src1\n\t"
8728             "vpminsd  $tmp2,$tmp,$tmp2\n\t"
8729             "movd     $dst,$tmp2\t! min reduction8I" %}
8730   ins_encode %{
8731     int vector_len = 1;
8732     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
8733     __ vpminsd($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
8734     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
8735     __ vpminsd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
8736     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister,0x1);
8737     __ vpminsd($tmp2$$XMMRegister,$tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
8738     __ movdl($tmp$$XMMRegister, $src1$$Register);
8739     __ vpminsd($tmp2$$XMMRegister,$tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
8740     __ movdl($dst$$Register, $tmp2$$XMMRegister);
8741   %}
8742   ins_pipe( pipe_slow );
8743 %}
8744 
8745 instruct rvmin16I_reduction_evex(rRegI dst, rRegI src1, vecZ src2, regF tmp, regF tmp2, regF tmp3) %{
8746   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
8747   match(Set dst (MinReductionV src1 src2));
8748   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
8749   format %{ "vextracti64x4_high  $tmp3,$src2\n\t"
8750             "vpminsd  $tmp3,$tmp3,$src2\n\t"
8751             "vextracti128_high   $tmp,$tmp3\n\t"
8752             "vpminsd  $tmp,$tmp,$tmp3\n\t"
8753             "pshufd   $tmp2,$tmp,0xE\n\t"
8754             "vpminsd  $tmp2,$tmp,$tmp2\n\t"
8755             "pshufd   $tmp,$tmp2,0x1\n\t"
8756             "vpminsd  $tmp2,$tmp2,$tmp\n\t"
8757             "movd     $tmp,$src1\n\t"
8758             "vpminsd  $tmp2,$tmp,$tmp2\n\t"
8759             "movd     $dst,$tmp2\t! min reduction16I" %}
8760   ins_encode %{
8761     int vector_len = 2;
8762     __ vextracti64x4_high($tmp3$$XMMRegister, $src2$$XMMRegister);
8763     __ vpminsd($tmp3$$XMMRegister, $tmp3$$XMMRegister, $src2$$XMMRegister, vector_len);
8764     __ vextracti128_high($tmp$$XMMRegister, $tmp3$$XMMRegister);
8765     __ vpminsd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp3$$XMMRegister, vector_len);
8766     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
8767     __ vpminsd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
8768     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister,0x1);
8769     __ vpminsd($tmp2$$XMMRegister,$tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
8770     __ movdl($tmp$$XMMRegister, $src1$$Register);
8771     __ vpminsd($tmp2$$XMMRegister,$tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
8772     __ movdl($dst$$Register, $tmp2$$XMMRegister);
8773   %}
8774   ins_pipe( pipe_slow );
8775 %}
8776 
8777 // Long Min Reduction
8778 instruct rsmin1L_reduction_reg(rRegL dst, rRegL src1, legVecD src2, rxmm0 tmp, legVecD tmp2) %{
8779   predicate(UseSSE > 3 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
8780   match(Set dst (MinReductionV src1 src2));
8781   effect(TEMP tmp, TEMP tmp2);
8782   format %{ "movdq      $tmp,$src1\n\t"
8783             "movdq      $tmp2,$src1\n\t"
8784             "pcmpgtq   $tmp,$src2\n\t"
8785             "blendvpd  $tmp2,$src2\n\t"
8786             "movdq      $dst,$tmp2\t! min reduction1L" %}
8787   ins_encode %{
8788     __ movdq($tmp$$XMMRegister,$src1$$Register);
8789     __ movdq($tmp2$$XMMRegister,$src1$$Register);
8790     __ pcmpgtq($tmp$$XMMRegister, $src2$$XMMRegister);
8791     __ blendvpd($tmp2$$XMMRegister,$src2$$XMMRegister);
8792     __ movdq($dst$$Register, $tmp2$$XMMRegister);
8793   %}
8794   ins_pipe( pipe_slow );
8795 %}
8796 
8797 instruct rsmin2L_reduction_reg(rRegL dst, rRegL src1, vecX src2, rxmm0 xmm_0, vecX tmp2, vecX tmp3) %{
8798   predicate(UseSSE > 3 && UseAVX == 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
8799   match(Set dst (MinReductionV src1 src2));
8800   effect(TEMP xmm_0, TEMP tmp2, TEMP tmp3);
8801   format %{ "pshufd  $tmp3,$src2,0xE\n\t"
8802             "movdqu  $xmm_0,$src2\n\t"
8803             "movdqu  $tmp2,$src2\n\t"
8804             "pcmpgtq  $xmm_0,$tmp3\n\t"
8805             "blendvpd  $tmp2,$tmp3\n\t"
8806             "movdqu  $xmm_0,$tmp2\n\t"
8807             "movdq  $tmp3,$src1\n\t"
8808             "pcmpgtq  $xmm_0,$tmp3\n\t"
8809             "blendvpd  $tmp2,$tmp3\n\t"
8810             "movq  $dst,$tmp2\t! min reduction2L" %}
8811   ins_encode %{
8812     __ pshufd($tmp3$$XMMRegister, $src2$$XMMRegister, 0xE);
8813     __ movdqu($xmm_0$$XMMRegister, $src2$$XMMRegister);
8814     __ movdqu($tmp2$$XMMRegister, $src2$$XMMRegister);
8815     __ pcmpgtq($xmm_0$$XMMRegister, $tmp3$$XMMRegister);
8816     __ blendvpd($tmp2$$XMMRegister, $tmp3$$XMMRegister);
8817     __ movdqu($xmm_0$$XMMRegister, $tmp2$$XMMRegister);
8818     __ movdq($tmp3$$XMMRegister, $src1$$Register);
8819     __ pcmpgtq($xmm_0$$XMMRegister, $tmp3$$XMMRegister);
8820     __ blendvpd($tmp2$$XMMRegister,$tmp3$$XMMRegister);
8821     __ movdq($dst$$Register, $tmp2$$XMMRegister);
8822   %}
8823   ins_pipe( pipe_slow );
8824 %}
8825 
8826 instruct rvmin2L_reduction_reg(rRegL dst, rRegL src1, legVecX src2, legVecX tmp, legVecX tmp2, legVecX tmp3) %{
8827   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
8828   match(Set dst (MinReductionV src1 src2));
8829   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
8830   format %{ "pshufd   $tmp2,$src2,0xE\n\t"
8831             "vpcmpgtq  $tmp,$tmp2,$src2\n\t"
8832             "vblendvpd   $tmp2,$tmp2,$src2,$tmp\n\t"
8833             "movq     $tmp,$src1\n\t"
8834             "vpcmpgtq  $tmp3,$tmp2,$tmp\n\t"
8835             "blendvpd   $tmp2,$tmp2,$src1,$tmp3\n\t"
8836             "movq     $dst,$tmp2\t! min reduction2L" %}
8837   ins_encode %{
8838     int vector_len = 0;
8839     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
8840     __ vpcmpgtq($tmp$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, vector_len);
8841     __ vblendvpd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister, vector_len);
8842     __ movdq($tmp$$XMMRegister,$src1$$Register);
8843     __ vpcmpgtq($tmp3$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
8844     __ vblendvpd($tmp2$$XMMRegister, $tmp2$$XMMRegister,$src1$$XMMRegister,$tmp3$$XMMRegister, vector_len);
8845     __ movdq($dst$$Register, $tmp2$$XMMRegister);
8846   %}
8847   ins_pipe( pipe_slow );
8848 %}
8849 
8850 instruct rvmin4L_reduction_reg(rRegL dst, rRegL src1, legVecY src2, legVecY tmp, legVecY tmp2, legVecY tmp3) %{
8851   predicate(UseAVX > 1 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
8852   match(Set dst (MinReductionV src1 src2));
8853   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
8854   format %{ "vextracti128_high   $tmp2,$src2\n\t"
8855             "vpcmpgtq  $tmp,$tmp2,$src2\n\t"
8856             "vblendvpd   $tmp2,$tmp2,$src2,$tmp\n\t"
8857             "vpshufd   $tmp3, $tmp2,0x1\n\t"
8858             "vpcmpgtq  $tmp, $tmp3,$tmp\n\t2"
8859             "vblendvpd $tmp3,$tmp3,$tmp2,$tmp\n\t"
8860             "movq     $tmp2,$src1\n\t"
8861             "vpcmpgtq  $tmp,$tmp2,$tmp3\n\t"
8862             "blendvpd   $tmp2,$tmp2,$tmp3,$tmp\n\t"
8863             "movq     $dst,$tmp2\t! min reduction2L" %}
8864   ins_encode %{
8865     int vector_len = 1;
8866     __ vextracti128_high($tmp2$$XMMRegister, $src2$$XMMRegister);
8867     __ vpcmpgtq($tmp$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, vector_len);
8868     __ vblendvpd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister, vector_len);
8869     __ pshufd($tmp3$$XMMRegister, $tmp2$$XMMRegister, 0xE);
8870     __ vpcmpgtq($tmp$$XMMRegister, $tmp3$$XMMRegister, $tmp2$$XMMRegister, vector_len);
8871     __ vblendvpd($tmp3$$XMMRegister,$tmp3$$XMMRegister,$tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
8872     __ movdq($tmp$$XMMRegister,$src1$$Register);
8873     __ vpcmpgtq($tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp$$XMMRegister, vector_len);
8874     __ vblendvpd($tmp2$$XMMRegister, $tmp3$$XMMRegister,$tmp$$XMMRegister,$tmp2$$XMMRegister, vector_len);
8875     __ movdq($dst$$Register, $tmp2$$XMMRegister);
8876   %}
8877   ins_pipe( pipe_slow );
8878 %}
8879 
8880 instruct rvmin8L_reduction_reg(rRegL dst, rRegL src1, legVecZ src2, legVecZ tmp, legVecZ tmp2, legVecZ tmp3) %{
8881   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
8882   match(Set dst (MinReductionV src1 src2));
8883   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
8884   format %{ "vextracti64x4_high  $tmp3,$src2\n\t"
8885             "vpcmpgtq  $tmp,$tmp3,$src2\n\t"
8886             "vblendvpd   $tmp3,$tmp3,$src2,$tmp\n\t"
8887             "vextracti128_high   $tmp2,$tmp3\n\t"
8888             "vpcmpgtq  $tmp,$tmp2,$tmp3\n\t"
8889             "vblendvpd   $tmp2,$tmp2,$tmp3,$tmp\n\t"
8890             "vpshufd  $tmp3,$tmp2,0x1\n\t"
8891             "vpcmpgtq   $tmp,$tmp3,$tmp2\n\t"
8892             "vblendvpd  $tmp3,$tmp3,$tmp2,$tmp\n\t"
8893             "movq     $tmp2,$src1\n\t"
8894             "vpcmpgtq  $tmp,$tmp2,$tmp3\n\t"
8895             "vblendvpd  $tmp2,$tmp2,$tmp3,$tmp\n\t"
8896             "movq     $dst,$tmp2\t! min reduction4I" %}
8897   ins_encode %{
8898     __ vextracti64x4_high($tmp3$$XMMRegister, $src2$$XMMRegister);
8899     __ vpcmpgtq($tmp$$XMMRegister, $tmp3$$XMMRegister, $src2$$XMMRegister, 1);
8900     __ vblendvpd($tmp3$$XMMRegister, $tmp3$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister, 1);
8901     __ vextracti128_high($tmp2$$XMMRegister, $tmp3$$XMMRegister);
8902     __ vpcmpgtq($tmp$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, 1);
8903     __ vblendvpd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp$$XMMRegister, 1);
8904     __ pshufd($tmp3$$XMMRegister, $tmp2$$XMMRegister, 0xE);
8905     __ vpcmpgtq($tmp$$XMMRegister, $tmp3$$XMMRegister, $tmp2$$XMMRegister, 1);
8906     __ vblendvpd($tmp3$$XMMRegister,$tmp3$$XMMRegister,$tmp2$$XMMRegister, $tmp$$XMMRegister, 1);
8907     __ movdq($tmp2$$XMMRegister, $src1$$Register);
8908     __ vpcmpgtq($tmp$$XMMRegister,$tmp2$$XMMRegister, $tmp3$$XMMRegister, 1);
8909     __ vblendvpd($tmp2$$XMMRegister,$tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp$$XMMRegister, 1);
8910     __ movdq($dst$$Register, $tmp2$$XMMRegister);
8911   %}
8912   ins_pipe( pipe_slow );
8913 %}
8914 
8915 // Float Min Reduction
8916 instruct rvmin2F_reduction_reg_av(legRegF dst, legVecD src, legVecD tmp, legVecD dtmp,
8917                                   legVecD atmp, legVecD btmp, legVecX xmm_1) %{
8918   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
8919   match(Set dst (MinReductionV dst src));
8920   effect(TEMP dst, TEMP tmp, TEMP dtmp, TEMP atmp, TEMP btmp, TEMP xmm_1);
8921   format %{ "vpermilps    $xmm_1,$src,1\n\t"
8922             "vminps_macro $dtmp,$xmm_1,$src\t! minps\n\t"
8923             "vminps_macro $dst,$dtmp,$dst\t! minps" %}
8924   ins_encode %{
8925     int vector_len = 0;
8926     __ vpermilps($xmm_1$$XMMRegister, $src$$XMMRegister, 1, vector_len);
8927     __ vmin_max_macro($dtmp$$XMMRegister, $xmm_1$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister,
8928                       $atmp$$XMMRegister, $btmp$$XMMRegister, true, true, vector_len);
8929     __ vmin_max_macro($dst$$XMMRegister, $dtmp$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister,
8930                       $atmp$$XMMRegister, $btmp$$XMMRegister, true, true, vector_len);
8931   %}
8932   ins_pipe( pipe_slow );
8933 %}
8934 
8935 instruct rvmin2F_reduction_reg(legRegF dst, immF src1, legVecD src2, legVecD tmp,
8936                                legVecD atmp, legVecD btmp, legVecX xmm_1) %{
8937   predicate(UseAVX > 0 && n->in(1)->as_Type()->type() == (Type*)TypeF::POS_INF && 
8938             n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
8939   match(Set dst (MinReductionV src1 src2));
8940   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_1);
8941   format %{ "vpermilps    $xmm_1,$src2,1\n\t"
8942             "vminps_macro $dst,$xmm_1,$src2\t! minps" %}
8943   ins_encode %{
8944     int vector_len = 0;
8945     __ vpermilps($xmm_1$$XMMRegister, $src2$$XMMRegister, 1, vector_len);
8946     __ vmin_max_macro($dst$$XMMRegister, $xmm_1$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister,
8947                       $atmp$$XMMRegister, $btmp$$XMMRegister, true, true, vector_len);
8948   %}
8949   ins_pipe( pipe_slow );
8950 %}
8951 
8952 instruct rvmin4F_reduction_reg_av(legRegF dst, legVecX src, legVecX tmp, legVecX dtmp, 
8953                                   legVecX atmp, legVecX btmp, legVecX xmm_0, legVecX xmm_1) %{
8954   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
8955   match(Set dst (MinReductionV dst src));
8956   effect(TEMP dst, TEMP tmp, TEMP dtmp, TEMP atmp, TEMP btmp, TEMP xmm_0, TEMP xmm_1);
8957   format %{ "vpermilps    $xmm_1,$src,14\n\t"
8958             "vminps_macro $xmm_0,$xmm_1,$src\t! minps\n\t"
8959             "vpermilps    $xmm_1,$xmm_0,1\n\t"
8960             "vminps_macro $dtmp,$xmm_1,$xmm_0\t! minps\n\t"
8961             "vminps_macro $dst,$dtmp,$dst\t! minps" %}
8962   ins_encode %{
8963     int vector_len = 0;
8964     __ vpermilps($xmm_1$$XMMRegister, $src$$XMMRegister, 14, vector_len);
8965     __ vmin_max_macro($xmm_0$$XMMRegister, $xmm_1$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister,
8966                       $atmp$$XMMRegister, $btmp$$XMMRegister , true, true, vector_len);
8967     __ vpermilps($xmm_1$$XMMRegister, $xmm_0$$XMMRegister, 1, vector_len);
8968     __ vmin_max_macro($dtmp$$XMMRegister, $xmm_0$$XMMRegister, $xmm_1$$XMMRegister, $tmp$$XMMRegister,
8969                       $atmp$$XMMRegister, $btmp$$XMMRegister , true, true, vector_len);
8970     __ vmin_max_macro($dst$$XMMRegister, $dtmp$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister,
8971                       $atmp$$XMMRegister, $btmp$$XMMRegister , true, true, vector_len);
8972   %}
8973   ins_pipe( pipe_slow );
8974 %}
8975 
8976 instruct rvmin4F_reduction_reg(legRegF dst, immF src1, legVecX src2, legVecX tmp, legVecX atmp,
8977                                legVecX btmp, legVecX xmm_0, legVecX xmm_1) %{
8978   predicate(UseAVX > 0 && n->in(1)->as_Type()->type() == (Type*)TypeF::POS_INF && 
8979             n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
8980   match(Set dst (MinReductionV src1 src2));
8981   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_0, TEMP xmm_1);
8982   format %{ "vpermilps    $xmm_1,$src2,14\n\t"
8983             "vminps_macro $xmm_0,$xmm_1,$src2\t! minps\n\t"
8984             "vpermilps    $xmm_1,$xmm_0,1\n\t"
8985             "vminps_macro $dst,$xmm_1,$xmm_0\t! minps" %}
8986   ins_encode %{
8987     int vector_len = 0;
8988     __ vpermilps($xmm_1$$XMMRegister, $src2$$XMMRegister, 14, vector_len);
8989     __ vmin_max_macro($xmm_0$$XMMRegister, $xmm_1$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister,
8990                       $atmp$$XMMRegister, $btmp$$XMMRegister , true, true, vector_len);
8991     __ vpermilps($xmm_1$$XMMRegister, $xmm_0$$XMMRegister, 1, vector_len);
8992     __ vmin_max_macro($dst$$XMMRegister, $xmm_0$$XMMRegister, $xmm_1$$XMMRegister, $tmp$$XMMRegister,
8993                       $atmp$$XMMRegister, $btmp$$XMMRegister , true, true, vector_len);
8994   %}
8995   ins_pipe( pipe_slow );
8996 %}
8997 
8998 instruct rvmin8F_reduction_reg_av(legRegF dst, legVecY src, legVecY tmp, legVecY dtmp, legVecY atmp,
8999                                   legVecY btmp, legVecY ymm_0, legVecY ymm_1) %{
9000   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
9001   match(Set dst (MinReductionV dst src));
9002   effect(TEMP dst, TEMP tmp, TEMP dtmp, TEMP atmp, TEMP btmp, TEMP ymm_0, TEMP ymm_1);
9003   format %{ "vperm2f128   $ymm_1,$src,$src,1\n\t"
9004             "vminps_macro $ymm_0,$ymm_1,$src\t! minps\n\t"
9005             "vpermilps    $ymm_1,$ymm_0,14\n\t"
9006             "vminps_macro $ymm_0,$ymm_1,$ymm_0\n\t! mips\n\t"
9007             "vpermilps    $ymm_1,$ymm_0,1\n\t"
9008             "vminps_macro $dtmp,$ymm_1,$ymm_0\t! minps\n\t" 
9009             "vminps_macro $dst,$dtmp,$dst\t! minps" %}
9010   ins_encode %{
9011     int vector_len = 1;
9012     __ vperm2f128($ymm_1$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 1);
9013     __ vmin_max_macro($ymm_0$$XMMRegister, $ymm_1$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister,
9014                       $atmp$$XMMRegister, $btmp$$XMMRegister , true, true, vector_len);
9015     __ vpermilps($ymm_1$$XMMRegister, $ymm_0$$XMMRegister, 14, vector_len);
9016     __ vmin_max_macro($ymm_0$$XMMRegister, $ymm_1$$XMMRegister, $ymm_0$$XMMRegister, $tmp$$XMMRegister,
9017                       $atmp$$XMMRegister, $btmp$$XMMRegister , true, true, vector_len);
9018     __ vpermilps($ymm_1$$XMMRegister, $ymm_0$$XMMRegister, 1, vector_len);
9019     __ vmin_max_macro($dtmp$$XMMRegister, $ymm_0$$XMMRegister, $ymm_1$$XMMRegister, $tmp$$XMMRegister,
9020                       $atmp$$XMMRegister, $btmp$$XMMRegister , true, true, vector_len);
9021     __ vmin_max_macro($dst$$XMMRegister, $dtmp$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister,
9022                       $atmp$$XMMRegister, $btmp$$XMMRegister , true, true, vector_len);
9023   %}
9024   ins_pipe( pipe_slow );
9025 %}
9026 
9027 instruct rvmin8F_reduction_reg(legRegF dst, immF src1, legVecY src2, legVecY tmp,
9028                                legVecY atmp, legVecY btmp, legVecY ymm_0, legVecY ymm_1) %{
9029   predicate(UseAVX > 0 && n->in(1)->as_Type()->type() == (Type*)TypeF::POS_INF && 
9030             n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
9031   match(Set dst (MinReductionV src1 src2));
9032   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP ymm_0, TEMP ymm_1);
9033   format %{ "vperm2f128   $ymm_1,$src2,$src2, 1\n\t"
9034             "vminps_macro $ymm_0,$ymm_1,$src2\t! minps\n\t"
9035             "vpermilps    $ymm_1,$ymm_0,14\n\t"
9036             "vminps_macro $ymm_0,$ymm_1,$ymm_0\t! mips\n\t"
9037             "vpermilps    $ymm_1,$ymm_0,1\n\t"
9038             "vminps_macro $dst,$ymm_1,$ymm_0\t! minps" %}
9039   ins_encode %{
9040     int vector_len = 1;
9041     __ vperm2f128($ymm_1$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, 1);
9042     __ vmin_max_macro($ymm_0$$XMMRegister, $ymm_1$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister,
9043                       $atmp$$XMMRegister, $btmp$$XMMRegister , true, true, vector_len);
9044     __ vpermilps($ymm_1$$XMMRegister, $ymm_0$$XMMRegister, 14, vector_len);
9045     __ vmin_max_macro($ymm_0$$XMMRegister, $ymm_1$$XMMRegister, $ymm_0$$XMMRegister, $tmp$$XMMRegister,
9046                       $atmp$$XMMRegister, $btmp$$XMMRegister , true, true, vector_len);
9047     __ vpermilps($ymm_1$$XMMRegister, $ymm_0$$XMMRegister, 1, vector_len);
9048     __ vmin_max_macro($dst$$XMMRegister, $ymm_0$$XMMRegister, $ymm_1$$XMMRegister, $tmp$$XMMRegister,
9049                       $atmp$$XMMRegister, $btmp$$XMMRegister , true, true, vector_len);
9050   %}
9051   ins_pipe( pipe_slow );
9052 %}
9053 
9054 instruct rvmin16F_reduction_reg_av(regF dst, vecZ src, vecZ tmp, vecZ dtmp,
9055                                    vecZ atmp, vecZ btmp, vecY ymm_0, vecY ymm_1) %{
9056   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
9057   match(Set dst (MinReductionV dst src));
9058   effect(TEMP dst, TEMP tmp, TEMP dtmp, TEMP atmp, TEMP btmp, TEMP ymm_0, TEMP ymm_1);
9059   format %{
9060        "vextractf64x4 $ymm_0, $src, 0\n\t"
9061        "vextractf64x4 $ymm_1, $src, 1\n\t"
9062        "vminps_macro $ymm_0,$ymm_1,$ymm_0\t! minps\n\t"
9063        "vpermpd      $ymm_1,$ymm_0,78\n\t"
9064        "vminps_macro $ymm_0,$ymm_1,$ymm_0\n\t! minps\n\t"
9065        "vpermilps    $ymm_1,$ymm_0,14\n\t"
9066        "vminps_macro $ymm_0,$ymm_1,$ymm_0\n\t! minps\n\t"
9067        "vpermilps    $ymm_1,$ymm_0,1\n\t"
9068        "vminps_macro $dtmp,$ymm_1,$ymm_0\t! minps\n\t"
9069        "vminps_macro $dst,$dtmp,$dst\t! minps" %}
9070   ins_encode %{
9071     int vector_len = 1;
9072     KRegister ktmp = k1;
9073     __ vextractf64x4($ymm_0$$XMMRegister, $src$$XMMRegister, 0);
9074     __ vextractf64x4($ymm_1$$XMMRegister, $src$$XMMRegister, 1);
9075     __ vmin_max_macro_evex($ymm_0$$XMMRegister, $ymm_1$$XMMRegister, $ymm_0$$XMMRegister, ktmp,
9076                     $atmp$$XMMRegister, $btmp$$XMMRegister , true, true, vector_len);
9077     __ vpermpd($ymm_1$$XMMRegister, $ymm_0$$XMMRegister, 78, vector_len);
9078     __ vmin_max_macro_evex($ymm_0$$XMMRegister, $ymm_1$$XMMRegister, $ymm_0$$XMMRegister, ktmp,
9079                     $atmp$$XMMRegister, $btmp$$XMMRegister , true, true, vector_len);
9080     __ vpermilps($ymm_1$$XMMRegister, $ymm_0$$XMMRegister, 14, vector_len);
9081     __ vmin_max_macro_evex($ymm_0$$XMMRegister, $ymm_1$$XMMRegister, $ymm_0$$XMMRegister, ktmp,
9082                     $atmp$$XMMRegister, $btmp$$XMMRegister , true, true, vector_len);
9083     __ vpermilps($ymm_1$$XMMRegister, $ymm_0$$XMMRegister, 1, vector_len);
9084     __ vmin_max_macro_evex($dtmp$$XMMRegister, $ymm_0$$XMMRegister, $ymm_1$$XMMRegister, ktmp,
9085                     $atmp$$XMMRegister, $btmp$$XMMRegister , true, true, vector_len);
9086     __ vmin_max_macro_evex($dst$$XMMRegister, $dtmp$$XMMRegister, $dst$$XMMRegister, ktmp,
9087                     $atmp$$XMMRegister, $btmp$$XMMRegister , true, true, vector_len);
9088   %}
9089   ins_pipe( pipe_slow );
9090 %}
9091 
9092 instruct rvmin16F_reduction_reg(regF dst, immF src1, vecZ src2, vecZ tmp,
9093                                 vecZ atmp, vecZ btmp, vecY ymm_0, vecY ymm_1) %{
9094   predicate(UseAVX > 2 && n->in(1)->as_Type()->type() == (Type*)TypeF::POS_INF && 
9095             n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
9096   match(Set dst (MinReductionV src1 src2));
9097   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP ymm_0, TEMP ymm_1);
9098   format %{
9099        "vextractf64x4 $ymm_0, $src2, 0\n\t"
9100        "vextractf64x4 $ymm_1, $src2, 1\n\t"
9101        "vminps_macro $ymm_0,$ymm_1,$ymm_0\t! minps\n\t"
9102        "vpermpd      $ymm_1,$ymm_0, 78\n\t"
9103        "vminps_macro $ymm_0,$ymm_1,$ymm_0\t! minps\n\t"
9104        "vpermilps    $ymm_1,$ymm_0,14\n\t"
9105        "vminps_macro $ymm_0,$ymm_1,$ymm_0\t! mips\n\t"
9106        "vpermilps    $ymm_1,$ymm_0,1\n\t"
9107        "vminps_macro $dst,$ymm_1,$ymm_0\t! minps" %}
9108   ins_encode %{
9109     int vector_len = 1;
9110     KRegister ktmp = k1;
9111     __ vextractf64x4($ymm_0$$XMMRegister, $src2$$XMMRegister, 0);
9112     __ vextractf64x4($ymm_1$$XMMRegister, $src2$$XMMRegister, 1);
9113     __ vmin_max_macro_evex($ymm_0$$XMMRegister, $ymm_1$$XMMRegister, $ymm_0$$XMMRegister, ktmp,
9114                     $atmp$$XMMRegister, $btmp$$XMMRegister , true, true, vector_len);
9115     __ vpermpd($ymm_1$$XMMRegister, $ymm_0$$XMMRegister, 78, vector_len);
9116     __ vmin_max_macro_evex($ymm_0$$XMMRegister, $ymm_1$$XMMRegister, $ymm_0$$XMMRegister, ktmp,
9117                     $atmp$$XMMRegister, $btmp$$XMMRegister , true, true, vector_len);
9118     __ vpermilps($ymm_1$$XMMRegister, $ymm_0$$XMMRegister, 14, vector_len);
9119     __ vmin_max_macro_evex($ymm_0$$XMMRegister, $ymm_1$$XMMRegister, $ymm_0$$XMMRegister, ktmp,
9120                     $atmp$$XMMRegister, $btmp$$XMMRegister , true, true, vector_len);
9121     __ vpermilps($ymm_1$$XMMRegister, $ymm_0$$XMMRegister, 1, vector_len);
9122     __ vmin_max_macro_evex($dst$$XMMRegister, $ymm_0$$XMMRegister, $ymm_1$$XMMRegister, ktmp,
9123                     $atmp$$XMMRegister, $btmp$$XMMRegister , true, true, vector_len);
9124   %}
9125   ins_pipe( pipe_slow );
9126 %}
9127 
9128 instruct rvmin2D_reduction_reg_av(legRegD dst, legVecX src, legVecX tmp, legVecX dtmp,
9129                                   legVecX atmp, legVecX btmp, legVecX xmm_1) %{
9130   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
9131   match(Set dst (MinReductionV dst src));
9132   effect(TEMP dst, TEMP tmp, TEMP dtmp, TEMP atmp, TEMP btmp, TEMP xmm_1);
9133   format %{ "vpermilpd    $xmm_1,$src,1\n\t"
9134             "vminps_macro $dtmp,$xmm_1,$src\t! minps\n\t"
9135             "vminps_macro $dst,$dtmp,$dst\t! minps" %}
9136   ins_encode %{
9137     int vector_len = 0;
9138     __ vpermilpd($xmm_1$$XMMRegister, $src$$XMMRegister, 1, vector_len);
9139     __ vmin_max_macro($dtmp$$XMMRegister, $xmm_1$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister,
9140                       $atmp$$XMMRegister, $btmp$$XMMRegister, false, true, vector_len);
9141     __ vmin_max_macro($dst$$XMMRegister, $dtmp$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister,
9142                       $atmp$$XMMRegister, $btmp$$XMMRegister, false, true, vector_len);
9143   %}
9144   ins_pipe( pipe_slow );
9145 %}
9146 
9147 instruct rvmin2D_reduction_reg(legRegD dst, immD src1, legVecX src2, legVecX tmp,
9148                                legVecX atmp, legVecX btmp, legVecX xmm_1) %{
9149   predicate(UseAVX > 0 && n->in(1)->as_Type()->type() == (Type*)TypeD::POS_INF && 
9150             n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
9151   match(Set dst (MinReductionV src1 src2));
9152   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_1);
9153   format %{ "vpermilpd    $xmm_1,$src2,1\n\t"
9154             "vminps_macro $dst,$xmm_1,$src2\t! minps" %}
9155   ins_encode %{
9156     int vector_len = 0;
9157     __ vpermilpd($xmm_1$$XMMRegister, $src2$$XMMRegister, 1, vector_len);
9158     __ vmin_max_macro($dst$$XMMRegister, $xmm_1$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister,
9159                       $atmp$$XMMRegister, $btmp$$XMMRegister, false, true, vector_len);
9160   %}
9161   ins_pipe( pipe_slow );
9162 %}
9163 
9164 instruct rvmin4D_reduction_reg_av(legRegD dst, legVecY src, legVecY tmp, legVecY dtmp,
9165                                   legVecY atmp, legVecY btmp, legVecY ymm_0, legVecY ymm_1) %{
9166   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
9167   match(Set dst (MinReductionV dst src));
9168   effect(TEMP dst, TEMP tmp, TEMP dtmp, TEMP atmp, TEMP btmp, TEMP ymm_0, TEMP ymm_1);
9169   format %{ "vperm2f128   $ymm_1,$src,$src,1\n\t"
9170             "vminpd_macro $ymm_0,$ymm_1,$src\t! minps\n\t"
9171             "vpermilpd    $ymm_1,$ymm_0,1\n\t"
9172             "vminpd_macro $dtmp,$ymm_1,$ymm_0\t! minps\n\t"
9173             "vminpd_macro $dst,$dtmp,$dst\t! minps" %}
9174   ins_encode %{
9175     int vector_len = 1;
9176     __ vperm2f128($ymm_1$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 1);
9177     __ vmin_max_macro($ymm_0$$XMMRegister, $ymm_1$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister,
9178                       $atmp$$XMMRegister, $btmp$$XMMRegister , false, true, vector_len);
9179     __ vpermilpd($ymm_1$$XMMRegister, $ymm_0$$XMMRegister, 1, vector_len);
9180     __ vmin_max_macro($dtmp$$XMMRegister, $ymm_0$$XMMRegister, $ymm_1$$XMMRegister, $tmp$$XMMRegister,
9181                       $atmp$$XMMRegister, $btmp$$XMMRegister , false, true, vector_len);
9182     __ vmin_max_macro($dst$$XMMRegister, $dtmp$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister,
9183                       $atmp$$XMMRegister, $btmp$$XMMRegister , false, true, vector_len);
9184   %}
9185   ins_pipe( pipe_slow );
9186 %}
9187 
9188 instruct rvmin4D_reduction_reg(legRegD dst, immD src1, legVecY src2, legVecY tmp,
9189                                legVecY atmp, legVecY btmp, legVecY ymm_0, legVecY ymm_1) %{
9190   predicate(UseAVX > 0  && n->in(1)->as_Type()->type() == (Type*)TypeD::POS_INF && 
9191             n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
9192   match(Set dst (MinReductionV src1 src2));
9193   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP ymm_0, TEMP ymm_1);
9194   format %{ "vperm2f128   $ymm_1,$src2,$src2,1\n\t"
9195             "vminpd_macro $ymm_0,$ymm_1,$src2\t! minps\n\t"
9196             "vpermilpd    $ymm_1,$ymm_0,1\n\t"
9197             "vminpd_macro $dst,$ymm_1,$ymm_0\t! minps" %}
9198   ins_encode %{
9199     int vector_len = 1;
9200     __ vperm2f128($ymm_1$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, 1);
9201     __ vmin_max_macro($ymm_0$$XMMRegister, $ymm_1$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister,
9202                       $atmp$$XMMRegister, $btmp$$XMMRegister , false, true, vector_len);
9203     __ vpermilpd($ymm_1$$XMMRegister, $ymm_0$$XMMRegister, 1, vector_len);
9204     __ vmin_max_macro($dst$$XMMRegister, $ymm_0$$XMMRegister, $ymm_1$$XMMRegister, $tmp$$XMMRegister,
9205                       $atmp$$XMMRegister, $btmp$$XMMRegister , false, true, vector_len);
9206   %}
9207   ins_pipe( pipe_slow );
9208 %}
9209 
9210 instruct rvmin8D_reduction_reg_av(regD dst, vecZ src, vecZ tmp, vecZ dtmp, vecZ atmp,
9211                                   vecZ btmp, vecY ymm_0, vecY ymm_1) %{
9212   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
9213   match(Set dst (MinReductionV dst src));
9214   effect(TEMP dst, TEMP tmp, TEMP dtmp, TEMP atmp, TEMP btmp, TEMP ymm_0, TEMP ymm_1);
9215   format %{
9216        "vextractf64x4 $ymm_0, $src, 0\n\t"
9217        "vextractf64x4 $ymm_1, $src, 1\n\t"
9218        "vminpd_macro $ymm_0,$ymm_1,$ymm_0\t! minpd\n\t"
9219        "vpermpd      $ymm_1,$ymm_0,14\n\t"
9220        "vminpd_macro $ymm_0,$ymm_1,$src\t! minpd\n\t"
9221        "vpermilpd    $ymm_1,$ymm_0,1\n\t"
9222        "vminpd_macro $dtmp,$ymm_1,$ymm_0\t! minpd\n\t" 
9223        "vminpd_macro $dst,$dtmp,$dst\t! minpd\t" %}
9224   ins_encode %{
9225     int vector_len = 1;
9226     KRegister ktmp = k1;
9227     __ vextractf64x4($ymm_0$$XMMRegister, $src$$XMMRegister, 0);
9228     __ vextractf64x4($ymm_1$$XMMRegister, $src$$XMMRegister, 1);
9229     __ vmin_max_macro_evex($ymm_0$$XMMRegister, $ymm_1$$XMMRegister, $ymm_0$$XMMRegister, ktmp,
9230                     $atmp$$XMMRegister, $btmp$$XMMRegister , false, true, vector_len);
9231     __ vpermpd($ymm_1$$XMMRegister, $ymm_0$$XMMRegister, 14, vector_len);
9232     __ vmin_max_macro_evex($ymm_0$$XMMRegister, $ymm_1$$XMMRegister, $ymm_0$$XMMRegister, ktmp,
9233                     $atmp$$XMMRegister, $btmp$$XMMRegister , false, true, vector_len);
9234     __ vpermilpd($ymm_1$$XMMRegister, $ymm_0$$XMMRegister, 1, vector_len);
9235     __ vmin_max_macro_evex($dst$$XMMRegister, $ymm_1$$XMMRegister, $ymm_0$$XMMRegister, ktmp,
9236                     $atmp$$XMMRegister, $btmp$$XMMRegister , false, true, vector_len);
9237     __ vmin_max_macro_evex($dst$$XMMRegister, $dtmp$$XMMRegister, $dst$$XMMRegister, ktmp,
9238                     $atmp$$XMMRegister, $btmp$$XMMRegister , false, true, vector_len);
9239   %}
9240   ins_pipe( pipe_slow );
9241 %}
9242 
9243 instruct rvmin8D_reduction_reg(regD dst, immD src1, vecZ src2, vecZ tmp, 
9244                                vecZ atmp, vecZ btmp, vecY ymm_0, vecY ymm_1) %{
9245   predicate(UseAVX > 2 && n->in(1)->as_Type()->type() == (Type*)TypeD::POS_INF && 
9246             n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
9247   match(Set dst (MinReductionV src1 src2));
9248   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP ymm_0, TEMP ymm_1);
9249   format %{
9250        "vextractf64x4 $ymm_0, $src2, 0\n\t"
9251        "vextractf64x4 $ymm_1, $src2, 1\n\t"
9252        "vminpd_macro $ymm_0,$ymm_1,$ymm_0\t! minpd\n\t"
9253        "vpermpd      $ymm_1,$ymm_0,14\n\t"
9254        "vminpd_macro $ymm_0,$ymm_1,$ymm_0\t! minpd\n\t"
9255        "vpermilpd    $ymm_1,$ymm_0,1\n\t"
9256        "vminpd_macro $dst,$ymm_1,$ymm_0\t! minpd\n\t" %}
9257   ins_encode %{
9258     int vector_len = 1;
9259     KRegister ktmp = k1;
9260     __ vextractf64x4($ymm_0$$XMMRegister, $src2$$XMMRegister, 0);
9261     __ vextractf64x4($ymm_1$$XMMRegister, $src2$$XMMRegister, 1);
9262     __ vmin_max_macro_evex($ymm_0$$XMMRegister, $ymm_1$$XMMRegister, $ymm_0$$XMMRegister, ktmp,
9263                     $atmp$$XMMRegister, $btmp$$XMMRegister , false, true, vector_len);
9264     __ vpermpd($ymm_1$$XMMRegister, $ymm_0$$XMMRegister, 14, vector_len);
9265     __ vmin_max_macro_evex($ymm_0$$XMMRegister, $ymm_1$$XMMRegister, $ymm_0$$XMMRegister, ktmp,
9266                     $atmp$$XMMRegister, $btmp$$XMMRegister , false, true, vector_len);
9267     __ vpermilpd($ymm_1$$XMMRegister, $ymm_0$$XMMRegister, 1, vector_len);
9268     __ vmin_max_macro_evex($dst$$XMMRegister, $ymm_1$$XMMRegister, $ymm_0$$XMMRegister, ktmp,
9269                     $atmp$$XMMRegister, $btmp$$XMMRegister , false, true, vector_len);
9270   %}
9271   ins_pipe( pipe_slow );
9272 %}
9273 
9274 // ------- Max Reduction ------------
9275 
9276 instruct rsmax8B_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, rRegI tmp2, rRegI tmp3, regF tmp4) %{
9277   predicate(UseSSE > 3 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
9278   match(Set dst (MaxReductionV src1 src2));
9279   effect(TEMP dst, TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP tmp4);
9280   format %{ "pshufd  $tmp,$src2,0x1\n\t"
9281             "pminsb  $tmp,$src2\n\t"
9282             "pextrb  $tmp2,$tmp, 0x1\n\t"
9283             "movsbl  $tmp2,$tmp2\n\t"
9284             "pextrb  $tmp3,$tmp,0x0\n\t"
9285             "movsbl  $tmp3,$tmp3\n\t"
9286             "cmpl  $tmp2,$tmp3\n\t"
9287             "cmovl  $tmp3,$tmp2\n\t"
9288             "cmpl  $src1,$tmp3\n\t"
9289             "cmovl  $tmp3,$src1, 0x0\n\t"
9290             "movl  $dst,$tmp2\n\t"
9291             "pextrb  $tmp2,$tmp\n\t"
9292             "movsbl  $tmp2,$tmp2\n\t"
9293             "pextrb  $tmp3,$tmp\n\t"
9294             "movsbl  $tmp3,$tmp3\n\t"
9295             "cmpl  $tmp2,$tmp3\n\t"
9296             "cmovl  $tmp3,$tmp2\n\t"
9297             "cmpl  $tmp3,$dst\n\t"
9298             "cmovl  $dst,$tmp3\t! min reduction4S" %}
9299   ins_encode %{
9300     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister,0x1);
9301     __ pmaxsb($tmp$$XMMRegister, $src2$$XMMRegister);
9302     __ pextrb($tmp2$$Register, $tmp$$XMMRegister,0x1);
9303     __ movsbl($tmp2$$Register, $tmp2$$Register);
9304     __ pextrb($tmp3$$Register, $tmp$$XMMRegister,0x0);
9305     __ movsbl($tmp3$$Register, $tmp3$$Register);
9306     __ cmpl($tmp2$$Register, $tmp3$$Register);
9307     __ cmovl(Assembler::greater, $tmp3$$Register, $tmp2$$Register);
9308     __ cmpl($src1$$Register, $tmp3$$Register);
9309     __ cmovl(Assembler::greater, $tmp3$$Register, $src1$$Register);
9310     __ movl($dst$$Register, $tmp3$$Register);
9311     __ pextrb($tmp2$$Register, $tmp$$XMMRegister,0x3);
9312     __ movsbl($tmp2$$Register, $tmp2$$Register);
9313     __ pextrb($tmp3$$Register, $tmp$$XMMRegister,0x2);
9314     __ movsbl($tmp3$$Register, $tmp3$$Register);
9315     __ cmpl($tmp2$$Register, $tmp3$$Register);
9316     __ cmovl(Assembler::greater, $tmp3$$Register, $tmp2$$Register);
9317     __ cmpl($tmp3$$Register, $dst$$Register);
9318     __ cmovl(Assembler::greater, $dst$$Register, $tmp3$$Register);
9319     __ movsbl($dst$$Register, $dst$$Register);
9320   %}
9321   ins_pipe( pipe_slow );
9322 %}
9323 
9324 instruct rsmax16B_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, rRegI tmp2, rRegI tmp3, regF tmp4) %{
9325   predicate(UseSSE > 3 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
9326   match(Set dst (MaxReductionV src1 src2));
9327   effect(TEMP dst, TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP tmp4);
9328   format %{ "pshufd  $tmp4,$src2,0xE\n\t"
9329             "pmaxsb  $tmp4,$src2\n\t"
9330             "pshufd  $tmp,$tmp4,0x1\n\t"
9331             "pmaxsb  $tmp,$tmp4\n\t"
9332             "pextrb  $tmp2,$tmp, 0x1\n\t"
9333             "movsbl  $tmp2,$tmp2\n\t"
9334             "pextrb  $tmp3,$tmp,0x0\n\t"
9335             "movsbl  $tmp3,$tmp3\n\t"
9336             "cmpl  $tmp2,$tmp3\n\t"
9337             "cmovl  $tmp3,$tmp2\n\t"
9338             "cmpl  $src1,$tmp3\n\t"
9339             "cmovl  $tmp3,$src1, 0x0\n\t"
9340             "movl  $dst,$tmp2\n\t"
9341             "pextrb  $tmp2,$tmp\n\t"
9342             "movsbl  $tmp2,$tmp2\n\t"
9343             "pextrb  $tmp3,$tmp\n\t"
9344             "movsbl  $tmp3,$tmp3\n\t"
9345             "cmpl  $tmp2,$tmp3\n\t"
9346             "cmovl  $tmp3,$tmp2\n\t"
9347             "cmpl  $tmp3,$dst\n\t"
9348             "cmovl  $dst,$tmp3\t! max reduction4S" %}
9349   ins_encode %{
9350     __ pshufd($tmp4$$XMMRegister, $src2$$XMMRegister, 0xE);
9351     __ pmaxsb($tmp4$$XMMRegister, $src2$$XMMRegister);
9352     __ pshufd($tmp$$XMMRegister, $tmp4$$XMMRegister,0x1);
9353     __ pmaxsb($tmp$$XMMRegister, $tmp4$$XMMRegister);
9354     __ pextrb($tmp2$$Register, $tmp$$XMMRegister,0x1);
9355     __ movsbl($tmp2$$Register, $tmp2$$Register);
9356     __ pextrb($tmp3$$Register, $tmp$$XMMRegister,0x0);
9357     __ movsbl($tmp3$$Register, $tmp3$$Register);
9358     __ cmpl($tmp2$$Register, $tmp3$$Register);
9359     __ cmovl(Assembler::greater, $tmp3$$Register, $tmp2$$Register);
9360     __ cmpl($src1$$Register, $tmp3$$Register);
9361     __ cmovl(Assembler::greater, $tmp3$$Register, $src1$$Register);
9362     __ movl($dst$$Register, $tmp3$$Register);
9363     __ pextrb($tmp2$$Register, $tmp$$XMMRegister,0x3);
9364     __ movsbl($tmp2$$Register, $tmp2$$Register);
9365     __ pextrb($tmp3$$Register, $tmp$$XMMRegister,0x2);
9366     __ movsbl($tmp3$$Register, $tmp3$$Register);
9367     __ cmpl($tmp2$$Register, $tmp3$$Register);
9368     __ cmovl(Assembler::greater, $tmp3$$Register, $tmp2$$Register);
9369     __ cmpl($tmp3$$Register, $dst$$Register);
9370     __ cmovl(Assembler::greater, $dst$$Register, $tmp3$$Register);
9371     __ movsbl($dst$$Register, $dst$$Register);
9372   %}
9373   ins_pipe( pipe_slow );
9374 %}
9375 
9376 instruct rvmax16B_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, rRegI tmp2, rRegI tmp3, regF tmp4) %{
9377   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
9378   match(Set dst (MaxReductionV src1 src2));
9379   effect(TEMP dst, TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP tmp4);
9380   format %{ "pshufd  $tmp4,$src2,0xE\n\t"
9381             "vpmaxsb  $tmp,$tmp4,$src2\n\t"
9382             "pshufd  $tmp,$tmp4,0x1\n\t"
9383             "vpmaxsb  $tmp,$tmp,$tmp4\n\t"
9384             "pextrb  $tmp2,$tmp, 0x1\n\t"
9385             "movsbl  $tmp2,$tmp2\n\t"
9386             "pextrb  $tmp3,$tmp,0x0\n\t"
9387             "movsbl  $tmp3,$tmp3\n\t"
9388             "cmpl  $tmp2,$tmp3\n\t"
9389             "cmovl  $tmp3,$tmp2\n\t"
9390             "cmpl  $src1,$tmp3\n\t"
9391             "cmovl  $tmp3,$src1, 0x0\n\t"
9392             "movl  $dst,$tmp2\n\t"
9393             "pextrb  $tmp2,$tmp\n\t"
9394             "movsbl  $tmp2,$tmp2\n\t"
9395             "pextrb  $tmp3,$tmp\n\t"
9396             "movsbl  $tmp3,$tmp3\n\t"
9397             "cmpl  $tmp2,$tmp3\n\t"
9398             "cmovl  $tmp3,$tmp2\n\t"
9399             "cmpl  $tmp3,$dst\n\t"
9400             "cmovl  $dst,$tmp3\t! max reduction4S" %}
9401   ins_encode %{
9402     int vector_len = 0;
9403     __ pshufd($tmp4$$XMMRegister, $src2$$XMMRegister, 0xE);
9404     __ vpmaxsb($tmp4$$XMMRegister, $tmp4$$XMMRegister, $src2$$XMMRegister, 0);
9405     __ pshufd($tmp$$XMMRegister, $tmp4$$XMMRegister,0x1);
9406     __ vpmaxsb($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp4$$XMMRegister, 0);
9407     __ pextrb($tmp2$$Register, $tmp$$XMMRegister,0x1);
9408     __ movsbl($tmp2$$Register, $tmp2$$Register);
9409     __ pextrb($tmp3$$Register, $tmp$$XMMRegister,0x0);
9410     __ movsbl($tmp3$$Register, $tmp3$$Register);
9411     __ cmpl($tmp2$$Register, $tmp3$$Register);
9412     __ cmovl(Assembler::greater, $tmp3$$Register, $tmp2$$Register);
9413     __ cmpl($src1$$Register, $tmp3$$Register);
9414     __ cmovl(Assembler::greater, $tmp3$$Register, $src1$$Register);
9415     __ movl($dst$$Register, $tmp3$$Register);
9416     __ pextrb($tmp2$$Register, $tmp$$XMMRegister,0x3);
9417     __ movsbl($tmp2$$Register, $tmp2$$Register);
9418     __ pextrb($tmp3$$Register, $tmp$$XMMRegister,0x2);
9419     __ movsbl($tmp3$$Register, $tmp3$$Register);
9420     __ cmpl($tmp2$$Register, $tmp3$$Register);
9421     __ cmovl(Assembler::greater, $tmp3$$Register, $tmp2$$Register);
9422     __ cmpl($tmp3$$Register, $dst$$Register);
9423     __ cmovl(Assembler::greater, $dst$$Register, $tmp3$$Register);
9424     __ movsbl($dst$$Register, $dst$$Register);
9425   %}
9426   ins_pipe( pipe_slow );
9427 %}
9428 
9429 instruct rvmax32B_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, rRegI tmp2, rRegI tmp3, regF tmp4) %{
9430   predicate(UseAVX > 1 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
9431   match(Set dst (MaxReductionV src1 src2));
9432   effect(TEMP dst, TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP tmp4);
9433   format %{ "vextracti128_high  $tmp,$src2\n\t"
9434             "vpmaxsb  $tmp,$tmp,$src2\n\t"
9435             "pshufd  $tmp4,$tmp,0xE\n\t"
9436             "vpmaxsb  $tmp4,$tmp4,$tmp\n\t"
9437             "pshufd  $tmp,$tmp4,0x1\n\t"
9438             "vpmaxsb  $tmp,$tmp,$tmp4\n\t"
9439             "pextrb  $tmp2,$tmp, 0x1\n\t"
9440             "movsbl  $tmp2,$tmp2\n\t"
9441             "pextrb  $tmp3,$tmp,0x0\n\t"
9442             "movsbl  $tmp3,$tmp3\n\t"
9443             "cmpl  $tmp2,$tmp3\n\t"
9444             "cmovl  $tmp3,$tmp2\n\t"
9445             "cmpl  $src1,$tmp3\n\t"
9446             "cmovl  $tmp3,$src1, 0x0\n\t"
9447             "movl  $dst,$tmp2\n\t"
9448             "pextrb  $tmp2,$tmp\n\t"
9449             "movsbl  $tmp2,$tmp2\n\t"
9450             "pextrb  $tmp3,$tmp\n\t"
9451             "movsbl  $tmp3,$tmp3\n\t"
9452             "cmpl  $tmp2,$tmp3\n\t"
9453             "cmovl  $tmp3,$tmp2\n\t"
9454             "cmpl  $tmp3,$dst\n\t"
9455             "cmovl  $dst,$tmp3\t! min reduction4S" %}
9456   ins_encode %{
9457     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
9458     __ vpmaxsb($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, 0);
9459     __ pshufd($tmp4$$XMMRegister, $tmp$$XMMRegister, 0xE);
9460     __ vpmaxsb($tmp4$$XMMRegister, $tmp4$$XMMRegister, $tmp$$XMMRegister, 0);
9461     __ pshufd($tmp$$XMMRegister, $tmp4$$XMMRegister,0x1);
9462     __ vpmaxsb($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp4$$XMMRegister, 0);
9463     __ pextrb($tmp2$$Register, $tmp$$XMMRegister,0x1);
9464     __ movsbl($tmp2$$Register, $tmp2$$Register);
9465     __ pextrb($tmp3$$Register, $tmp$$XMMRegister,0x0);
9466     __ movsbl($tmp3$$Register, $tmp3$$Register);
9467     __ cmpl($tmp2$$Register, $tmp3$$Register);
9468     __ cmovl(Assembler::greater, $tmp3$$Register, $tmp2$$Register);
9469     __ cmpl($src1$$Register, $tmp3$$Register);
9470     __ cmovl(Assembler::greater, $tmp3$$Register, $src1$$Register);
9471     __ movl($dst$$Register, $tmp3$$Register);
9472     __ pextrb($tmp2$$Register, $tmp$$XMMRegister,0x3);
9473     __ movsbl($tmp2$$Register, $tmp2$$Register);
9474     __ pextrb($tmp3$$Register, $tmp$$XMMRegister,0x2);
9475     __ movsbl($tmp3$$Register, $tmp3$$Register);
9476     __ cmpl($tmp2$$Register, $tmp3$$Register);
9477     __ cmovl(Assembler::greater, $tmp3$$Register, $tmp2$$Register);
9478     __ cmpl($tmp3$$Register, $dst$$Register);
9479     __ cmovl(Assembler::greater, $dst$$Register, $tmp3$$Register);
9480     __ movsbl($dst$$Register, $dst$$Register);
9481   %}
9482   ins_pipe( pipe_slow );
9483 %}
9484 
9485 instruct rvmax64B_reduction_reg(rRegI dst, rRegI src1, vecZ src2, regF tmp, rRegI tmp2, rRegI tmp3, regF tmp4) %{
9486   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
9487   match(Set dst (MaxReductionV src1 src2));
9488   effect(TEMP dst, TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP tmp4);
9489   format %{ "vextracti64x4_high  $tmp4,$src2\n\t"
9490             "vpmaxsb  $tmp4,$tmp4,$src2\n\t"
9491             "vextracti128_high  $tmp,$tmp4\n\t"
9492             "vpmaxsb  $tmp,$tmp,$tmp4\n\t"
9493             "pshufd  $tmp4,$tmp,0xE\n\t"
9494             "vpmaxsb  $tmp,$tmp4,$tmp\n\t"
9495             "pshufd  $tmp4,$src2,0xE\n\t"
9496             "vpmaxsb  $tmp,$tmp4,$src2\n\t"
9497             "pshufd  $tmp,$tmp4,0x1\n\t"
9498             "vpmaxsb  $tmp,$tmp,$tmp4\n\t"
9499             "pextrb  $tmp2,$tmp, 0x1\n\t"
9500             "movsbl  $tmp2,$tmp2\n\t"
9501             "pextrb  $tmp3,$tmp,0x0\n\t"
9502             "movsbl  $tmp3,$tmp3\n\t"
9503             "cmpl  $tmp2,$tmp3\n\t"
9504             "cmovl  $tmp3,$tmp2\n\t"
9505             "cmpl  $src1,$tmp3\n\t"
9506             "cmovl  $tmp3,$src1, 0x0\n\t"
9507             "movl  $dst,$tmp2\n\t"
9508             "pextrb  $tmp2,$tmp\n\t"
9509             "movsbl  $tmp2,$tmp2\n\t"
9510             "pextrb  $tmp3,$tmp\n\t"
9511             "movsbl  $tmp3,$tmp3\n\t"
9512             "cmpl  $tmp2,$tmp3\n\t"
9513             "cmovl  $tmp3,$tmp2\n\t"
9514             "cmpl  $tmp3,$dst\n\t"
9515             "cmovl  $dst,$tmp3\t! max reduction32B" %}
9516   ins_encode %{
9517     __ vextracti64x4_high($tmp4$$XMMRegister, $src2$$XMMRegister);
9518     __ vpmaxsb($tmp4$$XMMRegister, $tmp4$$XMMRegister, $src2$$XMMRegister, 1);
9519     __ vextracti128_high($tmp$$XMMRegister, $tmp4$$XMMRegister);
9520     __ vpmaxsb($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp4$$XMMRegister, 0);
9521     __ pshufd($tmp4$$XMMRegister, $tmp$$XMMRegister, 0xE);
9522     __ vpmaxsb($tmp4$$XMMRegister, $tmp4$$XMMRegister, $tmp$$XMMRegister, 0);
9523     __ pshufd($tmp$$XMMRegister, $tmp4$$XMMRegister,0x1);
9524     __ vpmaxsb($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp4$$XMMRegister, 0);
9525     __ pextrb($tmp2$$Register, $tmp$$XMMRegister,0x1);
9526     __ movsbl($tmp2$$Register, $tmp2$$Register);
9527     __ pextrb($tmp3$$Register, $tmp$$XMMRegister,0x0);
9528     __ movsbl($tmp3$$Register, $tmp3$$Register);
9529     __ cmpl($tmp2$$Register, $tmp3$$Register);
9530     __ cmovl(Assembler::greater, $tmp3$$Register, $tmp2$$Register);
9531     __ cmpl($src1$$Register, $tmp3$$Register);
9532     __ cmovl(Assembler::greater, $tmp3$$Register, $src1$$Register);
9533     __ movl($dst$$Register, $tmp3$$Register);
9534     __ pextrb($tmp2$$Register, $tmp$$XMMRegister,0x3);
9535     __ movsbl($tmp2$$Register, $tmp2$$Register);
9536     __ pextrb($tmp3$$Register, $tmp$$XMMRegister,0x2);
9537     __ movsbl($tmp3$$Register, $tmp3$$Register);
9538     __ cmpl($tmp2$$Register, $tmp3$$Register);
9539     __ cmovl(Assembler::greater, $tmp3$$Register, $tmp2$$Register);
9540     __ cmpl($tmp3$$Register, $dst$$Register);
9541     __ cmovl(Assembler::greater, $dst$$Register, $tmp3$$Register);
9542     __ movsbl($dst$$Register, $dst$$Register);
9543   %}
9544   ins_pipe( pipe_slow );
9545 %}
9546 
9547 instruct rsmax4S_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, rRegI tmp2, rRegI tmp3) %{
9548   predicate(UseSSE > 1 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
9549   match(Set dst (MaxReductionV src1 src2));
9550   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
9551   format %{ "pshufd  $tmp,$src2,0x1\n\t"
9552             "pminsw  $tmp,$src2\n\t"
9553             "movzwl  $dst,$src1\n\t"
9554             "pextrw  $tmp2,$tmp, 0x0\n\t"
9555             "pminsw  $dst,$tmp2\n\t"
9556             "pminsw  $dst,$tmp2\n\t"
9557             "movswl  $dst,$dst\t! min reduction4S" %}
9558   ins_encode %{
9559     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister,0x1);
9560     __ pmaxsw($tmp$$XMMRegister, $src2$$XMMRegister);
9561     __ pextrw($tmp2$$Register, $tmp$$XMMRegister,0x1);
9562     __ movswl($tmp2$$Register, $tmp2$$Register);
9563     __ pextrw($tmp3$$Register, $tmp$$XMMRegister,0x0);
9564     __ movswl($tmp3$$Register, $tmp3$$Register);
9565     __ cmpl($tmp2$$Register, $tmp3$$Register);
9566     __ cmovl(Assembler::greater, $tmp3$$Register, $tmp2$$Register);
9567     __ cmpl($src1$$Register, $tmp3$$Register);
9568     __ cmovl(Assembler::greater, $tmp3$$Register, $src1$$Register);
9569     __ movl($dst$$Register, $tmp3$$Register);
9570   %}
9571   ins_pipe( pipe_slow );
9572 %}
9573 
9574 instruct rvmax4S_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, rRegI tmp2, rRegI tmp3) %{
9575   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
9576   match(Set dst (MaxReductionV src1 src2));
9577   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
9578   format %{ "pshufd  $tmp,$src2,0x1\n\t"
9579             "pminsw  $tmp,$src2\n\t"
9580             "movzwl  $dst,$src1\n\t"
9581             "pextrw  $tmp2,$tmp, 0x0\n\t"
9582             "pminsw  $dst,$tmp2\n\t"
9583             "pminsw  $dst,$tmp2\n\t"
9584             "movswl  $dst,$dst\t! min reduction4S" %}
9585   ins_encode %{
9586     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister,0x1);
9587     __ vpmaxsw($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, 0);
9588     __ pextrw($tmp2$$Register, $tmp$$XMMRegister,0x1);
9589     __ movswl($tmp2$$Register, $tmp2$$Register);
9590     __ pextrw($tmp3$$Register, $tmp$$XMMRegister,0x0);
9591     __ movswl($tmp3$$Register, $tmp3$$Register);
9592     __ cmpl($tmp2$$Register, $tmp3$$Register);
9593     __ cmovl(Assembler::greater, $tmp3$$Register, $tmp2$$Register);
9594     __ cmpl($src1$$Register, $tmp3$$Register);
9595     __ cmovl(Assembler::greater, $tmp3$$Register, $src1$$Register);
9596     __ movl($dst$$Register, $tmp3$$Register);
9597   %}
9598   ins_pipe( pipe_slow );
9599 %}
9600 
9601 instruct rsmax8S_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2, rRegI tmp3, rRegI tmp4) %{
9602   predicate(UseSSE > 1 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
9603   match(Set dst (MaxReductionV src1 src2));
9604   effect(TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP tmp4);
9605   format %{ "pshufd  $tmp2,$src2,0xE\n\t"
9606             "pmaxsw  $tmp2,$src2\n\t"
9607             "pshufd  $tmp,$tmp2,0x1\n\t"
9608             "pmaxsw  $tmp,$tmp2\n\t"
9609             "pextrw  $tmp2,$tmp\n\t"
9610             "movswl  $tmp2,$tmp2\n\t"
9611             "pextrw  $tmp3,$tmp, 0x0\n\t"
9612             "movswl  $tmp3,$tmp3\n\t"
9613             "cmpl    $tmp2,$tmp3\n\t"
9614             "cmovl  $tmp3,$tmp2\n\t"
9615             "cmpl  $src1,$tmp3\n\t"
9616             "cmovl  $tmp3,$src1\n\t"
9617             "movl  $dst,$tmp3\t! max reduction8S" %}
9618   ins_encode %{
9619     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister,0xE);
9620     __ pmaxsw($tmp2$$XMMRegister, $src2$$XMMRegister);
9621     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister,0x1);
9622     __ pmaxsw($tmp$$XMMRegister, $tmp2$$XMMRegister);
9623     __ pextrw($tmp4$$Register, $tmp$$XMMRegister,0x1);
9624     __ movswl($tmp4$$Register, $tmp4$$Register);
9625     __ pextrw($tmp3$$Register, $tmp$$XMMRegister,0x0);
9626     __ movswl($tmp3$$Register, $tmp3$$Register);
9627     __ cmpl($tmp4$$Register, $tmp3$$Register);
9628     __ cmovl(Assembler::greater, $tmp3$$Register, $tmp4$$Register);
9629     __ cmpl($src1$$Register, $tmp3$$Register);
9630     __ cmovl(Assembler::greater, $tmp3$$Register, $src1$$Register);
9631     __ movl($dst$$Register, $tmp3$$Register);
9632   %}
9633   ins_pipe( pipe_slow );
9634 %}
9635 
9636 instruct rvmax8S_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2, rRegI tmp3, rRegI tmp4) %{
9637   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
9638   match(Set dst (MaxReductionV src1 src2));
9639   effect(TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP tmp4);
9640   format %{ "pshufd   $tmp,$src2,0xE\n\t"
9641             "vpmaxsw  $tmp,$tmp,$src2\n\t"
9642             "pshufd   $tmp2,$tmp,0x1\n\t"
9643             "vpmaxsw  $tmp,$tmp,$tmp2\n\t"
9644             "movzwl   $dst,$src1\n\t"
9645             "pextrw   $tmp3,$tmp, 0x0\n\t"
9646             "vpmaxsw  $dst,$dst,$tmp3\n\t"
9647             "pextrw   $tmp3,$tmp, 0x1\n\t"
9648             "vpmaxsw  $dst,$dst,$tmp3\n\t"
9649             "movswl   $dst,$dst\t! max reduction8S" %}
9650   ins_encode %{
9651     int vector_len = 0;
9652     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
9653     __ vpmaxsw($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
9654     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister,0x1);
9655     __ vpmaxsw($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
9656     __ pextrw($tmp4$$Register, $tmp$$XMMRegister,0x1);
9657     __ movswl($tmp4$$Register, $tmp4$$Register);
9658     __ pextrw($tmp3$$Register, $tmp$$XMMRegister,0x0);
9659     __ movswl($tmp3$$Register, $tmp3$$Register);
9660     __ cmpl($tmp4$$Register, $tmp3$$Register);
9661     __ cmovl(Assembler::greater, $tmp3$$Register, $tmp4$$Register);
9662     __ cmpl($src1$$Register, $tmp3$$Register);
9663     __ cmovl(Assembler::greater, $tmp3$$Register, $src1$$Register);
9664     __ movl($dst$$Register, $tmp3$$Register);
9665   %}
9666   ins_pipe( pipe_slow );
9667 %}
9668 
9669 instruct rvmax16S_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2, rRegI tmp3, rRegI tmp4) %{
9670   predicate(UseAVX > 1 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
9671   match(Set dst (MaxReductionV src1 src2));
9672   effect(TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP tmp4);
9673   format %{ "vextracti128_high  $tmp,$src2\n\t"
9674             "vpmaxsw  $tmp,$tmp,$src2\n\t"
9675             "pshufd  $tmp2,$tmp,0xE\n\t"
9676             "vpmaxsw  $tmp,$tmp,$tmp2\n\t"
9677             "pshufd  $tmp2,$tmp,0x1\n\t"
9678             "vpmaxsw  $tmp,$tmp,$tmp2\n\t"
9679             "pextrw  $tmp2,$tmp, 0x1\n\t"
9680             "movswl  $tmp2,$tmp2\n\t"
9681             "pextrw  $tmp3,$tmp, 0x0\n\t"
9682             "movswl  $tmp3,$tmp3\n\t"
9683             "cmpl  $tmp2$tmp3\n\t"
9684             "cmovl  $tmp3,$tmp2\n\t"
9685             "cmpl  $src1,$tmp3\n\t"
9686             "cmovl  $tmp3,$src1\n\t"
9687             "movl  $dst,$tmp3\t! max reduction16S" %}
9688   ins_encode %{
9689     int vector_len = 1;
9690     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
9691     __ vpmaxsw($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
9692     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
9693     __ vpmaxsw($tmp$$XMMRegister,$tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
9694     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister,0x1);
9695     __ vpmaxsw($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
9696     __ pextrw($tmp4$$Register, $tmp$$XMMRegister,0x1);
9697     __ movswl($tmp4$$Register, $tmp4$$Register);
9698     __ pextrw($tmp3$$Register, $tmp$$XMMRegister,0x0);
9699     __ movswl($tmp3$$Register, $tmp3$$Register);
9700     __ cmpl($tmp4$$Register, $tmp3$$Register);
9701     __ cmovl(Assembler::greater, $tmp3$$Register, $tmp4$$Register);
9702     __ cmpl($src1$$Register, $tmp3$$Register);
9703     __ cmovl(Assembler::greater, $tmp3$$Register, $src1$$Register);
9704     __ movl($dst$$Register, $tmp3$$Register);
9705   %}
9706   ins_pipe( pipe_slow );
9707 %}
9708 
9709 instruct rvmax32S_reduction_reg(rRegI dst, rRegI src1, vecZ src2, regF tmp, regF tmp2, rRegI tmp3, rRegI tmp4) %{
9710   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
9711   match(Set dst (MaxReductionV src1 src2));
9712   effect(TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP tmp4);
9713   format %{ "vextracti64x4_high  $tmp2,$src2\n\t"
9714             "vpmaxsw  $tmp2,$tmp2,$src2\n\t"
9715             "vextracti128_high  $tmp,$tmp2\n\t"
9716             "vpmaxsw  $tmp,$tmp,$tmp2\n\t"
9717             "pshufd  $tmp2,$tmp,0xE\n\t"
9718             "vpmaxsw  $tmp,$tmp,$tmp2\n\t"
9719             "pshufd  $tmp2,$tmp,0x1\n\t"
9720             "vpmaxsw  $tmp,$tmp,$tmp2\n\t"
9721             "pextrw  $tmp3,$tmp, 0x0\n\t"
9722             "movswl  $dst,$src1\n\t"
9723             "pextrw  $tmp3,$tmp, 0x0\n\t"
9724             "movswl  $dst,$src1\n\t"
9725             "cmpl  $tmp2$tmp3\n\t"
9726             "cmovl  $tmp3,$tmp2\n\t"
9727             "cmpl  $src1,$tmp3\n\t"
9728             "cmovl  $tmp3,$src1\n\t"
9729             "movl  $dst,$dst\t! max reduction32S" %}
9730   ins_encode %{
9731     int vector_len = 2;
9732     __ vextracti64x4_high($tmp2$$XMMRegister, $src2$$XMMRegister);
9733     __ vpmaxsw($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, vector_len);
9734     __ vextracti128_high($tmp$$XMMRegister, $tmp2$$XMMRegister);
9735     __ vpmaxsw($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
9736     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
9737     __ vpmaxsw($tmp$$XMMRegister,$tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
9738     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister,0x1);
9739     __ vpmaxsw($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
9740     __ pextrw($tmp4$$Register, $tmp$$XMMRegister,0x1);
9741     __ movswl($tmp4$$Register, $tmp4$$Register);
9742     __ pextrw($tmp3$$Register, $tmp$$XMMRegister,0x0);
9743     __ movswl($tmp3$$Register, $tmp3$$Register);
9744     __ cmpl($tmp4$$Register, $tmp3$$Register);
9745     __ cmovl(Assembler::greater, $tmp3$$Register, $tmp4$$Register);
9746     __ cmpl($src1$$Register, $tmp3$$Register);
9747     __ cmovl(Assembler::greater, $tmp3$$Register, $src1$$Register);
9748     __ movl($dst$$Register, $tmp3$$Register);
9749   %}
9750   ins_pipe( pipe_slow );
9751 %}
9752 
9753 instruct rsmax2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
9754   predicate(UseSSE > 3 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
9755   match(Set dst (MaxReductionV src1 src2));
9756   effect(TEMP tmp, TEMP tmp2);
9757   format %{ "pshufd  $tmp,$src2,0x1\n\t"
9758             "pmaxsd  $tmp,$src2\n\t"
9759             "movd    $tmp2,$src1\n\t"
9760             "pmaxsd  $tmp2,$tmp\n\t"
9761             "movd    $dst,$tmp2\t! max reduction2I" %}
9762   ins_encode %{
9763     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x1);
9764     __ pmaxsd($tmp$$XMMRegister, $src2$$XMMRegister);
9765     __ movdl($tmp2$$XMMRegister, $src1$$Register);
9766     __ pmaxsd($tmp2$$XMMRegister, $tmp$$XMMRegister);
9767     __ movdl($dst$$Register, $tmp2$$XMMRegister);
9768   %}
9769   ins_pipe( pipe_slow );
9770 %}
9771 
9772 instruct rvmax2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
9773   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
9774   match(Set dst (MaxReductionV src1 src2));
9775   effect(TEMP tmp, TEMP tmp2);
9776   format %{ "pshufd   $tmp,$src2,0x1\n\t"
9777             "vpmaxsd  $tmp2,$tmp,$src2\n\t"
9778             "movd     $tmp,$src1\n\t"
9779             "vpmaxsd  $tmp2,$tmp,$tmp2\n\t"
9780             "movd     $dst,$tmp2\t! max reduction2I" %}
9781   ins_encode %{
9782     int vector_len = 0;
9783     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x1);
9784     __ vpmaxsd($tmp2$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
9785     __ movdl($tmp$$XMMRegister, $src1$$Register);
9786     __ vpmaxsd($tmp2$$XMMRegister,$tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
9787     __ movdl($dst$$Register, $tmp2$$XMMRegister);
9788   %}
9789   ins_pipe( pipe_slow );
9790 %}
9791 
9792 instruct rsmax4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
9793   predicate(UseSSE > 3 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
9794   match(Set dst (MaxReductionV src1 src2));
9795   effect(TEMP tmp, TEMP tmp2);
9796   format %{ "pshufd  $tmp,$src2,0xE\n\t"
9797             "pmaxsd  $tmp,$src2\n\t"
9798             "pshufd  $tmp2,$tmp,0x1\n\t"
9799             "pmaxsd  $tmp2,$tmp\n\t"
9800             "movd    $tmp,$src1\n\t"
9801             "pmaxsd  $tmp2,$tmp\n\t"
9802             "movd    $dst,$tmp2\t! max reduction4I" %}
9803   ins_encode %{
9804     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
9805     __ pmaxsd($tmp$$XMMRegister, $src2$$XMMRegister);
9806     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
9807     __ pmaxsd($tmp2$$XMMRegister,$tmp$$XMMRegister);
9808     __ movdl($tmp$$XMMRegister, $src1$$Register);
9809     __ pmaxsd($tmp2$$XMMRegister,$tmp$$XMMRegister);
9810     __ movdl($dst$$Register, $tmp2$$XMMRegister);
9811   %}
9812   ins_pipe( pipe_slow );
9813 %}
9814 
9815 instruct rvmax4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
9816   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
9817   match(Set dst (MaxReductionV src1 src2));
9818   effect(TEMP tmp, TEMP tmp2);
9819   format %{ "pshufd   $tmp,$src2,0xE\n\t"
9820             "vpmaxsd  $tmp2,$tmp,$src2\n\t"
9821             "pshufd   $tmp,$tmp2,0x1\n\t"
9822             "vpmaxsd  $tmp2,$tmp2,$tmp\n\t"
9823             "movd     $tmp,$src1\n\t"
9824             "vpmaxsd  $tmp2,$tmp,$tmp2\n\t"
9825             "movd     $dst,$tmp2\t! max reduction4I" %}
9826   ins_encode %{
9827     int vector_len = 0;
9828     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
9829     __ vpmaxsd($tmp2$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
9830     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister,0x1);
9831     __ vpmaxsd($tmp2$$XMMRegister,$tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
9832     __ movdl($tmp$$XMMRegister, $src1$$Register);
9833     __ vpmaxsd($tmp2$$XMMRegister,$tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
9834     __ movdl($dst$$Register, $tmp2$$XMMRegister);
9835   %}
9836   ins_pipe( pipe_slow );
9837 %}
9838 
9839 instruct rvmax4I_reduction_evex(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
9840   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
9841   match(Set dst (MaxReductionV src1 src2));
9842   effect(TEMP tmp, TEMP tmp2);
9843   format %{ "pshufd   $tmp,$src2,0xE\n\t"
9844             "vpmaxsd  $tmp2,$tmp,$src2\n\t"
9845             "pshufd   $tmp,$tmp2,0x1\n\t"
9846             "vpmaxsd  $tmp2,$tmp2,$tmp\n\t"
9847             "movd     $tmp,$src1\n\t"
9848             "vpmaxsd  $tmp2,$tmp,$tmp2\n\t"
9849             "movd     $dst,$tmp2\t! max reduction4I" %}
9850   ins_encode %{
9851     int vector_len = 0;
9852     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
9853     __ vpmaxsd($tmp2$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
9854     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister,0x1);
9855     __ vpmaxsd($tmp2$$XMMRegister,$tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
9856     __ movdl($tmp$$XMMRegister, $src1$$Register);
9857     __ vpmaxsd($tmp2$$XMMRegister,$tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
9858     __ movdl($dst$$Register, $tmp2$$XMMRegister);
9859   %}
9860   ins_pipe( pipe_slow );
9861 %}
9862 
9863 instruct rvmax8I_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2) %{
9864   predicate(UseAVX > 1 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
9865   match(Set dst (MaxReductionV src1 src2));
9866   effect(TEMP tmp, TEMP tmp2);
9867   format %{ "vextracti128_high   $tmp,$src2\n\t"
9868             "vpmaxsd  $tmp,$tmp,$src2\n\t"
9869             "pshufd   $tmp2,$tmp,0xE\n\t"
9870             "vpmaxsd  $tmp2,$tmp,$tmp2\n\t"
9871             "pshufd   $tmp,$tmp2,0x1\n\t"
9872             "vpmaxsd  $tmp2,$tmp2,$tmp\n\t"
9873             "movd     $tmp,$src1\n\t"
9874             "vpmaxsd  $tmp2,$tmp,$tmp2\n\t"
9875             "movd     $dst,$tmp2\t! max reduction8I" %}
9876   ins_encode %{
9877     int vector_len = 1;
9878     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
9879     __ vpmaxsd($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
9880     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
9881     __ vpmaxsd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
9882     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister,0x1);
9883     __ vpmaxsd($tmp2$$XMMRegister,$tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
9884     __ movdl($tmp$$XMMRegister, $src1$$Register);
9885     __ vpmaxsd($tmp2$$XMMRegister,$tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
9886     __ movdl($dst$$Register, $tmp2$$XMMRegister);
9887   %}
9888   ins_pipe( pipe_slow );
9889 %}
9890 
9891 instruct rvmax8I_reduction_evex(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2) %{
9892   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
9893   match(Set dst (MaxReductionV src1 src2));
9894   effect(TEMP tmp, TEMP tmp2);
9895   format %{ "vextracti128_high   $tmp,$src2\n\t"
9896             "vpmaxsd  $tmp,$tmp,$src2\n\t"
9897             "pshufd   $tmp2,$tmp,0xE\n\t"
9898             "vpmaxsd  $tmp2,$tmp,$tmp2\n\t"
9899             "pshufd   $tmp,$tmp2,0x1\n\t"
9900             "vpmaxsd  $tmp2,$tmp2,$tmp\n\t"
9901             "movd     $tmp,$src1\n\t"
9902             "vpmaxsd  $tmp2,$tmp,$tmp2\n\t"
9903             "movd     $dst,$tmp2\t! max reduction8I" %}
9904   ins_encode %{
9905     int vector_len = 1;
9906     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
9907     __ vpmaxsd($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
9908     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
9909     __ vpmaxsd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
9910     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister,0x1);
9911     __ vpmaxsd($tmp2$$XMMRegister,$tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
9912     __ movdl($tmp$$XMMRegister, $src1$$Register);
9913     __ vpmaxsd($tmp2$$XMMRegister,$tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
9914     __ movdl($dst$$Register, $tmp2$$XMMRegister);
9915   %}
9916   ins_pipe( pipe_slow );
9917 %}
9918 
9919 instruct rvmax16I_reduction_evex(rRegI dst, rRegI src1, vecZ src2, regF tmp, regF tmp2, regF tmp3) %{
9920   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
9921   match(Set dst (MaxReductionV src1 src2));
9922   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
9923   format %{ "vextracti64x4_high  $tmp3,$src2\n\t"
9924             "vpmaxsd  $tmp3,$tmp3,$src2\n\t"
9925             "vextracti128_high   $tmp,$tmp3\n\t"
9926             "vpmaxsd  $tmp,$tmp,$tmp3\n\t"
9927             "pshufd   $tmp2,$tmp,0xE\n\t"
9928             "vpmaxsd  $tmp2,$tmp,$tmp2\n\t"
9929             "pshufd   $tmp,$tmp2,0x1\n\t"
9930             "vpmaxsd  $tmp2,$tmp2,$tmp\n\t"
9931             "movd     $tmp,$src1\n\t"
9932             "vpmaxsd  $tmp2,$tmp,$tmp2\n\t"
9933             "movd     $dst,$tmp2\t! max reduction16I" %}
9934   ins_encode %{
9935     int vector_len = 2;
9936     __ vextracti64x4_high($tmp3$$XMMRegister, $src2$$XMMRegister);
9937     __ vpmaxsd($tmp3$$XMMRegister, $tmp3$$XMMRegister, $src2$$XMMRegister, vector_len);
9938     __ vextracti128_high($tmp$$XMMRegister, $tmp3$$XMMRegister);
9939     __ vpmaxsd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp3$$XMMRegister, vector_len);
9940     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
9941     __ vpmaxsd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
9942     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister,0x1);
9943     __ vpmaxsd($tmp2$$XMMRegister,$tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
9944     __ movdl($tmp$$XMMRegister, $src1$$Register);
9945     __ vpmaxsd($tmp2$$XMMRegister,$tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
9946     __ movdl($dst$$Register, $tmp2$$XMMRegister);
9947   %}
9948   ins_pipe( pipe_slow );
9949 %}
9950 
9951 // Long Max Reduction
9952 instruct rsmax1L_reduction_reg(rRegL dst, rRegL src1, legVecD src2, rxmm0 xmm_0, legVecD tmp2, legVecD tmp3) %{
9953   predicate(UseSSE > 3 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
9954   match(Set dst (MaxReductionV src1 src2));
9955   effect(TEMP xmm_0, TEMP tmp2, TEMP tmp3);
9956   format %{ "movdq      $xmm_0,$src1\n\t"
9957             "movdq      $tmp2,$src1\n\t"
9958             "pcmpgtq   $xmm_0,$src2\n\t"
9959             "blendvpd  $tmp2,$src2\n\t"
9960             "movdq      $dst,$tmp2\t! max reduction1L" %}
9961   ins_encode %{
9962     __ movdq($xmm_0$$XMMRegister,$src1$$Register);
9963     __ movdq($tmp2$$XMMRegister,$src1$$Register);
9964     __ movdq($tmp3$$XMMRegister,$src2$$Register);
9965     __ pcmpgtq($xmm_0$$XMMRegister, $tmp3$$XMMRegister);
9966     __ blendvpd($tmp3$$XMMRegister,$tmp2$$XMMRegister);
9967     __ movdq($dst$$Register, $tmp3$$XMMRegister);
9968   %}
9969   ins_pipe( pipe_slow );
9970 %}
9971 
9972 instruct rsmax2L_reduction_reg(rRegL dst, rRegL src1, vecX src2, rxmm0 xmm_0, vecX tmp2, vecX tmp3) %{
9973   predicate(UseSSE > 3 && UseAVX == 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
9974   match(Set dst (MaxReductionV src1 src2));
9975   effect(TEMP xmm_0, TEMP tmp2, TEMP tmp3);
9976   format %{ "pshufd   $tmp3,$src2,0xE\n\t"
9977             "movdqu  $xmm_0,$src2\n\t"
9978             "pcmpgtq  $xmm_0,$tmp3\n\t"
9979             "blendvpd  $tmp3,$src2\n\t"
9980             "movdqu  $xmm_0,$tmp3\n\t"
9981             "movdq  $tmp2,$src1\n\t"
9982             "pcmpgtq  $xmm_0,$tmp2\n\t"
9983             "blendvpd  $tmp2,$tmp3\n\t"
9984             "movq     $dst,$tmp2\t! max reduction2L" %}
9985   ins_encode %{
9986     __ pshufd($tmp3$$XMMRegister, $src2$$XMMRegister, 0xE);
9987     __ movdqu($xmm_0$$XMMRegister, $src2$$XMMRegister);
9988     __ pcmpgtq($xmm_0$$XMMRegister, $tmp3$$XMMRegister);
9989     __ blendvpd($tmp3$$XMMRegister, $src2$$XMMRegister);
9990     __ movdqu($xmm_0$$XMMRegister, $tmp3$$XMMRegister);
9991     __ movdq($tmp2$$XMMRegister, $src1$$Register);
9992     __ pcmpgtq($xmm_0$$XMMRegister, $tmp2$$XMMRegister);
9993     __ blendvpd($tmp2$$XMMRegister,$tmp3$$XMMRegister);
9994     __ movdq($dst$$Register, $tmp2$$XMMRegister);
9995   %}
9996   ins_pipe( pipe_slow );
9997 %}
9998 
9999 instruct rvmax2L_reduction_reg(rRegL dst, rRegL src1, legVecX src2, legVecX tmp, legVecX tmp2, legVecX tmp3) %{
10000   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
10001   match(Set dst (MaxReductionV src1 src2));
10002   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
10003   format %{ "pshufd   $tmp2,$src2,0xE\n\t"
10004             "vpcmpgtq  $tmp,$tmp2,$src2\n\t"
10005             "vblendvpd   $tmp2,$tmp2,$src2,$tmp\n\t"
10006             "movq     $tmp,$src1\n\t"
10007             "vpcmpgtq  $tmp3,$tmp2,$tmp\n\t"
10008             "blendvpd   $tmp2,$tmp2,$src1,$tmp3\n\t"
10009             "movq     $dst,$tmp2\t! max reduction2L" %}
10010   ins_encode %{
10011     int vector_len = 0;
10012     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
10013     __ vpcmpgtq($tmp$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, vector_len);
10014     __ vblendvpd($tmp2$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
10015     __ movdq($tmp$$XMMRegister,$src1$$Register);
10016     __ vpcmpgtq($tmp3$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
10017     __ vblendvpd($tmp2$$XMMRegister, $tmp$$XMMRegister,$tmp2$$XMMRegister,$tmp3$$XMMRegister, vector_len);
10018     __ movdq($dst$$Register, $tmp2$$XMMRegister);
10019   %}
10020   ins_pipe( pipe_slow );
10021 %}
10022 
10023 instruct rvmax4L_reduction_reg(rRegL dst, rRegL src1, legVecY src2, legVecY tmp, legVecY tmp2, legVecY tmp3) %{
10024   predicate(UseAVX > 1 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
10025   match(Set dst (MaxReductionV src1 src2));
10026   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
10027   format %{ "vextracti128_high   $tmp2,$src2\n\t"
10028             "vpcmpgtq  $tmp,$tmp2,$src2\n\t"
10029             "vblendvpd   $tmp2,$tmp2,$src2,$tmp\n\t"
10030             "vpshufd   $tmp3, $tmp2,0x1\n\t"
10031             "vpcmpgtq  $tmp, $tmp3,$tmp\n\t2"
10032             "vblendvpd $tmp3,$tmp3,$tmp2,$tmp\n\t"
10033             "movq     $tmp2,$src1\n\t"
10034             "vpcmpgtq  $tmp,$tmp2,$tmp3\n\t"
10035             "blendvpd   $tmp2,$tmp2,$tmp3,$tmp\n\t"
10036             "movq     $dst,$tmp2\t! max reduction2L" %}
10037   ins_encode %{
10038     int vector_len = 1;
10039     __ vextracti128_high($tmp2$$XMMRegister, $src2$$XMMRegister);
10040     __ vpcmpgtq($tmp$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, vector_len);
10041     __ vblendvpd($tmp2$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
10042     __ pshufd($tmp3$$XMMRegister, $tmp2$$XMMRegister, 0xE);
10043     __ vpcmpgtq($tmp$$XMMRegister, $tmp3$$XMMRegister, $tmp2$$XMMRegister, vector_len);
10044     __ vblendvpd($tmp3$$XMMRegister,$tmp2$$XMMRegister,$tmp3$$XMMRegister, $tmp$$XMMRegister, vector_len);
10045     __ movdq($tmp$$XMMRegister,$src1$$Register);
10046     __ vpcmpgtq($tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp$$XMMRegister, vector_len);
10047     __ vblendvpd($tmp2$$XMMRegister, $tmp$$XMMRegister,$tmp3$$XMMRegister,$tmp2$$XMMRegister, vector_len);
10048     __ movdq($dst$$Register, $tmp2$$XMMRegister);
10049   %}
10050   ins_pipe( pipe_slow );
10051 %}
10052 
10053 instruct rvmax8L_reduction_reg(rRegL dst, rRegL src1, legVecZ src2, legVecZ tmp, legVecZ tmp2, legVecZ tmp3) %{
10054   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
10055   match(Set dst (MaxReductionV src1 src2));
10056   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
10057   format %{ "vextracti64x4_high  $tmp3,$src2\n\t"
10058             "vpcmpgtq  $tmp,$tmp3,$src2\n\t"
10059             "vblendvpd   $tmp3,$tmp3,$src2,$tmp\n\t"
10060             "vextracti128_high   $tmp2,$tmp3\n\t"
10061             "vpcmpgtq  $tmp,$tmp2,$tmp3\n\t"
10062             "vblendvpd   $tmp2,$tmp2,$tmp3,$tmp\n\t"
10063             "vpshufd  $tmp3,$tmp2,0x1\n\t"
10064             "vpcmpgtq   $tmp,$tmp3,$tmp2\n\t"
10065             "vblendvpd  $tmp3,$tmp3,$tmp2,$tmp\n\t"
10066             "movq     $tmp2,$src1\n\t"
10067             "vpcmpgtq  $tmp,$tmp2,$tmp3\n\t"
10068             "vblendvpd  $tmp2,$tmp2,$tmp3,$tmp\n\t"
10069             "movq     $dst,$tmp2\t! max reduction4I" %}
10070   ins_encode %{
10071     int vector_len = 1;
10072     __ vextracti64x4_high($tmp3$$XMMRegister, $src2$$XMMRegister);
10073     __ vpcmpgtq($tmp$$XMMRegister, $tmp3$$XMMRegister, $src2$$XMMRegister, vector_len);
10074     __ vblendvpd($tmp3$$XMMRegister, $src2$$XMMRegister, $tmp3$$XMMRegister, $tmp$$XMMRegister, vector_len);
10075     __ vextracti128_high($tmp2$$XMMRegister, $tmp3$$XMMRegister);
10076     __ vpcmpgtq($tmp$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, vector_len);
10077     __ vblendvpd($tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
10078     __ pshufd($tmp3$$XMMRegister, $tmp2$$XMMRegister, 0xE);
10079     __ vpcmpgtq($tmp$$XMMRegister, $tmp3$$XMMRegister, $tmp2$$XMMRegister, vector_len);
10080     __ vblendvpd($tmp3$$XMMRegister,$tmp2$$XMMRegister,$tmp3$$XMMRegister, $tmp$$XMMRegister, vector_len);
10081     __ movdq($tmp2$$XMMRegister, $src1$$Register);
10082     __ vpcmpgtq($tmp$$XMMRegister,$tmp2$$XMMRegister, $tmp3$$XMMRegister, vector_len);
10083     __ vblendvpd($tmp2$$XMMRegister,$tmp3$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
10084     __ movdq($dst$$Register, $tmp2$$XMMRegister);
10085   %}
10086   ins_pipe( pipe_slow );
10087 %}
10088 
10089 // Float max Reduction
10090 instruct rvmax2F_reduction_reg_av(legRegF dst, legVecD src, legVecD tmp,
10091                                   legVecD dtmp, legVecD atmp, legVecD btmp, legVecX xmm_1) %{
10092   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
10093   match(Set dst (MaxReductionV dst src));
10094   effect(TEMP dst, TEMP tmp, TEMP dtmp, TEMP atmp, TEMP btmp, TEMP xmm_1);
10095   format %{ "vpermilps    $tmp,$src,1\n\t"
10096             "vminps_macro $dtmp,$tmp,$src\t! minps\n\t"
10097             "vminps_macro $dst,$dtmp,$dst\t! minps" %}
10098   ins_encode %{
10099     int vector_len = 0;
10100     __ vpermilps($xmm_1$$XMMRegister, $src$$XMMRegister, 1, vector_len);
10101     __ vmin_max_macro($dtmp$$XMMRegister, $xmm_1$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister,
10102                       $atmp$$XMMRegister, $btmp$$XMMRegister, true, false, vector_len);
10103     __ vmin_max_macro($dst$$XMMRegister, $dtmp$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister,
10104                       $atmp$$XMMRegister, $btmp$$XMMRegister, true, false, vector_len);
10105   %}
10106   ins_pipe( pipe_slow );
10107 %}
10108 
10109 instruct rvmax2F_reduction_reg(legRegF dst, immF src1, legVecD src2, legVecD tmp,
10110                                legVecD atmp, legVecD btmp, legVecX xmm_1) %{
10111   predicate(UseAVX > 0 && n->in(1)->as_Type()->type() == (Type*)TypeF::NEG_INF && 
10112             n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
10113   match(Set dst (MaxReductionV src1 src2));
10114   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_1);
10115   format %{ "vpermilps    $tmp,$src2,1\n\t"
10116             "vminps_macro $dst,$tmp,$src2\t! minps" %}
10117   ins_encode %{
10118     int vector_len = 0;
10119     __ vpermilps($xmm_1$$XMMRegister, $src2$$XMMRegister, 1, vector_len);
10120     __ vmin_max_macro($dst$$XMMRegister, $xmm_1$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister,
10121                       $atmp$$XMMRegister, $btmp$$XMMRegister, true, false, vector_len);
10122   %}
10123   ins_pipe( pipe_slow );
10124 %}
10125 
10126 instruct rvmax4F_reduction_reg_av(legRegF dst, legVecX src, legVecX tmp, legVecX dtmp,
10127                                   legVecX atmp, legVecX btmp, legVecX xmm_0, legVecX xmm_1) %{
10128   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
10129   match(Set dst (MaxReductionV dst src));
10130   effect(TEMP dst, TEMP tmp, TEMP dtmp, TEMP atmp, TEMP btmp, TEMP xmm_0, TEMP xmm_1);
10131   format %{ "vpermilps    $xmm_1,$src,14\n\t"
10132             "vmaxps_macro $xmm_0,$xmm_1,$src\t! maxps\n\t"
10133             "vpermilps    $xmm_1,$xmm_0,1\n\t"
10134             "vmaxps_macro $dtmp,$xmm_1,$xmm_0\t! maxps\n\t"
10135             "vmaxps_macro $dst,$dtmp,$dst\t! maxps" %}
10136   ins_encode %{
10137     int vector_len = 0;
10138     __ vpermilps($xmm_1$$XMMRegister, $src$$XMMRegister, 14, vector_len);
10139     __ vmin_max_macro($xmm_0$$XMMRegister, $xmm_1$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister,
10140                       $atmp$$XMMRegister, $btmp$$XMMRegister , true, false, vector_len);
10141     __ vpermilps($xmm_1$$XMMRegister, $xmm_0$$XMMRegister, 1, vector_len);
10142     __ vmin_max_macro($dtmp$$XMMRegister, $xmm_0$$XMMRegister, $xmm_1$$XMMRegister, $tmp$$XMMRegister,
10143                       $atmp$$XMMRegister, $btmp$$XMMRegister , true, false, vector_len);
10144     __ vmin_max_macro($dst$$XMMRegister, $dtmp$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister,
10145                       $atmp$$XMMRegister, $btmp$$XMMRegister , true, false, vector_len);
10146   %}
10147   ins_pipe( pipe_slow );
10148 %}
10149 
10150 instruct rvmax4F_reduction_reg(legRegF dst, immF src1, legVecX src2, legVecX tmp,
10151                                legVecX atmp, legVecX btmp, legVecX xmm_0, legVecX xmm_1) %{
10152   predicate(UseAVX > 0 && n->in(1)->as_Type()->type() == (Type*)TypeF::NEG_INF && 
10153             n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
10154   match(Set dst (MaxReductionV src1 src2));
10155   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_0, TEMP xmm_1);
10156   format %{ "vpermilps    $xmm_1,$src2,14\n\t"
10157             "vmaxps_macro $xmm_0,$xmm_1,$src2\t! maxps\n\t"
10158             "vpermilps    $xmm_1,$xmm_0,1\n\t"
10159             "vmaxps_macro $xmm_0,$xmm_1,$xmm_0\t! maxps" %}
10160   ins_encode %{
10161     int vector_len = 0;
10162     __ vpermilps($xmm_1$$XMMRegister, $src2$$XMMRegister, 14, vector_len);
10163     __ vmin_max_macro($xmm_0$$XMMRegister, $xmm_1$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister,
10164                       $atmp$$XMMRegister, $btmp$$XMMRegister , true, false, vector_len);
10165     __ vpermilps($xmm_1$$XMMRegister, $xmm_0$$XMMRegister, 1, vector_len);
10166     __ vmin_max_macro($dst$$XMMRegister, $xmm_0$$XMMRegister, $xmm_1$$XMMRegister, $tmp$$XMMRegister,
10167                       $atmp$$XMMRegister, $btmp$$XMMRegister , true, false, vector_len);
10168   %}
10169   ins_pipe( pipe_slow );
10170 %}
10171 
10172 instruct rvmax8F_reduction_reg_av(legRegF dst, legVecY src, legVecY tmp, legVecY dtmp,
10173                                   legVecY atmp, legVecY btmp, legVecY ymm_0, legVecY ymm_1) %{
10174   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
10175   match(Set dst (MaxReductionV dst src));
10176   effect(TEMP dst, TEMP tmp, TEMP dtmp, TEMP atmp, TEMP btmp, TEMP ymm_0, TEMP ymm_1);
10177   format %{ "vperm2f128   $ymm_1,$src,$src,1\n\t"
10178             "vmaxps_macro $ymm_0,$ymm_1,$src\t! maxps\n\t"
10179             "vpermilps    $ymm_1,$ymm_0,14\n\t"
10180             "vmaxps_macro $ymm_0,$ymm_1,$ymm_0\t! mips\n\t"
10181             "vpermilps    $ymm_1,$ymm_0,1\n\t"
10182             "vmaxps_macro $dtmp,$ymm_1,$ymm_0\t! maxps\n\t" 
10183             "vmaxps_macro $dst,$dtmp,$dst\t! maxps" %}
10184   ins_encode %{
10185     int vector_len = 1;
10186     __ vperm2f128($ymm_1$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 1);
10187     __ vmin_max_macro($ymm_0$$XMMRegister, $ymm_1$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister,
10188                       $atmp$$XMMRegister, $btmp$$XMMRegister , true, false, vector_len);
10189     __ vpermilps($ymm_1$$XMMRegister, $ymm_0$$XMMRegister, 14, vector_len);
10190     __ vmin_max_macro($ymm_0$$XMMRegister, $ymm_1$$XMMRegister, $ymm_0$$XMMRegister, $tmp$$XMMRegister,
10191                       $atmp$$XMMRegister, $btmp$$XMMRegister , true, false, vector_len);
10192     __ vpermilps($ymm_1$$XMMRegister, $ymm_0$$XMMRegister, 1, vector_len);
10193     __ vmin_max_macro($dtmp$$XMMRegister, $ymm_0$$XMMRegister, $ymm_1$$XMMRegister, $tmp$$XMMRegister,
10194                       $atmp$$XMMRegister, $btmp$$XMMRegister , true, false, vector_len);
10195     __ vmin_max_macro($dst$$XMMRegister, $dtmp$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister,
10196                       $atmp$$XMMRegister, $btmp$$XMMRegister , true, false, vector_len);
10197   %}
10198   ins_pipe( pipe_slow );
10199 %}
10200 
10201 instruct rvmax8F_reduction_reg(legRegF dst, immF src1, legVecY src2, legVecY tmp, 
10202                                legVecY atmp, legVecY btmp, legVecY ymm_0, legVecY ymm_1) %{
10203   predicate(UseAVX > 0 && n->in(1)->as_Type()->type() == (Type*)TypeF::NEG_INF &&
10204             n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
10205   match(Set dst (MaxReductionV src1 src2));
10206   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP ymm_0, TEMP ymm_1);
10207   format %{ "vperm2f128   $ymm_1,$src2,$src2,1\n\t"
10208             "vmaxps_macro $ymm_0,$ymm_1,$src2\t! maxps\n\t"
10209             "vpermilps    $ymm_1,$ymm_0,14\n\t"
10210             "vmaxps_macro $ymm_0,$ymm_1,$ymm_0\t! mips\n\t"
10211             "vpermilps    $ymm_1,$ymm_0,1\n\t"
10212             "vmaxps_macro $dst,$ymm_1,$ymm_0\t! maxps" %}
10213   ins_encode %{
10214     int vector_len = 1;
10215     __ vperm2f128($ymm_1$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, 1);
10216     __ vmin_max_macro($ymm_0$$XMMRegister, $ymm_1$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister,
10217                       $atmp$$XMMRegister, $btmp$$XMMRegister , true, false, vector_len);
10218     __ vpermilps($ymm_1$$XMMRegister, $ymm_0$$XMMRegister, 14, vector_len);
10219     __ vmin_max_macro($ymm_0$$XMMRegister, $ymm_1$$XMMRegister, $ymm_0$$XMMRegister, $tmp$$XMMRegister,
10220                       $atmp$$XMMRegister, $btmp$$XMMRegister , true, false, vector_len);
10221     __ vpermilps($ymm_1$$XMMRegister, $ymm_0$$XMMRegister, 1, vector_len);
10222     __ vmin_max_macro($dst$$XMMRegister, $ymm_0$$XMMRegister, $ymm_1$$XMMRegister, $tmp$$XMMRegister,
10223                       $atmp$$XMMRegister, $btmp$$XMMRegister , true, false, vector_len);
10224   %}
10225   ins_pipe( pipe_slow );
10226 %}
10227 
10228 instruct rvmax16F_reduction_reg_av(regF dst, vecZ src, vecZ dtmp, vecZ atmp, vecZ btmp, vecY ymm_0, vecY ymm_1) %{
10229   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
10230   match(Set dst (MaxReductionV dst src));
10231   effect(TEMP dst, TEMP dtmp, TEMP atmp, TEMP btmp, TEMP ymm_0, TEMP ymm_1);
10232   format %{
10233        "vextractf64x4 $ymm_0, $src, 0\n\t"
10234        "vextractf64x4 $ymm_1, $src, 1\n\t"
10235        "vmaxps_macro $ymm_0,$ymm_1,$ymm_0\t! maxps\n\t"
10236        "vpermpd      $ymm_1,$ymm_0, 78\n\t"
10237        "vmaxps_macro $ymm_0,$ymm_1,$src\t! maxps\n\t"
10238        "vpermilps    $ymm_1,$ymm_0,14\n\t"
10239        "vmaxps_macro $ymm_0,$ymm_1,$ymm_0\t! mips\n\t"
10240        "vpermilps    $ymm_1,$ymm_0,1\n\t"
10241        "vmaxps_macro $dtmp,$ymm_1,$ymm_0\t! maxps\n\t" 
10242        "vmaxps_macro $dst,$dtmp,$dst\t! maxps" %}
10243   ins_encode %{
10244     int vector_len = 1;
10245     KRegister  ktmp = k1;
10246     __ vextractf64x4($ymm_0$$XMMRegister, $src$$XMMRegister, 0);
10247     __ vextractf64x4($ymm_1$$XMMRegister, $src$$XMMRegister, 1);
10248     __ vmin_max_macro_evex($ymm_0$$XMMRegister, $ymm_1$$XMMRegister, $ymm_0$$XMMRegister, ktmp,
10249                     $atmp$$XMMRegister, $btmp$$XMMRegister , true, false, vector_len);
10250     __ vpermpd($ymm_1$$XMMRegister, $ymm_0$$XMMRegister, 78, vector_len);
10251     __ vmin_max_macro_evex($ymm_0$$XMMRegister, $ymm_1$$XMMRegister, $ymm_0$$XMMRegister, ktmp,
10252                     $atmp$$XMMRegister, $btmp$$XMMRegister , true, false, vector_len);
10253     __ vpermilps($ymm_1$$XMMRegister, $ymm_0$$XMMRegister, 14, vector_len);
10254     __ vmin_max_macro_evex($ymm_0$$XMMRegister, $ymm_1$$XMMRegister, $ymm_0$$XMMRegister, ktmp,
10255                     $atmp$$XMMRegister, $btmp$$XMMRegister , true, false, vector_len);
10256     __ vpermilps($ymm_1$$XMMRegister, $ymm_0$$XMMRegister, 1, vector_len);
10257     __ vmin_max_macro_evex($dtmp$$XMMRegister, $ymm_0$$XMMRegister, $ymm_1$$XMMRegister, ktmp,
10258                     $atmp$$XMMRegister, $btmp$$XMMRegister , true, false, vector_len);
10259     __ vmin_max_macro_evex($dst$$XMMRegister, $dtmp$$XMMRegister, $dst$$XMMRegister, ktmp,
10260                     $atmp$$XMMRegister, $btmp$$XMMRegister , true, false, vector_len);
10261   %}
10262   ins_pipe( pipe_slow );
10263 %}
10264 
10265 instruct rvmax16F_reduction_reg(regF dst, immF src1, vecZ src2, vecZ atmp, vecZ btmp, vecY ymm_0, vecY ymm_1) %{
10266   predicate(UseAVX > 2 && n->in(1)->as_Type()->type() == (Type*)TypeF::NEG_INF &&
10267             n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
10268   match(Set dst (MaxReductionV src1 src2));
10269   effect(TEMP dst, TEMP atmp, TEMP btmp, TEMP ymm_0, TEMP ymm_1);
10270   format %{
10271        "vextractf64x4 $ymm_0, $src2, 0\n\t"
10272        "vextractf64x4 $ymm_1, $src2, 1\n\t"
10273        "vmaxps_macro $ymm_0,$ymm_1,$ymm_0\t! maxps\n\t"
10274        "vpermpd      $ymm_1,$ymm_0, 78\n\t"
10275        "vmaxps_macro $ymm_0,$ymm_1,$ymm_0\t! maxps\n\t"
10276        "vpermilps    $ymm_1,$ymm_0,14\n\t"
10277        "vmaxps_macro $ymm_0,$ymm_1,$ymm_0\t! mips\n\t"
10278        "vpermilps    $ymm_1,$ymm_0,1\n\t"
10279        "vmaxps_macro $dst,$ymm_1,$ymm_0\t! maxps" %}
10280   ins_encode %{
10281     int vector_len = 1;
10282     KRegister  ktmp = k1;
10283     __ vextractf64x4($ymm_0$$XMMRegister, $src2$$XMMRegister, 0);
10284     __ vextractf64x4($ymm_1$$XMMRegister, $src2$$XMMRegister, 1);
10285     __ vmin_max_macro_evex($ymm_0$$XMMRegister, $ymm_1$$XMMRegister, $ymm_0$$XMMRegister, ktmp,
10286                     $atmp$$XMMRegister, $btmp$$XMMRegister , true, false, vector_len);
10287     __ vpermpd($ymm_1$$XMMRegister, $ymm_0$$XMMRegister, 78, vector_len);
10288     __ vmin_max_macro_evex($ymm_0$$XMMRegister, $ymm_1$$XMMRegister, $ymm_0$$XMMRegister, ktmp,
10289                     $atmp$$XMMRegister, $btmp$$XMMRegister , true, false, vector_len);
10290     __ vpermilps($ymm_1$$XMMRegister, $ymm_0$$XMMRegister, 14, vector_len);
10291     __ vmin_max_macro_evex($ymm_0$$XMMRegister, $ymm_1$$XMMRegister, $ymm_0$$XMMRegister, ktmp,
10292                     $atmp$$XMMRegister, $btmp$$XMMRegister , true, false, vector_len);
10293     __ vpermilps($ymm_1$$XMMRegister, $ymm_0$$XMMRegister, 1, vector_len);
10294     __ vmin_max_macro_evex($dst$$XMMRegister, $ymm_0$$XMMRegister, $ymm_1$$XMMRegister, ktmp,
10295                     $atmp$$XMMRegister, $btmp$$XMMRegister , true, false, vector_len);
10296   %}
10297   ins_pipe( pipe_slow );
10298 %}
10299 
10300 instruct rvmax2D_reduction_reg_av(legRegD dst, legVecX src, legVecX tmp, legVecX dtmp,
10301                                   legVecX atmp, legVecX btmp, legVecX xmm_1) %{
10302   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
10303   match(Set dst (MaxReductionV dst src));
10304   effect(TEMP dst, TEMP tmp, TEMP dtmp, TEMP atmp, TEMP btmp, TEMP xmm_1);
10305   format %{ "vpermilpd    $xmm_1,$src,1\n\t"
10306             "vmaxpd_macro $dtmp,$xmm_1,$src\t! maxps\n\t" 
10307             "vmaxpd_macro $dst,$dtmp,$dst\t! maxps" %}
10308   ins_encode %{
10309     int vector_len = 0;
10310     __ vpermilpd($xmm_1$$XMMRegister, $src$$XMMRegister, 1, vector_len);
10311     __ vmin_max_macro($dtmp$$XMMRegister, $xmm_1$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister,
10312                       $atmp$$XMMRegister, $btmp$$XMMRegister, false, false, vector_len);
10313     __ vmin_max_macro($dst$$XMMRegister, $dtmp$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister,
10314                       $atmp$$XMMRegister, $btmp$$XMMRegister, false, false, vector_len);
10315   %}
10316   ins_pipe( pipe_slow );
10317 %}
10318 
10319 instruct rvmax2D_reduction_reg(legRegD dst, immD src1 , legVecX src2, legVecX tmp,
10320                                legVecX atmp, legVecX btmp, legVecX xmm_1) %{
10321   predicate(UseAVX > 0 && n->in(1)->as_Type()->type() == (Type*)TypeD::NEG_INF &&
10322             n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
10323   match(Set dst (MaxReductionV src1 src2));
10324   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_1);
10325   format %{ "vpermilpd    $xmm_1,$src2,1\n\t"
10326             "vmaxpd_macro $dst,$xmm_1,$src2\t! maxps" %}
10327   ins_encode %{
10328     int vector_len = 0;
10329     __ vpermilpd($xmm_1$$XMMRegister, $src2$$XMMRegister, 1, vector_len);
10330     __ vmin_max_macro($dst$$XMMRegister, $xmm_1$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister,
10331                       $atmp$$XMMRegister, $btmp$$XMMRegister, false, false, vector_len);
10332   %}
10333   ins_pipe( pipe_slow );
10334 %}
10335 instruct rvmax4D_reduction_reg_av(legRegD dst, legVecY src, legVecY tmp, legVecY dtmp, 
10336                                   legVecY atmp, legVecY btmp, legVecY ymm_0, legVecY ymm_1) %{
10337   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
10338   match(Set dst (MaxReductionV dst src));
10339   effect(TEMP dst, TEMP tmp, TEMP dtmp, TEMP atmp, TEMP btmp, TEMP ymm_0, TEMP ymm_1);
10340   format %{ "vperm2f128   $ymm_1,$src,$src,1\n\t"
10341             "vmaxpd_macro $ymm_0,$ymm_1,$src\t! maxps\n\t"
10342             "vpermilpd    $ymm_1,$ymm_0,1\n\t"
10343             "vmaxpd_macro $dtmp,$ymm_1,$ymm_0\t! maxps\n\t"
10344             "vmaxpd_macro $dst,$dtmp,$dst\t! maxps" %}
10345   ins_encode %{
10346     int vector_len = 1;
10347     __ vperm2f128($ymm_1$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 1);
10348     __ vmin_max_macro($ymm_0$$XMMRegister, $ymm_1$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister,
10349                       $atmp$$XMMRegister, $btmp$$XMMRegister , false, false, vector_len);
10350     __ vpermilpd($ymm_1$$XMMRegister, $ymm_0$$XMMRegister, 1, vector_len);
10351     __ vmin_max_macro($dtmp$$XMMRegister, $ymm_0$$XMMRegister, $ymm_1$$XMMRegister, $tmp$$XMMRegister,
10352                       $atmp$$XMMRegister, $btmp$$XMMRegister , false, false, vector_len);
10353     __ vmin_max_macro($dst$$XMMRegister, $dtmp$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister,
10354                       $atmp$$XMMRegister, $btmp$$XMMRegister , false, false, vector_len);
10355   %}
10356   ins_pipe( pipe_slow );
10357 %}
10358 
10359 instruct rvmax4D_reduction_reg(legRegD dst, immD src1, legVecY src2, legVecY tmp, 
10360                                legVecY atmp, legVecY btmp, legVecY ymm_0, legVecY ymm_1) %{
10361   predicate(UseAVX > 0 && n->in(1)->as_Type()->type() == (Type*)TypeD::NEG_INF &&
10362             n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
10363   match(Set dst (MaxReductionV src1 src2));
10364   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP ymm_0, TEMP ymm_1);
10365   format %{ "vperm2f128   $ymm_1,$src2,$src2,1\n\t"
10366             "vmaxpd_macro $ymm_0,$ymm_1,$src2\t! maxps\n\t"
10367             "vpermilpd    $ymm_1,$ymm_0,1\n\t"
10368             "vmaxpd_macro $ymm_0,$ymm_1,$ymm_0\t! maxps" %}
10369   ins_encode %{
10370     int vector_len = 1;
10371     __ vperm2f128($ymm_1$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, 1);
10372     __ vmin_max_macro($ymm_0$$XMMRegister, $ymm_1$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister,
10373                       $atmp$$XMMRegister, $btmp$$XMMRegister , false, false, vector_len);
10374     __ vpermilpd($ymm_1$$XMMRegister, $ymm_0$$XMMRegister, 1, vector_len);
10375     __ vmin_max_macro($dst$$XMMRegister, $ymm_0$$XMMRegister, $ymm_1$$XMMRegister, $tmp$$XMMRegister,
10376                       $atmp$$XMMRegister, $btmp$$XMMRegister , false, false, vector_len);
10377   %}
10378   ins_pipe( pipe_slow );
10379 %}
10380 
10381 instruct rvmax8D_reduction_reg_av(regD dst, vecZ src, vecZ dtmp, vecZ atmp, vecZ btmp, vecY ymm_0, vecY ymm_1) %{
10382   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
10383   match(Set dst (MaxReductionV dst src));
10384   effect(TEMP dst, TEMP dtmp, TEMP atmp, TEMP btmp, TEMP ymm_0, TEMP ymm_1);
10385   format %{
10386        "vextractf64x4 $ymm_0, $src, 0\n\t"
10387        "vextractf64x4 $ymm_1, $src, 1\n\t"
10388        "vmaxpd_macro $ymm_0,$ymm_1,$ymm_0\t! maxpd\n\t"
10389        "vpermpd      $ymm_1,$ymm_0, 14\n\t"
10390        "vmaxpd_macro $ymm_0,$ymm_1,$ymm_0\t! maxpd\n\t"
10391        "vpermilpd    $ymm_1,$ymm_0,1\n\t"
10392        "vmaxpd_macro $dtmp,$ymm_1,$ymm_0\t! maxpd\n\t" 
10393        "vmaxpd_macro $dst,$dtmp,$dst\t! maxpd\n\t" %} 
10394   ins_encode %{
10395     int vector_len = 1;
10396     KRegister ktmp = k1;
10397     __ vextractf64x4($ymm_0$$XMMRegister, $src$$XMMRegister, 0);
10398     __ vextractf64x4($ymm_1$$XMMRegister, $src$$XMMRegister, 1);
10399     __ vmin_max_macro_evex($ymm_0$$XMMRegister, $ymm_1$$XMMRegister, $ymm_0$$XMMRegister, ktmp,
10400                     $atmp$$XMMRegister, $btmp$$XMMRegister , false, false, vector_len);
10401     __ vpermpd($ymm_1$$XMMRegister, $ymm_0$$XMMRegister, 14, vector_len);
10402     __ vmin_max_macro_evex($ymm_0$$XMMRegister, $ymm_1$$XMMRegister, $ymm_0$$XMMRegister, ktmp,
10403                     $atmp$$XMMRegister, $btmp$$XMMRegister , false, false, vector_len);
10404     __ vpermilpd($ymm_1$$XMMRegister, $ymm_0$$XMMRegister, 1, vector_len);
10405     __ vmin_max_macro_evex($dtmp$$XMMRegister, $ymm_1$$XMMRegister, $ymm_0$$XMMRegister, ktmp,
10406                     $atmp$$XMMRegister, $btmp$$XMMRegister , false, false, vector_len);
10407     __ vmin_max_macro_evex($dst$$XMMRegister, $dtmp$$XMMRegister, $dst$$XMMRegister, ktmp,
10408                     $atmp$$XMMRegister, $btmp$$XMMRegister , false, false, vector_len);
10409   %}
10410   ins_pipe( pipe_slow );
10411 %}
10412 
10413 
10414 instruct rvmax8D_reduction_reg(regD dst, immD src1, vecZ src2, vecZ atmp, vecZ btmp, vecY ymm_0, vecY ymm_1) %{
10415   predicate(UseAVX > 2 && n->in(1)->as_Type()->type() == (Type*)TypeD::NEG_INF &&
10416             n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
10417   match(Set dst (MaxReductionV src1 src2));
10418   effect(TEMP dst, TEMP atmp, TEMP btmp, TEMP ymm_0, TEMP ymm_1);
10419   format %{
10420        "vextractf64x4 $ymm_0, $src2, 0\n\t"
10421        "vextractf64x4 $ymm_1, $src2, 1\n\t"
10422        "vmaxpd_macro $ymm_0,$ymm_1,$ymm_0\t! maxpd\n\t"
10423        "vpermpd      $ymm_1,$ymm_0, 14\n\t"
10424        "vmaxpd_macro $ymm_0,$ymm_1,$ymm_0\t! maxpd\n\t"
10425        "vpermilpd    $ymm_1,$ymm_0,1\n\t"
10426        "vmaxpd_macro $dst,$ymm_1,$ymm_0\t! maxpd\n\t" %} 
10427   ins_encode %{
10428     int vector_len = 1;
10429     KRegister ktmp = k1;
10430     __ vextractf64x4($ymm_0$$XMMRegister, $src2$$XMMRegister, 0);
10431     __ vextractf64x4($ymm_1$$XMMRegister, $src2$$XMMRegister, 1);
10432     __ vmin_max_macro_evex($ymm_0$$XMMRegister, $ymm_1$$XMMRegister, $ymm_0$$XMMRegister, ktmp,
10433                     $atmp$$XMMRegister, $btmp$$XMMRegister , false, false, vector_len);
10434     __ vpermpd($ymm_1$$XMMRegister, $ymm_0$$XMMRegister, 14, vector_len);
10435     __ vmin_max_macro_evex($ymm_0$$XMMRegister, $ymm_1$$XMMRegister, $ymm_0$$XMMRegister, ktmp,
10436                     $atmp$$XMMRegister, $btmp$$XMMRegister , false, false, vector_len);
10437     __ vpermilpd($ymm_1$$XMMRegister, $ymm_0$$XMMRegister, 1, vector_len);
10438     __ vmin_max_macro_evex($dst$$XMMRegister, $ymm_1$$XMMRegister, $ymm_0$$XMMRegister, ktmp,
10439                     $atmp$$XMMRegister, $btmp$$XMMRegister , false, false, vector_len);
10440   %}
10441   ins_pipe( pipe_slow );
10442 %}
10443 
10444 
10445 instruct rsand8B_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, rRegI tmp2) %{
10446   predicate(UseSSE > 3 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
10447   match(Set dst (AndReductionV src1 src2));
10448   effect(TEMP tmp, TEMP tmp2, TEMP dst);
10449   format %{
10450             "pshufd  $tmp,$src2,0x1\n\t"
10451             "pand    $tmp,$src2\n\t"
10452             "movzbl  $dst,$src1\n\t"
10453             "pextrb  $tmp2,$tmp, 0x0\n\t"
10454             "andl    $dst,$tmp2\n\t"
10455             "pextrb  $tmp2,$tmp, 0x1\n\t"
10456             "andl    $dst,$tmp2\n\t"
10457             "pextrb  $tmp2,$tmp, 0x2\n\t"
10458             "andl    $dst,$tmp2\n\t"
10459             "pextrb  $tmp2,$tmp, 0x3\n\t"
10460             "andl    $dst,$tmp2\n\t"
10461             "movsbl  $dst,$dst\t! and reduction8B" %}
10462   ins_encode %{
10463     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x1);
10464     __ pand($tmp$$XMMRegister, $src2$$XMMRegister);
10465     __ movzbl($dst$$Register, $src1$$Register);
10466     __ pextrb($tmp2$$Register, $tmp$$XMMRegister, 0x0);
10467     __ andl($dst$$Register, $tmp2$$Register);
10468     __ pextrb($tmp2$$Register, $tmp$$XMMRegister, 0x1);
10469     __ andl($dst$$Register, $tmp2$$Register);
10470     __ pextrb($tmp2$$Register, $tmp$$XMMRegister, 0x2);
10471     __ andl($dst$$Register, $tmp2$$Register);
10472     __ pextrb($tmp2$$Register, $tmp$$XMMRegister, 0x3);
10473     __ andl($dst$$Register, $tmp2$$Register);
10474     __ movsbl($dst$$Register, $dst$$Register);
10475   %}
10476   ins_pipe( pipe_slow );
10477 %}
10478 
10479 instruct rsand16B_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2, rRegI tmp3) %{
10480   predicate(UseSSE > 3 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
10481   match(Set dst (AndReductionV src1 src2));
10482   effect(TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP dst);
10483   format %{ "pshufd  $tmp,$src2,0xE\n\t"
10484             "pand    $tmp,$src2\n\t"
10485             "pshufd  $tmp2,$tmp,0x1\n\t"
10486             "pand    $tmp,$tmp,$tmp2\n\t"
10487             "movzbl  $dst,$src1\n\t"
10488             "pextrb  $tmp3,$tmp, 0x0\n\t"
10489             "andl    $dst,$tmp3\n\t"
10490             "pextrb  $tmp3,$tmp, 0x1\n\t"
10491             "andl    $dst,$tmp3\n\t"
10492             "pextrb  $tmp3,$tmp, 0x2\n\t"
10493             "andl    $dst,$tmp3\n\t"
10494             "pextrb  $tmp3,$tmp, 0x3\n\t"
10495             "andl    $dst,$tmp3\n\t"
10496             "movsbl  $dst,$dst\t! and reduction16B" %}
10497   ins_encode %{
10498     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
10499     __ pand($tmp$$XMMRegister, $src2$$XMMRegister);
10500     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
10501     __ pand($tmp$$XMMRegister, $tmp2$$XMMRegister);
10502     __ movzbl($dst$$Register, $src1$$Register);
10503     __ pextrb($tmp3$$Register, $tmp$$XMMRegister, 0x0);
10504     __ andl($dst$$Register, $tmp3$$Register);
10505     __ pextrb($tmp3$$Register, $tmp$$XMMRegister, 0x1);
10506     __ andl($dst$$Register, $tmp3$$Register);
10507     __ pextrb($tmp3$$Register, $tmp$$XMMRegister, 0x2);
10508     __ andl($dst$$Register, $tmp3$$Register);
10509     __ pextrb($tmp3$$Register, $tmp$$XMMRegister, 0x3);
10510     __ andl($dst$$Register, $tmp3$$Register);
10511     __ movsbl($dst$$Register, $dst$$Register);
10512   %}
10513   ins_pipe( pipe_slow );
10514 %}
10515 
10516 instruct rvand32B_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2, rRegI tmp3) %{
10517   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
10518   match(Set dst (AndReductionV src1 src2));
10519   effect(TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP dst);
10520    format %{ "vextracti128_high  $tmp,$src2\n\t"
10521             "vpand   $tmp,$tmp,$src2\n\t"
10522             "pshufd  $tmp2,$tmp,0xE\n\t"
10523             "vpand   $tmp,$tmp,$tmp2\n\t"
10524             "pshufd  $tmp2,$tmp,0x1\n\t"
10525             "vpand   $tmp,$tmp,$tmp2\n\t"
10526             "movzbl  $dst,$src1\n\t"
10527             "pextrb  $tmp3,$tmp, 0x0\n\t"
10528             "andl    $dst,$tmp3\n\t"
10529             "pextrb  $tmp3,$tmp, 0x1\n\t"
10530             "andl    $dst,$tmp3\n\t"
10531             "pextrb  $tmp3,$tmp, 0x2\n\t"
10532             "andl    $dst,$tmp3\n\t"
10533             "pextrb  $tmp3,$tmp, 0x3\n\t"
10534             "andl    $dst,$tmp3\n\t"
10535             "movsbl  $dst,$dst\t! and reduction32B" %}
10536   ins_encode %{
10537     int vector_len = 0;
10538     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
10539     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
10540     __ vpshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE, vector_len);
10541     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
10542     __ vpshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1, vector_len);
10543     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
10544     __ movzbl($dst$$Register, $src1$$Register);
10545     __ pextrb($tmp3$$Register, $tmp$$XMMRegister, 0x0);
10546     __ andl($dst$$Register, $tmp3$$Register);
10547     __ pextrb($tmp3$$Register, $tmp$$XMMRegister, 0x1);
10548     __ andl($dst$$Register, $tmp3$$Register);
10549     __ pextrb($tmp3$$Register, $tmp$$XMMRegister, 0x2);
10550     __ andl($dst$$Register, $tmp3$$Register);
10551     __ pextrb($tmp3$$Register, $tmp$$XMMRegister, 0x3);
10552     __ andl($dst$$Register, $tmp3$$Register);
10553     __ movsbl($dst$$Register, $dst$$Register);
10554   %}
10555   ins_pipe( pipe_slow );
10556 %}
10557 
10558 instruct rvand64B_reduction_reg(rRegI dst, rRegI src1, vecZ src2, regF tmp, regF tmp2, rRegI tmp3) %{
10559   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
10560   match(Set dst (AndReductionV src1 src2));
10561   effect(TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP dst);
10562   format %{ "vextracti64x4_high  $tmp2,$src2\n\t"
10563             "vpand   $tmp2,$tmp2,$src2\n\t"
10564             "vextracti128_high  $tmp,$tmp2\n\t"
10565             "vpand   $tmp,$tmp,$tmp2\n\t"
10566             "pshufd  $tmp2,$tmp,0xE\n\t"
10567             "vpand   $tmp,$tmp,$tmp2\n\t"
10568             "pshufd  $tmp2,$tmp,0x1\n\t"
10569             "vpand   $tmp,$tmp,$tmp2\n\t"
10570             "movzbl  $dst,$src1\n\t"
10571             "movdl   $tmp3,$tmp\n\t"
10572             "andl    $dst,$tmp3\n\t"
10573             "shrl    $tmp3,0x8\n\t"
10574             "andl    $dst,$tmp3\n\t"
10575             "shrl    $tmp3,0x8\n\t"
10576             "andl    $dst,$tmp3\n\t"
10577             "shrl    $tmp3,0x8\n\t"
10578             "andl    $dst,$tmp3\n\t"
10579             "movsbl  $dst,$dst\t! and reduction64B" %}
10580   ins_encode %{
10581     int vector_len = 0;
10582     __ vextracti64x4_high($tmp2$$XMMRegister, $src2$$XMMRegister);
10583     __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
10584     __ vextracti128_high($tmp$$XMMRegister, $tmp2$$XMMRegister);
10585     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
10586     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
10587     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
10588     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
10589     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
10590     __ movzbl($dst$$Register, $src1$$Register);
10591     __ movdl($tmp3$$Register, $tmp$$XMMRegister);
10592     __ andl($dst$$Register, $tmp3$$Register);
10593     __ shrl($tmp3$$Register, 8);
10594     __ andl($dst$$Register, $tmp3$$Register);
10595     __ shrl($tmp3$$Register, 8);
10596     __ andl($dst$$Register, $tmp3$$Register);
10597     __ shrl($tmp3$$Register, 8);
10598     __ andl($dst$$Register, $tmp3$$Register);
10599     __ movsbl($dst$$Register, $dst$$Register);
10600   %}
10601   ins_pipe( pipe_slow );
10602 %}
10603 
10604 instruct rsand4S_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, rRegI tmp2) %{
10605   predicate(UseSSE > 1 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
10606   match(Set dst (AndReductionV src1 src2));
10607   effect(TEMP tmp, TEMP tmp2, TEMP dst);
10608   format %{
10609             "pshufd  $tmp,$src2,0x1\n\t"
10610             "pand    $tmp,$src2\n\t"
10611             "movzwl  $dst,$src1\n\t"
10612             "pextrw  $tmp2,$tmp, 0x0\n\t"
10613             "andw    $dst,$tmp2\n\t"
10614             "pextrw  $tmp2,$tmp, 0x1\n\t"
10615             "andw    $dst,$tmp2\n\t"
10616             "movswl  $dst,$dst\t! and reduction4S" %}
10617   ins_encode %{
10618     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x1);
10619     __ pand($tmp$$XMMRegister, $src2$$XMMRegister);
10620     __ movzwl($dst$$Register, $src1$$Register);
10621     __ pextrw($tmp2$$Register, $tmp$$XMMRegister, 0x0);
10622     __ andw($dst$$Register, $tmp2$$Register);
10623     __ pextrw($tmp2$$Register, $tmp$$XMMRegister, 0x1);
10624     __ andw($dst$$Register, $tmp2$$Register);
10625     __ movswl($dst$$Register, $dst$$Register);
10626   %}
10627   ins_pipe( pipe_slow );
10628 %}
10629 
10630 instruct rsand8S_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2, rRegI tmp3) %{
10631   predicate(UseSSE > 1 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
10632   match(Set dst (AndReductionV src1 src2));
10633   effect(TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP dst);
10634   format %{ "pshufd  $tmp,$src2,0xE\n\t"
10635             "pand    $tmp,$src2\n\t"
10636             "pshufd  $tmp2,$tmp,0x1\n\t"
10637             "pand    $tmp,$tmp,$tmp2\n\t"
10638             "movzwl  $dst,$src1\n\t"
10639             "pextrw  $tmp3,$tmp, 0x0\n\t"
10640             "andw    $dst,$tmp3\n\t"
10641             "pextrw  $tmp3,$tmp, 0x1\n\t"
10642             "andw    $dst,$tmp3\n\t"
10643             "movswl  $dst,$dst\t! and reduction8S" %}
10644   ins_encode %{
10645     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
10646     __ pand($tmp$$XMMRegister, $src2$$XMMRegister);
10647     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
10648     __ pand($tmp$$XMMRegister, $tmp2$$XMMRegister);
10649     __ movzwl($dst$$Register, $src1$$Register);
10650     __ pextrw($tmp3$$Register, $tmp$$XMMRegister, 0x0);
10651     __ andw($dst$$Register, $tmp3$$Register);
10652     __ pextrw($tmp3$$Register, $tmp$$XMMRegister, 0x1);
10653     __ andw($dst$$Register, $tmp3$$Register);
10654     __ movswl($dst$$Register, $dst$$Register);
10655   %}
10656   ins_pipe( pipe_slow );
10657 %}
10658 
10659 instruct rvand16S_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2, rRegI tmp3) %{
10660   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
10661   match(Set dst (AndReductionV src1 src2));
10662   effect(TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP dst);
10663    format %{ "vextracti128_high  $tmp,$src2\n\t"
10664             "vpand   $tmp,$tmp,$src2\n\t"
10665             "pshufd  $tmp2,$tmp,0xE\n\t"
10666             "vpand   $tmp,$tmp,$tmp2\n\t"
10667             "pshufd  $tmp2,$tmp,0x1\n\t"
10668             "vpand   $tmp,$tmp,$tmp2\n\t"
10669             "movzwl  $dst,$src1\n\t"
10670             "pextrw  $tmp3,$tmp, 0x0\n\t"
10671             "andw    $dst,$tmp3\n\t"
10672             "pextrw  $tmp3,$tmp, 0x1\n\t"
10673             "andw    $dst,$tmp3\n\t"
10674             "movswl  $dst,$dst\t! and reduction16S" %}
10675   ins_encode %{
10676     int vector_len = 0;
10677     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
10678     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
10679     __ vpshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE, vector_len);
10680     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
10681     __ vpshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1, vector_len);
10682     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
10683     __ movzwl($dst$$Register, $src1$$Register);
10684     __ pextrw($tmp3$$Register, $tmp$$XMMRegister, 0x0);
10685     __ andw($dst$$Register, $tmp3$$Register);
10686     __ pextrw($tmp3$$Register, $tmp$$XMMRegister, 0x1);
10687     __ andw($dst$$Register, $tmp3$$Register);
10688     __ movswl($dst$$Register, $dst$$Register);
10689   %}
10690   ins_pipe( pipe_slow );
10691 %}
10692 
10693 instruct rvand32S_reduction_reg(rRegI dst, rRegI src1, vecZ src2, regF tmp, regF tmp2, rRegI tmp3) %{
10694   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
10695   match(Set dst (AndReductionV src1 src2));
10696   effect(TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP dst);
10697   format %{ "vextracti64x4_high  $tmp2,$src2\n\t"
10698             "vpand   $tmp2,$tmp2,$src2\n\t"
10699             "vextracti128_high  $tmp,$tmp2\n\t"
10700             "vpand   $tmp,$tmp,$tmp2\n\t"
10701             "pshufd  $tmp2,$tmp,0xE\n\t"
10702             "vpand   $tmp,$tmp,$tmp2\n\t"
10703             "pshufd  $tmp2,$tmp,0x1\n\t"
10704             "vpand   $tmp,$tmp,$tmp2\n\t"
10705             "movzwl  $dst,$src1\n\t"
10706             "movdl   $tmp3,$tmp\n\t"
10707             "andw    $dst,$tmp3\n\t"
10708             "shrl    $tmp3,0x16\n\t"
10709             "andw    $dst,$tmp3\n\t"
10710             "movswl  $dst,$dst\t! and reduction32S" %}
10711   ins_encode %{
10712     int vector_len = 0;
10713     __ vextracti64x4_high($tmp2$$XMMRegister, $src2$$XMMRegister);
10714     __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
10715     __ vextracti128_high($tmp$$XMMRegister, $tmp2$$XMMRegister);
10716     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
10717     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
10718     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
10719     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
10720     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
10721     __ movzwl($dst$$Register, $src1$$Register);
10722     __ movdl($tmp3$$Register, $tmp$$XMMRegister);
10723     __ andw($dst$$Register, $tmp3$$Register);
10724     __ shrl($tmp3$$Register, 16);
10725     __ andw($dst$$Register, $tmp3$$Register);
10726     __ movswl($dst$$Register, $dst$$Register);
10727   %}
10728   ins_pipe( pipe_slow );
10729 %}
10730 
10731 instruct rsand2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
10732   predicate(UseSSE > 1 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
10733   match(Set dst (AndReductionV src1 src2));
10734   effect(TEMP tmp, TEMP tmp2);
10735   format %{ "pshufd  $tmp2,$src2,0x1\n\t"
10736             "pand    $tmp2,$src2\n\t"
10737             "movd    $tmp,$src1\n\t"
10738             "pand    $tmp2,$tmp\n\t"
10739             "movd    $dst,$tmp2\t! and reduction2I" %}
10740   ins_encode %{
10741     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
10742     __ pand($tmp2$$XMMRegister, $src2$$XMMRegister);
10743     __ movdl($tmp$$XMMRegister, $src1$$Register);
10744     __ pand($tmp2$$XMMRegister, $tmp$$XMMRegister);
10745     __ movdl($dst$$Register, $tmp2$$XMMRegister);
10746   %}
10747   ins_pipe( pipe_slow );
10748 %}
10749 
10750 instruct rsand4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
10751   predicate(UseSSE > 1 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
10752   match(Set dst (AndReductionV src1 src2));
10753   effect(TEMP tmp, TEMP tmp2);
10754   format %{ "pshufd  $tmp2,$src2,0xE\n\t"
10755             "pand    $tmp2,$src2\n\t"
10756             "pshufd  $tmp,$tmp2,0x1\n\t"
10757             "pand    $tmp2,$tmp\n\t"
10758             "movd    $tmp,$src1\n\t"
10759             "pand    $tmp2,$tmp\n\t"
10760             "movd    $dst,$tmp2\t! and reduction4I" %}
10761   ins_encode %{
10762     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
10763     __ pand($tmp2$$XMMRegister, $src2$$XMMRegister);
10764     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x1);
10765     __ pand($tmp2$$XMMRegister, $tmp$$XMMRegister);
10766     __ movdl($tmp$$XMMRegister, $src1$$Register);
10767     __ pand($tmp2$$XMMRegister, $tmp$$XMMRegister);
10768     __ movdl($dst$$Register, $tmp2$$XMMRegister);
10769   %}
10770   ins_pipe( pipe_slow );
10771 %}
10772 
10773 instruct rvand8I_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2) %{
10774   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
10775   match(Set dst (AndReductionV src1 src2));
10776   effect(TEMP tmp, TEMP tmp2);
10777   format %{ "vextracti128_high  $tmp,$src2\n\t"
10778             "vpand    $tmp,$tmp,$src2\n\t"
10779             "vpshufd   $tmp2,$tmp,0xE\n\t"
10780             "vpand    $tmp,$tmp,$tmp2\n\t"
10781             "vpshufd   $tmp2,$tmp,0x1\n\t"
10782             "vpand    $tmp,$tmp,$tmp2\n\t"
10783             "movd     $tmp2,$src1\n\t"
10784             "vpand    $tmp2,$tmp,$tmp2\n\t"
10785             "movd     $dst,$tmp2\t! and reduction8I" %}
10786   ins_encode %{
10787     int vector_len = 0;
10788     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
10789     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
10790     __ vpshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE, vector_len);
10791     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
10792     __ vpshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1, vector_len);
10793     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
10794     __ movdl($tmp2$$XMMRegister, $src1$$Register);
10795     __ vpand($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
10796     __ movdl($dst$$Register, $tmp2$$XMMRegister);
10797   %}
10798   ins_pipe( pipe_slow );
10799 %}
10800 
10801 instruct rvand16I_reduction_reg(rRegI dst, rRegI src1, vecZ src2, regF tmp, regF tmp2, regF tmp3) %{
10802   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
10803   match(Set dst (AndReductionV src1 src2));
10804   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
10805   format %{ "vextracti64x4_high  $tmp3,$src2\n\t"
10806             "vpand  $tmp3,$tmp3,$src2\n\t"
10807             "vextracti128_high  $tmp,$tmp3\n\t"
10808             "vpand    $tmp,$tmp,$src2\n\t"
10809             "vpshufd   $tmp2,$tmp,0xE\n\t"
10810             "vpand    $tmp,$tmp,$tmp2\n\t"
10811             "vpshufd   $tmp2,$tmp,0x1\n\t"
10812             "vpand    $tmp,$tmp,$tmp2\n\t"
10813             "movd     $tmp2,$src1\n\t"
10814             "vpand    $tmp2,$tmp,$tmp2\n\t"
10815             "movd     $dst,$tmp2\t! and reduction16I" %}
10816   ins_encode %{
10817     __ vextracti64x4_high($tmp3$$XMMRegister, $src2$$XMMRegister);
10818     __ vpand($tmp3$$XMMRegister, $tmp3$$XMMRegister, $src2$$XMMRegister, 1);
10819     __ vextracti128_high($tmp$$XMMRegister, $tmp3$$XMMRegister);
10820     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp3$$XMMRegister, 0);
10821     __ vpshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE, 0);
10822     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
10823     __ vpshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1, 0);
10824     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
10825     __ movdl($tmp2$$XMMRegister, $src1$$Register);
10826     __ vpand($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
10827     __ movdl($dst$$Register, $tmp2$$XMMRegister);
10828   %}
10829   ins_pipe( pipe_slow );
10830 %}
10831 
10832 #ifdef _LP64
10833 instruct rsand2L_reduction_reg(rRegL dst, rRegL src1, vecX src2, regF tmp, regF tmp2) %{
10834   predicate(UseSSE >= 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
10835   match(Set dst (AndReductionV src1 src2));
10836   effect(TEMP tmp, TEMP tmp2);
10837   format %{ "pshufd  $tmp2,$src2,0xE\n\t"
10838             "pand    $tmp2,$src2\n\t"
10839             "movdq   $tmp,$src1\n\t"
10840             "pand    $tmp2,$tmp\n\t"
10841             "movq   $dst,$tmp2\t! and reduction2L" %}
10842   ins_encode %{
10843     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
10844     __ pand($tmp2$$XMMRegister, $src2$$XMMRegister);
10845     __ movdq($tmp$$XMMRegister, $src1$$Register);
10846     __ pand($tmp2$$XMMRegister, $tmp$$XMMRegister);
10847     __ movq($dst$$Register, $tmp2$$XMMRegister);
10848   %}
10849   ins_pipe( pipe_slow );
10850 %}
10851 
10852 instruct rvand4L_reduction_reg_avx(rRegL dst, rRegL src1, vecY src2, regF tmp, regF tmp2) %{
10853   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
10854   match(Set dst (AndReductionV src1 src2));
10855   effect(TEMP tmp, TEMP tmp2);
10856   format %{ "vextracti128_high  $tmp,$src2\n\t"
10857             "vpand  $tmp2,$tmp,$src2\n\t"
10858             "vpshufd  $tmp,$tmp2,0xE\n\t"
10859             "vpand  $tmp2,$tmp2,$tmp\n\t"
10860             "movq   $tmp,$src1\n\t"
10861             "vpand  $tmp2,$tmp2,$tmp\n\t"
10862             "movq   $dst,$tmp2\t! and reduction4L" %}
10863   ins_encode %{
10864     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
10865     __ vpand($tmp2$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, 0);
10866     __ vpshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE, 0);
10867     __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
10868     __ movq($tmp$$XMMRegister, $src1$$Register);
10869     __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
10870     __ movq($dst$$Register, $tmp2$$XMMRegister);
10871   %}
10872   ins_pipe( pipe_slow );
10873 %}
10874 
10875 instruct rvand8L_reduction_reg(rRegL dst, rRegL src1, vecZ src2, regF tmp, regF tmp2) %{
10876   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
10877   match(Set dst (AndReductionV src1 src2));
10878   effect(TEMP tmp, TEMP tmp2);
10879   format %{ "vextracti64x4_high  $tmp2,$src2\n\t"
10880             "vpandq  $tmp2,$tmp2,$src2\n\t"
10881             "vextracti128_high  $tmp,$tmp2\n\t"
10882             "vpandq  $tmp2,$tmp2,$tmp\n\t"
10883             "vpshufd  $tmp,$tmp2,0xE\n\t"
10884             "vpandq  $tmp2,$tmp2,$tmp\n\t"
10885             "movdq   $tmp,$src1\n\t"
10886             "vpandq  $tmp2,$tmp2,$tmp\n\t"
10887             "movdq   $dst,$tmp2\t! and reduction8L" %}
10888   ins_encode %{
10889     __ vextracti64x4_high($tmp2$$XMMRegister, $src2$$XMMRegister);
10890     __ vpandq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
10891     __ vextracti128_high($tmp$$XMMRegister, $tmp2$$XMMRegister);
10892     __ vpandq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
10893     __ vpshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE, 0);
10894     __ vpandq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
10895     __ movdq($tmp$$XMMRegister, $src1$$Register);
10896     __ vpandq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
10897     __ movdq($dst$$Register, $tmp2$$XMMRegister);
10898   %}
10899   ins_pipe( pipe_slow );
10900 %}
10901 #endif
10902 
10903 instruct rsor8B_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, rRegI tmp2) %{
10904   predicate(UseSSE > 3 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
10905   match(Set dst (OrReductionV src1 src2));
10906   effect(TEMP tmp, TEMP tmp2, TEMP dst);
10907   format %{
10908             "pshufd  $tmp,$src2,0x1\n\t"
10909             "por    $tmp,$src2\n\t"
10910             "movzbl  $dst,$src1\n\t"
10911             "pextrb  $tmp2,$tmp, 0x0\n\t"
10912             "orl    $dst,$tmp2\n\t"
10913             "pextrb  $tmp2,$tmp, 0x1\n\t"
10914             "orl    $dst,$tmp2\n\t"
10915             "pextrb  $tmp2,$tmp, 0x2\n\t"
10916             "orl    $dst,$tmp2\n\t"
10917             "pextrb  $tmp2,$tmp, 0x3\n\t"
10918             "orl    $dst,$tmp2\n\t"
10919             "movsbl  $dst,$dst\t! or reduction8B" %}
10920   ins_encode %{
10921     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x1);
10922     __ por($tmp$$XMMRegister, $src2$$XMMRegister);
10923     __ movzbl($dst$$Register, $src1$$Register);
10924     __ pextrb($tmp2$$Register, $tmp$$XMMRegister, 0x0);
10925     __ orl($dst$$Register, $tmp2$$Register);
10926     __ pextrb($tmp2$$Register, $tmp$$XMMRegister, 0x1);
10927     __ orl($dst$$Register, $tmp2$$Register);
10928     __ pextrb($tmp2$$Register, $tmp$$XMMRegister, 0x2);
10929     __ orl($dst$$Register, $tmp2$$Register);
10930     __ pextrb($tmp2$$Register, $tmp$$XMMRegister, 0x3);
10931     __ orl($dst$$Register, $tmp2$$Register);
10932     __ movsbl($dst$$Register, $dst$$Register);
10933   %}
10934   ins_pipe( pipe_slow );
10935 %}
10936 
10937 instruct rsor16B_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2, rRegI tmp3) %{
10938   predicate(UseSSE > 3 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
10939   match(Set dst (OrReductionV src1 src2));
10940   effect(TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP dst);
10941   format %{ "pshufd  $tmp,$src2,0xE\n\t"
10942             "por    $tmp,$src2\n\t"
10943             "pshufd  $tmp2,$tmp,0x1\n\t"
10944             "por    $tmp,$tmp,$tmp2\n\t"
10945             "movzbl  $dst,$src1\n\t"
10946             "pextrb  $tmp3,$tmp, 0x0\n\t"
10947             "orl    $dst,$tmp3\n\t"
10948             "pextrb  $tmp3,$tmp, 0x1\n\t"
10949             "orl    $dst,$tmp3\n\t"
10950             "pextrb  $tmp3,$tmp, 0x2\n\t"
10951             "orl    $dst,$tmp3\n\t"
10952             "pextrb  $tmp3,$tmp, 0x3\n\t"
10953             "orl    $dst,$tmp3\n\t"
10954             "movsbl  $dst,$dst\t! or reduction16B" %}
10955   ins_encode %{
10956     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
10957     __ por($tmp$$XMMRegister, $src2$$XMMRegister);
10958     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
10959     __ por($tmp$$XMMRegister, $tmp2$$XMMRegister);
10960     __ movzbl($dst$$Register, $src1$$Register);
10961     __ pextrb($tmp3$$Register, $tmp$$XMMRegister, 0x0);
10962     __ orl($dst$$Register, $tmp3$$Register);
10963     __ pextrb($tmp3$$Register, $tmp$$XMMRegister, 0x1);
10964     __ orl($dst$$Register, $tmp3$$Register);
10965     __ pextrb($tmp3$$Register, $tmp$$XMMRegister, 0x2);
10966     __ orl($dst$$Register, $tmp3$$Register);
10967     __ pextrb($tmp3$$Register, $tmp$$XMMRegister, 0x3);
10968     __ orl($dst$$Register, $tmp3$$Register);
10969     __ movsbl($dst$$Register, $dst$$Register);
10970   %}
10971   ins_pipe( pipe_slow );
10972 %}
10973 
10974 instruct rvor32B_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2, rRegI tmp3) %{
10975   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
10976   match(Set dst (OrReductionV src1 src2));
10977   effect(TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP dst);
10978    format %{ "vextracti128_high  $tmp,$src2\n\t"
10979             "vpor   $tmp,$tmp,$src2\n\t"
10980             "pshufd  $tmp2,$tmp,0xE\n\t"
10981             "vpor   $tmp,$tmp,$tmp2\n\t"
10982             "pshufd  $tmp2,$tmp,0x1\n\t"
10983             "vpor   $tmp,$tmp,$tmp2\n\t"
10984             "movzbl  $dst,$src1\n\t"
10985             "pextrb  $tmp3,$tmp, 0x0\n\t"
10986             "orl    $dst,$tmp3\n\t"
10987             "pextrb  $tmp3,$tmp, 0x1\n\t"
10988             "orl    $dst,$tmp3\n\t"
10989             "pextrb  $tmp3,$tmp, 0x2\n\t"
10990             "orl    $dst,$tmp3\n\t"
10991             "pextrb  $tmp3,$tmp, 0x3\n\t"
10992             "orl    $dst,$tmp3\n\t"
10993             "movsbl  $dst,$dst\t! or reduction32B" %}
10994   ins_encode %{
10995     int vector_len = 0;
10996     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
10997     __ vpor($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
10998     __ vpshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE, vector_len);
10999     __ vpor($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
11000     __ vpshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1, vector_len);
11001     __ vpor($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
11002     __ movzbl($dst$$Register, $src1$$Register);
11003     __ pextrb($tmp3$$Register, $tmp$$XMMRegister, 0x0);
11004     __ orl($dst$$Register, $tmp3$$Register);
11005     __ pextrb($tmp3$$Register, $tmp$$XMMRegister, 0x1);
11006     __ orl($dst$$Register, $tmp3$$Register);
11007     __ pextrb($tmp3$$Register, $tmp$$XMMRegister, 0x2);
11008     __ orl($dst$$Register, $tmp3$$Register);
11009     __ pextrb($tmp3$$Register, $tmp$$XMMRegister, 0x3);
11010     __ orl($dst$$Register, $tmp3$$Register);
11011     __ movsbl($dst$$Register, $dst$$Register);
11012   %}
11013   ins_pipe( pipe_slow );
11014 %}
11015 
11016 instruct rvor64B_reduction_reg(rRegI dst, rRegI src1, vecZ src2, regF tmp, regF tmp2, rRegI tmp3) %{
11017   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
11018   match(Set dst (OrReductionV src1 src2));
11019   effect(TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP dst);
11020   format %{ "vextracti64x4_high  $tmp2,$src2\n\t"
11021             "vpor   $tmp2,$tmp2,$src2\n\t"
11022             "vextracti128_high  $tmp,$tmp2\n\t"
11023             "vpor   $tmp,$tmp,$tmp2\n\t"
11024             "pshufd  $tmp2,$tmp,0xE\n\t"
11025             "vpor   $tmp,$tmp,$tmp2\n\t"
11026             "pshufd  $tmp2,$tmp,0x1\n\t"
11027             "vpor   $tmp,$tmp,$tmp2\n\t"
11028             "movzbl  $dst,$src1\n\t"
11029             "movdl   $tmp3,$tmp\n\t"
11030             "orl    $dst,$tmp3\n\t"
11031             "shrl    $tmp3,0x8\n\t"
11032             "orl    $dst,$tmp3\n\t"
11033             "shrl    $tmp3,0x8\n\t"
11034             "orl    $dst,$tmp3\n\t"
11035             "shrl    $tmp3,0x8\n\t"
11036             "orl    $dst,$tmp3\n\t"
11037             "movsbl  $dst,$dst\t! or reduction64B" %}
11038   ins_encode %{
11039     int vector_len = 0;
11040     __ vextracti64x4_high($tmp2$$XMMRegister, $src2$$XMMRegister);
11041     __ vpor($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
11042     __ vextracti128_high($tmp$$XMMRegister, $tmp2$$XMMRegister);
11043     __ vpor($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
11044     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
11045     __ vpor($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
11046     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
11047     __ vpor($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
11048     __ movzbl($dst$$Register, $src1$$Register);
11049     __ movdl($tmp3$$Register, $tmp$$XMMRegister);
11050     __ orl($dst$$Register, $tmp3$$Register);
11051     __ shrl($tmp3$$Register, 8);
11052     __ orl($dst$$Register, $tmp3$$Register);
11053     __ shrl($tmp3$$Register, 8);
11054     __ orl($dst$$Register, $tmp3$$Register);
11055     __ shrl($tmp3$$Register, 8);
11056     __ orl($dst$$Register, $tmp3$$Register);
11057     __ movsbl($dst$$Register, $dst$$Register);
11058   %}
11059   ins_pipe( pipe_slow );
11060 %}
11061 
11062 instruct rsor4S_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, rRegI tmp2) %{
11063   predicate(UseSSE > 1 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
11064   match(Set dst (OrReductionV src1 src2));
11065   effect(TEMP tmp, TEMP tmp2, TEMP dst);
11066   format %{
11067             "pshufd  $tmp,$src2,0x1\n\t"
11068             "por    $tmp,$src2\n\t"
11069             "movzwl  $dst,$src1\n\t"
11070             "pextrw  $tmp2,$tmp, 0x0\n\t"
11071             "orw    $dst,$tmp2\n\t"
11072             "pextrw  $tmp2,$tmp, 0x1\n\t"
11073             "orw    $dst,$tmp2\n\t"
11074             "movswl  $dst,$dst\t! or reduction4S" %}
11075   ins_encode %{
11076     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x1);
11077     __ por($tmp$$XMMRegister, $src2$$XMMRegister);
11078     __ movzwl($dst$$Register, $src1$$Register);
11079     __ pextrw($tmp2$$Register, $tmp$$XMMRegister, 0x0);
11080     __ orw($dst$$Register, $tmp2$$Register);
11081     __ pextrw($tmp2$$Register, $tmp$$XMMRegister, 0x1);
11082     __ orw($dst$$Register, $tmp2$$Register);
11083     __ movswl($dst$$Register, $dst$$Register);
11084   %}
11085   ins_pipe( pipe_slow );
11086 %}
11087 
11088 instruct rsor8S_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2, rRegI tmp3) %{
11089   predicate(UseSSE > 1 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
11090   match(Set dst (OrReductionV src1 src2));
11091   effect(TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP dst);
11092   format %{ "pshufd  $tmp,$src2,0xE\n\t"
11093             "por    $tmp,$src2\n\t"
11094             "pshufd  $tmp2,$tmp,0x1\n\t"
11095             "por    $tmp,$tmp,$tmp2\n\t"
11096             "movzwl  $dst,$src1\n\t"
11097             "pextrw  $tmp3,$tmp, 0x0\n\t"
11098             "orw    $dst,$tmp3\n\t"
11099             "pextrw  $tmp3,$tmp, 0x1\n\t"
11100             "orw    $dst,$tmp3\n\t"
11101             "movswl  $dst,$dst\t! or reduction8S" %}
11102   ins_encode %{
11103     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
11104     __ por($tmp$$XMMRegister, $src2$$XMMRegister);
11105     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
11106     __ por($tmp$$XMMRegister, $tmp2$$XMMRegister);
11107     __ movzwl($dst$$Register, $src1$$Register);
11108     __ pextrw($tmp3$$Register, $tmp$$XMMRegister, 0x0);
11109     __ orw($dst$$Register, $tmp3$$Register);
11110     __ pextrw($tmp3$$Register, $tmp$$XMMRegister, 0x1);
11111     __ orw($dst$$Register, $tmp3$$Register);
11112     __ movswl($dst$$Register, $dst$$Register);
11113   %}
11114   ins_pipe( pipe_slow );
11115 %}
11116 
11117 instruct rvor16S_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2, rRegI tmp3) %{
11118   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
11119   match(Set dst (OrReductionV src1 src2));
11120   effect(TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP dst);
11121    format %{ "vextracti128_high  $tmp,$src2\n\t"
11122             "vpor   $tmp,$tmp,$src2\n\t"
11123             "pshufd  $tmp2,$tmp,0xE\n\t"
11124             "vpor   $tmp,$tmp,$tmp2\n\t"
11125             "pshufd  $tmp2,$tmp,0x1\n\t"
11126             "vpor   $tmp,$tmp,$tmp2\n\t"
11127             "movzwl  $dst,$src1\n\t"
11128             "pextrw  $tmp3,$tmp, 0x0\n\t"
11129             "orw    $dst,$tmp3\n\t"
11130             "pextrw  $tmp3,$tmp, 0x1\n\t"
11131             "orw    $dst,$tmp3\n\t"
11132             "movswl  $dst,$dst\t! or reduction16S" %}
11133   ins_encode %{
11134     int vector_len = 0;
11135     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
11136     __ vpor($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
11137     __ vpshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE, vector_len);
11138     __ vpor($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
11139     __ vpshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1, vector_len);
11140     __ vpor($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
11141     __ movzwl($dst$$Register, $src1$$Register);
11142     __ pextrw($tmp3$$Register, $tmp$$XMMRegister, 0x0);
11143     __ orw($dst$$Register, $tmp3$$Register);
11144     __ pextrw($tmp3$$Register, $tmp$$XMMRegister, 0x1);
11145     __ orw($dst$$Register, $tmp3$$Register);
11146     __ movswl($dst$$Register, $dst$$Register);
11147   %}
11148   ins_pipe( pipe_slow );
11149 %}
11150 
11151 instruct rvor32S_reduction_reg(rRegI dst, rRegI src1, vecZ src2, regF tmp, regF tmp2, rRegI tmp3) %{
11152   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
11153   match(Set dst (OrReductionV src1 src2));
11154   effect(TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP dst);
11155   format %{ "vextracti64x4_high  $tmp2,$src2\n\t"
11156             "vpor   $tmp2,$tmp2,$src2\n\t"
11157             "vextracti128_high  $tmp,$tmp2\n\t"
11158             "vpor   $tmp,$tmp,$tmp2\n\t"
11159             "pshufd  $tmp2,$tmp,0xE\n\t"
11160             "vpor   $tmp,$tmp,$tmp2\n\t"
11161             "pshufd  $tmp2,$tmp,0x1\n\t"
11162             "vpor   $tmp,$tmp,$tmp2\n\t"
11163             "movzwl  $dst,$src1\n\t"
11164             "movdl   $tmp3,$tmp\n\t"
11165             "orw    $dst,$tmp3\n\t"
11166             "shrl    $tmp3,0x16\n\t"
11167             "orw    $dst,$tmp3\n\t"
11168             "movswl  $dst,$dst\t! or reduction32S" %}
11169   ins_encode %{
11170     int vector_len = 0;
11171     __ vextracti64x4_high($tmp2$$XMMRegister, $src2$$XMMRegister);
11172     __ vpor($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
11173     __ vextracti128_high($tmp$$XMMRegister, $tmp2$$XMMRegister);
11174     __ vpor($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
11175     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
11176     __ vpor($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
11177     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
11178     __ vpor($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
11179     __ movzwl($dst$$Register, $src1$$Register);
11180     __ movdl($tmp3$$Register, $tmp$$XMMRegister);
11181     __ orw($dst$$Register, $tmp3$$Register);
11182     __ shrl($tmp3$$Register, 16);
11183     __ orw($dst$$Register, $tmp3$$Register);
11184     __ movswl($dst$$Register, $dst$$Register);
11185   %}
11186   ins_pipe( pipe_slow );
11187 %}
11188 
11189 instruct rsor2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
11190   predicate(UseSSE > 1 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
11191   match(Set dst (OrReductionV src1 src2));
11192   effect(TEMP tmp, TEMP tmp2);
11193   format %{ "pshufd  $tmp2,$src2,0x1\n\t"
11194             "por    $tmp2,$src2\n\t"
11195             "movd    $tmp,$src1\n\t"
11196             "por    $tmp2,$tmp\n\t"
11197             "movd    $dst,$tmp2\t! or reduction2I" %}
11198   ins_encode %{
11199     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
11200     __ por($tmp2$$XMMRegister, $src2$$XMMRegister);
11201     __ movdl($tmp$$XMMRegister, $src1$$Register);
11202     __ por($tmp2$$XMMRegister, $tmp$$XMMRegister);
11203     __ movdl($dst$$Register, $tmp2$$XMMRegister);
11204   %}
11205   ins_pipe( pipe_slow );
11206 %}
11207 
11208 instruct rsor4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
11209   predicate(UseSSE > 1 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
11210   match(Set dst (OrReductionV src1 src2));
11211   effect(TEMP tmp, TEMP tmp2);
11212   format %{ "pshufd  $tmp2,$src2,0xE\n\t"
11213             "por    $tmp2,$src2\n\t"
11214             "pshufd  $tmp,$tmp2,0x1\n\t"
11215             "por    $tmp2,$tmp\n\t"
11216             "movd    $tmp,$src1\n\t"
11217             "por    $tmp2,$tmp\n\t"
11218             "movd    $dst,$tmp2\t! or reduction4I" %}
11219   ins_encode %{
11220     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
11221     __ por($tmp2$$XMMRegister, $src2$$XMMRegister);
11222     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x1);
11223     __ por($tmp2$$XMMRegister, $tmp$$XMMRegister);
11224     __ movdl($tmp$$XMMRegister, $src1$$Register);
11225     __ por($tmp2$$XMMRegister, $tmp$$XMMRegister);
11226     __ movdl($dst$$Register, $tmp2$$XMMRegister);
11227   %}
11228   ins_pipe( pipe_slow );
11229 %}
11230 
11231 instruct rvor8I_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2) %{
11232   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
11233   match(Set dst (OrReductionV src1 src2));
11234   effect(TEMP tmp, TEMP tmp2);
11235   format %{ "vextracti128_high  $tmp,$src2\n\t"
11236             "vpor    $tmp,$tmp,$src2\n\t"
11237             "vpshufd   $tmp2,$tmp,0xE\t"
11238             "vpor    $tmp,$tmp,$tmp2\n\t"
11239             "vpshufd   $tmp2,$tmp,0x1\t"
11240             "vpor    $tmp,$tmp,$tmp2\n\t"
11241             "movd     $tmp2,$src1\n\t"
11242             "vpor    $tmp2,$tmp,$tmp2\n\t"
11243             "movd     $dst,$tmp2\t! or reduction8I" %}
11244   ins_encode %{
11245     int vector_len = 0;
11246     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
11247     __ vpor($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
11248     __ vpshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE, vector_len);
11249     __ vpor($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
11250     __ vpshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1, vector_len);
11251     __ vpor($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
11252     __ movdl($tmp2$$XMMRegister, $src1$$Register);
11253     __ vpor($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
11254     __ movdl($dst$$Register, $tmp2$$XMMRegister);
11255   %}
11256   ins_pipe( pipe_slow );
11257 %}
11258 
11259 instruct rvor16I_reduction_reg(rRegI dst, rRegI src1, vecZ src2, regF tmp, regF tmp2, regF tmp3) %{
11260   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
11261   match(Set dst (OrReductionV src1 src2));
11262   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
11263   format %{ "vextracti64x4_high  $tmp3,$src2\n\t"
11264             "vpor  $tmp3,$tmp3,$src2\n\t"
11265             "vextracti128_high  $tmp,$tmp3\n\t"
11266             "vpor    $tmp,$tmp,$src2\n\t"
11267             "vpshufd   $tmp2,$tmp,0xE\t"
11268             "vpor    $tmp,$tmp,$tmp2\n\t"
11269             "vpshufd   $tmp2,$tmp,0x1\t"
11270             "vpor    $tmp,$tmp,$tmp2\n\t"
11271             "movd     $tmp2,$src1\n\t"
11272             "vpor    $tmp2,$tmp,$tmp2\n\t"
11273             "movd     $dst,$tmp2\t! or reduction16I" %}
11274   ins_encode %{
11275     int vector_len = 0;
11276     __ vextracti64x4_high($tmp3$$XMMRegister, $src2$$XMMRegister);
11277     __ vpor($tmp3$$XMMRegister, $tmp3$$XMMRegister, $src2$$XMMRegister, 1);
11278     __ vextracti128_high($tmp$$XMMRegister, $tmp3$$XMMRegister);
11279     __ vpor($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp3$$XMMRegister, vector_len);
11280     __ vpshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE, vector_len);
11281     __ vpor($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
11282     __ vpshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1, vector_len);
11283     __ vpor($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
11284     __ movdl($tmp2$$XMMRegister, $src1$$Register);
11285     __ vpor($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
11286     __ movdl($dst$$Register, $tmp2$$XMMRegister);
11287   %}
11288   ins_pipe( pipe_slow );
11289 %}
11290 
11291 instruct rsor2L_reduction_reg(rRegL dst, rRegL src1, vecX src2, regF tmp, regF tmp2) %{
11292   predicate(UseSSE >= 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
11293   match(Set dst (OrReductionV src1 src2));
11294   effect(TEMP tmp, TEMP tmp2);
11295   format %{ "pshufd  $tmp2,$src2,0xE\n\t"
11296             "por    $tmp2,$src2\n\t"
11297             "movdq   $tmp,$src1\n\t"
11298             "por    $tmp2,$tmp\n\t"
11299             "movq   $dst,$tmp2\t! or reduction2L" %}
11300   ins_encode %{
11301     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
11302     __ por($tmp2$$XMMRegister, $src2$$XMMRegister);
11303     __ movdq($tmp$$XMMRegister, $src1$$Register);
11304     __ por($tmp2$$XMMRegister, $tmp$$XMMRegister);
11305     __ movq($dst$$Register, $tmp2$$XMMRegister);
11306   %}
11307   ins_pipe( pipe_slow );
11308 %}
11309 
11310 instruct rvor4L_reduction_reg_avx(rRegL dst, rRegL src1, vecY src2, regF tmp, regF tmp2) %{
11311   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
11312   match(Set dst (OrReductionV src1 src2));
11313   effect(TEMP tmp, TEMP tmp2);
11314   format %{ "vextracti128_high  $tmp,$src2\n\t"
11315             "vpor  $tmp2,$tmp,$src2\n\t"
11316             "vpshufd  $tmp,$tmp2,0xE\t"
11317             "vpor  $tmp2,$tmp2,$tmp\n\t"
11318             "movq   $tmp,$src1\n\t"
11319             "vpor  $tmp2,$tmp2,$tmp\n\t"
11320             "movq   $dst,$tmp2\t! or reduction4L" %}
11321   ins_encode %{
11322     int vector_len = 0;
11323     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
11324     __ vpor($tmp2$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
11325     __ vpshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE, vector_len);
11326     __ vpor($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
11327     __ movq($tmp$$XMMRegister, $src1$$Register);
11328     __ vpor($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
11329     __ movq($dst$$Register, $tmp2$$XMMRegister);
11330   %}
11331   ins_pipe( pipe_slow );
11332 %}
11333 
11334 #ifdef _LP64
11335 instruct rvor8L_reduction_reg(rRegL dst, rRegL src1, vecZ src2, regF tmp, regF tmp2) %{
11336   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
11337   match(Set dst (OrReductionV src1 src2));
11338   effect(TEMP tmp, TEMP tmp2);
11339   format %{ "vextracti64x4_high  $tmp2,$src2\n\t"
11340             "vporq  $tmp2,$tmp2,$src2\n\t"
11341             "vextracti128_high  $tmp,$tmp2\n\t"
11342             "vporq  $tmp2,$tmp2,$tmp\n\t"
11343             "vpshufd  $tmp,$tmp2,0xE\t"
11344             "vporq  $tmp2,$tmp2,$tmp\n\t"
11345             "movdq   $tmp,$src1\n\t"
11346             "vporq  $tmp2,$tmp2,$tmp\n\t"
11347             "movdq   $dst,$tmp2\t! or reduction8L" %}
11348   ins_encode %{
11349     int vector_len = 0;
11350     __ vextracti64x4_high($tmp2$$XMMRegister, $src2$$XMMRegister);
11351     __ vporq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
11352     __ vextracti128_high($tmp$$XMMRegister, $tmp2$$XMMRegister);
11353     __ vporq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
11354     __ vpshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE, vector_len);
11355     __ vporq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
11356     __ movdq($tmp$$XMMRegister, $src1$$Register);
11357     __ vporq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
11358     __ movdq($dst$$Register, $tmp2$$XMMRegister);
11359   %}
11360   ins_pipe( pipe_slow );
11361 %}
11362 #endif
11363 
11364 instruct rsxor8B_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, rRegI tmp2) %{
11365   predicate(UseSSE > 3 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
11366   match(Set dst (XorReductionV src1 src2));
11367   effect(TEMP tmp, TEMP tmp2, TEMP dst);
11368   format %{
11369             "pshufd  $tmp,$src2,0x1\n\t"
11370             "pxor    $tmp,$src2\n\t"
11371             "movzbl  $dst,$src1\n\t"
11372             "pextrb  $tmp2,$tmp, 0x0\n\t"
11373             "xorl    $dst,$tmp2\n\t"
11374             "pextrb  $tmp2,$tmp, 0x1\n\t"
11375             "xorl    $dst,$tmp2\n\t"
11376             "pextrb  $tmp2,$tmp, 0x2\n\t"
11377             "xorl    $dst,$tmp2\n\t"
11378             "pextrb  $tmp2,$tmp, 0x3\n\t"
11379             "xorl    $dst,$tmp2\n\t"
11380             "movsbl  $dst,$dst\t! xor reduction8B" %}
11381   ins_encode %{
11382     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x1);
11383     __ pxor($tmp$$XMMRegister, $src2$$XMMRegister);
11384     __ movzbl($dst$$Register, $src1$$Register);
11385     __ pextrb($tmp2$$Register, $tmp$$XMMRegister, 0x0);
11386     __ xorl($dst$$Register, $tmp2$$Register);
11387     __ pextrb($tmp2$$Register, $tmp$$XMMRegister, 0x1);
11388     __ xorl($dst$$Register, $tmp2$$Register);
11389     __ pextrb($tmp2$$Register, $tmp$$XMMRegister, 0x2);
11390     __ xorl($dst$$Register, $tmp2$$Register);
11391     __ pextrb($tmp2$$Register, $tmp$$XMMRegister, 0x3);
11392     __ xorl($dst$$Register, $tmp2$$Register);
11393     __ movsbl($dst$$Register, $dst$$Register);
11394   %}
11395   ins_pipe( pipe_slow );
11396 %}
11397 
11398 instruct rsxor16B_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2, rRegI tmp3) %{
11399   predicate(UseSSE > 3 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
11400   match(Set dst (XorReductionV src1 src2));
11401   effect(TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP dst);
11402   format %{ "pshufd  $tmp,$src2,0xE\n\t"
11403             "pxor    $tmp,$src2\n\t"
11404             "pshufd  $tmp2,$tmp,0x1\n\t"
11405             "pxor    $tmp,$tmp,$tmp2\n\t"
11406             "movzbl  $dst,$src1\n\t"
11407             "pextrb  $tmp3,$tmp, 0x0\n\t"
11408             "xorl    $dst,$tmp3\n\t"
11409             "pextrb  $tmp3,$tmp, 0x1\n\t"
11410             "xorl    $dst,$tmp3\n\t"
11411             "pextrb  $tmp3,$tmp, 0x2\n\t"
11412             "xorl    $dst,$tmp3\n\t"
11413             "pextrb  $tmp3,$tmp, 0x3\n\t"
11414             "xorl    $dst,$tmp3\n\t"
11415             "movsbl  $dst,$dst\t! xor reduction16B" %}
11416   ins_encode %{
11417     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
11418     __ pxor($tmp$$XMMRegister, $src2$$XMMRegister);
11419     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
11420     __ pxor($tmp$$XMMRegister, $tmp2$$XMMRegister);
11421     __ movzbl($dst$$Register, $src1$$Register);
11422     __ pextrb($tmp3$$Register, $tmp$$XMMRegister, 0x0);
11423     __ xorl($dst$$Register, $tmp3$$Register);
11424     __ pextrb($tmp3$$Register, $tmp$$XMMRegister, 0x1);
11425     __ xorl($dst$$Register, $tmp3$$Register);
11426     __ pextrb($tmp3$$Register, $tmp$$XMMRegister, 0x2);
11427     __ xorl($dst$$Register, $tmp3$$Register);
11428     __ pextrb($tmp3$$Register, $tmp$$XMMRegister, 0x3);
11429     __ xorl($dst$$Register, $tmp3$$Register);
11430     __ movsbl($dst$$Register, $dst$$Register);
11431   %}
11432   ins_pipe( pipe_slow );
11433 %}
11434 
11435 instruct rvxor32B_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2, rRegI tmp3) %{
11436   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
11437   match(Set dst (XorReductionV src1 src2));
11438   effect(TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP dst);
11439    format %{ "vextracti128_high  $tmp,$src2\n\t"
11440             "vpxor   $tmp,$tmp,$src2\n\t"
11441             "pshufd  $tmp2,$tmp,0xE\n\t"
11442             "vpxor   $tmp,$tmp,$tmp2\n\t"
11443             "pshufd  $tmp2,$tmp,0x1\n\t"
11444             "vpxor   $tmp,$tmp,$tmp2\n\t"
11445             "movzbl  $dst,$src1\n\t"
11446             "pextrb  $tmp3,$tmp, 0x0\n\t"
11447             "xorl    $dst,$tmp3\n\t"
11448             "pextrb  $tmp3,$tmp, 0x1\n\t"
11449             "xorl    $dst,$tmp3\n\t"
11450             "pextrb  $tmp3,$tmp, 0x2\n\t"
11451             "xorl    $dst,$tmp3\n\t"
11452             "pextrb  $tmp3,$tmp, 0x3\n\t"
11453             "xorl    $dst,$tmp3\n\t"
11454             "movsbl  $dst,$dst\t! xor reduction32B" %}
11455   ins_encode %{
11456     int vector_len = 0;
11457     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
11458     __ vpxor($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
11459     __ vpshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE, vector_len);
11460     __ vpxor($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
11461     __ vpshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1, vector_len);
11462     __ vpxor($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
11463     __ movzbl($dst$$Register, $src1$$Register);
11464     __ pextrb($tmp3$$Register, $tmp$$XMMRegister, 0x0);
11465     __ xorl($dst$$Register, $tmp3$$Register);
11466     __ pextrb($tmp3$$Register, $tmp$$XMMRegister, 0x1);
11467     __ xorl($dst$$Register, $tmp3$$Register);
11468     __ pextrb($tmp3$$Register, $tmp$$XMMRegister, 0x2);
11469     __ xorl($dst$$Register, $tmp3$$Register);
11470     __ pextrb($tmp3$$Register, $tmp$$XMMRegister, 0x3);
11471     __ xorl($dst$$Register, $tmp3$$Register);
11472     __ movsbl($dst$$Register, $dst$$Register);
11473   %}
11474   ins_pipe( pipe_slow );
11475 %}
11476 
11477 instruct rvxor64B_reduction_reg(rRegI dst, rRegI src1, vecZ src2, regF tmp, regF tmp2, rRegI tmp3) %{
11478   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
11479   match(Set dst (XorReductionV src1 src2));
11480   effect(TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP dst);
11481   format %{ "vextracti64x4_high  $tmp2,$src2\n\t"
11482             "vpxor   $tmp2,$tmp2,$src2\n\t"
11483             "vextracti128_high  $tmp,$tmp2\n\t"
11484             "vpxor   $tmp,$tmp,$tmp2\n\t"
11485             "pshufd  $tmp2,$tmp,0xE\n\t"
11486             "vpxor   $tmp,$tmp,$tmp2\n\t"
11487             "pshufd  $tmp2,$tmp,0x1\n\t"
11488             "vpxor   $tmp,$tmp,$tmp2\n\t"
11489             "movzbl  $dst,$src1\n\t"
11490             "movdl   $tmp3,$tmp\n\t"
11491             "xorl    $dst,$tmp3\n\t"
11492             "shrl    $tmp3,0x8\n\t"
11493             "xorl    $dst,$tmp3\n\t"
11494             "shrl    $tmp3,0x8\n\t"
11495             "xorl    $dst,$tmp3\n\t"
11496             "shrl    $tmp3,0x8\n\t"
11497             "xorl    $dst,$tmp3\n\t"
11498             "movsbl  $dst,$dst\t! xor reduction64B" %}
11499   ins_encode %{
11500     int vector_len = 0;
11501     __ vextracti64x4_high($tmp2$$XMMRegister, $src2$$XMMRegister);
11502     __ vpxor($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
11503     __ vextracti128_high($tmp$$XMMRegister, $tmp2$$XMMRegister);
11504     __ vpxor($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
11505     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
11506     __ vpxor($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
11507     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
11508     __ vpxor($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
11509     __ movzbl($dst$$Register, $src1$$Register);
11510     __ movdl($tmp3$$Register, $tmp$$XMMRegister);
11511     __ xorl($dst$$Register, $tmp3$$Register);
11512     __ shrl($tmp3$$Register, 8);
11513     __ xorl($dst$$Register, $tmp3$$Register);
11514     __ shrl($tmp3$$Register, 8);
11515     __ xorl($dst$$Register, $tmp3$$Register);
11516     __ shrl($tmp3$$Register, 8);
11517     __ xorl($dst$$Register, $tmp3$$Register);
11518     __ movsbl($dst$$Register, $dst$$Register);
11519   %}
11520   ins_pipe( pipe_slow );
11521 %}
11522 
11523 instruct rsxor4S_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, rRegI tmp2) %{
11524   predicate(UseSSE > 1 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
11525   match(Set dst (XorReductionV src1 src2));
11526   effect(TEMP tmp, TEMP tmp2, TEMP dst);
11527   format %{
11528             "pshufd  $tmp,$src2,0x1\n\t"
11529             "pxor    $tmp,$src2\n\t"
11530             "movzwl  $dst,$src1\n\t"
11531             "pextrw  $tmp2,$tmp, 0x0\n\t"
11532             "xorw    $dst,$tmp2\n\t"
11533             "pextrw  $tmp2,$tmp, 0x1\n\t"
11534             "xorw    $dst,$tmp2\n\t"
11535             "movswl  $dst,$dst\t! xor reduction4S" %}
11536   ins_encode %{
11537     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x1);
11538     __ pxor($tmp$$XMMRegister, $src2$$XMMRegister);
11539     __ movzwl($dst$$Register, $src1$$Register);
11540     __ pextrw($tmp2$$Register, $tmp$$XMMRegister, 0x0);
11541     __ xorw($dst$$Register, $tmp2$$Register);
11542     __ pextrw($tmp2$$Register, $tmp$$XMMRegister, 0x1);
11543     __ xorw($dst$$Register, $tmp2$$Register);
11544     __ movswl($dst$$Register, $dst$$Register);
11545   %}
11546   ins_pipe( pipe_slow );
11547 %}
11548 
11549 instruct rsxor8S_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2, rRegI tmp3) %{
11550   predicate(UseSSE > 1 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
11551   match(Set dst (XorReductionV src1 src2));
11552   effect(TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP dst);
11553   format %{ "pshufd  $tmp,$src2,0xE\n\t"
11554             "pxor    $tmp,$src2\n\t"
11555             "pshufd  $tmp2,$tmp,0x1\n\t"
11556             "pxor    $tmp,$tmp,$tmp2\n\t"
11557             "movzwl  $dst,$src1\n\t"
11558             "pextrw  $tmp3,$tmp, 0x0\n\t"
11559             "xorw    $dst,$tmp3\n\t"
11560             "pextrw  $tmp3,$tmp, 0x1\n\t"
11561             "xorw    $dst,$tmp3\n\t"
11562             "movswl  $dst,$dst\t! xor reduction8S" %}
11563   ins_encode %{
11564     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
11565     __ pxor($tmp$$XMMRegister, $src2$$XMMRegister);
11566     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
11567     __ pxor($tmp$$XMMRegister, $tmp2$$XMMRegister);
11568     __ movzwl($dst$$Register, $src1$$Register);
11569     __ pextrw($tmp3$$Register, $tmp$$XMMRegister, 0x0);
11570     __ xorw($dst$$Register, $tmp3$$Register);
11571     __ pextrw($tmp3$$Register, $tmp$$XMMRegister, 0x1);
11572     __ xorw($dst$$Register, $tmp3$$Register);
11573     __ movswl($dst$$Register, $dst$$Register);
11574   %}
11575   ins_pipe( pipe_slow );
11576 %}
11577 
11578 instruct rvxor16S_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2, rRegI tmp3) %{
11579   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
11580   match(Set dst (XorReductionV src1 src2));
11581   effect(TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP dst);
11582    format %{ "vextracti128_high  $tmp,$src2\n\t"
11583             "vpxor   $tmp,$tmp,$src2\n\t"
11584             "pshufd  $tmp2,$tmp,0xE\n\t"
11585             "vpxor   $tmp,$tmp,$tmp2\n\t"
11586             "pshufd  $tmp2,$tmp,0x1\n\t"
11587             "vpxor   $tmp,$tmp,$tmp2\n\t"
11588             "movzwl  $dst,$src1\n\t"
11589             "pextrw  $tmp3,$tmp, 0x0\n\t"
11590             "xorw    $dst,$tmp3\n\t"
11591             "pextrw  $tmp3,$tmp, 0x1\n\t"
11592             "xorw    $dst,$tmp3\n\t"
11593             "movswl  $dst,$dst\t! xor reduction16S" %}
11594   ins_encode %{
11595     int vector_len = 0;
11596     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
11597     __ vpxor($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
11598     __ vpshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE, vector_len);
11599     __ vpxor($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
11600     __ vpshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1, vector_len);
11601     __ vpxor($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
11602     __ movzwl($dst$$Register, $src1$$Register);
11603     __ pextrw($tmp3$$Register, $tmp$$XMMRegister, 0x0);
11604     __ xorw($dst$$Register, $tmp3$$Register);
11605     __ pextrw($tmp3$$Register, $tmp$$XMMRegister, 0x1);
11606     __ xorw($dst$$Register, $tmp3$$Register);
11607     __ movswl($dst$$Register, $dst$$Register);
11608   %}
11609   ins_pipe( pipe_slow );
11610 %}
11611 
11612 instruct rvxor32S_reduction_reg(rRegI dst, rRegI src1, vecZ src2, regF tmp, regF tmp2, rRegI tmp3) %{
11613   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
11614   match(Set dst (XorReductionV src1 src2));
11615   effect(TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP dst);
11616   format %{ "vextracti64x4_high  $tmp2,$src2\n\t"
11617             "vpxor   $tmp2,$tmp2,$src2\n\t"
11618             "vextracti128_high  $tmp,$tmp2\n\t"
11619             "vpxor   $tmp,$tmp,$tmp2\n\t"
11620             "pshufd  $tmp2,$tmp,0xE\n\t"
11621             "vpxor   $tmp,$tmp,$tmp2\n\t"
11622             "pshufd  $tmp2,$tmp,0x1\n\t"
11623             "vpxor   $tmp,$tmp,$tmp2\n\t"
11624             "movzwl  $dst,$src1\n\t"
11625             "movdl   $tmp3,$tmp\n\t"
11626             "xorw    $dst,$tmp3\n\t"
11627             "shrl    $tmp3,0x16\n\t"
11628             "xorw    $dst,$tmp3\n\t"
11629             "movswl  $dst,$dst\t! xor reduction32S" %}
11630   ins_encode %{
11631     int vector_len = 0;
11632     __ vextracti64x4_high($tmp2$$XMMRegister, $src2$$XMMRegister);
11633     __ vpxor($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
11634     __ vextracti128_high($tmp$$XMMRegister, $tmp2$$XMMRegister);
11635     __ vpxor($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
11636     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
11637     __ vpxor($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
11638     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
11639     __ vpxor($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
11640     __ movzwl($dst$$Register, $src1$$Register);
11641     __ movdl($tmp3$$Register, $tmp$$XMMRegister);
11642     __ xorw($dst$$Register, $tmp3$$Register);
11643     __ shrl($tmp3$$Register, 16);
11644     __ xorw($dst$$Register, $tmp3$$Register);
11645     __ movswl($dst$$Register, $dst$$Register);
11646   %}
11647   ins_pipe( pipe_slow );
11648 %}
11649 
11650 instruct rsxor2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
11651   predicate(UseSSE > 1 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
11652   match(Set dst (XorReductionV src1 src2));
11653   effect(TEMP tmp, TEMP tmp2);
11654   format %{ "pshufd  $tmp2,$src2,0x1\n\t"
11655             "pxor    $tmp2,$src2\n\t"
11656             "movd    $tmp,$src1\n\t"
11657             "pxor    $tmp2,$tmp\n\t"
11658             "movd    $dst,$tmp2\t! xor reduction2I" %}
11659   ins_encode %{
11660     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
11661     __ pxor($tmp2$$XMMRegister, $src2$$XMMRegister);
11662     __ movdl($tmp$$XMMRegister, $src1$$Register);
11663     __ pxor($tmp2$$XMMRegister, $tmp$$XMMRegister);
11664     __ movdl($dst$$Register, $tmp2$$XMMRegister);
11665   %}
11666   ins_pipe( pipe_slow );
11667 %}
11668 
11669 instruct rsxor4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
11670   predicate(UseSSE > 1 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
11671   match(Set dst (XorReductionV src1 src2));
11672   effect(TEMP tmp, TEMP tmp2);
11673   format %{ "pshufd  $tmp2,$src2,0xE\n\t"
11674             "pxor    $tmp2,$src2\n\t"
11675             "pshufd  $tmp,$tmp2,0x1\n\t"
11676             "pxor    $tmp2,$tmp\n\t"
11677             "movd    $tmp,$src1\n\t"
11678             "pxor    $tmp2,$tmp\n\t"
11679             "movd    $dst,$tmp2\t! xor reduction4I" %}
11680   ins_encode %{
11681     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
11682     __ pxor($tmp2$$XMMRegister, $src2$$XMMRegister);
11683     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x1);
11684     __ pxor($tmp2$$XMMRegister, $tmp$$XMMRegister);
11685     __ movdl($tmp$$XMMRegister, $src1$$Register);
11686     __ pxor($tmp2$$XMMRegister, $tmp$$XMMRegister);
11687     __ movdl($dst$$Register, $tmp2$$XMMRegister);
11688   %}
11689   ins_pipe( pipe_slow );
11690 %}
11691 
11692 instruct rvxor8I_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2) %{
11693   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
11694   match(Set dst (XorReductionV src1 src2));
11695   effect(TEMP tmp, TEMP tmp2);
11696   format %{ "vextracti128_high  $tmp,$src2\n\t"
11697             "vpxor    $tmp,$tmp,$src2\n\t"
11698             "vpshufd   $tmp2,$tmp,0xE\t"
11699             "vpxor    $tmp,$tmp,$tmp2\n\t"
11700             "vpshufd   $tmp2,$tmp,0x1\t"
11701             "vpxor    $tmp,$tmp,$tmp2\n\t"
11702             "movd     $tmp2,$src1\n\t"
11703             "vpxor    $tmp2,$tmp,$tmp2\n\t"
11704             "movd     $dst,$tmp2\t! xor reduction8I" %}
11705   ins_encode %{
11706     int vector_len = 0;
11707     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
11708     __ vpxor($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
11709     __ vpshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE, vector_len);
11710     __ vpxor($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
11711     __ vpshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1, vector_len);
11712     __ vpxor($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
11713     __ movdl($tmp2$$XMMRegister, $src1$$Register);
11714     __ vpxor($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
11715     __ movdl($dst$$Register, $tmp2$$XMMRegister);
11716   %}
11717   ins_pipe( pipe_slow );
11718 %}
11719 
11720 instruct rvxor16I_reduction_reg(rRegI dst, rRegI src1, vecZ src2, regF tmp, regF tmp2, regF tmp3) %{
11721   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
11722   match(Set dst (XorReductionV src1 src2));
11723   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
11724   format %{ "vextracti64x4_high  $tmp3,$src2\n\t"
11725             "vpxor  $tmp3,$tmp3,$src2\n\t"
11726             "vextracti128_high  $tmp,$tmp3\n\t"
11727             "vpxor    $tmp,$tmp,$src2\n\t"
11728             "vpshufd   $tmp2,$tmp,0xE\t"
11729             "vpxor    $tmp,$tmp,$tmp2\n\t"
11730             "vpshufd   $tmp2,$tmp,0x1\t"
11731             "vpxor    $tmp,$tmp,$tmp2\n\t"
11732             "movd     $tmp2,$src1\n\t"
11733             "vpxor    $tmp2,$tmp,$tmp2\n\t"
11734             "movd     $dst,$tmp2\t! xor reduction16I" %}
11735   ins_encode %{
11736     int vector_len = 0;
11737     __ vextracti64x4_high($tmp3$$XMMRegister, $src2$$XMMRegister);
11738     __ vpxor($tmp3$$XMMRegister, $tmp3$$XMMRegister, $src2$$XMMRegister, 1);
11739     __ vextracti128_high($tmp$$XMMRegister, $tmp3$$XMMRegister);
11740     __ vpxor($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp3$$XMMRegister, vector_len);
11741     __ vpshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE, vector_len);
11742     __ vpxor($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
11743     __ vpshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1, vector_len);
11744     __ vpxor($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
11745     __ movdl($tmp2$$XMMRegister, $src1$$Register);
11746     __ vpxor($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
11747     __ movdl($dst$$Register, $tmp2$$XMMRegister);
11748   %}
11749   ins_pipe( pipe_slow );
11750 %}
11751 
11752 instruct rsxor2L_reduction_reg(rRegL dst, rRegL src1, vecX src2, regF tmp, regF tmp2) %{
11753   predicate(UseSSE >= 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
11754   match(Set dst (XorReductionV src1 src2));
11755   effect(TEMP tmp, TEMP tmp2);
11756   format %{ "pshufd  $tmp2,$src2,0xE\n\t"
11757             "pxor    $tmp2,$src2\n\t"
11758             "movdq   $tmp,$src1\n\t"
11759             "pxor    $tmp2,$tmp\n\t"
11760             "movq   $dst,$tmp2\t! xor reduction2L" %}
11761   ins_encode %{
11762     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
11763     __ pxor($tmp2$$XMMRegister, $src2$$XMMRegister);
11764     __ movdq($tmp$$XMMRegister, $src1$$Register);
11765     __ pxor($tmp2$$XMMRegister, $tmp$$XMMRegister);
11766     __ movq($dst$$Register, $tmp2$$XMMRegister);
11767   %}
11768   ins_pipe( pipe_slow );
11769 %}
11770 
11771 instruct rvxor4L_reduction_reg_avx(rRegL dst, rRegL src1, vecY src2, regF tmp, regF tmp2) %{
11772   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
11773   match(Set dst (XorReductionV src1 src2));
11774   effect(TEMP tmp, TEMP tmp2);
11775   format %{ "vextracti128_high  $tmp,$src2\n\t"
11776             "vpxor  $tmp2,$tmp,$src2\n\t"
11777             "vpshufd  $tmp,$tmp2,0xE\t"
11778             "vpxor  $tmp2,$tmp2,$tmp\n\t"
11779             "movq   $tmp,$src1\n\t"
11780             "vpxor  $tmp2,$tmp2,$tmp\n\t"
11781             "movq   $dst,$tmp2\t! xor reduction4L" %}
11782   ins_encode %{
11783     int vector_len = 0;
11784     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
11785     __ vpxor($tmp2$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
11786     __ vpshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE, vector_len);
11787     __ vpxor($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
11788     __ movq($tmp$$XMMRegister, $src1$$Register);
11789     __ vpxor($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
11790     __ movq($dst$$Register, $tmp2$$XMMRegister);
11791   %}
11792   ins_pipe( pipe_slow );
11793 %}
11794 
11795 #ifdef _LP64
11796 instruct rvxor8L_reduction_reg(rRegL dst, rRegL src1, vecZ src2, regF tmp, regF tmp2) %{
11797   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
11798   match(Set dst (XorReductionV src1 src2));
11799   effect(TEMP tmp, TEMP tmp2);
11800   format %{ "vextracti64x4_high  $tmp2,$src2\n\t"
11801             "vpxorq  $tmp2,$tmp2,$src2\n\t"
11802             "vextracti128_high  $tmp,$tmp2\n\t"
11803             "vpxorq  $tmp2,$tmp2,$tmp\n\t"
11804             "vpshufd  $tmp,$tmp2,0xE\t"
11805             "vpxorq  $tmp2,$tmp2,$tmp\n\t"
11806             "movdq   $tmp,$src1\n\t"
11807             "vpxorq  $tmp2,$tmp2,$tmp\n\t"
11808             "movdq   $dst,$tmp2\t! xor reduction8L" %}
11809   ins_encode %{
11810     int vector_len = 0;
11811     __ vextracti64x4_high($tmp2$$XMMRegister, $src2$$XMMRegister);
11812     __ vpxorq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
11813     __ vextracti128_high($tmp$$XMMRegister, $tmp2$$XMMRegister);
11814     __ vpxorq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
11815     __ vpshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE, vector_len);
11816     __ vpxorq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
11817     __ movdq($tmp$$XMMRegister, $src1$$Register);
11818     __ vpxorq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
11819     __ movdq($dst$$Register, $tmp2$$XMMRegister);
11820   %}
11821   ins_pipe( pipe_slow );
11822 %}
11823 #endif
11824 
11825 // ====================VECTOR ARITHMETIC=======================================
11826 
11827 // --------------------------------- ADD --------------------------------------
11828 
11829 // Bytes vector add
11830 instruct vadd4B(vecS dst, vecS src) %{
11831   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
11832   match(Set dst (AddVB dst src));
11833   format %{ "paddb   $dst,$src\t! add packed4B" %}
11834   ins_encode %{
11835     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
11836   %}
11837   ins_pipe( pipe_slow );
11838 %}
11839 
11840 instruct vadd4B_reg(vecS dst, vecS src1, vecS src2) %{
11841   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
11842   match(Set dst (AddVB src1 src2));
11843   format %{ "vpaddb  $dst,$src1,$src2\t! add packed4B" %}
11844   ins_encode %{
11845     int vector_len = 0;
11846     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
11847   %}
11848   ins_pipe( pipe_slow );
11849 %}
11850 
11851 
11852 instruct vadd4B_mem(vecS dst, vecS src, memory mem) %{
11853   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
11854   match(Set dst (AddVB src (LoadVector mem)));
11855   format %{ "vpaddb  $dst,$src,$mem\t! add packed4B" %}
11856   ins_encode %{
11857     int vector_len = 0;
11858     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
11859   %}
11860   ins_pipe( pipe_slow );
11861 %}
11862 
11863 instruct vadd8B(vecD dst, vecD src) %{
11864   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
11865   match(Set dst (AddVB dst src));
11866   format %{ "paddb   $dst,$src\t! add packed8B" %}
11867   ins_encode %{
11868     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
11869   %}
11870   ins_pipe( pipe_slow );
11871 %}
11872 
11873 instruct vadd8B_reg(vecD dst, vecD src1, vecD src2) %{
11874   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
11875   match(Set dst (AddVB src1 src2));
11876   format %{ "vpaddb  $dst,$src1,$src2\t! add packed8B" %}
11877   ins_encode %{
11878     int vector_len = 0;
11879     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
11880   %}
11881   ins_pipe( pipe_slow );
11882 %}
11883 
11884 
11885 instruct vadd8B_mem(vecD dst, vecD src, memory mem) %{
11886   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
11887   match(Set dst (AddVB src (LoadVector mem)));
11888   format %{ "vpaddb  $dst,$src,$mem\t! add packed8B" %}
11889   ins_encode %{
11890     int vector_len = 0;
11891     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
11892   %}
11893   ins_pipe( pipe_slow );
11894 %}
11895 
11896 instruct vadd16B(vecX dst, vecX src) %{
11897   predicate(UseAVX == 0 && n->as_Vector()->length() == 16);
11898   match(Set dst (AddVB dst src));
11899   format %{ "paddb   $dst,$src\t! add packed16B" %}
11900   ins_encode %{
11901     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
11902   %}
11903   ins_pipe( pipe_slow );
11904 %}
11905 
11906 instruct vadd16B_reg(vecX dst, vecX src1, vecX src2) %{
11907   predicate(UseAVX > 0  && n->as_Vector()->length() == 16);
11908   match(Set dst (AddVB src1 src2));
11909   format %{ "vpaddb  $dst,$src1,$src2\t! add packed16B" %}
11910   ins_encode %{
11911     int vector_len = 0;
11912     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
11913   %}
11914   ins_pipe( pipe_slow );
11915 %}
11916 
11917 instruct vadd16B_mem(vecX dst, vecX src, memory mem) %{
11918   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
11919   match(Set dst (AddVB src (LoadVector mem)));
11920   format %{ "vpaddb  $dst,$src,$mem\t! add packed16B" %}
11921   ins_encode %{
11922     int vector_len = 0;
11923     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
11924   %}
11925   ins_pipe( pipe_slow );
11926 %}
11927 
11928 instruct vadd32B_reg(vecY dst, vecY src1, vecY src2) %{
11929   predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
11930   match(Set dst (AddVB src1 src2));
11931   format %{ "vpaddb  $dst,$src1,$src2\t! add packed32B" %}
11932   ins_encode %{
11933     int vector_len = 1;
11934     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
11935   %}
11936   ins_pipe( pipe_slow );
11937 %}
11938 
11939 instruct vadd32B_mem(vecY dst, vecY src, memory mem) %{
11940   predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
11941   match(Set dst (AddVB src (LoadVector mem)));
11942   format %{ "vpaddb  $dst,$src,$mem\t! add packed32B" %}
11943   ins_encode %{
11944     int vector_len = 1;
11945     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
11946   %}
11947   ins_pipe( pipe_slow );
11948 %}
11949 
11950 instruct vadd64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
11951   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64);
11952   match(Set dst (AddVB src1 src2));
11953   format %{ "vpaddb  $dst,$src1,$src2\t! add packed64B" %}
11954   ins_encode %{
11955     int vector_len = 2;
11956     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
11957   %}
11958   ins_pipe( pipe_slow );
11959 %}
11960 
11961 instruct vadd64B_mem(vecZ dst, vecZ src, memory mem) %{
11962   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64);
11963   match(Set dst (AddVB src (LoadVector mem)));
11964   format %{ "vpaddb  $dst,$src,$mem\t! add packed64B" %}
11965   ins_encode %{
11966     int vector_len = 2;
11967     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
11968   %}
11969   ins_pipe( pipe_slow );
11970 %}
11971 
11972 // Shorts/Chars vector add
11973 instruct vadd2S(vecS dst, vecS src) %{
11974   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
11975   match(Set dst (AddVS dst src));
11976   format %{ "paddw   $dst,$src\t! add packed2S" %}
11977   ins_encode %{
11978     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
11979   %}
11980   ins_pipe( pipe_slow );
11981 %}
11982 
11983 instruct vadd2S_reg(vecS dst, vecS src1, vecS src2) %{
11984   predicate(UseAVX > 0  && n->as_Vector()->length() == 2);
11985   match(Set dst (AddVS src1 src2));
11986   format %{ "vpaddw  $dst,$src1,$src2\t! add packed2S" %}
11987   ins_encode %{
11988     int vector_len = 0;
11989     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
11990   %}
11991   ins_pipe( pipe_slow );
11992 %}
11993 
11994 instruct vadd2S_mem(vecS dst, vecS src, memory mem) %{
11995   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
11996   match(Set dst (AddVS src (LoadVector mem)));
11997   format %{ "vpaddw  $dst,$src,$mem\t! add packed2S" %}
11998   ins_encode %{
11999     int vector_len = 0;
12000     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12001   %}
12002   ins_pipe( pipe_slow );
12003 %}
12004 
12005 instruct vadd4S(vecD dst, vecD src) %{
12006   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
12007   match(Set dst (AddVS dst src));
12008   format %{ "paddw   $dst,$src\t! add packed4S" %}
12009   ins_encode %{
12010     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
12011   %}
12012   ins_pipe( pipe_slow );
12013 %}
12014 
12015 instruct vadd4S_reg(vecD dst, vecD src1, vecD src2) %{
12016   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
12017   match(Set dst (AddVS src1 src2));
12018   format %{ "vpaddw  $dst,$src1,$src2\t! add packed4S" %}
12019   ins_encode %{
12020     int vector_len = 0;
12021     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12022   %}
12023   ins_pipe( pipe_slow );
12024 %}
12025 
12026 instruct vadd4S_mem(vecD dst, vecD src, memory mem) %{
12027   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
12028   match(Set dst (AddVS src (LoadVector mem)));
12029   format %{ "vpaddw  $dst,$src,$mem\t! add packed4S" %}
12030   ins_encode %{
12031     int vector_len = 0;
12032     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12033   %}
12034   ins_pipe( pipe_slow );
12035 %}
12036 
12037 instruct vadd8S(vecX dst, vecX src) %{
12038   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
12039   match(Set dst (AddVS dst src));
12040   format %{ "paddw   $dst,$src\t! add packed8S" %}
12041   ins_encode %{
12042     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
12043   %}
12044   ins_pipe( pipe_slow );
12045 %}
12046 
12047 instruct vadd8S_reg(vecX dst, vecX src1, vecX src2) %{
12048   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
12049   match(Set dst (AddVS src1 src2));
12050   format %{ "vpaddw  $dst,$src1,$src2\t! add packed8S" %}
12051   ins_encode %{
12052     int vector_len = 0;
12053     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12054   %}
12055   ins_pipe( pipe_slow );
12056 %}
12057 
12058 instruct vadd8S_mem(vecX dst, vecX src, memory mem) %{
12059   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
12060   match(Set dst (AddVS src (LoadVector mem)));
12061   format %{ "vpaddw  $dst,$src,$mem\t! add packed8S" %}
12062   ins_encode %{
12063     int vector_len = 0;
12064     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12065   %}
12066   ins_pipe( pipe_slow );
12067 %}
12068 
12069 instruct vadd16S_reg(vecY dst, vecY src1, vecY src2) %{
12070   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
12071   match(Set dst (AddVS src1 src2));
12072   format %{ "vpaddw  $dst,$src1,$src2\t! add packed16S" %}
12073   ins_encode %{
12074     int vector_len = 1;
12075     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12076   %}
12077   ins_pipe( pipe_slow );
12078 %}
12079 
12080 instruct vadd16S_mem(vecY dst, vecY src, memory mem) %{
12081   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
12082   match(Set dst (AddVS src (LoadVector mem)));
12083   format %{ "vpaddw  $dst,$src,$mem\t! add packed16S" %}
12084   ins_encode %{
12085     int vector_len = 1;
12086     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12087   %}
12088   ins_pipe( pipe_slow );
12089 %}
12090 
12091 instruct vadd32S_reg(vecZ dst, vecZ src1, vecZ src2) %{
12092   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
12093   match(Set dst (AddVS src1 src2));
12094   format %{ "vpaddw  $dst,$src1,$src2\t! add packed32S" %}
12095   ins_encode %{
12096     int vector_len = 2;
12097     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12098   %}
12099   ins_pipe( pipe_slow );
12100 %}
12101 
12102 instruct vadd32S_mem(vecZ dst, vecZ src, memory mem) %{
12103   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
12104   match(Set dst (AddVS src (LoadVector mem)));
12105   format %{ "vpaddw  $dst,$src,$mem\t! add packed32S" %}
12106   ins_encode %{
12107     int vector_len = 2;
12108     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12109   %}
12110   ins_pipe( pipe_slow );
12111 %}
12112 
12113 // Integers vector add
12114 instruct vadd2I(vecD dst, vecD src) %{
12115   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
12116   match(Set dst (AddVI dst src));
12117   format %{ "paddd   $dst,$src\t! add packed2I" %}
12118   ins_encode %{
12119     __ paddd($dst$$XMMRegister, $src$$XMMRegister);
12120   %}
12121   ins_pipe( pipe_slow );
12122 %}
12123 
12124 instruct vadd2I_reg(vecD dst, vecD src1, vecD src2) %{
12125   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
12126   match(Set dst (AddVI src1 src2));
12127   format %{ "vpaddd  $dst,$src1,$src2\t! add packed2I" %}
12128   ins_encode %{
12129     int vector_len = 0;
12130     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12131   %}
12132   ins_pipe( pipe_slow );
12133 %}
12134 
12135 instruct vadd2I_mem(vecD dst, vecD src, memory mem) %{
12136   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
12137   match(Set dst (AddVI src (LoadVector mem)));
12138   format %{ "vpaddd  $dst,$src,$mem\t! add packed2I" %}
12139   ins_encode %{
12140     int vector_len = 0;
12141     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12142   %}
12143   ins_pipe( pipe_slow );
12144 %}
12145 
12146 instruct vadd4I(vecX dst, vecX src) %{
12147   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
12148   match(Set dst (AddVI dst src));
12149   format %{ "paddd   $dst,$src\t! add packed4I" %}
12150   ins_encode %{
12151     __ paddd($dst$$XMMRegister, $src$$XMMRegister);
12152   %}
12153   ins_pipe( pipe_slow );
12154 %}
12155 
12156 instruct vadd4I_reg(vecX dst, vecX src1, vecX src2) %{
12157   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
12158   match(Set dst (AddVI src1 src2));
12159   format %{ "vpaddd  $dst,$src1,$src2\t! add packed4I" %}
12160   ins_encode %{
12161     int vector_len = 0;
12162     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12163   %}
12164   ins_pipe( pipe_slow );
12165 %}
12166 
12167 instruct vadd4I_mem(vecX dst, vecX src, memory mem) %{
12168   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
12169   match(Set dst (AddVI src (LoadVector mem)));
12170   format %{ "vpaddd  $dst,$src,$mem\t! add packed4I" %}
12171   ins_encode %{
12172     int vector_len = 0;
12173     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12174   %}
12175   ins_pipe( pipe_slow );
12176 %}
12177 
12178 instruct vadd8I_reg(vecY dst, vecY src1, vecY src2) %{
12179   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
12180   match(Set dst (AddVI src1 src2));
12181   format %{ "vpaddd  $dst,$src1,$src2\t! add packed8I" %}
12182   ins_encode %{
12183     int vector_len = 1;
12184     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12185   %}
12186   ins_pipe( pipe_slow );
12187 %}
12188 
12189 instruct vadd8I_mem(vecY dst, vecY src, memory mem) %{
12190   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
12191   match(Set dst (AddVI src (LoadVector mem)));
12192   format %{ "vpaddd  $dst,$src,$mem\t! add packed8I" %}
12193   ins_encode %{
12194     int vector_len = 1;
12195     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12196   %}
12197   ins_pipe( pipe_slow );
12198 %}
12199 
12200 instruct vadd16I_reg(vecZ dst, vecZ src1, vecZ src2) %{
12201   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
12202   match(Set dst (AddVI src1 src2));
12203   format %{ "vpaddd  $dst,$src1,$src2\t! add packed16I" %}
12204   ins_encode %{
12205     int vector_len = 2;
12206     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12207   %}
12208   ins_pipe( pipe_slow );
12209 %}
12210 
12211 instruct vadd16I_mem(vecZ dst, vecZ src, memory mem) %{
12212   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
12213   match(Set dst (AddVI src (LoadVector mem)));
12214   format %{ "vpaddd  $dst,$src,$mem\t! add packed16I" %}
12215   ins_encode %{
12216     int vector_len = 2;
12217     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12218   %}
12219   ins_pipe( pipe_slow );
12220 %}
12221 
12222 // Longs vector add
12223 instruct vadd2L(vecX dst, vecX src) %{
12224   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
12225   match(Set dst (AddVL dst src));
12226   format %{ "paddq   $dst,$src\t! add packed2L" %}
12227   ins_encode %{
12228     __ paddq($dst$$XMMRegister, $src$$XMMRegister);
12229   %}
12230   ins_pipe( pipe_slow );
12231 %}
12232 
12233 instruct vadd2L_reg(vecX dst, vecX src1, vecX src2) %{
12234   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
12235   match(Set dst (AddVL src1 src2));
12236   format %{ "vpaddq  $dst,$src1,$src2\t! add packed2L" %}
12237   ins_encode %{
12238     int vector_len = 0;
12239     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12240   %}
12241   ins_pipe( pipe_slow );
12242 %}
12243 
12244 instruct vadd2L_mem(vecX dst, vecX src, memory mem) %{
12245   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
12246   match(Set dst (AddVL src (LoadVector mem)));
12247   format %{ "vpaddq  $dst,$src,$mem\t! add packed2L" %}
12248   ins_encode %{
12249     int vector_len = 0;
12250     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12251   %}
12252   ins_pipe( pipe_slow );
12253 %}
12254 
12255 instruct vadd4L_reg(vecY dst, vecY src1, vecY src2) %{
12256   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
12257   match(Set dst (AddVL src1 src2));
12258   format %{ "vpaddq  $dst,$src1,$src2\t! add packed4L" %}
12259   ins_encode %{
12260     int vector_len = 1;
12261     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12262   %}
12263   ins_pipe( pipe_slow );
12264 %}
12265 
12266 instruct vadd4L_mem(vecY dst, vecY src, memory mem) %{
12267   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
12268   match(Set dst (AddVL src (LoadVector mem)));
12269   format %{ "vpaddq  $dst,$src,$mem\t! add packed4L" %}
12270   ins_encode %{
12271     int vector_len = 1;
12272     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12273   %}
12274   ins_pipe( pipe_slow );
12275 %}
12276 
12277 instruct vadd8L_reg(vecZ dst, vecZ src1, vecZ src2) %{
12278   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
12279   match(Set dst (AddVL src1 src2));
12280   format %{ "vpaddq  $dst,$src1,$src2\t! add packed8L" %}
12281   ins_encode %{
12282     int vector_len = 2;
12283     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12284   %}
12285   ins_pipe( pipe_slow );
12286 %}
12287 
12288 instruct vadd8L_mem(vecZ dst, vecZ src, memory mem) %{
12289   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
12290   match(Set dst (AddVL src (LoadVector mem)));
12291   format %{ "vpaddq  $dst,$src,$mem\t! add packed8L" %}
12292   ins_encode %{
12293     int vector_len = 2;
12294     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12295   %}
12296   ins_pipe( pipe_slow );
12297 %}
12298 
12299 // Floats vector add
12300 instruct vadd2F(vecD dst, vecD src) %{
12301   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
12302   match(Set dst (AddVF dst src));
12303   format %{ "addps   $dst,$src\t! add packed2F" %}
12304   ins_encode %{
12305     __ addps($dst$$XMMRegister, $src$$XMMRegister);
12306   %}
12307   ins_pipe( pipe_slow );
12308 %}
12309 
12310 instruct vadd2F_reg(vecD dst, vecD src1, vecD src2) %{
12311   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
12312   match(Set dst (AddVF src1 src2));
12313   format %{ "vaddps  $dst,$src1,$src2\t! add packed2F" %}
12314   ins_encode %{
12315     int vector_len = 0;
12316     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12317   %}
12318   ins_pipe( pipe_slow );
12319 %}
12320 
12321 instruct vadd2F_mem(vecD dst, vecD src, memory mem) %{
12322   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
12323   match(Set dst (AddVF src (LoadVector mem)));
12324   format %{ "vaddps  $dst,$src,$mem\t! add packed2F" %}
12325   ins_encode %{
12326     int vector_len = 0;
12327     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12328   %}
12329   ins_pipe( pipe_slow );
12330 %}
12331 
12332 instruct vadd4F(vecX dst, vecX src) %{
12333   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
12334   match(Set dst (AddVF dst src));
12335   format %{ "addps   $dst,$src\t! add packed4F" %}
12336   ins_encode %{
12337     __ addps($dst$$XMMRegister, $src$$XMMRegister);
12338   %}
12339   ins_pipe( pipe_slow );
12340 %}
12341 
12342 instruct vadd4F_reg(vecX dst, vecX src1, vecX src2) %{
12343   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
12344   match(Set dst (AddVF src1 src2));
12345   format %{ "vaddps  $dst,$src1,$src2\t! add packed4F" %}
12346   ins_encode %{
12347     int vector_len = 0;
12348     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12349   %}
12350   ins_pipe( pipe_slow );
12351 %}
12352 
12353 instruct vadd4F_mem(vecX dst, vecX src, memory mem) %{
12354   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
12355   match(Set dst (AddVF src (LoadVector mem)));
12356   format %{ "vaddps  $dst,$src,$mem\t! add packed4F" %}
12357   ins_encode %{
12358     int vector_len = 0;
12359     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12360   %}
12361   ins_pipe( pipe_slow );
12362 %}
12363 
12364 instruct vadd8F_reg(vecY dst, vecY src1, vecY src2) %{
12365   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
12366   match(Set dst (AddVF src1 src2));
12367   format %{ "vaddps  $dst,$src1,$src2\t! add packed8F" %}
12368   ins_encode %{
12369     int vector_len = 1;
12370     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12371   %}
12372   ins_pipe( pipe_slow );
12373 %}
12374 
12375 instruct vadd8F_mem(vecY dst, vecY src, memory mem) %{
12376   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
12377   match(Set dst (AddVF src (LoadVector mem)));
12378   format %{ "vaddps  $dst,$src,$mem\t! add packed8F" %}
12379   ins_encode %{
12380     int vector_len = 1;
12381     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12382   %}
12383   ins_pipe( pipe_slow );
12384 %}
12385 
12386 instruct vadd16F_reg(vecZ dst, vecZ src1, vecZ src2) %{
12387   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
12388   match(Set dst (AddVF src1 src2));
12389   format %{ "vaddps  $dst,$src1,$src2\t! add packed16F" %}
12390   ins_encode %{
12391     int vector_len = 2;
12392     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12393   %}
12394   ins_pipe( pipe_slow );
12395 %}
12396 
12397 instruct vadd16F_mem(vecZ dst, vecZ src, memory mem) %{
12398   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
12399   match(Set dst (AddVF src (LoadVector mem)));
12400   format %{ "vaddps  $dst,$src,$mem\t! add packed16F" %}
12401   ins_encode %{
12402     int vector_len = 2;
12403     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12404   %}
12405   ins_pipe( pipe_slow );
12406 %}
12407 
12408 // Doubles vector add
12409 instruct vadd2D(vecX dst, vecX src) %{
12410   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
12411   match(Set dst (AddVD dst src));
12412   format %{ "addpd   $dst,$src\t! add packed2D" %}
12413   ins_encode %{
12414     __ addpd($dst$$XMMRegister, $src$$XMMRegister);
12415   %}
12416   ins_pipe( pipe_slow );
12417 %}
12418 
12419 instruct vadd2D_reg(vecX dst, vecX src1, vecX src2) %{
12420   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
12421   match(Set dst (AddVD src1 src2));
12422   format %{ "vaddpd  $dst,$src1,$src2\t! add packed2D" %}
12423   ins_encode %{
12424     int vector_len = 0;
12425     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12426   %}
12427   ins_pipe( pipe_slow );
12428 %}
12429 
12430 instruct vadd2D_mem(vecX dst, vecX src, memory mem) %{
12431   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
12432   match(Set dst (AddVD src (LoadVector mem)));
12433   format %{ "vaddpd  $dst,$src,$mem\t! add packed2D" %}
12434   ins_encode %{
12435     int vector_len = 0;
12436     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12437   %}
12438   ins_pipe( pipe_slow );
12439 %}
12440 
12441 instruct vadd4D_reg(vecY dst, vecY src1, vecY src2) %{
12442   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
12443   match(Set dst (AddVD src1 src2));
12444   format %{ "vaddpd  $dst,$src1,$src2\t! add packed4D" %}
12445   ins_encode %{
12446     int vector_len = 1;
12447     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12448   %}
12449   ins_pipe( pipe_slow );
12450 %}
12451 
12452 instruct vadd4D_mem(vecY dst, vecY src, memory mem) %{
12453   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
12454   match(Set dst (AddVD src (LoadVector mem)));
12455   format %{ "vaddpd  $dst,$src,$mem\t! add packed4D" %}
12456   ins_encode %{
12457     int vector_len = 1;
12458     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12459   %}
12460   ins_pipe( pipe_slow );
12461 %}
12462 
12463 instruct vadd8D_reg(vecZ dst, vecZ src1, vecZ src2) %{
12464   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
12465   match(Set dst (AddVD src1 src2));
12466   format %{ "vaddpd  $dst,$src1,$src2\t! add packed8D" %}
12467   ins_encode %{
12468     int vector_len = 2;
12469     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12470   %}
12471   ins_pipe( pipe_slow );
12472 %}
12473 
12474 instruct vadd8D_mem(vecZ dst, vecZ src, memory mem) %{
12475   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
12476   match(Set dst (AddVD src (LoadVector mem)));
12477   format %{ "vaddpd  $dst,$src,$mem\t! add packed8D" %}
12478   ins_encode %{
12479     int vector_len = 2;
12480     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12481   %}
12482   ins_pipe( pipe_slow );
12483 %}
12484 
12485 // --------------------------------- SUB --------------------------------------
12486 
12487 // Bytes vector sub
12488 instruct vsub4B(vecS dst, vecS src) %{
12489   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
12490   match(Set dst (SubVB dst src));
12491   format %{ "psubb   $dst,$src\t! sub packed4B" %}
12492   ins_encode %{
12493     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
12494   %}
12495   ins_pipe( pipe_slow );
12496 %}
12497 
12498 instruct vsub4B_reg(vecS dst, vecS src1, vecS src2) %{
12499   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
12500   match(Set dst (SubVB src1 src2));
12501   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed4B" %}
12502   ins_encode %{
12503     int vector_len = 0;
12504     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12505   %}
12506   ins_pipe( pipe_slow );
12507 %}
12508 
12509 instruct vsub4B_mem(vecS dst, vecS src, memory mem) %{
12510   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
12511   match(Set dst (SubVB src (LoadVector mem)));
12512   format %{ "vpsubb  $dst,$src,$mem\t! sub packed4B" %}
12513   ins_encode %{
12514     int vector_len = 0;
12515     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12516   %}
12517   ins_pipe( pipe_slow );
12518 %}
12519 
12520 instruct vsub8B(vecD dst, vecD src) %{
12521   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
12522   match(Set dst (SubVB dst src));
12523   format %{ "psubb   $dst,$src\t! sub packed8B" %}
12524   ins_encode %{
12525     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
12526   %}
12527   ins_pipe( pipe_slow );
12528 %}
12529 
12530 instruct vsub8B_reg(vecD dst, vecD src1, vecD src2) %{
12531   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
12532   match(Set dst (SubVB src1 src2));
12533   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed8B" %}
12534   ins_encode %{
12535     int vector_len = 0;
12536     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12537   %}
12538   ins_pipe( pipe_slow );
12539 %}
12540 
12541 instruct vsub8B_mem(vecD dst, vecD src, memory mem) %{
12542   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
12543   match(Set dst (SubVB src (LoadVector mem)));
12544   format %{ "vpsubb  $dst,$src,$mem\t! sub packed8B" %}
12545   ins_encode %{
12546     int vector_len = 0;
12547     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12548   %}
12549   ins_pipe( pipe_slow );
12550 %}
12551 
12552 instruct vsub16B(vecX dst, vecX src) %{
12553   predicate(UseAVX == 0 && n->as_Vector()->length() == 16);
12554   match(Set dst (SubVB dst src));
12555   format %{ "psubb   $dst,$src\t! sub packed16B" %}
12556   ins_encode %{
12557     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
12558   %}
12559   ins_pipe( pipe_slow );
12560 %}
12561 
12562 instruct vsub16B_reg(vecX dst, vecX src1, vecX src2) %{
12563   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
12564   match(Set dst (SubVB src1 src2));
12565   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed16B" %}
12566   ins_encode %{
12567     int vector_len = 0;
12568     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12569   %}
12570   ins_pipe( pipe_slow );
12571 %}
12572 
12573 instruct vsub16B_mem(vecX dst, vecX src, memory mem) %{
12574   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
12575   match(Set dst (SubVB src (LoadVector mem)));
12576   format %{ "vpsubb  $dst,$src,$mem\t! sub packed16B" %}
12577   ins_encode %{
12578     int vector_len = 0;
12579     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12580   %}
12581   ins_pipe( pipe_slow );
12582 %}
12583 
12584 instruct vsub32B_reg(vecY dst, vecY src1, vecY src2) %{
12585   predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
12586   match(Set dst (SubVB src1 src2));
12587   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed32B" %}
12588   ins_encode %{
12589     int vector_len = 1;
12590     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12591   %}
12592   ins_pipe( pipe_slow );
12593 %}
12594 
12595 instruct vsub32B_mem(vecY dst, vecY src, memory mem) %{
12596   predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
12597   match(Set dst (SubVB src (LoadVector mem)));
12598   format %{ "vpsubb  $dst,$src,$mem\t! sub packed32B" %}
12599   ins_encode %{
12600     int vector_len = 1;
12601     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12602   %}
12603   ins_pipe( pipe_slow );
12604 %}
12605 
12606 instruct vsub64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
12607   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64);
12608   match(Set dst (SubVB src1 src2));
12609   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed64B" %}
12610   ins_encode %{
12611     int vector_len = 2;
12612     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12613   %}
12614   ins_pipe( pipe_slow );
12615 %}
12616 
12617 instruct vsub64B_mem(vecZ dst, vecZ src, memory mem) %{
12618   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64);
12619   match(Set dst (SubVB src (LoadVector mem)));
12620   format %{ "vpsubb  $dst,$src,$mem\t! sub packed64B" %}
12621   ins_encode %{
12622     int vector_len = 2;
12623     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12624   %}
12625   ins_pipe( pipe_slow );
12626 %}
12627 
12628 // Shorts/Chars vector sub
12629 instruct vsub2S(vecS dst, vecS src) %{
12630   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
12631   match(Set dst (SubVS dst src));
12632   format %{ "psubw   $dst,$src\t! sub packed2S" %}
12633   ins_encode %{
12634     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
12635   %}
12636   ins_pipe( pipe_slow );
12637 %}
12638 
12639 instruct vsub2S_reg(vecS dst, vecS src1, vecS src2) %{
12640   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
12641   match(Set dst (SubVS src1 src2));
12642   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed2S" %}
12643   ins_encode %{
12644     int vector_len = 0;
12645     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12646   %}
12647   ins_pipe( pipe_slow );
12648 %}
12649 
12650 instruct vsub2S_mem(vecS dst, vecS src, memory mem) %{
12651   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
12652   match(Set dst (SubVS src (LoadVector mem)));
12653   format %{ "vpsubw  $dst,$src,$mem\t! sub packed2S" %}
12654   ins_encode %{
12655     int vector_len = 0;
12656     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12657   %}
12658   ins_pipe( pipe_slow );
12659 %}
12660 
12661 instruct vsub4S(vecD dst, vecD src) %{
12662   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
12663   match(Set dst (SubVS dst src));
12664   format %{ "psubw   $dst,$src\t! sub packed4S" %}
12665   ins_encode %{
12666     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
12667   %}
12668   ins_pipe( pipe_slow );
12669 %}
12670 
12671 instruct vsub4S_reg(vecD dst, vecD src1, vecD src2) %{
12672   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
12673   match(Set dst (SubVS src1 src2));
12674   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed4S" %}
12675   ins_encode %{
12676     int vector_len = 0;
12677     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12678   %}
12679   ins_pipe( pipe_slow );
12680 %}
12681 
12682 instruct vsub4S_mem(vecD dst, vecD src, memory mem) %{
12683   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
12684   match(Set dst (SubVS src (LoadVector mem)));
12685   format %{ "vpsubw  $dst,$src,$mem\t! sub packed4S" %}
12686   ins_encode %{
12687     int vector_len = 0;
12688     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12689   %}
12690   ins_pipe( pipe_slow );
12691 %}
12692 
12693 instruct vsub8S(vecX dst, vecX src) %{
12694   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
12695   match(Set dst (SubVS dst src));
12696   format %{ "psubw   $dst,$src\t! sub packed8S" %}
12697   ins_encode %{
12698     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
12699   %}
12700   ins_pipe( pipe_slow );
12701 %}
12702 
12703 instruct vsub8S_reg(vecX dst, vecX src1, vecX src2) %{
12704   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
12705   match(Set dst (SubVS src1 src2));
12706   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed8S" %}
12707   ins_encode %{
12708     int vector_len = 0;
12709     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12710   %}
12711   ins_pipe( pipe_slow );
12712 %}
12713 
12714 instruct vsub8S_mem(vecX dst, vecX src, memory mem) %{
12715   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
12716   match(Set dst (SubVS src (LoadVector mem)));
12717   format %{ "vpsubw  $dst,$src,$mem\t! sub packed8S" %}
12718   ins_encode %{
12719     int vector_len = 0;
12720     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12721   %}
12722   ins_pipe( pipe_slow );
12723 %}
12724 
12725 instruct vsub16S_reg(vecY dst, vecY src1, vecY src2) %{
12726   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
12727   match(Set dst (SubVS src1 src2));
12728   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed16S" %}
12729   ins_encode %{
12730     int vector_len = 1;
12731     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12732   %}
12733   ins_pipe( pipe_slow );
12734 %}
12735 
12736 instruct vsub16S_mem(vecY dst, vecY src, memory mem) %{
12737   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
12738   match(Set dst (SubVS src (LoadVector mem)));
12739   format %{ "vpsubw  $dst,$src,$mem\t! sub packed16S" %}
12740   ins_encode %{
12741     int vector_len = 1;
12742     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12743   %}
12744   ins_pipe( pipe_slow );
12745 %}
12746 
12747 instruct vsub32S_reg(vecZ dst, vecZ src1, vecZ src2) %{
12748   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
12749   match(Set dst (SubVS src1 src2));
12750   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed32S" %}
12751   ins_encode %{
12752     int vector_len = 2;
12753     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12754   %}
12755   ins_pipe( pipe_slow );
12756 %}
12757 
12758 instruct vsub32S_mem(vecZ dst, vecZ src, memory mem) %{
12759   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
12760   match(Set dst (SubVS src (LoadVector mem)));
12761   format %{ "vpsubw  $dst,$src,$mem\t! sub packed32S" %}
12762   ins_encode %{
12763     int vector_len = 2;
12764     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12765   %}
12766   ins_pipe( pipe_slow );
12767 %}
12768 
12769 // Integers vector sub
12770 instruct vsub2I(vecD dst, vecD src) %{
12771   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
12772   match(Set dst (SubVI dst src));
12773   format %{ "psubd   $dst,$src\t! sub packed2I" %}
12774   ins_encode %{
12775     __ psubd($dst$$XMMRegister, $src$$XMMRegister);
12776   %}
12777   ins_pipe( pipe_slow );
12778 %}
12779 
12780 instruct vsub2I_reg(vecD dst, vecD src1, vecD src2) %{
12781   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
12782   match(Set dst (SubVI src1 src2));
12783   format %{ "vpsubd  $dst,$src1,$src2\t! sub packed2I" %}
12784   ins_encode %{
12785     int vector_len = 0;
12786     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12787   %}
12788   ins_pipe( pipe_slow );
12789 %}
12790 
12791 instruct vsub2I_mem(vecD dst, vecD src, memory mem) %{
12792   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
12793   match(Set dst (SubVI src (LoadVector mem)));
12794   format %{ "vpsubd  $dst,$src,$mem\t! sub packed2I" %}
12795   ins_encode %{
12796     int vector_len = 0;
12797     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12798   %}
12799   ins_pipe( pipe_slow );
12800 %}
12801 
12802 instruct vsub4I(vecX dst, vecX src) %{
12803   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
12804   match(Set dst (SubVI dst src));
12805   format %{ "psubd   $dst,$src\t! sub packed4I" %}
12806   ins_encode %{
12807     __ psubd($dst$$XMMRegister, $src$$XMMRegister);
12808   %}
12809   ins_pipe( pipe_slow );
12810 %}
12811 
12812 instruct vsub4I_reg(vecX dst, vecX src1, vecX src2) %{
12813   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
12814   match(Set dst (SubVI src1 src2));
12815   format %{ "vpsubd  $dst,$src1,$src2\t! sub packed4I" %}
12816   ins_encode %{
12817     int vector_len = 0;
12818     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12819   %}
12820   ins_pipe( pipe_slow );
12821 %}
12822 
12823 instruct vsub4I_mem(vecX dst, vecX src, memory mem) %{
12824   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
12825   match(Set dst (SubVI src (LoadVector mem)));
12826   format %{ "vpsubd  $dst,$src,$mem\t! sub packed4I" %}
12827   ins_encode %{
12828     int vector_len = 0;
12829     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12830   %}
12831   ins_pipe( pipe_slow );
12832 %}
12833 
12834 instruct vsub8I_reg(vecY dst, vecY src1, vecY src2) %{
12835   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
12836   match(Set dst (SubVI src1 src2));
12837   format %{ "vpsubd  $dst,$src1,$src2\t! sub packed8I" %}
12838   ins_encode %{
12839     int vector_len = 1;
12840     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12841   %}
12842   ins_pipe( pipe_slow );
12843 %}
12844 
12845 instruct vsub8I_mem(vecY dst, vecY src, memory mem) %{
12846   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
12847   match(Set dst (SubVI src (LoadVector mem)));
12848   format %{ "vpsubd  $dst,$src,$mem\t! sub packed8I" %}
12849   ins_encode %{
12850     int vector_len = 1;
12851     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12852   %}
12853   ins_pipe( pipe_slow );
12854 %}
12855 
12856 instruct vsub16I_reg(vecZ dst, vecZ src1, vecZ src2) %{
12857   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
12858   match(Set dst (SubVI src1 src2));
12859   format %{ "vpsubd  $dst,$src1,$src2\t! sub packed16I" %}
12860   ins_encode %{
12861     int vector_len = 2;
12862     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12863   %}
12864   ins_pipe( pipe_slow );
12865 %}
12866 
12867 instruct vsub16I_mem(vecZ dst, vecZ src, memory mem) %{
12868   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
12869   match(Set dst (SubVI src (LoadVector mem)));
12870   format %{ "vpsubd  $dst,$src,$mem\t! sub packed16I" %}
12871   ins_encode %{
12872     int vector_len = 2;
12873     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12874   %}
12875   ins_pipe( pipe_slow );
12876 %}
12877 
12878 // Longs vector sub
12879 instruct vsub2L(vecX dst, vecX src) %{
12880   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
12881   match(Set dst (SubVL dst src));
12882   format %{ "psubq   $dst,$src\t! sub packed2L" %}
12883   ins_encode %{
12884     __ psubq($dst$$XMMRegister, $src$$XMMRegister);
12885   %}
12886   ins_pipe( pipe_slow );
12887 %}
12888 
12889 instruct vsub2L_reg(vecX dst, vecX src1, vecX src2) %{
12890   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
12891   match(Set dst (SubVL src1 src2));
12892   format %{ "vpsubq  $dst,$src1,$src2\t! sub packed2L" %}
12893   ins_encode %{
12894     int vector_len = 0;
12895     __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12896   %}
12897   ins_pipe( pipe_slow );
12898 %}
12899 
12900 instruct vsub2L_mem(vecX dst, vecX src, memory mem) %{
12901   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
12902   match(Set dst (SubVL src (LoadVector mem)));
12903   format %{ "vpsubq  $dst,$src,$mem\t! sub packed2L" %}
12904   ins_encode %{
12905     int vector_len = 0;
12906     __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12907   %}
12908   ins_pipe( pipe_slow );
12909 %}
12910 
12911 instruct vsub4L_reg(vecY dst, vecY src1, vecY src2) %{
12912   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
12913   match(Set dst (SubVL src1 src2));
12914   format %{ "vpsubq  $dst,$src1,$src2\t! sub packed4L" %}
12915   ins_encode %{
12916     int vector_len = 1;
12917     __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12918   %}
12919   ins_pipe( pipe_slow );
12920 %}
12921 
12922 instruct vsub4L_mem(vecY dst, vecY src, memory mem) %{
12923   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
12924   match(Set dst (SubVL src (LoadVector mem)));
12925   format %{ "vpsubq  $dst,$src,$mem\t! sub packed4L" %}
12926   ins_encode %{
12927     int vector_len = 1;
12928     __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12929   %}
12930   ins_pipe( pipe_slow );
12931 %}
12932 
12933 instruct vsub8L_reg(vecZ dst, vecZ src1, vecZ src2) %{
12934   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
12935   match(Set dst (SubVL src1 src2));
12936   format %{ "vpsubq  $dst,$src1,$src2\t! sub packed8L" %}
12937   ins_encode %{
12938     int vector_len = 2;
12939     __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12940   %}
12941   ins_pipe( pipe_slow );
12942 %}
12943 
12944 instruct vsub8L_mem(vecZ dst, vecZ src, memory mem) %{
12945   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
12946   match(Set dst (SubVL src (LoadVector mem)));
12947   format %{ "vpsubq  $dst,$src,$mem\t! sub packed8L" %}
12948   ins_encode %{
12949     int vector_len = 2;
12950     __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12951   %}
12952   ins_pipe( pipe_slow );
12953 %}
12954 
12955 // Floats vector sub
12956 instruct vsub2F(vecD dst, vecD src) %{
12957   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
12958   match(Set dst (SubVF dst src));
12959   format %{ "subps   $dst,$src\t! sub packed2F" %}
12960   ins_encode %{
12961     __ subps($dst$$XMMRegister, $src$$XMMRegister);
12962   %}
12963   ins_pipe( pipe_slow );
12964 %}
12965 
12966 instruct vsub2F_reg(vecD dst, vecD src1, vecD src2) %{
12967   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
12968   match(Set dst (SubVF src1 src2));
12969   format %{ "vsubps  $dst,$src1,$src2\t! sub packed2F" %}
12970   ins_encode %{
12971     int vector_len = 0;
12972     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12973   %}
12974   ins_pipe( pipe_slow );
12975 %}
12976 
12977 instruct vsub2F_mem(vecD dst, vecD src, memory mem) %{
12978   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
12979   match(Set dst (SubVF src (LoadVector mem)));
12980   format %{ "vsubps  $dst,$src,$mem\t! sub packed2F" %}
12981   ins_encode %{
12982     int vector_len = 0;
12983     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12984   %}
12985   ins_pipe( pipe_slow );
12986 %}
12987 
12988 instruct vsub4F(vecX dst, vecX src) %{
12989   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
12990   match(Set dst (SubVF dst src));
12991   format %{ "subps   $dst,$src\t! sub packed4F" %}
12992   ins_encode %{
12993     __ subps($dst$$XMMRegister, $src$$XMMRegister);
12994   %}
12995   ins_pipe( pipe_slow );
12996 %}
12997 
12998 instruct vsub4F_reg(vecX dst, vecX src1, vecX src2) %{
12999   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
13000   match(Set dst (SubVF src1 src2));
13001   format %{ "vsubps  $dst,$src1,$src2\t! sub packed4F" %}
13002   ins_encode %{
13003     int vector_len = 0;
13004     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13005   %}
13006   ins_pipe( pipe_slow );
13007 %}
13008 
13009 instruct vsub4F_mem(vecX dst, vecX src, memory mem) %{
13010   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
13011   match(Set dst (SubVF src (LoadVector mem)));
13012   format %{ "vsubps  $dst,$src,$mem\t! sub packed4F" %}
13013   ins_encode %{
13014     int vector_len = 0;
13015     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
13016   %}
13017   ins_pipe( pipe_slow );
13018 %}
13019 
13020 instruct vsub8F_reg(vecY dst, vecY src1, vecY src2) %{
13021   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
13022   match(Set dst (SubVF src1 src2));
13023   format %{ "vsubps  $dst,$src1,$src2\t! sub packed8F" %}
13024   ins_encode %{
13025     int vector_len = 1;
13026     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13027   %}
13028   ins_pipe( pipe_slow );
13029 %}
13030 
13031 instruct vsub8F_mem(vecY dst, vecY src, memory mem) %{
13032   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
13033   match(Set dst (SubVF src (LoadVector mem)));
13034   format %{ "vsubps  $dst,$src,$mem\t! sub packed8F" %}
13035   ins_encode %{
13036     int vector_len = 1;
13037     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
13038   %}
13039   ins_pipe( pipe_slow );
13040 %}
13041 
13042 instruct vsub16F_reg(vecZ dst, vecZ src1, vecZ src2) %{
13043   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
13044   match(Set dst (SubVF src1 src2));
13045   format %{ "vsubps  $dst,$src1,$src2\t! sub packed16F" %}
13046   ins_encode %{
13047     int vector_len = 2;
13048     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13049   %}
13050   ins_pipe( pipe_slow );
13051 %}
13052 
13053 instruct vsub16F_mem(vecZ dst, vecZ src, memory mem) %{
13054   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
13055   match(Set dst (SubVF src (LoadVector mem)));
13056   format %{ "vsubps  $dst,$src,$mem\t! sub packed16F" %}
13057   ins_encode %{
13058     int vector_len = 2;
13059     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
13060   %}
13061   ins_pipe( pipe_slow );
13062 %}
13063 
13064 // Doubles vector sub
13065 instruct vsub2D(vecX dst, vecX src) %{
13066   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
13067   match(Set dst (SubVD dst src));
13068   format %{ "subpd   $dst,$src\t! sub packed2D" %}
13069   ins_encode %{
13070     __ subpd($dst$$XMMRegister, $src$$XMMRegister);
13071   %}
13072   ins_pipe( pipe_slow );
13073 %}
13074 
13075 instruct vsub2D_reg(vecX dst, vecX src1, vecX src2) %{
13076   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
13077   match(Set dst (SubVD src1 src2));
13078   format %{ "vsubpd  $dst,$src1,$src2\t! sub packed2D" %}
13079   ins_encode %{
13080     int vector_len = 0;
13081     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13082   %}
13083   ins_pipe( pipe_slow );
13084 %}
13085 
13086 instruct vsub2D_mem(vecX dst, vecX src, memory mem) %{
13087   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
13088   match(Set dst (SubVD src (LoadVector mem)));
13089   format %{ "vsubpd  $dst,$src,$mem\t! sub packed2D" %}
13090   ins_encode %{
13091     int vector_len = 0;
13092     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
13093   %}
13094   ins_pipe( pipe_slow );
13095 %}
13096 
13097 instruct vsub4D_reg(vecY dst, vecY src1, vecY src2) %{
13098   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
13099   match(Set dst (SubVD src1 src2));
13100   format %{ "vsubpd  $dst,$src1,$src2\t! sub packed4D" %}
13101   ins_encode %{
13102     int vector_len = 1;
13103     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13104   %}
13105   ins_pipe( pipe_slow );
13106 %}
13107 
13108 instruct vsub4D_mem(vecY dst, vecY src, memory mem) %{
13109   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
13110   match(Set dst (SubVD src (LoadVector mem)));
13111   format %{ "vsubpd  $dst,$src,$mem\t! sub packed4D" %}
13112   ins_encode %{
13113     int vector_len = 1;
13114     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
13115   %}
13116   ins_pipe( pipe_slow );
13117 %}
13118 
13119 instruct vsub8D_reg(vecZ dst, vecZ src1, vecZ src2) %{
13120   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
13121   match(Set dst (SubVD src1 src2));
13122   format %{ "vsubpd  $dst,$src1,$src2\t! sub packed8D" %}
13123   ins_encode %{
13124     int vector_len = 2;
13125     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13126   %}
13127   ins_pipe( pipe_slow );
13128 %}
13129 
13130 instruct vsub8D_mem(vecZ dst, vecZ src, memory mem) %{
13131   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
13132   match(Set dst (SubVD src (LoadVector mem)));
13133   format %{ "vsubpd  $dst,$src,$mem\t! sub packed8D" %}
13134   ins_encode %{
13135     int vector_len = 2;
13136     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
13137   %}
13138   ins_pipe( pipe_slow );
13139 %}
13140 
13141 // --------------------------------- MUL --------------------------------------
13142 
13143 // Byte vector mul
13144 
13145 instruct mul4B_reg(vecS dst, vecS src1, vecS src2, vecS tmp2, vecS tmp) %{
13146   predicate(UseSSE > 3 && n->as_Vector()->length() == 4);
13147   match(Set dst (MulVB src1 src2));
13148   effect(TEMP dst, TEMP tmp2, TEMP tmp);
13149   format %{"pmovsxbw  $tmp,$src1\n\t"
13150            "pmovsxbw  $tmp2,$src2\n\t"
13151            "pmullw    $tmp,$tmp2\n\t"
13152            "movdqu    $tmp2,[0x00ff00ff0x00ff00ff]\n\t"
13153            "pand      $tmp,$tmp2\n\t"
13154            "packuswb  $tmp,$tmp\n\t"
13155            "movss     $dst,$tmp\t! mul packed4B" %}
13156   ins_encode %{
13157     __ pmovsxbw($tmp$$XMMRegister, $src1$$XMMRegister);
13158     __ pmovsxbw($tmp2$$XMMRegister, $src2$$XMMRegister);
13159     __ pmullw($tmp$$XMMRegister, $tmp2$$XMMRegister);
13160     __ movdqu($tmp2$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()));
13161     __ pand($tmp$$XMMRegister, $tmp2$$XMMRegister);
13162     __ packuswb($tmp$$XMMRegister, $tmp$$XMMRegister);
13163     __ movss($dst$$XMMRegister, $tmp$$XMMRegister);
13164   %}
13165   ins_pipe( pipe_slow );
13166 %}
13167 
13168 instruct mul8B_reg(vecD dst, vecD src1, vecD src2, vecD tmp2, vecD tmp) %{
13169   predicate(UseSSE > 3 && n->as_Vector()->length() == 8);
13170   match(Set dst (MulVB src1 src2));
13171   effect(TEMP dst, TEMP tmp2, TEMP tmp);
13172   format %{"pmovsxbw  $tmp,$src1\n\t"
13173            "pmovsxbw  $tmp2,$src2\n\t"
13174            "pmullw    $tmp,$tmp2\n\t"
13175            "movdqu    $tmp2,[0x00ff00ff0x00ff00ff]\n\t"
13176            "pand      $tmp,$tmp2\n\t"
13177            "packuswb  $tmp,$tmp\n\t"
13178            "movsd     $dst,$tmp\t! mul packed8B" %}
13179   ins_encode %{
13180     __ pmovsxbw($tmp$$XMMRegister, $src1$$XMMRegister);
13181     __ pmovsxbw($tmp2$$XMMRegister, $src2$$XMMRegister);
13182     __ pmullw($tmp$$XMMRegister, $tmp2$$XMMRegister);
13183     __ movdqu($tmp2$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()));
13184     __ pand($tmp$$XMMRegister, $tmp2$$XMMRegister);
13185     __ packuswb($tmp$$XMMRegister, $tmp$$XMMRegister);
13186     __ movsd($dst$$XMMRegister, $tmp$$XMMRegister);
13187   %}
13188   ins_pipe( pipe_slow );
13189 %}
13190 
13191 instruct mul16B_reg(vecX dst, vecX src1, vecX src2, vecX tmp3, vecX tmp2, vecX tmp) %{
13192   predicate(UseSSE > 3 && n->as_Vector()->length() == 16);
13193   match(Set dst (MulVB src1 src2));
13194   effect(TEMP tmp3, TEMP tmp2, TEMP tmp);
13195   format %{"pmovsxbw  $tmp,$src1\n\t"
13196            "pmovsxbw  $tmp2,$src2\n\t"
13197            "pmullw    $tmp,$tmp2\n\t"
13198            "pshufd    $tmp2,$src1\n\t"
13199            "pshufd    $tmp3,$src2\n\t"
13200            "pmovsxbw  $tmp2,$tmp2\n\t"
13201            "pmovsxbw  $tmp3,$tmp3\n\t"
13202            "pmullw    $tmp2,$tmp3\n\t"
13203            "movdqu    $tmp3,[0x00ff00ff0x00ff00ff]\n\t"
13204            "pand      $tmp,$tmp3\n\t"
13205            "pand      $tmp2,$tmp3\n\t"
13206            "packuswb  $tmp,$tmp2\n\t"
13207            "movdqu    $dst,$tmp \n\t! mul packed16B" %}
13208   ins_encode %{
13209     __ pmovsxbw($tmp$$XMMRegister, $src1$$XMMRegister);
13210     __ pmovsxbw($tmp2$$XMMRegister, $src2$$XMMRegister);
13211     __ pmullw($tmp$$XMMRegister, $tmp2$$XMMRegister);
13212     __ pshufd($tmp2$$XMMRegister, $src1$$XMMRegister, 238);
13213     __ pshufd($tmp3$$XMMRegister, $src2$$XMMRegister, 238);
13214     __ pmovsxbw($tmp2$$XMMRegister, $tmp2$$XMMRegister);
13215     __ pmovsxbw($tmp3$$XMMRegister, $tmp3$$XMMRegister);
13216     __ pmullw($tmp2$$XMMRegister, $tmp3$$XMMRegister);
13217     __ movdqu($tmp3$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()));
13218     __ pand($tmp$$XMMRegister, $tmp3$$XMMRegister);
13219     __ pand($tmp2$$XMMRegister, $tmp3$$XMMRegister);
13220     __ packuswb($tmp$$XMMRegister, $tmp2$$XMMRegister);
13221     __ movdqu($dst$$XMMRegister, $tmp$$XMMRegister);
13222   %}
13223   ins_pipe( pipe_slow );
13224 %}
13225 
13226 instruct vmul16B_reg_avx(vecX dst, vecX src1, vecX src2, vecY tmp2, vecY tmp) %{
13227   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
13228   match(Set dst (MulVB src1 src2));
13229   effect(TEMP dst, TEMP tmp2, TEMP tmp);
13230   format %{"vpmovsxbw  $tmp,$src1\n\t"
13231            "vpmovsxbw  $tmp2,$src2\n\t"
13232            "vpmullw    $tmp,$tmp2\n\t"
13233            "vmovdqu    $tmp2,[0x00ff00ff0x00ff00ff]\n\t"
13234            "vpand      $tmp,$tmp2\n\t"
13235            "vextracti128_high  $tmp2,$tmp\n\t"
13236            "vpackuswb  $dst,$tmp, $tmp2\n\t! mul packed16B" %}
13237   ins_encode %{
13238   int vector_len = 1;
13239     __ vpmovsxbw($tmp$$XMMRegister, $src1$$XMMRegister, vector_len);
13240     __ vpmovsxbw($tmp2$$XMMRegister, $src2$$XMMRegister, vector_len);
13241     __ vpmullw($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
13242     __ vmovdqu($tmp2$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()));
13243     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
13244     __ vextracti128_high($tmp2$$XMMRegister, $tmp$$XMMRegister);
13245     __ vpackuswb($dst$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
13246   %}
13247   ins_pipe( pipe_slow );
13248 %}
13249 
13250 instruct vmul32B_reg_avx(vecY dst, vecY src1, vecY src2, vecY tmp1, vecY tmp2, vecY tmp3) %{
13251   predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
13252   match(Set dst (MulVB src1 src2));
13253   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3);
13254   format %{"vextracti128_high  $tmp1,$src1\n\t"
13255            "vextracti128_high  $tmp3,$src2\n\t"
13256            "vpmovsxbw $tmp1,$tmp1\n\t"
13257            "vpmovsxbw $tmp3,$tmp3\n\t"
13258            "vpmullw $tmp1,$tmp1,$tmp3\n\t"
13259            "vpmovsxbw $tmp2,$src1\n\t"
13260            "vpmovsxbw $tmp3,$src2\n\t"
13261            "vpmullw $tmp2,$tmp2,$tmp3\n\t"
13262            "vmovdqu $tmp3, [0x00ff00ff0x00ff00ff]\n\t"
13263            "vpbroadcastd $tmp3, $tmp3\n\t"
13264            "vpand $tmp2,$tmp2,$tmp3\n\t"
13265            "vpand $tmp1,$tmp1,$tmp3\n\t"
13266            "vpackuswb $dst,$tmp2,$tmp1\n\t"
13267            "vpermq $dst, $dst, 0xD8\t! mul packed32B" %}
13268   ins_encode %{
13269     int vector_len = 1;
13270     __ vextracti128_high($tmp1$$XMMRegister, $src1$$XMMRegister);
13271     __ vextracti128_high($tmp3$$XMMRegister, $src2$$XMMRegister);
13272     __ vpmovsxbw($tmp1$$XMMRegister, $tmp1$$XMMRegister, vector_len);
13273     __ vpmovsxbw($tmp3$$XMMRegister, $tmp3$$XMMRegister, vector_len);
13274     __ vpmullw($tmp1$$XMMRegister, $tmp1$$XMMRegister, $tmp3$$XMMRegister, vector_len);
13275     __ vpmovsxbw($tmp2$$XMMRegister, $src1$$XMMRegister, vector_len);
13276     __ vpmovsxbw($tmp3$$XMMRegister, $src2$$XMMRegister, vector_len);
13277     __ vpmullw($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, vector_len);
13278     __ vmovdqu($tmp3$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()));
13279     __ vpbroadcastd($tmp3$$XMMRegister, $tmp3$$XMMRegister, vector_len);
13280     __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $tmp3$$XMMRegister, vector_len);
13281     __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, vector_len);
13282     __ vpackuswb($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp1$$XMMRegister, vector_len);
13283     __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vector_len);
13284   %}
13285   ins_pipe( pipe_slow );
13286 %}
13287 
13288 instruct vmul64B_reg_avx(vecZ dst, vecZ src1, vecZ src2, vecZ tmp1, vecZ tmp2, vecZ tmp3, vecZ tmp4, vecZ tmp5, vecZ tmp6) %{
13289   predicate(UseAVX > 2 && n->as_Vector()->length() == 64);
13290   match(Set dst (MulVB src1 src2));
13291   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4);
13292   format %{"vextracti64x4_high  $tmp1,$src1\n\t"
13293            "vextracti64x4_high  $tmp3,$src2\n\t"
13294            "vpmovsxbw $tmp1,$tmp1\n\t"
13295            "vpmovsxbw $tmp3,$tmp3\n\t"
13296            "vpmullw $tmp1,$tmp1,$tmp3\n\t"
13297            "vpmovsxbw $tmp2,$src1\n\t"
13298            "vpmovsxbw $tmp3,$src2\n\t"
13299            "vpmullw $tmp2,$tmp2,$tmp3\n\t"
13300            "vmovdqu $tmp3, [0x00ff00ff0x00ff00ff]\n\t"
13301            "vpbroadcastd $tmp3, $tmp3\n\t"
13302            "vpand $tmp1,$tmp1,$tmp3\n\t"
13303            "vpand $tmp2,$tmp2,$tmp3\n\t"
13304            "vpackuswb $tmp1,$tmp2,$tmp1\n\t"
13305            "vextracti64x4_high  $tmp3,$tmp1\n\t"
13306            "vpermq $tmp3, $tmp3, 0x8D\n\t"
13307            "vpermq $tmp1, $tmp1, 0xD8\n\t"
13308            "vmovdqu  $tmp4,$tmp3\n\t"
13309            "vmovdqu  $tmp2,$tmp1\n\t"
13310            "vpblendd  $tmp3,$tmp3,$tmp1\n\t"
13311            "vpblendd  $tmp2,$tmp2,$tmp4\n\t"
13312            "vpermq $tmp2,$tmp2,0x4E\n\t"
13313            "vinserti64x4 $dst,$dst,$tmp3,0x00\n\t"
13314            "vinserti64x4 $dst,$dst,$tmp2,0x01\t! mul packed64B" %}
13315   ins_encode %{
13316     int vector_len = 2;
13317     __ vextracti64x4_high($tmp1$$XMMRegister, $src1$$XMMRegister);
13318     __ vextracti64x4_high($tmp3$$XMMRegister, $src2$$XMMRegister);
13319     __ vpmovsxbw($tmp1$$XMMRegister, $tmp1$$XMMRegister, vector_len);
13320     __ vpmovsxbw($tmp3$$XMMRegister, $tmp3$$XMMRegister, vector_len);
13321     __ vpmullw($tmp1$$XMMRegister, $tmp1$$XMMRegister, $tmp3$$XMMRegister, vector_len);
13322     __ vpmovsxbw($tmp2$$XMMRegister, $src1$$XMMRegister, vector_len);
13323     __ vpmovsxbw($tmp3$$XMMRegister, $src2$$XMMRegister, vector_len);
13324     __ vpmullw($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, vector_len);
13325     __ vmovdqu($tmp3$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()));
13326     __ vpbroadcastd($tmp3$$XMMRegister, $tmp3$$XMMRegister, vector_len);
13327     __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $tmp3$$XMMRegister, vector_len);
13328     __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, vector_len);
13329     __ vpackuswb($tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp1$$XMMRegister, vector_len);
13330     __ vextracti64x4_high($tmp3$$XMMRegister, $tmp1$$XMMRegister);
13331     __ vpermq($tmp3$$XMMRegister, $tmp3$$XMMRegister, 0x8D, 1);
13332     __ vpermq($tmp1$$XMMRegister, $tmp1$$XMMRegister, 0xD8, 1);
13333     __ vmovdqu($tmp4$$XMMRegister, $tmp3$$XMMRegister);
13334     __ vmovdqu($tmp2$$XMMRegister, $tmp1$$XMMRegister);
13335     __ vpblendd($tmp3$$XMMRegister, $tmp3$$XMMRegister, $tmp1$$XMMRegister, 0x0F, 1);
13336     __ vpblendd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp4$$XMMRegister, 0x0F, 1);
13337     __ vpermq($tmp2$$XMMRegister, $tmp2$$XMMRegister, 0x4E, 1);
13338     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $tmp3$$XMMRegister, 0x00);
13339     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister, 0x01);
13340   %}
13341   ins_pipe( pipe_slow );
13342 %}
13343 
13344 // Shorts/Chars vector mul
13345 instruct vmul2S(vecS dst, vecS src) %{
13346   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
13347   match(Set dst (MulVS dst src));
13348   format %{ "pmullw $dst,$src\t! mul packed2S" %}
13349   ins_encode %{
13350     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
13351   %}
13352   ins_pipe( pipe_slow );
13353 %}
13354 
13355 instruct vmul2S_reg(vecS dst, vecS src1, vecS src2) %{
13356   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
13357   match(Set dst (MulVS src1 src2));
13358   format %{ "vpmullw $dst,$src1,$src2\t! mul packed2S" %}
13359   ins_encode %{
13360     int vector_len = 0;
13361     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13362   %}
13363   ins_pipe( pipe_slow );
13364 %}
13365 
13366 instruct vmul2S_mem(vecS dst, vecS src, memory mem) %{
13367   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
13368   match(Set dst (MulVS src (LoadVector mem)));
13369   format %{ "vpmullw $dst,$src,$mem\t! mul packed2S" %}
13370   ins_encode %{
13371     int vector_len = 0;
13372     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
13373   %}
13374   ins_pipe( pipe_slow );
13375 %}
13376 
13377 instruct vmul4S(vecD dst, vecD src) %{
13378   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
13379   match(Set dst (MulVS dst src));
13380   format %{ "pmullw  $dst,$src\t! mul packed4S" %}
13381   ins_encode %{
13382     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
13383   %}
13384   ins_pipe( pipe_slow );
13385 %}
13386 
13387 instruct vmul4S_reg(vecD dst, vecD src1, vecD src2) %{
13388   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
13389   match(Set dst (MulVS src1 src2));
13390   format %{ "vpmullw $dst,$src1,$src2\t! mul packed4S" %}
13391   ins_encode %{
13392     int vector_len = 0;
13393     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13394   %}
13395   ins_pipe( pipe_slow );
13396 %}
13397 
13398 instruct vmul4S_mem(vecD dst, vecD src, memory mem) %{
13399   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
13400   match(Set dst (MulVS src (LoadVector mem)));
13401   format %{ "vpmullw $dst,$src,$mem\t! mul packed4S" %}
13402   ins_encode %{
13403     int vector_len = 0;
13404     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
13405   %}
13406   ins_pipe( pipe_slow );
13407 %}
13408 
13409 instruct vmul8S(vecX dst, vecX src) %{
13410   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
13411   match(Set dst (MulVS dst src));
13412   format %{ "pmullw  $dst,$src\t! mul packed8S" %}
13413   ins_encode %{
13414     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
13415   %}
13416   ins_pipe( pipe_slow );
13417 %}
13418 
13419 instruct vmul8S_reg(vecX dst, vecX src1, vecX src2) %{
13420   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
13421   match(Set dst (MulVS src1 src2));
13422   format %{ "vpmullw $dst,$src1,$src2\t! mul packed8S" %}
13423   ins_encode %{
13424     int vector_len = 0;
13425     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13426   %}
13427   ins_pipe( pipe_slow );
13428 %}
13429 
13430 instruct vmul8S_mem(vecX dst, vecX src, memory mem) %{
13431   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
13432   match(Set dst (MulVS src (LoadVector mem)));
13433   format %{ "vpmullw $dst,$src,$mem\t! mul packed8S" %}
13434   ins_encode %{
13435     int vector_len = 0;
13436     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
13437   %}
13438   ins_pipe( pipe_slow );
13439 %}
13440 
13441 instruct vmul16S_reg(vecY dst, vecY src1, vecY src2) %{
13442   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
13443   match(Set dst (MulVS src1 src2));
13444   format %{ "vpmullw $dst,$src1,$src2\t! mul packed16S" %}
13445   ins_encode %{
13446     int vector_len = 1;
13447     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13448   %}
13449   ins_pipe( pipe_slow );
13450 %}
13451 
13452 instruct vmul16S_mem(vecY dst, vecY src, memory mem) %{
13453   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
13454   match(Set dst (MulVS src (LoadVector mem)));
13455   format %{ "vpmullw $dst,$src,$mem\t! mul packed16S" %}
13456   ins_encode %{
13457     int vector_len = 1;
13458     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
13459   %}
13460   ins_pipe( pipe_slow );
13461 %}
13462 
13463 instruct vmul32S_reg(vecZ dst, vecZ src1, vecZ src2) %{
13464   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
13465   match(Set dst (MulVS src1 src2));
13466   format %{ "vpmullw $dst,$src1,$src2\t! mul packed32S" %}
13467   ins_encode %{
13468     int vector_len = 2;
13469     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13470   %}
13471   ins_pipe( pipe_slow );
13472 %}
13473 
13474 instruct vmul32S_mem(vecZ dst, vecZ src, memory mem) %{
13475   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
13476   match(Set dst (MulVS src (LoadVector mem)));
13477   format %{ "vpmullw $dst,$src,$mem\t! mul packed32S" %}
13478   ins_encode %{
13479     int vector_len = 2;
13480     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
13481   %}
13482   ins_pipe( pipe_slow );
13483 %}
13484 
13485 // Integers vector mul (sse4_1)
13486 instruct vmul2I(vecD dst, vecD src) %{
13487   predicate(UseSSE > 3 && n->as_Vector()->length() == 2);
13488   match(Set dst (MulVI dst src));
13489   format %{ "pmulld  $dst,$src\t! mul packed2I" %}
13490   ins_encode %{
13491     __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
13492   %}
13493   ins_pipe( pipe_slow );
13494 %}
13495 
13496 instruct vmul2I_reg(vecD dst, vecD src1, vecD src2) %{
13497   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
13498   match(Set dst (MulVI src1 src2));
13499   format %{ "vpmulld $dst,$src1,$src2\t! mul packed2I" %}
13500   ins_encode %{
13501     int vector_len = 0;
13502     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13503   %}
13504   ins_pipe( pipe_slow );
13505 %}
13506 
13507 instruct vmul2I_mem(vecD dst, vecD src, memory mem) %{
13508   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
13509   match(Set dst (MulVI src (LoadVector mem)));
13510   format %{ "vpmulld $dst,$src,$mem\t! mul packed2I" %}
13511   ins_encode %{
13512     int vector_len = 0;
13513     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
13514   %}
13515   ins_pipe( pipe_slow );
13516 %}
13517 
13518 instruct vmul4I(vecX dst, vecX src) %{
13519   predicate(UseSSE > 3 && n->as_Vector()->length() == 4);
13520   match(Set dst (MulVI dst src));
13521   format %{ "pmulld  $dst,$src\t! mul packed4I" %}
13522   ins_encode %{
13523     __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
13524   %}
13525   ins_pipe( pipe_slow );
13526 %}
13527 
13528 instruct vmul4I_reg(vecX dst, vecX src1, vecX src2) %{
13529   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
13530   match(Set dst (MulVI src1 src2));
13531   format %{ "vpmulld $dst,$src1,$src2\t! mul packed4I" %}
13532   ins_encode %{
13533     int vector_len = 0;
13534     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13535   %}
13536   ins_pipe( pipe_slow );
13537 %}
13538 
13539 instruct vmul4I_mem(vecX dst, vecX src, memory mem) %{
13540   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
13541   match(Set dst (MulVI src (LoadVector mem)));
13542   format %{ "vpmulld $dst,$src,$mem\t! mul packed4I" %}
13543   ins_encode %{
13544     int vector_len = 0;
13545     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
13546   %}
13547   ins_pipe( pipe_slow );
13548 %}
13549 
13550 // Long vector mul
13551 
13552 instruct mul2L_reg(vecX dst, vecX src2, vecX tmp) %{
13553   predicate(UseSSE > 3 && n->as_Vector()->length() == 2 && VM_Version::supports_sse4_1());
13554   match(Set dst (MulVL dst src2));
13555   effect(TEMP dst, TEMP tmp);
13556   format %{ "pshufd $tmp,$src2, 177\n\t"
13557             "pmulld $tmp,$dst\n\t"
13558             "phaddd $tmp,$tmp\n\t"
13559             "pmovzxdq $tmp,$tmp\n\t"
13560             "psllq $tmp, 32\n\t"
13561             "pmuludq $dst,$src2\n\t"
13562             "paddq $dst,$tmp\n\t! mul packed2L" %}
13563 
13564   ins_encode %{
13565     int vector_len = 0;
13566     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 177);
13567     __ pmulld($tmp$$XMMRegister, $dst$$XMMRegister);
13568     __ phaddd($tmp$$XMMRegister, $tmp$$XMMRegister);
13569     __ pmovzxdq($tmp$$XMMRegister, $tmp$$XMMRegister);
13570     __ psllq($tmp$$XMMRegister, 32);
13571     __ pmuludq($dst$$XMMRegister, $src2$$XMMRegister);
13572     __ paddq($dst$$XMMRegister, $tmp$$XMMRegister);
13573   %}
13574   ins_pipe( pipe_slow );
13575 %}
13576 
13577 instruct vmul2L_reg_avx(vecX dst, vecX src1, vecX src2, vecX tmp1, vecX tmp) %{
13578   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 && VM_Version::supports_avx());
13579   match(Set dst (MulVL src1 src2));
13580   effect(TEMP tmp1, TEMP tmp);
13581   format %{ "vpshufd $tmp,$src2\n\t"
13582             "vpmulld $tmp,$src1,$tmp\n\t"
13583             "vphaddd $tmp,$tmp,$tmp\n\t"
13584             "vpmovzxdq $tmp,$tmp\n\t"
13585             "vpsllq $tmp,$tmp\n\t"
13586             "vpmuludq $tmp1,$src1,$src2\n\t"
13587             "vpaddq $dst,$tmp,$tmp1\t! mul packed2L" %}
13588   ins_encode %{
13589     int vector_len = 0;
13590     __ vpshufd($tmp$$XMMRegister, $src2$$XMMRegister, 177, vector_len);
13591     __ vpmulld($tmp$$XMMRegister, $src1$$XMMRegister, $tmp$$XMMRegister, vector_len);
13592     __ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp$$XMMRegister, vector_len);
13593     __ vpmovzxdq($tmp$$XMMRegister, $tmp$$XMMRegister, vector_len);
13594     __ vpsllq($tmp$$XMMRegister, $tmp$$XMMRegister, 32, vector_len);
13595     __ vpmuludq($tmp1$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13596     __ vpaddq($dst$$XMMRegister, $tmp$$XMMRegister, $tmp1$$XMMRegister, vector_len);
13597   %}
13598   ins_pipe( pipe_slow );
13599 %}
13600 
13601 instruct vmul2L_reg(vecX dst, vecX src1, vecX src2) %{
13602   predicate(UseAVX > 2 && n->as_Vector()->length() == 2 && VM_Version::supports_avx512dq());
13603   match(Set dst (MulVL src1 src2));
13604   format %{ "vpmullq $dst,$src1,$src2\t! mul packed2L" %}
13605   ins_encode %{
13606     int vector_len = 0;
13607     __ vpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13608   %}
13609   ins_pipe( pipe_slow );
13610 %}
13611 
13612 instruct vmul2L_mem(vecX dst, vecX src, memory mem) %{
13613   predicate(UseAVX > 2 && n->as_Vector()->length() == 2 && VM_Version::supports_avx512dq());
13614   match(Set dst (MulVL src (LoadVector mem)));
13615   format %{ "vpmullq $dst,$src,$mem\t! mul packed2L" %}
13616   ins_encode %{
13617     int vector_len = 0;
13618     __ vpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
13619   %}
13620   ins_pipe( pipe_slow );
13621 %}
13622 
13623 instruct vmul4L_reg_avx(vecY dst, vecY src1, vecY src2, vecY tmp, vecY tmp1,) %{
13624   predicate(UseAVX > 1 && n->as_Vector()->length() == 4 && VM_Version::supports_avx2());
13625   match(Set dst (MulVL src1 src2));
13626   effect(TEMP tmp1, TEMP tmp);
13627   format %{ "vpshufd $tmp,$src2\n\t"
13628             "vpmulld $tmp,$src1,$tmp\n\t"
13629             "vphaddd $tmp,$tmp,$tmp\n\t"
13630             "vpmovzxdq $tmp,$tmp\n\t"
13631             "vpsllq $tmp,$tmp\n\t"
13632             "vpmuludq $tmp1,$src1,$src2\n\t"
13633             "vpaddq $dst,$tmp,$tmp1\t! mul packed4L" %}
13634   ins_encode %{
13635     int vector_len = 1;
13636     __ vpshufd($tmp$$XMMRegister, $src2$$XMMRegister, 177, vector_len);
13637     __ vpmulld($tmp$$XMMRegister, $src1$$XMMRegister, $tmp$$XMMRegister, vector_len);
13638     __ vextracti128_high($tmp1$$XMMRegister, $tmp$$XMMRegister);
13639     __ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp1$$XMMRegister, vector_len);
13640     __ vpmovzxdq($tmp$$XMMRegister, $tmp$$XMMRegister, vector_len);
13641     __ vpsllq($tmp$$XMMRegister, $tmp$$XMMRegister, 32, vector_len);
13642     __ vpmuludq($tmp1$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13643     __ vpaddq($dst$$XMMRegister, $tmp$$XMMRegister, $tmp1$$XMMRegister, vector_len);
13644   %}
13645   ins_pipe( pipe_slow );
13646 %}
13647 
13648 instruct vmul4L_reg(vecY dst, vecY src1, vecY src2) %{
13649   predicate(UseAVX > 2 && n->as_Vector()->length() == 4 && VM_Version::supports_avx512dq());
13650   match(Set dst (MulVL src1 src2));
13651   format %{ "vpmullq $dst,$src1,$src2\t! mul packed4L" %}
13652   ins_encode %{
13653     int vector_len = 1;
13654     __ vpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13655   %}
13656   ins_pipe( pipe_slow );
13657 %}
13658 
13659 instruct vmul4L_mem(vecY dst, vecY src, memory mem) %{
13660   predicate(UseAVX > 2 && n->as_Vector()->length() == 4 && VM_Version::supports_avx512dq());
13661   match(Set dst (MulVL src (LoadVector mem)));
13662   format %{ "vpmullq $dst,$src,$mem\t! mul packed4L" %}
13663   ins_encode %{
13664     int vector_len = 1;
13665     __ vpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
13666   %}
13667   ins_pipe( pipe_slow );
13668 %}
13669 
13670 instruct vmul8L_reg(vecZ dst, vecZ src1, vecZ src2) %{
13671   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && VM_Version::supports_avx512dq());
13672   match(Set dst (MulVL src1 src2));
13673   format %{ "vpmullq $dst,$src1,$src2\t! mul packed8L" %}
13674   ins_encode %{
13675     int vector_len = 2;
13676     __ vpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13677   %}
13678   ins_pipe( pipe_slow );
13679 %}
13680 
13681 instruct vmul8L_mem(vecZ dst, vecZ src, memory mem) %{
13682   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && VM_Version::supports_avx512dq());
13683   match(Set dst (MulVL src (LoadVector mem)));
13684   format %{ "vpmullq $dst,$src,$mem\t! mul packed8L" %}
13685   ins_encode %{
13686     int vector_len = 2;
13687     __ vpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
13688   %}
13689   ins_pipe( pipe_slow );
13690 %}
13691 
13692 instruct vmul8I_reg(vecY dst, vecY src1, vecY src2) %{
13693   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
13694   match(Set dst (MulVI src1 src2));
13695   format %{ "vpmulld $dst,$src1,$src2\t! mul packed8I" %}
13696   ins_encode %{
13697     int vector_len = 1;
13698     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13699   %}
13700   ins_pipe( pipe_slow );
13701 %}
13702 
13703 instruct vmul8I_mem(vecY dst, vecY src, memory mem) %{
13704   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
13705   match(Set dst (MulVI src (LoadVector mem)));
13706   format %{ "vpmulld $dst,$src,$mem\t! mul packed8I" %}
13707   ins_encode %{
13708     int vector_len = 1;
13709     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
13710   %}
13711   ins_pipe( pipe_slow );
13712 %}
13713 
13714 instruct vmul16I_reg(vecZ dst, vecZ src1, vecZ src2) %{
13715   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
13716   match(Set dst (MulVI src1 src2));
13717   format %{ "vpmulld $dst,$src1,$src2\t! mul packed16I" %}
13718   ins_encode %{
13719     int vector_len = 2;
13720     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13721   %}
13722   ins_pipe( pipe_slow );
13723 %}
13724 
13725 instruct vmul16I_mem(vecZ dst, vecZ src, memory mem) %{
13726   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
13727   match(Set dst (MulVI src (LoadVector mem)));
13728   format %{ "vpmulld $dst,$src,$mem\t! mul packed16I" %}
13729   ins_encode %{
13730     int vector_len = 2;
13731     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
13732   %}
13733   ins_pipe( pipe_slow );
13734 %}
13735 
13736 // Floats vector mul
13737 instruct vmul2F(vecD dst, vecD src) %{
13738   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
13739   match(Set dst (MulVF dst src));
13740   format %{ "mulps   $dst,$src\t! mul packed2F" %}
13741   ins_encode %{
13742     __ mulps($dst$$XMMRegister, $src$$XMMRegister);
13743   %}
13744   ins_pipe( pipe_slow );
13745 %}
13746 
13747 instruct vmul2F_reg(vecD dst, vecD src1, vecD src2) %{
13748   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
13749   match(Set dst (MulVF src1 src2));
13750   format %{ "vmulps  $dst,$src1,$src2\t! mul packed2F" %}
13751   ins_encode %{
13752     int vector_len = 0;
13753     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13754   %}
13755   ins_pipe( pipe_slow );
13756 %}
13757 
13758 instruct vmul2F_mem(vecD dst, vecD src, memory mem) %{
13759   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
13760   match(Set dst (MulVF src (LoadVector mem)));
13761   format %{ "vmulps  $dst,$src,$mem\t! mul packed2F" %}
13762   ins_encode %{
13763     int vector_len = 0;
13764     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
13765   %}
13766   ins_pipe( pipe_slow );
13767 %}
13768 
13769 instruct vmul4F(vecX dst, vecX src) %{
13770   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
13771   match(Set dst (MulVF dst src));
13772   format %{ "mulps   $dst,$src\t! mul packed4F" %}
13773   ins_encode %{
13774     __ mulps($dst$$XMMRegister, $src$$XMMRegister);
13775   %}
13776   ins_pipe( pipe_slow );
13777 %}
13778 
13779 instruct vmul4F_reg(vecX dst, vecX src1, vecX src2) %{
13780   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
13781   match(Set dst (MulVF src1 src2));
13782   format %{ "vmulps  $dst,$src1,$src2\t! mul packed4F" %}
13783   ins_encode %{
13784     int vector_len = 0;
13785     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13786   %}
13787   ins_pipe( pipe_slow );
13788 %}
13789 
13790 instruct vmul4F_mem(vecX dst, vecX src, memory mem) %{
13791   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
13792   match(Set dst (MulVF src (LoadVector mem)));
13793   format %{ "vmulps  $dst,$src,$mem\t! mul packed4F" %}
13794   ins_encode %{
13795     int vector_len = 0;
13796     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
13797   %}
13798   ins_pipe( pipe_slow );
13799 %}
13800 
13801 instruct vmul8F_reg(vecY dst, vecY src1, vecY src2) %{
13802   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
13803   match(Set dst (MulVF src1 src2));
13804   format %{ "vmulps  $dst,$src1,$src2\t! mul packed8F" %}
13805   ins_encode %{
13806     int vector_len = 1;
13807     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13808   %}
13809   ins_pipe( pipe_slow );
13810 %}
13811 
13812 instruct vmul8F_mem(vecY dst, vecY src, memory mem) %{
13813   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
13814   match(Set dst (MulVF src (LoadVector mem)));
13815   format %{ "vmulps  $dst,$src,$mem\t! mul packed8F" %}
13816   ins_encode %{
13817     int vector_len = 1;
13818     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
13819   %}
13820   ins_pipe( pipe_slow );
13821 %}
13822 
13823 instruct vmul16F_reg(vecZ dst, vecZ src1, vecZ src2) %{
13824   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
13825   match(Set dst (MulVF src1 src2));
13826   format %{ "vmulps  $dst,$src1,$src2\t! mul packed16F" %}
13827   ins_encode %{
13828     int vector_len = 2;
13829     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13830   %}
13831   ins_pipe( pipe_slow );
13832 %}
13833 
13834 instruct vmul16F_mem(vecZ dst, vecZ src, memory mem) %{
13835   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
13836   match(Set dst (MulVF src (LoadVector mem)));
13837   format %{ "vmulps  $dst,$src,$mem\t! mul packed16F" %}
13838   ins_encode %{
13839     int vector_len = 2;
13840     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
13841   %}
13842   ins_pipe( pipe_slow );
13843 %}
13844 
13845 // Doubles vector mul
13846 instruct vmul2D(vecX dst, vecX src) %{
13847   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
13848   match(Set dst (MulVD dst src));
13849   format %{ "mulpd   $dst,$src\t! mul packed2D" %}
13850   ins_encode %{
13851     __ mulpd($dst$$XMMRegister, $src$$XMMRegister);
13852   %}
13853   ins_pipe( pipe_slow );
13854 %}
13855 
13856 instruct vmul2D_reg(vecX dst, vecX src1, vecX src2) %{
13857   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
13858   match(Set dst (MulVD src1 src2));
13859   format %{ "vmulpd  $dst,$src1,$src2\t! mul packed2D" %}
13860   ins_encode %{
13861     int vector_len = 0;
13862     __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13863   %}
13864   ins_pipe( pipe_slow );
13865 %}
13866 
13867 instruct vmul2D_mem(vecX dst, vecX src, memory mem) %{
13868   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
13869   match(Set dst (MulVD src (LoadVector mem)));
13870   format %{ "vmulpd  $dst,$src,$mem\t! mul packed2D" %}
13871   ins_encode %{
13872     int vector_len = 0;
13873     __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
13874   %}
13875   ins_pipe( pipe_slow );
13876 %}
13877 
13878 instruct vmul4D_reg(vecY dst, vecY src1, vecY src2) %{
13879   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
13880   match(Set dst (MulVD src1 src2));
13881   format %{ "vmulpd  $dst,$src1,$src2\t! mul packed4D" %}
13882   ins_encode %{
13883     int vector_len = 1;
13884     __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13885   %}
13886   ins_pipe( pipe_slow );
13887 %}
13888 
13889 instruct vmul4D_mem(vecY dst, vecY src, memory mem) %{
13890   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
13891   match(Set dst (MulVD src (LoadVector mem)));
13892   format %{ "vmulpd  $dst,$src,$mem\t! mul packed4D" %}
13893   ins_encode %{
13894     int vector_len = 1;
13895     __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
13896   %}
13897   ins_pipe( pipe_slow );
13898 %}
13899 
13900 instruct vmul8D_reg(vecZ dst, vecZ src1, vecZ src2) %{
13901   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
13902   match(Set dst (MulVD src1 src2));
13903   format %{ "vmulpd  $dst k0,$src1,$src2\t! mul packed8D" %}
13904   ins_encode %{
13905     int vector_len = 2;
13906     __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13907   %}
13908   ins_pipe( pipe_slow );
13909 %}
13910 
13911 instruct vmul8D_mem(vecZ dst, vecZ src, memory mem) %{
13912   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
13913   match(Set dst (MulVD src (LoadVector mem)));
13914   format %{ "vmulpd  $dst k0,$src,$mem\t! mul packed8D" %}
13915   ins_encode %{
13916     int vector_len = 2;
13917     __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
13918   %}
13919   ins_pipe( pipe_slow );
13920 %}
13921 
13922 instruct vcmov8F_reg(legVecY dst, legVecY src1, legVecY src2, immI8 cop, cmpOp_vcmppd copnd) %{
13923   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
13924   match(Set dst (CMoveVF (Binary copnd cop) (Binary src1 src2)));
13925   effect(TEMP dst, USE src1, USE src2);
13926   format %{ "cmpps.$copnd  $dst, $src1, $src2  ! vcmovevf, cond=$cop\n\t"
13927             "blendvps $dst,$src1,$src2,$dst ! vcmovevf\n\t"
13928          %}
13929   ins_encode %{
13930     int vector_len = 1;
13931     int cond = (Assembler::Condition)($copnd$$cmpcode);
13932     __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cond, vector_len);
13933     __ vblendvps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $dst$$XMMRegister, vector_len);
13934   %}
13935   ins_pipe( pipe_slow );
13936 %}
13937 
13938 instruct vcmov4D_reg(legVecY dst, legVecY src1, legVecY src2, immI8 cop, cmpOp_vcmppd copnd) %{
13939   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
13940   match(Set dst (CMoveVD (Binary copnd cop) (Binary src1 src2)));
13941   effect(TEMP dst, USE src1, USE src2);
13942   format %{ "cmppd.$copnd  $dst, $src1, $src2  ! vcmovevd, cond=$cop\n\t"
13943             "vblendvpd $dst,$src1,$src2,$dst ! vcmovevd\n\t"
13944          %}
13945   ins_encode %{
13946     int vector_len = 1;
13947     int cond = (Assembler::Condition)($copnd$$cmpcode);
13948     __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cond, vector_len);
13949     __ vblendvpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $dst$$XMMRegister, vector_len);
13950   %}
13951   ins_pipe( pipe_slow );
13952 %}
13953 
13954 // --------------------------------- DIV --------------------------------------
13955 
13956 // Floats vector div
13957 instruct vdiv2F(vecD dst, vecD src) %{
13958   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
13959   match(Set dst (DivVF dst src));
13960   format %{ "divps   $dst,$src\t! div packed2F" %}
13961   ins_encode %{
13962     __ divps($dst$$XMMRegister, $src$$XMMRegister);
13963   %}
13964   ins_pipe( pipe_slow );
13965 %}
13966 
13967 instruct vdiv2F_reg(vecD dst, vecD src1, vecD src2) %{
13968   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
13969   match(Set dst (DivVF src1 src2));
13970   format %{ "vdivps  $dst,$src1,$src2\t! div packed2F" %}
13971   ins_encode %{
13972     int vector_len = 0;
13973     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13974   %}
13975   ins_pipe( pipe_slow );
13976 %}
13977 
13978 instruct vdiv2F_mem(vecD dst, vecD src, memory mem) %{
13979   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
13980   match(Set dst (DivVF src (LoadVector mem)));
13981   format %{ "vdivps  $dst,$src,$mem\t! div packed2F" %}
13982   ins_encode %{
13983     int vector_len = 0;
13984     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
13985   %}
13986   ins_pipe( pipe_slow );
13987 %}
13988 
13989 instruct vdiv4F(vecX dst, vecX src) %{
13990   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
13991   match(Set dst (DivVF dst src));
13992   format %{ "divps   $dst,$src\t! div packed4F" %}
13993   ins_encode %{
13994     __ divps($dst$$XMMRegister, $src$$XMMRegister);
13995   %}
13996   ins_pipe( pipe_slow );
13997 %}
13998 
13999 instruct vdiv4F_reg(vecX dst, vecX src1, vecX src2) %{
14000   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
14001   match(Set dst (DivVF src1 src2));
14002   format %{ "vdivps  $dst,$src1,$src2\t! div packed4F" %}
14003   ins_encode %{
14004     int vector_len = 0;
14005     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14006   %}
14007   ins_pipe( pipe_slow );
14008 %}
14009 
14010 instruct vdiv4F_mem(vecX dst, vecX src, memory mem) %{
14011   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
14012   match(Set dst (DivVF src (LoadVector mem)));
14013   format %{ "vdivps  $dst,$src,$mem\t! div packed4F" %}
14014   ins_encode %{
14015     int vector_len = 0;
14016     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
14017   %}
14018   ins_pipe( pipe_slow );
14019 %}
14020 
14021 instruct vdiv8F_reg(vecY dst, vecY src1, vecY src2) %{
14022   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
14023   match(Set dst (DivVF src1 src2));
14024   format %{ "vdivps  $dst,$src1,$src2\t! div packed8F" %}
14025   ins_encode %{
14026     int vector_len = 1;
14027     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14028   %}
14029   ins_pipe( pipe_slow );
14030 %}
14031 
14032 instruct vdiv8F_mem(vecY dst, vecY src, memory mem) %{
14033   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
14034   match(Set dst (DivVF src (LoadVector mem)));
14035   format %{ "vdivps  $dst,$src,$mem\t! div packed8F" %}
14036   ins_encode %{
14037     int vector_len = 1;
14038     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
14039   %}
14040   ins_pipe( pipe_slow );
14041 %}
14042 
14043 instruct vdiv16F_reg(vecZ dst, vecZ src1, vecZ src2) %{
14044   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
14045   match(Set dst (DivVF src1 src2));
14046   format %{ "vdivps  $dst,$src1,$src2\t! div packed16F" %}
14047   ins_encode %{
14048     int vector_len = 2;
14049     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14050   %}
14051   ins_pipe( pipe_slow );
14052 %}
14053 
14054 instruct vdiv16F_mem(vecZ dst, vecZ src, memory mem) %{
14055   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
14056   match(Set dst (DivVF src (LoadVector mem)));
14057   format %{ "vdivps  $dst,$src,$mem\t! div packed16F" %}
14058   ins_encode %{
14059     int vector_len = 2;
14060     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
14061   %}
14062   ins_pipe( pipe_slow );
14063 %}
14064 
14065 // Doubles vector div
14066 instruct vdiv2D(vecX dst, vecX src) %{
14067   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
14068   match(Set dst (DivVD dst src));
14069   format %{ "divpd   $dst,$src\t! div packed2D" %}
14070   ins_encode %{
14071     __ divpd($dst$$XMMRegister, $src$$XMMRegister);
14072   %}
14073   ins_pipe( pipe_slow );
14074 %}
14075 
14076 instruct vdiv2D_reg(vecX dst, vecX src1, vecX src2) %{
14077   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
14078   match(Set dst (DivVD src1 src2));
14079   format %{ "vdivpd  $dst,$src1,$src2\t! div packed2D" %}
14080   ins_encode %{
14081     int vector_len = 0;
14082     __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14083   %}
14084   ins_pipe( pipe_slow );
14085 %}
14086 
14087 instruct vdiv2D_mem(vecX dst, vecX src, memory mem) %{
14088   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
14089   match(Set dst (DivVD src (LoadVector mem)));
14090   format %{ "vdivpd  $dst,$src,$mem\t! div packed2D" %}
14091   ins_encode %{
14092     int vector_len = 0;
14093     __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
14094   %}
14095   ins_pipe( pipe_slow );
14096 %}
14097 
14098 instruct vdiv4D_reg(vecY dst, vecY src1, vecY src2) %{
14099   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
14100   match(Set dst (DivVD src1 src2));
14101   format %{ "vdivpd  $dst,$src1,$src2\t! div packed4D" %}
14102   ins_encode %{
14103     int vector_len = 1;
14104     __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14105   %}
14106   ins_pipe( pipe_slow );
14107 %}
14108 
14109 instruct vdiv4D_mem(vecY dst, vecY src, memory mem) %{
14110   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
14111   match(Set dst (DivVD src (LoadVector mem)));
14112   format %{ "vdivpd  $dst,$src,$mem\t! div packed4D" %}
14113   ins_encode %{
14114     int vector_len = 1;
14115     __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
14116   %}
14117   ins_pipe( pipe_slow );
14118 %}
14119 
14120 instruct vdiv8D_reg(vecZ dst, vecZ src1, vecZ src2) %{
14121   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
14122   match(Set dst (DivVD src1 src2));
14123   format %{ "vdivpd  $dst,$src1,$src2\t! div packed8D" %}
14124   ins_encode %{
14125     int vector_len = 2;
14126     __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14127   %}
14128   ins_pipe( pipe_slow );
14129 %}
14130 
14131 instruct vdiv8D_mem(vecZ dst, vecZ src, memory mem) %{
14132   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
14133   match(Set dst (DivVD src (LoadVector mem)));
14134   format %{ "vdivpd  $dst,$src,$mem\t! div packed8D" %}
14135   ins_encode %{
14136     int vector_len = 2;
14137     __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
14138   %}
14139   ins_pipe( pipe_slow );
14140 %}
14141 
14142 // ------------------------------ Min ---------------------------------------
14143 // Byte vector Min
14144 instruct min8B_reg(vecD dst, vecD src1, vecD src2) %{
14145   predicate(UseSSE > 3 && UseAVX == 0 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
14146   match(Set dst (MinV src1 src2));
14147   effect(TEMP dst);
14148   format %{ "movdqu  $dst,$src1\n\t"
14149             "pminsb  $dst,$src2\t!  " %}
14150   ins_encode %{
14151     __ movdqu($dst$$XMMRegister, $src1$$XMMRegister);
14152     __ pminsb($dst$$XMMRegister, $src2$$XMMRegister);
14153   %}
14154   ins_pipe( pipe_slow );
14155 %}
14156 
14157 instruct min8B_reg_avx(vecD dst, vecD src1, vecD src2) %{
14158   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
14159   match(Set dst (MinV src1 src2));
14160   format %{ "vpminsb  $dst,$src1,$src2\t!  " %}
14161   ins_encode %{
14162     __ vpminsb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, 0);
14163   %}
14164   ins_pipe( pipe_slow );
14165 %}
14166 
14167 instruct min16B_reg(vecX dst, vecX src1, vecX src2) %{
14168   predicate(UseSSE > 3 && UseAVX == 0 && n->as_Vector()->length() == 16 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
14169   match(Set dst (MinV src1 src2));
14170   effect(TEMP dst);
14171   format %{ "movdqu  $dst,$src1\n\t"
14172             "pminsb  $dst,$src2\t!  " %}
14173   ins_encode %{
14174     __ movdqu($dst$$XMMRegister, $src1$$XMMRegister);
14175     __ pminsb($dst$$XMMRegister, $src2$$XMMRegister);
14176   %}
14177   ins_pipe( pipe_slow );
14178 %}
14179 
14180 instruct min16B_reg_avx(vecX dst, vecX src1, vecX src2) %{
14181   predicate(UseAVX > 0 && n->as_Vector()->length() == 16 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
14182   match(Set dst (MinV src1 src2));
14183   format %{ "vpminsb    $dst,$src1,$src2\t! " %}
14184   ins_encode %{
14185     int vector_len = 0;
14186     __ vpminsb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14187   %}
14188   ins_pipe( pipe_slow );
14189 %}
14190 
14191 instruct min32B_reg(vecY dst, vecY src1, vecY src2) %{
14192   predicate(UseAVX > 1 && n->as_Vector()->length() == 32 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
14193   match(Set dst (MinV src1 src2));
14194   format %{ "vpminsb    $dst,$src1,$src2\t! " %}
14195   ins_encode %{
14196     int vector_len = 1;
14197     __ vpminsb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14198   %}
14199   ins_pipe( pipe_slow );
14200 %}
14201 
14202 instruct min64B_reg_evex(vecZ dst, vecZ src1, vecZ src2) %{
14203   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
14204   match(Set dst (MinV src1 src2));
14205   format %{ "vpminsb  $dst,$src1,$src2\t! " %}
14206   ins_encode %{
14207     int vector_len = 2;
14208     __ vpminsb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14209   %}
14210   ins_pipe( pipe_slow );
14211 %}
14212 
14213 //Short vector Min
14214 instruct min4S_reg(vecD dst, vecD src1, vecD src2) %{
14215   predicate(UseSSE >= 2 && UseAVX == 0 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
14216   match(Set dst (MinV src1 src2));
14217   effect(TEMP dst);
14218   format %{ "movsd   $dst,$src1\n\t"
14219             "pminsw  $dst,$src2\t! " %}
14220   ins_encode %{
14221     __ movsd($dst$$XMMRegister, $src1$$XMMRegister);
14222     __ pminsw($dst$$XMMRegister, $src2$$XMMRegister);
14223   %}
14224   ins_pipe( pipe_slow );
14225 %}
14226 
14227 instruct min4S_reg_avx(vecD dst, vecD src1, vecD src2) %{
14228   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
14229   match(Set dst (MinV src1 src2));
14230   effect(TEMP dst);
14231   format %{ "vpminsw  $dst,$src1,$src2\t! " %}
14232   ins_encode %{
14233     __ vpminsw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, 0);
14234   %}
14235   ins_pipe( pipe_slow );
14236 %}
14237 
14238 instruct min8S_reg(vecX dst, vecX src1, vecX src2) %{
14239   predicate(UseSSE >= 2 && UseAVX == 0 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
14240   match(Set dst (MinV src1 src2));
14241   effect(TEMP dst);
14242   format %{ "movdqu   $dst,$src1\n\t"
14243             "pminsw  $dst,$src2\t! " %}
14244   ins_encode %{
14245     __ movdqu($dst$$XMMRegister, $src1$$XMMRegister);
14246     __ pminsw($dst$$XMMRegister, $src2$$XMMRegister);
14247   %}
14248   ins_pipe( pipe_slow );
14249 %}
14250 
14251 instruct min8S_reg_avx(vecX dst, vecX src1, vecX src2) %{
14252   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
14253   match(Set dst (MinV src1 src2));
14254   format %{ "vpminsw    $dst,$src1,$src2\t! " %}
14255   ins_encode %{
14256     int vector_len = 0;
14257     __ vpminsw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14258   %}
14259   ins_pipe( pipe_slow );
14260 %}
14261 
14262 instruct min16S_reg(vecY dst, vecY src1, vecY src2) %{
14263   predicate(UseAVX > 1 && n->as_Vector()->length() == 16 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
14264   match(Set dst (MinV src1 src2));
14265   format %{ "vpminsw    $dst,$src1,$src2\t! " %}
14266   ins_encode %{
14267     int vector_len = 1;
14268     __ vpminsw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14269   %}
14270   ins_pipe( pipe_slow );
14271 %}
14272 
14273 instruct min32S_reg_evex(vecZ dst, vecZ src1, vecZ src2) %{
14274   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
14275   match(Set dst (MinV src1 src2));
14276   format %{ "vpminsw  $dst,$src1,$src2\t! " %}
14277   ins_encode %{
14278     int vector_len = 2;
14279     __ vpminsw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14280   %}
14281   ins_pipe( pipe_slow );
14282 %}
14283 
14284 // Int vector Min
14285 instruct min2I_reg(vecD dst, vecD src1, vecD src2) %{
14286   predicate(UseSSE > 3 && UseAVX == 0 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
14287   match(Set dst (MinV src1 src2));
14288   effect(TEMP dst);
14289   format %{ "movsd   $dst,$src1\n\t"
14290             "pminsd  $dst,$src2\t! " %}
14291   ins_encode %{
14292     __ movsd($dst$$XMMRegister, $src1$$XMMRegister);
14293     __ pminsd($dst$$XMMRegister, $src2$$XMMRegister);
14294   %}
14295   ins_pipe( pipe_slow );
14296 %}
14297 
14298 instruct min2I_reg_avx(vecD dst, vecD src1, vecD src2) %{
14299   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
14300   match(Set dst (MinV src1 src2));
14301   format %{ "vpminsd    $dst,$src1,$src2\t! " %}
14302   ins_encode %{
14303     int vector_len = 0;
14304     __ vpminsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14305   %}
14306   ins_pipe( pipe_slow );
14307 %}
14308 
14309 instruct min4I_reg(vecX dst, vecX src1, vecX src2) %{
14310   predicate(UseSSE > 3 && UseAVX == 0 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
14311   match(Set dst (MinV src1 src2));
14312   effect(TEMP dst);
14313   format %{ "movdqu   $dst,$src1\n\t"
14314             "pminsd   $dst,$src2\t! " %}
14315   ins_encode %{
14316     __ movdqu($dst$$XMMRegister, $src1$$XMMRegister);
14317     __ pminsd($dst$$XMMRegister, $src2$$XMMRegister);
14318   %}
14319   ins_pipe( pipe_slow );
14320 %}
14321 
14322 instruct min4I_reg_avx(vecX dst, vecX src1, vecX src2) %{
14323   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
14324   match(Set dst (MinV src1 src2));
14325   format %{ "vpminsd    $dst,$src1,$src2\t! " %}
14326   ins_encode %{
14327     int vector_len = 0;
14328     __ vpminsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14329   %}
14330   ins_pipe( pipe_slow );
14331 %}
14332 
14333 instruct min4I_reg_evex(vecX dst, vecX src1, vecX src2) %{
14334   predicate(UseAVX > 2 && VM_Version::supports_avx512vl() && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
14335   match(Set dst (MinV src1 src2));
14336   format %{ "vpminsd  $dst,$src1,$src2\t! " %}
14337   ins_encode %{
14338     int vector_len = 0;
14339     __ vpminsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14340   %}
14341   ins_pipe( pipe_slow );
14342 %}
14343 
14344 instruct min8I_reg_avx(vecY dst, vecY src1, vecY src2) %{
14345   predicate(UseAVX > 1 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
14346   match(Set dst (MinV src1 src2));
14347   format %{ "vpminsd    $dst,$src1,$src2\t! " %}
14348   ins_encode %{
14349     int vector_len = 1;
14350     __ vpminsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14351   %}
14352   ins_pipe( pipe_slow );
14353 %}
14354 
14355 instruct min8I_reg_evex(vecY dst, vecY src1, vecY src2) %{
14356   predicate(UseAVX > 2 && VM_Version::supports_avx512vl() && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
14357   match(Set dst (MinV src1 src2));
14358   format %{ "vpminsd  $dst,$src1,$src2\t! " %}
14359   ins_encode %{
14360     int vector_len = 1;
14361     __ vpminsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14362   %}
14363   ins_pipe( pipe_slow );
14364 %}
14365 
14366 instruct min16I_reg_evex(vecZ dst, vecZ src1, vecZ src2) %{
14367   predicate(UseAVX > 2 && VM_Version::supports_avx512vl() && n->as_Vector()->length() == 16 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
14368   match(Set dst (MinV src1 src2));
14369   format %{ "vpminsd  $dst,$src1,$src2\t! " %}
14370   ins_encode %{
14371     int vector_len = 2;
14372     __ vpminsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14373   %}
14374   ins_pipe( pipe_slow );
14375 %}
14376 
14377 // Long vector Min
14378 instruct minL_reg(vecD dst, vecD src1, vecD src2, rxmm0 tmp) %{
14379   predicate(UseAVX == 0 && UseSSE >= 4 && n->as_Vector()->length() == 1 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
14380   match(Set dst (MinV src1 src2));
14381   effect(TEMP dst, TEMP tmp);
14382   format %{ "movsd     $tmp,$src1\n\t"
14383             "movsd     $dst,$src1\n\t"
14384             "pcmpgtq   $tmp,$src2\n\t"
14385             "blendvpd  $dst,$src2\t! " %}
14386   ins_encode %{
14387     __ movsd($tmp$$XMMRegister, $src1$$XMMRegister);
14388     __ movsd($dst$$XMMRegister, $src1$$XMMRegister);
14389     __ pcmpgtq($tmp$$XMMRegister, $src2$$XMMRegister);
14390     __ blendvpd($dst$$XMMRegister, $src2$$XMMRegister);
14391   %}
14392   ins_pipe( pipe_slow );
14393 %}
14394 
14395 instruct min1L_reg_avx(vecD dst, vecD src1, vecD src2) %{
14396   predicate(UseAVX > 0 && n->as_Vector()->length() == 1 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
14397   match(Set dst (MinV src1 src2));
14398   effect(TEMP dst);
14399   format %{ "vpcmpgtq   $dst,$src1,$src2\n\t"
14400             "vblendvpd  $dst,$src1,$src2,$dst\t! " %}
14401   ins_encode %{
14402     int vector_len = 0;
14403     __ vpcmpgtq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14404     __ vblendvpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $dst$$XMMRegister, vector_len);
14405   %}
14406   ins_pipe( pipe_slow );
14407 %}
14408 
14409 instruct min2L_reg(vecX dst, vecX src1, vecX src2, rxmm0 tmp) %{
14410   predicate(UseAVX == 0 && UseSSE >= 4 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
14411   match(Set dst (MinV src1 src2));
14412   effect(TEMP dst, TEMP tmp);
14413   format %{ "movdqu    $tmp,$src1\n\t"
14414             "movdqu    $dst,$src1\n\t"
14415             "pcmpgtq   $tmp,$src2\n\t"
14416             "blendvpd  $dst,$src2\t! " %}
14417   ins_encode %{
14418     __ movdqu($tmp$$XMMRegister, $src1$$XMMRegister);
14419     __ movdqu($dst$$XMMRegister, $src1$$XMMRegister);
14420     __ pcmpgtq($tmp$$XMMRegister, $src2$$XMMRegister);
14421     __ blendvpd($dst$$XMMRegister, $src2$$XMMRegister);
14422   %}
14423   ins_pipe( pipe_slow );
14424 %}
14425 
14426 instruct min2L_reg_avx(vecX dst, vecX src1, vecX src2) %{
14427   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
14428   match(Set dst (MinV src1 src2));
14429   effect(TEMP dst);
14430   format %{ "vpcmpgtq   $dst,$src1,$src2\n\t"
14431             "vblendvpd  $dst,$src1,$src2,$dst\t! " %}
14432   ins_encode %{
14433     int vector_len = 0;
14434     __ vpcmpgtq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14435     __ vblendvpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $dst$$XMMRegister, vector_len);
14436   %}
14437   ins_pipe( pipe_slow );
14438 %}
14439 
14440 instruct min4L_reg_avx(vecY dst, vecY src1, vecY src2) %{
14441   predicate(UseAVX > 1 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
14442   match(Set dst (MinV src1 src2));
14443   effect(TEMP dst);
14444   format %{ "vpcmpgtq   $dst,$src1,$src2\n\t"
14445             "vblendvpd  $dst,$src1,$src2,$dst\t! " %}
14446   ins_encode %{
14447     int vector_len = 1;
14448     __ vpcmpgtq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14449     __ vblendvpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $dst$$XMMRegister, vector_len);
14450   %}
14451   ins_pipe( pipe_slow );
14452 %}
14453 
14454 instruct min2L_reg_evex(vecX dst, vecX src1, vecX src2) %{
14455   predicate(UseAVX > 2 && VM_Version::supports_avx512vl() && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
14456   match(Set dst (MinV src1 src2));
14457   format %{ "vpminsq  $dst,$src1,src2\t! " %}
14458   ins_encode %{
14459     __ vpminsq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, 0);
14460   %}
14461   ins_pipe( pipe_slow );
14462 %}
14463 
14464 instruct min4L_reg_evex(vecY dst, vecY src1, vecY src2) %{
14465   predicate(UseAVX > 2 && VM_Version::supports_avx512vl() && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
14466   match(Set dst (MinV src1 src2));
14467   format %{ "vpminsq  $dst,$src1,src2\t! " %}
14468   ins_encode %{
14469     __ vpminsq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, 1);
14470   %}
14471   ins_pipe( pipe_slow );
14472 %}
14473 
14474 instruct min8L_reg_evex(vecZ dst, vecZ src1, vecZ src2) %{
14475   predicate(UseAVX > 2 && VM_Version::supports_avx512vl() && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
14476   match(Set dst (MinV src1 src2));
14477   format %{ "vpminsq  $dst,$src1,src2\t! " %}
14478   ins_encode %{
14479     __ vpminsq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, 2);
14480   %}
14481   ins_pipe( pipe_slow );
14482 %}
14483 
14484 // Float vector Min
14485 instruct min2F_reg_avx(legVecD dst, legVecD a, legVecD b, legVecD tmp, legVecD atmp, legVecD btmp) %{
14486   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
14487   match(Set dst (MinV a b));
14488   effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp);
14489   format %{
14490      "blendvps         $atmp,$a,$b,$a             \n\t"
14491      "blendvps         $btmp,$b,$a,$a             \n\t"
14492      "vminps           $tmp,$atmp,$btmp           \n\t"
14493      "cmpps.unordered  $btmp, $atmp, $atmp        \n\t"
14494      "blendvps         $dst,$tmp,$atmp,$btmp      \n\t"
14495   %}
14496   ins_encode %{
14497     int vector_len = 0;
14498     __ vblendvps($atmp$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, vector_len);
14499     __ vblendvps($btmp$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, $a$$XMMRegister, vector_len);
14500     __ vminps($tmp$$XMMRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vector_len);
14501     __ vcmpps($btmp$$XMMRegister, $atmp$$XMMRegister, $atmp$$XMMRegister, Assembler::UNORD_Q, vector_len);
14502     __ vblendvps($dst$$XMMRegister, $tmp$$XMMRegister, $atmp$$XMMRegister, $btmp$$XMMRegister, vector_len);
14503   %}
14504   ins_pipe( pipe_slow );
14505 %}
14506 
14507 instruct min4F_reg_avx(legVecX dst, legVecX a, legVecX b, legVecX tmp, legVecX atmp, legVecX btmp) %{
14508   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
14509   match(Set dst (MinV a b));
14510   effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp);
14511   format %{
14512      "blendvps         $atmp,$a,$b,$a             \n\t"
14513      "blendvps         $btmp,$b,$a,$a             \n\t"
14514      "vminps           $tmp,$atmp,$btmp           \n\t"
14515      "cmpps.unordered  $btmp, $atmp, $atmp        \n\t"
14516      "blendvps         $dst,$tmp,$atmp,$btmp      \n\t"
14517   %}
14518   ins_encode %{
14519     int vector_len = 0;
14520     __ vblendvps($atmp$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, vector_len);
14521     __ vblendvps($btmp$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, $a$$XMMRegister, vector_len);
14522     __ vminps($tmp$$XMMRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vector_len);
14523     __ vcmpps($btmp$$XMMRegister, $atmp$$XMMRegister, $atmp$$XMMRegister, Assembler::UNORD_Q, vector_len);
14524     __ vblendvps($dst$$XMMRegister, $tmp$$XMMRegister, $atmp$$XMMRegister, $btmp$$XMMRegister, vector_len);
14525   %}
14526   ins_pipe( pipe_slow );
14527 %}
14528 
14529 instruct min8F_reg_avx(legVecY dst, legVecY a, legVecY b, legVecY tmp, legVecY atmp, legVecY btmp) %{
14530   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
14531   match(Set dst (MinV a b));
14532   effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp);
14533   format %{
14534      "blendvps         $atmp,$a,$b,$a             \n\t"
14535      "blendvps         $btmp,$b,$a,$a             \n\t"
14536      "vminps           $tmp,$atmp,$btmp           \n\t"
14537      "cmpps.unordered  $btmp, $atmp, $atmp        \n\t"
14538      "blendvps         $dst,$tmp,$atmp,$btmp      \n\t"
14539   %}
14540   ins_encode %{
14541     int vector_len = 1;
14542     __ vblendvps($atmp$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, vector_len);
14543     __ vblendvps($btmp$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, $a$$XMMRegister, vector_len);
14544     __ vminps($tmp$$XMMRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vector_len);
14545     __ vcmpps($btmp$$XMMRegister, $atmp$$XMMRegister, $atmp$$XMMRegister, Assembler::UNORD_Q, vector_len);
14546     __ vblendvps($dst$$XMMRegister, $tmp$$XMMRegister, $atmp$$XMMRegister, $btmp$$XMMRegister, vector_len);
14547   %}
14548   ins_pipe( pipe_slow );
14549 %}
14550 
14551 instruct min16F_reg_evex(vecZ dst, vecZ a, vecZ b, vecZ atmp, vecZ btmp) %{
14552   predicate(UseAVX > 2 && VM_Version::supports_avx512dq() &&  n->as_Vector()->length() == 16 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
14553   match(Set dst (MinV a b));
14554   effect(USE a, USE b, TEMP atmp, TEMP btmp);
14555   format %{ 
14556      "vpmovd2m         k1,$a                    \n\t"
14557      "vblendmps        $atmp,k1,$a,$b           \n\t"
14558      "vblendmps        $btmp,k1,$b,$a           \n\t"
14559      "vminps           $dst,$atmp,$btmp         \n\t"
14560      "vcmpps.unordered      k1,$atmp,$atmp           \n\t"
14561      "vmovaps          $dst,k1,$atmp            \n\t"
14562   %}
14563   ins_encode %{
14564     int vector_len = 2;
14565     KRegister ktmp = k1;
14566     KRegister mask = k0;
14567     __ evpmovd2m(ktmp, $a$$XMMRegister, vector_len); 
14568     __ evblendmps($atmp$$XMMRegister, ktmp, $a$$XMMRegister, $b$$XMMRegister, true, vector_len); 
14569     __ evblendmps($btmp$$XMMRegister, ktmp, $b$$XMMRegister, $a$$XMMRegister, true, vector_len); 
14570     __ vminps($dst$$XMMRegister, $atmp$$XMMRegister, $btmp$$XMMRegister, vector_len);
14571     __ evcmpps(ktmp, mask, $atmp$$XMMRegister, $atmp$$XMMRegister, Assembler::UNORD_Q, vector_len);
14572     __ evmovdqul($dst$$XMMRegister, ktmp, $atmp$$XMMRegister, true, vector_len);
14573   %}
14574   ins_pipe( pipe_slow );
14575 %}
14576 
14577 // Double vector Min
14578 instruct min1D_reg_avx(legVecD dst, legVecD a, legVecD b, legVecD tmp, legVecD atmp, legVecD btmp) %{
14579   predicate(UseAVX > 0 && n->as_Vector()->length() == 1 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
14580   match(Set dst (MinV a b));
14581   effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp);
14582   format %{ 
14583      "blendvpd         $atmp,$a,$b,$a           \n\t"
14584      "blendvpd         $btmp,$b,$a,$a           \n\t"
14585      "vminpd           $tmp,$atmp,$btmp         \n\t"
14586      "cmppd.unordered  $btmp, $atmp, $atmp      \n\t"
14587      "blendvpd         $dst,$tmp,$atmp,$btmp    \n\t"
14588   %}
14589   ins_encode %{
14590     int vector_len = 0;
14591     __ vblendvpd($atmp$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, vector_len);
14592     __ vblendvpd($btmp$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, $a$$XMMRegister, vector_len);
14593     __ vminpd($tmp$$XMMRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vector_len);
14594     __ vcmppd($btmp$$XMMRegister, $atmp$$XMMRegister, $atmp$$XMMRegister, Assembler::UNORD_Q, vector_len);
14595     __ vblendvpd($dst$$XMMRegister, $tmp$$XMMRegister, $atmp$$XMMRegister, $btmp$$XMMRegister, vector_len);
14596   %}
14597   ins_pipe( pipe_slow );
14598 %}
14599 
14600 instruct min2D_reg_avx(legVecX dst, legVecX a, legVecX b, legVecX tmp, legVecX atmp, legVecX btmp) %{
14601   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
14602   match(Set dst (MinV a b));
14603   effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp);
14604   format %{ 
14605      "blendvpd         $atmp,$a,$b,$a           \n\t"
14606      "blendvpd         $btmp,$b,$a,$a           \n\t"
14607      "vminpd           $tmp,$atmp,$btmp         \n\t"
14608      "cmppd.unordered  $btmp, $atmp, $atmp      \n\t"
14609      "blendvpd         $dst,$tmp,$atmp,$btmp    \n\t"
14610   %}
14611   ins_encode %{
14612     int vector_len = 0;
14613     __ vblendvpd($atmp$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, vector_len);
14614     __ vblendvpd($btmp$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, $a$$XMMRegister, vector_len);
14615     __ vminpd($tmp$$XMMRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vector_len);
14616     __ vcmppd($btmp$$XMMRegister, $atmp$$XMMRegister, $atmp$$XMMRegister, Assembler::UNORD_Q, vector_len);
14617     __ vblendvpd($dst$$XMMRegister, $tmp$$XMMRegister, $atmp$$XMMRegister, $btmp$$XMMRegister, vector_len);
14618   %}
14619   ins_pipe( pipe_slow );
14620 %}
14621 
14622 instruct min4D_reg_avx(legVecY dst, legVecY a, legVecY b, legVecY tmp, legVecY atmp, legVecY btmp) %{
14623   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
14624   match(Set dst (MinV a b));
14625   effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp);
14626   format %{ 
14627      "blendvpd         $atmp,$a,$b,$a           \n\t"
14628      "blendvpd         $btmp,$b,$a,$a           \n\t"
14629      "vminpd           $tmp,$atmp,$btmp         \n\t"
14630      "cmppd.unordered  $btmp, $atmp, $atmp      \n\t"
14631      "blendvpd         $dst,$tmp,$atmp,$btmp    \n\t"
14632   %}
14633   ins_encode %{
14634     int vector_len = 1;
14635     __ vblendvpd($atmp$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, vector_len);
14636     __ vblendvpd($btmp$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, $a$$XMMRegister, vector_len);
14637     __ vminpd($tmp$$XMMRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vector_len);
14638     __ vcmppd($btmp$$XMMRegister, $atmp$$XMMRegister, $atmp$$XMMRegister, Assembler::UNORD_Q, vector_len);
14639     __ vblendvpd($dst$$XMMRegister, $tmp$$XMMRegister, $atmp$$XMMRegister, $btmp$$XMMRegister, vector_len);
14640   %}
14641   ins_pipe( pipe_slow );
14642 %}
14643 
14644 instruct min8D_reg_evex(vecZ dst, vecZ a, vecZ b, vecZ atmp, vecZ btmp) %{
14645   predicate(UseAVX > 2 && VM_Version::supports_avx512dq() && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
14646   match(Set dst (MinV a b));
14647   effect(USE a, USE b, TEMP atmp, TEMP btmp);
14648   format %{ 
14649      "vpmovq2m         k1,$a                    \n\t"
14650      "vblendmpd        $atmp,k1,$a,$b           \n\t"
14651      "vblendmpd        $btmp,k1,$b,$a           \n\t"
14652      "vminpd           $dst,$atmp,$btmp         \n\t"
14653      "vcmppd.unordered      k1,$atmp,$atmp           \n\t"
14654      "vmovapd          $dst,k1,$atmp            \n\t"
14655   %}
14656   ins_encode %{
14657     int vector_len = 2;
14658     KRegister ktmp = k1;
14659     KRegister mask = k0;
14660     __ evpmovq2m(ktmp, $a$$XMMRegister, vector_len); 
14661     __ evblendmpd($atmp$$XMMRegister, ktmp, $a$$XMMRegister, $b$$XMMRegister, true, vector_len); 
14662     __ evblendmpd($btmp$$XMMRegister, ktmp, $b$$XMMRegister, $a$$XMMRegister, true, vector_len); 
14663     __ vminpd($dst$$XMMRegister, $atmp$$XMMRegister, $btmp$$XMMRegister, vector_len);
14664     __ evcmppd(ktmp, mask, $atmp$$XMMRegister, $atmp$$XMMRegister, Assembler::UNORD_Q, vector_len);
14665     __ evmovdquq($dst$$XMMRegister, ktmp, $atmp$$XMMRegister, true, vector_len);
14666   %}
14667   ins_pipe( pipe_slow );
14668 %}
14669 
14670 // ------------------------------ Max ---------------------------------------
14671 // Byte vector Max
14672 instruct max8B_reg(vecD dst, vecD src1, vecD src2) %{
14673   predicate(UseSSE > 3 && UseAVX == 0 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
14674   match(Set dst (MaxV src1 src2));
14675   effect(TEMP dst);
14676   format %{ "movsd   $dst,$src1\n\t"
14677             "pmaxsb  $dst,$src2\t! " %}
14678   ins_encode %{
14679     __ movsd($dst$$XMMRegister, $src1$$XMMRegister);
14680     __ pmaxsb($dst$$XMMRegister, $src2$$XMMRegister);
14681   %}
14682   ins_pipe( pipe_slow );
14683 %}
14684 
14685 instruct max8B_reg_avx(vecD dst, vecD src1, vecD src2) %{
14686   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
14687   match(Set dst (MaxV src1 src2));
14688   format %{ "vpmaxsb    $dst,$src1,$src2\t! " %}
14689   ins_encode %{
14690     int vector_len = 0;
14691     __ vpmaxsb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14692   %}
14693   ins_pipe( pipe_slow );
14694 %}
14695 
14696 instruct max16B_reg(vecX dst, vecX src1, vecX src2) %{
14697   predicate(UseSSE > 3 && UseAVX == 0 && n->as_Vector()->length() == 16 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
14698   match(Set dst (MaxV src1 src2));
14699   effect(TEMP dst);
14700   format %{ "movdqu  $dst,$src1\n\t"
14701             "pmaxsb  $dst,$src2\t! " %}
14702   ins_encode %{
14703     __ movdqu($dst$$XMMRegister, $src1$$XMMRegister);
14704     __ pmaxsb($dst$$XMMRegister, $src2$$XMMRegister);
14705   %}
14706   ins_pipe( pipe_slow );
14707 %}
14708 
14709 instruct max16B_reg_avx(vecX dst, vecX src1, vecX src2) %{
14710   predicate(UseAVX > 0 && n->as_Vector()->length() == 16 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
14711   match(Set dst (MaxV src1 src2));
14712   format %{ "vpmaxsb    $dst,$src1,$src2\t! " %}
14713   ins_encode %{
14714     int vector_len = 0;
14715     __ vpmaxsb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14716   %}
14717   ins_pipe( pipe_slow );
14718 %}
14719 
14720 instruct max32B_reg(vecY dst, vecY src1, vecY src2) %{
14721   predicate(UseAVX > 1 && n->as_Vector()->length() == 32 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
14722   match(Set dst (MaxV src1 src2));
14723   format %{ "vpmaxsb    $dst,$src1,$src2\t! " %}
14724   ins_encode %{
14725     int vector_len = 1;
14726     __ vpmaxsb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14727   %}
14728   ins_pipe( pipe_slow );
14729 %}
14730 
14731 instruct max64B_reg_evex(vecZ dst, vecZ src1, vecZ src2) %{
14732   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
14733   match(Set dst (MaxV src1 src2));
14734   format %{ "vpmaxsb  $dst,$src1,$src2\t! " %}
14735   ins_encode %{
14736     int vector_len = 2;
14737     __ vpmaxsb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14738   %}
14739   ins_pipe( pipe_slow );
14740 %}
14741 
14742 //Short vector Max
14743 instruct max4S_reg(vecD dst, vecD src1, vecD src2) %{
14744   predicate(UseSSE > 1 && UseAVX == 0 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
14745   match(Set dst (MaxV src1 src2));
14746   effect(TEMP dst);
14747   format %{ "movsd   $dst,$src1\n\t"
14748             "pmaxsw  $dst,$src2\t! " %}
14749   ins_encode %{
14750     __ movsd($dst$$XMMRegister, $src1$$XMMRegister);
14751     __ pmaxsw($dst$$XMMRegister, $src2$$XMMRegister);
14752   %}
14753   ins_pipe( pipe_slow );
14754 %}
14755 
14756 instruct max4S_reg_avx(vecD dst, vecD src1, vecD src2) %{
14757   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
14758   match(Set dst (MaxV src1 src2));
14759   format %{ "vpmaxsw    $dst,$src1,$src2\t! " %}
14760   ins_encode %{
14761     int vector_len = 0;
14762     __ vpmaxsw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14763   %}
14764   ins_pipe( pipe_slow );
14765 %}
14766 
14767 instruct max8S_reg(vecX dst, vecX src1, vecX src2) %{
14768   predicate(UseSSE > 1 && UseAVX == 0 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
14769   match(Set dst (MaxV src1 src2));
14770   effect(TEMP dst);
14771   format %{ "movdqu  $dst,$src1\n\t"
14772             "pmaxsw  $dst,$src2\t! " %}
14773   ins_encode %{
14774     __ movdqu($dst$$XMMRegister, $src1$$XMMRegister);
14775     __ pmaxsw($dst$$XMMRegister, $src2$$XMMRegister);
14776   %}
14777   ins_pipe( pipe_slow );
14778 %}
14779 
14780 instruct max8S_reg_avx(vecX dst, vecX src1, vecX src2) %{
14781   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
14782   match(Set dst (MaxV src1 src2));
14783   format %{ "vpmaxsw    $dst,$src1,$src2\t! " %}
14784   ins_encode %{
14785     int vector_len = 0;
14786     __ vpmaxsw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14787   %}
14788   ins_pipe( pipe_slow );
14789 %}
14790 
14791 instruct max16S_reg(vecY dst, vecY src1, vecY src2) %{
14792   predicate(UseAVX > 1 && n->as_Vector()->length() == 16 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
14793   match(Set dst (MaxV src1 src2));
14794   format %{ "vpmaxsw    $dst,$src1,$src2\t! " %}
14795   ins_encode %{
14796     int vector_len = 1;
14797     __ vpmaxsw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14798   %}
14799   ins_pipe( pipe_slow );
14800 %}
14801 
14802 instruct max32S_reg_evex(vecZ dst, vecZ src1, vecZ src2) %{
14803   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
14804   match(Set dst (MaxV src1 src2));
14805   format %{ "vpmaxsw  $dst,$src1,$src2\t! " %}
14806   ins_encode %{
14807     int vector_len = 2;
14808     __ vpmaxsw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14809   %}
14810   ins_pipe( pipe_slow );
14811 %}
14812 
14813 // Int vector Max
14814 instruct max2I_reg(vecD dst, vecD src1, vecD src2) %{
14815   predicate(UseSSE > 3 && UseAVX == 0 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
14816   match(Set dst (MaxV src1 src2));
14817   effect(TEMP dst);
14818   format %{ "movdqu  $dst,$src1\n\t"
14819             "pmaxsd  $dst,$src2\t! " %}
14820   ins_encode %{
14821     __ movdqu($dst$$XMMRegister, $src1$$XMMRegister);
14822     __ pmaxsd($dst$$XMMRegister, $src2$$XMMRegister);
14823   %}
14824   ins_pipe( pipe_slow );
14825 %}
14826 
14827 instruct max2I_reg_avx(vecD dst, vecD src1, vecD src2) %{
14828   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
14829   match(Set dst (MaxV src1 src2));
14830   format %{ "vpmaxsd    $dst,$src1,$src2\t! " %}
14831   ins_encode %{
14832     int vector_len = 0;
14833     __ vpmaxsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14834   %}
14835   ins_pipe( pipe_slow );
14836 %}
14837 
14838 instruct max4I_reg(vecX dst, vecX src1, vecX src2) %{
14839   predicate(UseSSE > 3 && UseAVX == 0 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
14840   match(Set dst (MaxV src1 src2));
14841   effect(TEMP dst);
14842   format %{ "movdqu  $dst,$src1\n\t"
14843             "pmaxsd  $dst,$src2\t! " %}
14844   ins_encode %{
14845     __ movdqu($dst$$XMMRegister, $src1$$XMMRegister);
14846     __ pmaxsd($dst$$XMMRegister, $src2$$XMMRegister);
14847   %}
14848   ins_pipe( pipe_slow );
14849 %}
14850 
14851 instruct max4I_reg_avx(vecX dst, vecX src1, vecX src2) %{
14852   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
14853   match(Set dst (MaxV src1 src2));
14854   format %{ "vpmaxsd    $dst,$src1,$src2\t! " %}
14855   ins_encode %{
14856     int vector_len = 0;
14857     __ vpmaxsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14858   %}
14859   ins_pipe( pipe_slow );
14860 %}
14861 
14862 instruct max4I_reg_evex(vecX dst, vecX src1, vecX src2) %{
14863   predicate(UseAVX > 2 && VM_Version::supports_avx512vl() && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
14864   match(Set dst (MaxV src1 src2));
14865   format %{ "vpmaxsd  $dst,$src1,$src2\t! " %}
14866   ins_encode %{
14867     int vector_len = 0;
14868     __ vpmaxsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14869   %}
14870   ins_pipe( pipe_slow );
14871 %}
14872 
14873 instruct max8I_reg_avx(vecY dst, vecY src1, vecY src2) %{
14874   predicate(UseAVX > 1 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
14875   match(Set dst (MaxV src1 src2));
14876   format %{ "vpmaxsd    $dst,$src1,$src2\t! " %}
14877   ins_encode %{
14878     int vector_len = 1;
14879     __ vpmaxsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14880   %}
14881   ins_pipe( pipe_slow );
14882 %}
14883 
14884 instruct max8I_reg_evex(vecY dst, vecY src1, vecY src2) %{
14885   predicate(UseAVX > 2 && VM_Version::supports_avx512vl() && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
14886   match(Set dst (MaxV src1 src2));
14887   format %{ "vpmaxsd  $dst,$src1,$src2\t! " %}
14888   ins_encode %{
14889     int vector_len = 1;
14890     __ vpmaxsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14891   %}
14892   ins_pipe( pipe_slow );
14893 %}
14894 
14895 instruct max16I_reg_evex(vecZ dst, vecZ src1, vecZ src2) %{
14896   predicate(UseAVX > 2 && VM_Version::supports_avx512vl() && n->as_Vector()->length() == 16 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
14897   match(Set dst (MaxV src1 src2));
14898   format %{ "vpmaxsd  $dst,$src1,$src2\t! " %}
14899   ins_encode %{
14900     int vector_len = 2;
14901     __ vpmaxsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14902   %}
14903   ins_pipe( pipe_slow );
14904 %}
14905 
14906 // Long Vector Max
14907 instruct maxL_reg(vecD dst, vecD src1, vecD src2, rxmm0 tmp) %{
14908   predicate(UseAVX == 0 && UseSSE >= 4 && n->as_Vector()->length() == 1 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
14909   match(Set dst (MaxV src1 src2));
14910   effect(TEMP dst, TEMP tmp);
14911   format %{ "movsd     $tmp,$src1\n\t"
14912             "movsd     $dst,$src1\n\t"
14913             "pcmpgtq   $tmp,$src2\n\t"
14914             "blendvpd  $dst,$src2\t! " %}
14915   ins_encode %{
14916     __ movsd($tmp$$XMMRegister, $src1$$XMMRegister);
14917     __ movsd($dst$$XMMRegister, $src2$$XMMRegister);
14918     __ pcmpgtq($tmp$$XMMRegister, $src2$$XMMRegister);
14919     __ blendvpd($dst$$XMMRegister, $src1$$XMMRegister);
14920   %}
14921   ins_pipe( pipe_slow );
14922 %}
14923 
14924 instruct max1L_reg_avx(vecD dst, vecD src1, vecD src2) %{
14925   predicate(UseAVX > 0 && n->as_Vector()->length() == 1 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
14926   match(Set dst (MaxV src1 src2));
14927   effect(TEMP dst);
14928   format %{ "vpcmpgtq   $dst,$src1,$src2\n\t"
14929             "vblendvpd  $dst,$src2,$src1,$dst\t! " %}
14930   ins_encode %{
14931     int vector_len = 0;
14932     __ vpcmpgtq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14933     __ vblendvpd($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, $dst$$XMMRegister, vector_len);
14934   %}
14935   ins_pipe( pipe_slow );
14936 %}
14937 
14938 instruct max2L_reg(vecX dst, vecX src1, vecX src2, rxmm0 tmp) %{
14939   predicate(UseAVX == 0 && UseSSE >= 4 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
14940   match(Set dst (MaxV src1 src2));
14941   effect(TEMP dst, TEMP tmp);
14942   format %{ "movdqu    $tmp,$src2\n\t"
14943             "movdqu    $dst,$src1\n\t"
14944             "pcmpgtq   $tmp,$src1\n\t"
14945             "blendvpd  $dst,$src2\t! " %}
14946   ins_encode %{
14947     __ movdqu($tmp$$XMMRegister, $src2$$XMMRegister);
14948     __ movdqu($dst$$XMMRegister, $src1$$XMMRegister);
14949     __ pcmpgtq($tmp$$XMMRegister, $src1$$XMMRegister);
14950     __ blendvpd($dst$$XMMRegister, $src2$$XMMRegister);
14951   %}
14952   ins_pipe( pipe_slow );
14953 %}
14954 
14955 instruct max2L_reg_avx(vecX dst, vecX src1, vecX src2) %{
14956   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
14957   match(Set dst (MaxV src1 src2));
14958   effect(TEMP dst);
14959   format %{ "vpcmpgtq   $dst,$src1,$src2\n\t"
14960             "vblendvpd  $dst,$src2,$src1,$dst\t! " %}
14961   ins_encode %{
14962     int vector_len = 0;
14963     __ vpcmpgtq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14964     __ vblendvpd($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, $dst$$XMMRegister, vector_len);
14965   %}
14966   ins_pipe( pipe_slow );
14967 %}
14968 
14969 instruct max2L_reg_evex(vecX dst, vecX src1, vecX src2) %{
14970   predicate(UseAVX > 2 && VM_Version::supports_avx512vl() && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
14971   match(Set dst (MaxV src1 src2));
14972   format %{ "vpmaxsq  $dst,$src1,src2\t! " %}
14973   ins_encode %{
14974     __ vpmaxsq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, 0);
14975   %}
14976   ins_pipe( pipe_slow );
14977 %}
14978 
14979 instruct max4L_reg_avx(vecY dst, vecY src1, vecY src2) %{
14980   predicate(UseAVX > 1 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
14981   match(Set dst (MaxV src1 src2));
14982   effect(TEMP dst);
14983   format %{ "vpcmpgtq   $dst,$src1,$src2\n\t"
14984             "vblendvpd  $dst,$src2,$src1,$dst\t! " %}
14985   ins_encode %{
14986     int vector_len = 1;
14987     __ vpcmpgtq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14988     __ vblendvpd($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, $dst$$XMMRegister, vector_len);
14989   %}
14990   ins_pipe( pipe_slow );
14991 %}
14992 
14993 instruct max4L_reg_evex(vecY dst, vecY src1, vecY src2) %{
14994   predicate(UseAVX > 2 && VM_Version::supports_avx512vl() && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
14995   match(Set dst (MaxV src1 src2));
14996   format %{ "vpmaxsq  $dst,$src1,src2\t! " %}
14997   ins_encode %{
14998     __ vpmaxsq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, 1);
14999   %}
15000   ins_pipe( pipe_slow );
15001 %}
15002 
15003 instruct max8L_reg_evex(vecZ dst, vecZ src1, vecZ src2) %{
15004   predicate(UseAVX > 2 && VM_Version::supports_avx512vl() && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
15005   match(Set dst (MaxV src1 src2));
15006   format %{ "vpmaxsq  $dst,$src1,src2\t! " %}
15007   ins_encode %{
15008     __ vpmaxsq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, 2);
15009   %}
15010   ins_pipe( pipe_slow );
15011 %}
15012 
15013 // Float Vector Max
15014 instruct max2F_reg_avx(legVecD dst, legVecD a, legVecD b, legVecD tmp, legVecD atmp, legVecD btmp) %{
15015   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
15016   match(Set dst (MaxV a b));
15017   effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp);
15018   format %{ 
15019      "blendvps         $btmp,$b,$a,$b           \n\t"
15020      "blendvps         $atmp,$a,$b,$b           \n\t"
15021      "vmaxps           $tmp,$atmp,$btmp         \n\t"
15022      "cmpps.unordered  $btmp, $atmp, $atmp      \n\t"
15023      "blendvps         $dst,$tmp,$atmp,$btmp    \n\t"
15024   %}
15025   ins_encode %{
15026     int vector_len = 0;
15027     __ vblendvps($btmp$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, vector_len);
15028     __ vblendvps($atmp$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $b$$XMMRegister, vector_len);
15029     __ vmaxps($tmp$$XMMRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vector_len);
15030     __ vcmpps($btmp$$XMMRegister, $atmp$$XMMRegister, $atmp$$XMMRegister, Assembler::UNORD_Q, vector_len);
15031     __ vblendvps($dst$$XMMRegister, $tmp$$XMMRegister, $atmp$$XMMRegister, $btmp$$XMMRegister, vector_len);
15032  %}
15033  ins_pipe( pipe_slow );
15034 %}
15035 
15036 instruct max4F_reg_avx(legVecX dst, legVecX a, legVecX b, legVecX tmp, legVecX atmp, legVecX btmp) %{
15037   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
15038   match(Set dst (MaxV a b));
15039   effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp);
15040   format %{ 
15041      "blendvps         $btmp,$b,$a,$b           \n\t"
15042      "blendvps         $atmp,$a,$b,$b           \n\t"
15043      "vmaxps           $tmp,$atmp,$btmp         \n\t"
15044      "cmpps.unordered  $btmp, $atmp, $atmp      \n\t"
15045      "blendvps         $dst,$tmp,$atmp,$btmp    \n\t"
15046   %}
15047   ins_encode %{
15048     int vector_len = 0;
15049     __ vblendvps($btmp$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, vector_len);
15050     __ vblendvps($atmp$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $b$$XMMRegister, vector_len);
15051     __ vmaxps($tmp$$XMMRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vector_len);
15052     __ vcmpps($btmp$$XMMRegister, $atmp$$XMMRegister, $atmp$$XMMRegister, Assembler::UNORD_Q, vector_len);
15053     __ vblendvps($dst$$XMMRegister, $tmp$$XMMRegister, $atmp$$XMMRegister, $btmp$$XMMRegister, vector_len);
15054  %}
15055  ins_pipe( pipe_slow );
15056 %}
15057 
15058 instruct max8F_reg_avx(legVecY dst, legVecY a, legVecY b, legVecY tmp, legVecY atmp, legVecY btmp) %{
15059   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
15060   match(Set dst (MaxV a b));
15061   effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp);
15062   format %{ 
15063      "blendvps         $btmp,$b,$a,$b           \n\t"
15064      "blendvps         $atmp,$a,$b,$b           \n\t"
15065      "vmaxps           $tmp,$atmp,$btmp         \n\t"
15066      "cmpps.unordered  $btmp, $atmp, $atmp      \n\t"
15067      "blendvps         $dst,$tmp,$atmp,$btmp    \n\t"
15068   %}
15069   ins_encode %{
15070     int vector_len = 1;
15071     __ vblendvps($btmp$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, vector_len);
15072     __ vblendvps($atmp$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $b$$XMMRegister, vector_len);
15073     __ vmaxps($tmp$$XMMRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vector_len);
15074     __ vcmpps($btmp$$XMMRegister, $atmp$$XMMRegister, $atmp$$XMMRegister, Assembler::UNORD_Q, vector_len);
15075     __ vblendvps($dst$$XMMRegister, $tmp$$XMMRegister, $atmp$$XMMRegister, $btmp$$XMMRegister, vector_len);
15076  %}
15077  ins_pipe( pipe_slow );
15078 %}
15079 
15080 instruct max16F_reg_evex(vecZ dst, vecZ a, vecZ b, vecZ atmp, vecZ btmp) %{
15081   predicate(UseAVX > 2 && VM_Version::supports_avx512dq() && n->as_Vector()->length() == 16 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
15082   match(Set dst (MaxV a b));
15083   effect(USE a, USE b, TEMP atmp, TEMP btmp);
15084   format %{ 
15085      "vpmovd2m         k1,$b              \n\t"
15086      "vblendmps        $atmp,k1,$a,$b     \n\t"
15087      "vblendmps        $btmp,k1,$b,$a     \n\t"
15088      "vmaxps           $dst,$atmp,$btmp   \n\t"
15089      "vcmpps.unordered      k1,$atmp,$atmp     \n\t"
15090      "vmovaps          $dst,k1,$atmp      \n\t"
15091   %}
15092   ins_encode %{
15093     int vector_len = 2;
15094     KRegister ktmp = k1; 
15095     KRegister mask = k0;
15096     __ evpmovd2m(ktmp, $b$$XMMRegister, vector_len); 
15097     __ evblendmps($atmp$$XMMRegister, ktmp, $a$$XMMRegister, $b$$XMMRegister, true, vector_len); 
15098     __ evblendmps($btmp$$XMMRegister, ktmp, $b$$XMMRegister, $a$$XMMRegister, true, vector_len); 
15099     __ vmaxps($dst$$XMMRegister, $atmp$$XMMRegister, $btmp$$XMMRegister, vector_len);
15100     __ evcmpps(ktmp, mask, $atmp$$XMMRegister, $atmp$$XMMRegister, Assembler::UNORD_Q, vector_len);
15101     __ evmovdqul($dst$$XMMRegister, ktmp, $atmp$$XMMRegister, true, vector_len);
15102   %}
15103   ins_pipe( pipe_slow );
15104 %}
15105 
15106 // Double Vector Max
15107 instruct max1D_reg_avx(legVecD dst, legVecD a, legVecD b, legVecD tmp, legVecD atmp, legVecD btmp) %{
15108   predicate(UseAVX > 0 && n->as_Vector()->length() == 1 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
15109   match(Set dst (MaxV a b));
15110   effect(USE a, USE b, TEMP atmp, TEMP btmp, TEMP tmp);
15111   format %{ 
15112      "blendvpd         $btmp,$b,$a,$b            \n\t"
15113      "blendvpd         $atmp,$a,$b,$b            \n\t"
15114      "vmaxpd           $tmp,$atmp,$btmp          \n\t"
15115      "cmppd.unordered  $btmp, $atmp, $atmp       \n\t"
15116      "blendvpd         $dst,$tmp,$atmp,$btmp     \n\t"
15117   %}
15118   ins_encode %{
15119     int vector_len = 0;
15120     __ vblendvpd($btmp$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, vector_len);
15121     __ vblendvpd($atmp$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $b$$XMMRegister, vector_len);
15122     __ vmaxpd($tmp$$XMMRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vector_len);
15123     __ vcmppd($btmp$$XMMRegister, $atmp$$XMMRegister, $atmp$$XMMRegister, Assembler::UNORD_Q, vector_len);
15124     __ vblendvpd($dst$$XMMRegister, $tmp$$XMMRegister, $atmp$$XMMRegister, $btmp$$XMMRegister, vector_len);
15125   %}
15126   ins_pipe( pipe_slow );
15127 %}
15128 
15129 instruct max2D_reg_avx(legVecX dst, legVecX a, legVecX b, legVecX tmp, legVecX atmp, legVecX btmp) %{
15130   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
15131   match(Set dst (MaxV a b));
15132   effect(USE a, USE b, TEMP atmp, TEMP btmp, TEMP tmp);
15133   format %{ 
15134      "blendvpd         $btmp,$b,$a,$b            \n\t"
15135      "blendvpd         $atmp,$a,$b,$b            \n\t"
15136      "vmaxpd           $tmp,$atmp,$btmp          \n\t"
15137      "cmppd.unordered  $btmp, $atmp, $atmp       \n\t"
15138      "blendvpd         $dst,$tmp,$atmp,$btmp     \n\t"
15139   %}
15140   ins_encode %{
15141     int vector_len = 0;
15142     __ vblendvpd($btmp$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, vector_len);
15143     __ vblendvpd($atmp$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $b$$XMMRegister, vector_len);
15144     __ vmaxpd($tmp$$XMMRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vector_len);
15145     __ vcmppd($btmp$$XMMRegister, $atmp$$XMMRegister, $atmp$$XMMRegister, Assembler::UNORD_Q, vector_len);
15146     __ vblendvpd($dst$$XMMRegister, $tmp$$XMMRegister, $atmp$$XMMRegister, $btmp$$XMMRegister, vector_len);
15147   %}
15148   ins_pipe( pipe_slow );
15149 %}
15150 
15151 instruct max4D_reg_avx(legVecY dst, legVecY a, legVecY b, legVecY tmp, legVecY atmp, legVecY btmp) %{
15152   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
15153   match(Set dst (MaxV a b));
15154   effect(USE a, USE b, TEMP atmp, TEMP btmp, TEMP tmp);
15155   format %{ 
15156      "blendvpd         $btmp,$b,$a,$b            \n\t"
15157      "blendvpd         $atmp,$a,$b,$b            \n\t"
15158      "vmaxpd           $tmp,$atmp,$btmp          \n\t"
15159      "cmppd.unordered  $btmp, $atmp, $atmp       \n\t"
15160      "blendvpd         $dst,$tmp,$atmp,$btmp     \n\t"
15161   %}
15162   ins_encode %{
15163     int vector_len = 1;
15164     __ vblendvpd($btmp$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, vector_len);
15165     __ vblendvpd($atmp$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $b$$XMMRegister, vector_len);
15166     __ vmaxpd($tmp$$XMMRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vector_len);
15167     __ vcmppd($btmp$$XMMRegister, $atmp$$XMMRegister, $atmp$$XMMRegister, Assembler::UNORD_Q, vector_len);
15168     __ vblendvpd($dst$$XMMRegister, $tmp$$XMMRegister, $atmp$$XMMRegister, $btmp$$XMMRegister, vector_len);
15169   %}
15170   ins_pipe( pipe_slow );
15171 %}
15172 
15173 
15174 instruct max8D_reg_evex(vecZ dst, vecZ a, vecZ b, vecZ atmp, vecZ btmp) %{
15175   predicate(UseAVX > 2 && VM_Version::supports_avx512dq() && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
15176   match(Set dst (MaxV a b));
15177   effect(USE a, USE b, TEMP atmp, TEMP btmp);
15178   format %{ 
15179      "vpmovq2m         k1,$b              \n\t"
15180      "vblendmpd        $atmp,k1,$a,$b     \n\t"
15181      "vblendmpd        $btmp,k1,$b,$a     \n\t"
15182      "vmaxpd           $dst,$atmp,$btmp   \n\t"
15183      "vcmppd.unordered      k1,$atmp,$atmp     \n\t"
15184      "vmovapd          $dst,k1,$atmp      \n\t"
15185   %}
15186   ins_encode %{
15187     int vector_len = 2;
15188     KRegister ktmp = k1; 
15189     KRegister mask = k0;
15190     __ evpmovq2m(ktmp, $b$$XMMRegister, vector_len); 
15191     __ evblendmpd($atmp$$XMMRegister, ktmp, $a$$XMMRegister, $b$$XMMRegister, true, vector_len); 
15192     __ evblendmpd($btmp$$XMMRegister, ktmp, $b$$XMMRegister, $a$$XMMRegister, true, vector_len); 
15193     __ vmaxpd($dst$$XMMRegister, $atmp$$XMMRegister, $btmp$$XMMRegister, vector_len);
15194     __ evcmppd(ktmp, mask, $atmp$$XMMRegister, $atmp$$XMMRegister, Assembler::UNORD_Q, vector_len);
15195     __ evmovdquq($dst$$XMMRegister, ktmp, $atmp$$XMMRegister, true, vector_len);
15196   %}
15197   ins_pipe( pipe_slow );
15198 %}
15199 
15200 // ------------------------------ Shift ---------------------------------------
15201 
15202 // Left and right shift count vectors are the same on x86
15203 // (only lowest bits of xmm reg are used for count).
15204 instruct vshiftcnt(vecS dst, rRegI cnt) %{
15205   match(Set dst (LShiftCntV cnt));
15206   match(Set dst (RShiftCntV cnt));
15207   format %{ "movd    $dst,$cnt\t! load shift count" %}
15208   ins_encode %{
15209     __ movdl($dst$$XMMRegister, $cnt$$Register);
15210   %}
15211   ins_pipe( pipe_slow );
15212 %}
15213 
15214 // --------------------------------- Sqrt --------------------------------------
15215 
15216 // Floating point vector sqrt
15217 instruct vsqrt2D_reg(vecX dst, vecX src) %{
15218   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
15219   match(Set dst (SqrtVD src));
15220   format %{ "vsqrtpd  $dst,$src\t! sqrt packed2D" %}
15221   ins_encode %{
15222     int vector_len = 0;
15223     __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
15224   %}
15225   ins_pipe( pipe_slow );
15226 %}
15227 
15228 instruct vsqrt2D_mem(vecX dst, memory mem) %{
15229   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
15230   match(Set dst (SqrtVD (LoadVector mem)));
15231   format %{ "vsqrtpd  $dst,$mem\t! sqrt packed2D" %}
15232   ins_encode %{
15233     int vector_len = 0;
15234     __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vector_len);
15235   %}
15236   ins_pipe( pipe_slow );
15237 %}
15238 
15239 instruct vsqrt4D_reg(vecY dst, vecY src) %{
15240   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
15241   match(Set dst (SqrtVD src));
15242   format %{ "vsqrtpd  $dst,$src\t! sqrt packed4D" %}
15243   ins_encode %{
15244     int vector_len = 1;
15245     __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
15246   %}
15247   ins_pipe( pipe_slow );
15248 %}
15249 
15250 instruct vsqrt4D_mem(vecY dst, memory mem) %{
15251   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
15252   match(Set dst (SqrtVD (LoadVector mem)));
15253   format %{ "vsqrtpd  $dst,$mem\t! sqrt packed4D" %}
15254   ins_encode %{
15255     int vector_len = 1;
15256     __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vector_len);
15257   %}
15258   ins_pipe( pipe_slow );
15259 %}
15260 
15261 instruct vsqrt8D_reg(vecZ dst, vecZ src) %{
15262   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
15263   match(Set dst (SqrtVD src));
15264   format %{ "vsqrtpd  $dst,$src\t! sqrt packed8D" %}
15265   ins_encode %{
15266     int vector_len = 2;
15267     __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
15268   %}
15269   ins_pipe( pipe_slow );
15270 %}
15271 
15272 instruct vsqrt8D_mem(vecZ dst, memory mem) %{
15273   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
15274   match(Set dst (SqrtVD (LoadVector mem)));
15275   format %{ "vsqrtpd  $dst,$mem\t! sqrt packed8D" %}
15276   ins_encode %{
15277     int vector_len = 2;
15278     __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vector_len);
15279   %}
15280   ins_pipe( pipe_slow );
15281 %}
15282 
15283 instruct vsqrt2F_reg(vecD dst, vecD src) %{
15284   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
15285   match(Set dst (SqrtVF src));
15286   format %{ "vsqrtps  $dst,$src\t! sqrt packed2F" %}
15287   ins_encode %{
15288     int vector_len = 0;
15289     __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vector_len);
15290   %}
15291   ins_pipe( pipe_slow );
15292 %}
15293 
15294 instruct vsqrt2F_mem(vecD dst, memory mem) %{
15295   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
15296   match(Set dst (SqrtVF (LoadVector mem)));
15297   format %{ "vsqrtps  $dst,$mem\t! sqrt packed2F" %}
15298   ins_encode %{
15299     int vector_len = 0;
15300     __ vsqrtps($dst$$XMMRegister, $mem$$Address, vector_len);
15301   %}
15302   ins_pipe( pipe_slow );
15303 %}
15304 
15305 instruct vsqrt4F_reg(vecX dst, vecX src) %{
15306   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
15307   match(Set dst (SqrtVF src));
15308   format %{ "vsqrtps  $dst,$src\t! sqrt packed4F" %}
15309   ins_encode %{
15310     int vector_len = 0;
15311     __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vector_len);
15312   %}
15313   ins_pipe( pipe_slow );
15314 %}
15315 
15316 instruct vsqrt4F_mem(vecX dst, memory mem) %{
15317   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
15318   match(Set dst (SqrtVF (LoadVector mem)));
15319   format %{ "vsqrtps  $dst,$mem\t! sqrt packed4F" %}
15320   ins_encode %{
15321     int vector_len = 0;
15322     __ vsqrtps($dst$$XMMRegister, $mem$$Address, vector_len);
15323   %}
15324   ins_pipe( pipe_slow );
15325 %}
15326 
15327 instruct vsqrt8F_reg(vecY dst, vecY src) %{
15328   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
15329   match(Set dst (SqrtVF src));
15330   format %{ "vsqrtps  $dst,$src\t! sqrt packed8F" %}
15331   ins_encode %{
15332     int vector_len = 1;
15333     __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vector_len);
15334   %}
15335   ins_pipe( pipe_slow );
15336 %}
15337 
15338 instruct vsqrt8F_mem(vecY dst, memory mem) %{
15339   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
15340   match(Set dst (SqrtVF (LoadVector mem)));
15341   format %{ "vsqrtps  $dst,$mem\t! sqrt packed8F" %}
15342   ins_encode %{
15343     int vector_len = 1;
15344     __ vsqrtps($dst$$XMMRegister, $mem$$Address, vector_len);
15345   %}
15346   ins_pipe( pipe_slow );
15347 %}
15348 
15349 instruct vsqrt16F_reg(vecZ dst, vecZ src) %{
15350   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
15351   match(Set dst (SqrtVF src));
15352   format %{ "vsqrtps  $dst,$src\t! sqrt packed16F" %}
15353   ins_encode %{
15354     int vector_len = 2;
15355     __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vector_len);
15356   %}
15357   ins_pipe( pipe_slow );
15358 %}
15359 
15360 instruct vsqrt16F_mem(vecZ dst, memory mem) %{
15361   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
15362   match(Set dst (SqrtVF (LoadVector mem)));
15363   format %{ "vsqrtps  $dst,$mem\t! sqrt packed16F" %}
15364   ins_encode %{
15365     int vector_len = 2;
15366     __ vsqrtps($dst$$XMMRegister, $mem$$Address, vector_len);
15367   %}
15368   ins_pipe( pipe_slow );
15369 %}
15370 
15371 // ------------------------------ LeftShift -----------------------------------
15372 
15373 // Byte vector left shift
15374 instruct vsll4B_reg(vecS dst, vecS src, vecS shift, vecD tmp, vecD tmp2) %{
15375   predicate(UseSSE > 3 && n->as_Vector()->length() == 4);
15376   match(Set dst (LShiftVB src shift));
15377   effect(TEMP tmp2, TEMP tmp);
15378   format %{"pmovsxbw  $tmp,$src\n\t"
15379            "psllw     $tmp,$shift\n\t"
15380            "movdqu    $tmp2,[0x00ff00ff0x00ff00ff]\n\t"
15381            "pand      $tmp,$tmp2\n\t"
15382            "packuswb  $tmp,$tmp\n\t"
15383            "movss     $dst,$tmp\n\t! left shift packed4B" %}
15384   ins_encode %{
15385     __ pmovsxbw($tmp$$XMMRegister, $src$$XMMRegister);
15386     __ psllw($tmp$$XMMRegister, $shift$$XMMRegister);
15387     __ movdqu($tmp2$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()));
15388     __ pand($tmp$$XMMRegister, $tmp2$$XMMRegister);
15389     __ packuswb($tmp$$XMMRegister, $tmp$$XMMRegister);
15390     __ movss($dst$$XMMRegister, $tmp$$XMMRegister);
15391   %}
15392   ins_pipe( pipe_slow );
15393 %}
15394 
15395 instruct vsll8B_reg(vecD dst, vecD src, vecS shift, vecX tmp, vecX tmp2) %{
15396   predicate(UseSSE > 3 && n->as_Vector()->length() == 8);
15397   match(Set dst (LShiftVB src shift));
15398   effect(TEMP tmp2, TEMP tmp);
15399   format %{"pmovsxbw  $tmp,$src\n\t"
15400            "psllw     $tmp,$shift\n\t"
15401            "movdqu    $tmp2,[0x00ff00ff0x00ff00ff]\n\t"
15402            "pand      $tmp,$tmp2\n\t"
15403            "packuswb  $tmp,$tmp\n\t"
15404            "movsd     $dst,$tmp\n\t! left shift packed8B" %}
15405   ins_encode %{
15406     __ pmovsxbw($tmp$$XMMRegister, $src$$XMMRegister);
15407     __ psllw($tmp$$XMMRegister, $shift$$XMMRegister);
15408     __ movdqu($tmp2$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()));
15409     __ pand($tmp$$XMMRegister, $tmp2$$XMMRegister);
15410     __ packuswb($tmp$$XMMRegister, $tmp$$XMMRegister);
15411     __ movsd($dst$$XMMRegister, $tmp$$XMMRegister);
15412   %}
15413   ins_pipe( pipe_slow );
15414 %}
15415 
15416 instruct vsll16B_reg(vecX dst, vecX src, vecS shift, vecX tmp, vecX tmp2, vecX tmp3) %{
15417   predicate(UseSSE > 3  && n->as_Vector()->length() == 16);
15418   match(Set dst (LShiftVB src shift));
15419   effect(TEMP tmp2, TEMP tmp, TEMP tmp3);
15420   format %{"pmovsxbw  $tmp,$src\n\t"
15421            "psllw     $tmp,$shift\n\t"
15422            "pshufd    $tmp2,$src\n\t"
15423            "pmovsxbw  $tmp2,$tmp2\n\t"
15424            "psllw     $tmp2,$shift\n\t"
15425            "movdqu    $tmp3,[0x00ff00ff0x00ff00ff]\n\t"
15426            "pand      $tmp,$tmp3\n\t"
15427            "pand      $tmp2,$tmp3\n\t"
15428            "packuswb  $tmp,$tmp2\n\t"
15429            "modqu     $dst,$tmp\n\t! left shift packed16B" %}
15430   ins_encode %{
15431     __ pmovsxbw($tmp$$XMMRegister, $src$$XMMRegister);
15432     __ psllw($tmp$$XMMRegister, $shift$$XMMRegister);
15433     __ pshufd($tmp2$$XMMRegister, $src$$XMMRegister, 0x0E);
15434     __ pmovsxbw($tmp2$$XMMRegister, $tmp2$$XMMRegister);
15435     __ psllw($tmp2$$XMMRegister, $shift$$XMMRegister);
15436     __ movdqu($tmp3$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()));
15437     __ pand($tmp$$XMMRegister, $tmp3$$XMMRegister);
15438     __ pand($tmp2$$XMMRegister, $tmp3$$XMMRegister);
15439     __ packuswb($tmp$$XMMRegister, $tmp2$$XMMRegister);
15440     __ movdqu($dst$$XMMRegister, $tmp$$XMMRegister);
15441   %}
15442   ins_pipe( pipe_slow );
15443 %}
15444 
15445 instruct vsll16B_avx(vecX dst, vecX src, vecS shift, vecY tmp, rRegL scratch) %{
15446   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
15447   match(Set dst (LShiftVB src shift));
15448   effect(TEMP dst, TEMP tmp, TEMP scratch);
15449   format %{"vpmovsxbw  $tmp,$src\n\t"
15450            "vpsllw     $tmp,$tmp,$shift\\n\t"
15451            "vpand      $tmp,$tmp,[0x00ff00ff0x00ff00ff]\n\t"
15452            "vextracti128_high  $dst,$tmp\n\t"
15453            "vpackuswb  $dst,$tmp, $dst\n\t! left shift packed16B" %}
15454   ins_encode %{
15455     int vector_len = 1;
15456     __ vpmovsxbw($tmp$$XMMRegister, $src$$XMMRegister, vector_len);
15457     __ vpsllw($tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vector_len);
15458     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vector_len, $scratch$$Register);
15459     __ vextracti128_high($dst$$XMMRegister, $tmp$$XMMRegister);
15460     __ vpackuswb($dst$$XMMRegister, $tmp$$XMMRegister, $dst$$XMMRegister, 0);
15461   %}
15462   ins_pipe( pipe_slow );
15463 %}
15464 
15465 instruct vsll32B_avx(vecY dst, vecY src, vecS shift, vecY tmp, vecY tmp2, rRegL scratch) %{
15466   predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
15467   match(Set dst (LShiftVB src shift));
15468   effect(TEMP dst, TEMP tmp2, TEMP tmp, TEMP scratch);
15469   format %{"vextracti128_high  $tmp,$src\n\t"
15470            "vpmovsxbw   $tmp,$tmp\n\t"
15471            "vpmovsxbw   $tmp2,$src\n\t"
15472            "vpsllw      $tmp,$tmp,$shift\n\t"
15473            "vpsllw      $tmp2,$tmp2,$shift\n\t"
15474            "vpand       $tmp,$tmp,[0x00ff00ff0x00ff00ff]\n\t"
15475            "vpand       $tmp2,$tmp2,[0x00ff00ff0x00ff00ff]\n\t"
15476            "vpackuswb   $dst,$tmp2,$tmp\n\t"
15477            "vpermq      $dst,$dst, 0xD8\n\t! left shift for packed32B" %}
15478   ins_encode %{
15479     int vector_len = 1;
15480     __ vextracti128_high($tmp$$XMMRegister, $src$$XMMRegister);
15481     __ vpmovsxbw($tmp$$XMMRegister, $tmp$$XMMRegister, vector_len);
15482     __ vpmovsxbw($tmp2$$XMMRegister, $src$$XMMRegister, vector_len);
15483     __ vpsllw($tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vector_len);
15484     __ vpsllw($tmp2$$XMMRegister, $tmp2$$XMMRegister, $shift$$XMMRegister, vector_len);
15485     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vector_len, $scratch$$Register);
15486     __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vector_len, $scratch$$Register);
15487     __ vpackuswb($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
15488     __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vector_len);
15489   %}
15490   ins_pipe( pipe_slow );
15491 %}
15492 
15493 instruct vsll64B_avx(vecZ dst, vecZ src, vecS shift, vecZ tmp, vecZ tmp2, vecZ tmp3, rRegL scratch) %{
15494   predicate(UseAVX > 2 && n->as_Vector()->length() == 64);
15495   match(Set dst (LShiftVB src shift));
15496   effect(TEMP dst, TEMP tmp3, TEMP tmp2, TEMP tmp, TEMP scratch);
15497   format %{"vextracti64x4  $tmp,$src\n\t"
15498            "vpmovsxbw      $tmp,$tmp\n\t"
15499            "vpmovsxbw      $tmp2,$src\n\t"
15500            "vpsllw         $tmp,$tmp,$shift\n\t"
15501            "vpsllw         $tmp2,$tmp2,$shift\n\t"
15502            "vmovdqu        $tmp3,[0x00ff00ff0x00ff00ff]\n\t"
15503            "vpbroadcastd   $tmp3,$tmp3\n\t"
15504            "vpand          $tmp,$tmp,$tmp3\n\t"
15505            "vpand          $tmp2,$tmp2,$tmp3\n\t"
15506            "vpackuswb      $dst,$tmp,$tmp2\n\t"
15507            "evmovdquq     $tmp3, [0x06040200070500301]\n\t"
15508            "vpermq  $dst,$tmp3,$dst\n\t! left shift for packed64B" %}
15509   ins_encode %{
15510     int vector_len = 2;
15511     __ vextracti64x4($tmp$$XMMRegister, $src$$XMMRegister, 1);
15512     __ vpmovsxbw($tmp$$XMMRegister, $tmp$$XMMRegister, vector_len);
15513     __ vpmovsxbw($tmp2$$XMMRegister, $src$$XMMRegister, vector_len);
15514     __ vpsllw($tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vector_len);
15515     __ vpsllw($tmp2$$XMMRegister, $tmp2$$XMMRegister, $shift$$XMMRegister, vector_len);
15516     __ vmovdqu($tmp3$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()));
15517     __ vpbroadcastd($tmp3$$XMMRegister, $tmp3$$XMMRegister, vector_len);
15518     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp3$$XMMRegister, vector_len);
15519     __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, vector_len);
15520     __ vpackuswb($dst$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
15521     __ evmovdquq($tmp3$$XMMRegister, ExternalAddress(vector_byte_perm_mask()), vector_len, $scratch$$Register);
15522     __ vpermq($dst$$XMMRegister, $tmp3$$XMMRegister, $dst$$XMMRegister, vector_len);
15523   %}
15524   ins_pipe( pipe_slow );
15525 %}
15526 
15527 // Shorts/Chars vector left shift
15528 instruct vsll2S(vecS dst, vecS shift) %{
15529   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
15530   match(Set dst (LShiftVS dst shift));
15531   format %{ "psllw   $dst,$shift\t! left shift packed2S" %}
15532   ins_encode %{
15533     __ psllw($dst$$XMMRegister, $shift$$XMMRegister);
15534   %}
15535   ins_pipe( pipe_slow );
15536 %}
15537 
15538 instruct vsll2S_imm(vecS dst, immI8 shift) %{
15539   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
15540   match(Set dst (LShiftVS dst (LShiftCntV shift)));
15541   format %{ "psllw   $dst,$shift\t! left shift packed2S" %}
15542   ins_encode %{
15543     __ psllw($dst$$XMMRegister, (int)$shift$$constant);
15544   %}
15545   ins_pipe( pipe_slow );
15546 %}
15547 
15548 instruct vsll2S_reg(vecS dst, vecS src, vecS shift) %{
15549   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
15550   match(Set dst (LShiftVS src shift));
15551   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
15552   ins_encode %{
15553     int vector_len = 0;
15554     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
15555   %}
15556   ins_pipe( pipe_slow );
15557 %}
15558 
15559 instruct vsll2S_reg_imm(vecS dst, vecS src, immI8 shift) %{
15560   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
15561   match(Set dst (LShiftVS src (LShiftCntV shift)));
15562   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
15563   ins_encode %{
15564     int vector_len = 0;
15565     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
15566   %}
15567   ins_pipe( pipe_slow );
15568 %}
15569 
15570 instruct vsll4S(vecD dst, vecS shift) %{
15571   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
15572   match(Set dst (LShiftVS dst shift));
15573   format %{ "psllw   $dst,$shift\t! left shift packed4S" %}
15574   ins_encode %{
15575     __ psllw($dst$$XMMRegister, $shift$$XMMRegister);
15576   %}
15577   ins_pipe( pipe_slow );
15578 %}
15579 
15580 instruct vsll4S_imm(vecD dst, immI8 shift) %{
15581   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
15582   match(Set dst (LShiftVS dst (LShiftCntV shift)));
15583   format %{ "psllw   $dst,$shift\t! left shift packed4S" %}
15584   ins_encode %{
15585     __ psllw($dst$$XMMRegister, (int)$shift$$constant);
15586   %}
15587   ins_pipe( pipe_slow );
15588 %}
15589 
15590 instruct vsll4S_reg(vecD dst, vecD src, vecS shift) %{
15591   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
15592   match(Set dst (LShiftVS src shift));
15593   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
15594   ins_encode %{
15595     int vector_len = 0;
15596     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
15597   %}
15598   ins_pipe( pipe_slow );
15599 %}
15600 
15601 instruct vsll4S_reg_imm(vecD dst, vecD src, immI8 shift) %{
15602   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
15603   match(Set dst (LShiftVS src (LShiftCntV shift)));
15604   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
15605   ins_encode %{
15606     int vector_len = 0;
15607     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
15608   %}
15609   ins_pipe( pipe_slow );
15610 %}
15611 
15612 instruct vsll8S(vecX dst, vecS shift) %{
15613   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
15614   match(Set dst (LShiftVS dst shift));
15615   format %{ "psllw   $dst,$shift\t! left shift packed8S" %}
15616   ins_encode %{
15617     __ psllw($dst$$XMMRegister, $shift$$XMMRegister);
15618   %}
15619   ins_pipe( pipe_slow );
15620 %}
15621 
15622 instruct vsll8S_imm(vecX dst, immI8 shift) %{
15623   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
15624   match(Set dst (LShiftVS dst (LShiftCntV shift)));
15625   format %{ "psllw   $dst,$shift\t! left shift packed8S" %}
15626   ins_encode %{
15627     __ psllw($dst$$XMMRegister, (int)$shift$$constant);
15628   %}
15629   ins_pipe( pipe_slow );
15630 %}
15631 
15632 instruct vsll8S_reg(vecX dst, vecX src, vecS shift) %{
15633   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
15634   match(Set dst (LShiftVS src shift));
15635   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
15636   ins_encode %{
15637     int vector_len = 0;
15638     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
15639   %}
15640   ins_pipe( pipe_slow );
15641 %}
15642 
15643 instruct vsll8S_reg_imm(vecX dst, vecX src, immI8 shift) %{
15644   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
15645   match(Set dst (LShiftVS src (LShiftCntV shift)));
15646   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
15647   ins_encode %{
15648     int vector_len = 0;
15649     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
15650   %}
15651   ins_pipe( pipe_slow );
15652 %}
15653 
15654 instruct vsll16S_reg(vecY dst, vecY src, vecS shift) %{
15655   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
15656   match(Set dst (LShiftVS src shift));
15657   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
15658   ins_encode %{
15659     int vector_len = 1;
15660     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
15661   %}
15662   ins_pipe( pipe_slow );
15663 %}
15664 
15665 instruct vsll16S_reg_imm(vecY dst, vecY src, immI8 shift) %{
15666   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
15667   match(Set dst (LShiftVS src (LShiftCntV shift)));
15668   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
15669   ins_encode %{
15670     int vector_len = 1;
15671     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
15672   %}
15673   ins_pipe( pipe_slow );
15674 %}
15675 
15676 instruct vsll32S_reg(vecZ dst, vecZ src, vecS shift) %{
15677   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
15678   match(Set dst (LShiftVS src shift));
15679   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed32S" %}
15680   ins_encode %{
15681     int vector_len = 2;
15682     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
15683   %}
15684   ins_pipe( pipe_slow );
15685 %}
15686 
15687 instruct vsll32S_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
15688   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
15689   match(Set dst (LShiftVS src (LShiftCntV shift)));
15690   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed32S" %}
15691   ins_encode %{
15692     int vector_len = 2;
15693     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
15694   %}
15695   ins_pipe( pipe_slow );
15696 %}
15697 
15698 // Integers vector left shift
15699 instruct vsll2I(vecD dst, vecS shift) %{
15700   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
15701   match(Set dst (LShiftVI dst shift));
15702   format %{ "pslld   $dst,$shift\t! left shift packed2I" %}
15703   ins_encode %{
15704     __ pslld($dst$$XMMRegister, $shift$$XMMRegister);
15705   %}
15706   ins_pipe( pipe_slow );
15707 %}
15708 
15709 instruct vsll2I_imm(vecD dst, immI8 shift) %{
15710   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
15711   match(Set dst (LShiftVI dst (LShiftCntV shift)));
15712   format %{ "pslld   $dst,$shift\t! left shift packed2I" %}
15713   ins_encode %{
15714     __ pslld($dst$$XMMRegister, (int)$shift$$constant);
15715   %}
15716   ins_pipe( pipe_slow );
15717 %}
15718 
15719 instruct vsll2I_reg(vecD dst, vecD src, vecS shift) %{
15720   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
15721   match(Set dst (LShiftVI src shift));
15722   format %{ "vpslld  $dst,$src,$shift\t! left shift packed2I" %}
15723   ins_encode %{
15724     int vector_len = 0;
15725     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
15726   %}
15727   ins_pipe( pipe_slow );
15728 %}
15729 
15730 instruct vsll2I_reg_imm(vecD dst, vecD src, immI8 shift) %{
15731   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
15732   match(Set dst (LShiftVI src (LShiftCntV shift)));
15733   format %{ "vpslld  $dst,$src,$shift\t! left shift packed2I" %}
15734   ins_encode %{
15735     int vector_len = 0;
15736     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
15737   %}
15738   ins_pipe( pipe_slow );
15739 %}
15740 
15741 instruct vsll4I(vecX dst, vecS shift) %{
15742   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
15743   match(Set dst (LShiftVI dst shift));
15744   format %{ "pslld   $dst,$shift\t! left shift packed4I" %}
15745   ins_encode %{
15746     __ pslld($dst$$XMMRegister, $shift$$XMMRegister);
15747   %}
15748   ins_pipe( pipe_slow );
15749 %}
15750 
15751 instruct vsll4I_imm(vecX dst, immI8 shift) %{
15752   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
15753   match(Set dst (LShiftVI dst (LShiftCntV shift)));
15754   format %{ "pslld   $dst,$shift\t! left shift packed4I" %}
15755   ins_encode %{
15756     __ pslld($dst$$XMMRegister, (int)$shift$$constant);
15757   %}
15758   ins_pipe( pipe_slow );
15759 %}
15760 
15761 instruct vsll4I_reg(vecX dst, vecX src, vecS shift) %{
15762   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
15763   match(Set dst (LShiftVI src shift));
15764   format %{ "vpslld  $dst,$src,$shift\t! left shift packed4I" %}
15765   ins_encode %{
15766     int vector_len = 0;
15767     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
15768   %}
15769   ins_pipe( pipe_slow );
15770 %}
15771 
15772 instruct vsll4I_reg_imm(vecX dst, vecX src, immI8 shift) %{
15773   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
15774   match(Set dst (LShiftVI src (LShiftCntV shift)));
15775   format %{ "vpslld  $dst,$src,$shift\t! left shift packed4I" %}
15776   ins_encode %{
15777     int vector_len = 0;
15778     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
15779   %}
15780   ins_pipe( pipe_slow );
15781 %}
15782 
15783 instruct vsll8I_reg(vecY dst, vecY src, vecS shift) %{
15784   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
15785   match(Set dst (LShiftVI src shift));
15786   format %{ "vpslld  $dst,$src,$shift\t! left shift packed8I" %}
15787   ins_encode %{
15788     int vector_len = 1;
15789     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
15790   %}
15791   ins_pipe( pipe_slow );
15792 %}
15793 
15794 instruct vsll8I_reg_imm(vecY dst, vecY src, immI8 shift) %{
15795   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
15796   match(Set dst (LShiftVI src (LShiftCntV shift)));
15797   format %{ "vpslld  $dst,$src,$shift\t! left shift packed8I" %}
15798   ins_encode %{
15799     int vector_len = 1;
15800     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
15801   %}
15802   ins_pipe( pipe_slow );
15803 %}
15804 
15805 instruct vsll16I_reg(vecZ dst, vecZ src, vecS shift) %{
15806   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
15807   match(Set dst (LShiftVI src shift));
15808   format %{ "vpslld  $dst,$src,$shift\t! left shift packed16I" %}
15809   ins_encode %{
15810     int vector_len = 2;
15811     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
15812   %}
15813   ins_pipe( pipe_slow );
15814 %}
15815 
15816 instruct vsll16I_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
15817   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
15818   match(Set dst (LShiftVI src (LShiftCntV shift)));
15819   format %{ "vpslld  $dst,$src,$shift\t! left shift packed16I" %}
15820   ins_encode %{
15821     int vector_len = 2;
15822     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
15823   %}
15824   ins_pipe( pipe_slow );
15825 %}
15826 
15827 // Longs vector left shift
15828 instruct vsll2L(vecX dst, vecS shift) %{
15829   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
15830   match(Set dst (LShiftVL dst shift));
15831   format %{ "psllq   $dst,$shift\t! left shift packed2L" %}
15832   ins_encode %{
15833     __ psllq($dst$$XMMRegister, $shift$$XMMRegister);
15834   %}
15835   ins_pipe( pipe_slow );
15836 %}
15837 
15838 instruct vsll2L_imm(vecX dst, immI8 shift) %{
15839   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
15840   match(Set dst (LShiftVL dst (LShiftCntV shift)));
15841   format %{ "psllq   $dst,$shift\t! left shift packed2L" %}
15842   ins_encode %{
15843     __ psllq($dst$$XMMRegister, (int)$shift$$constant);
15844   %}
15845   ins_pipe( pipe_slow );
15846 %}
15847 
15848 instruct vsll2L_reg(vecX dst, vecX src, vecS shift) %{
15849   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
15850   match(Set dst (LShiftVL src shift));
15851   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed2L" %}
15852   ins_encode %{
15853     int vector_len = 0;
15854     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
15855   %}
15856   ins_pipe( pipe_slow );
15857 %}
15858 
15859 instruct vsll2L_reg_imm(vecX dst, vecX src, immI8 shift) %{
15860   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
15861   match(Set dst (LShiftVL src (LShiftCntV shift)));
15862   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed2L" %}
15863   ins_encode %{
15864     int vector_len = 0;
15865     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
15866   %}
15867   ins_pipe( pipe_slow );
15868 %}
15869 
15870 instruct vsll4L_reg(vecY dst, vecY src, vecS shift) %{
15871   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
15872   match(Set dst (LShiftVL src shift));
15873   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed4L" %}
15874   ins_encode %{
15875     int vector_len = 1;
15876     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
15877   %}
15878   ins_pipe( pipe_slow );
15879 %}
15880 
15881 instruct vsll4L_reg_imm(vecY dst, vecY src, immI8 shift) %{
15882   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
15883   match(Set dst (LShiftVL src (LShiftCntV shift)));
15884   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed4L" %}
15885   ins_encode %{
15886     int vector_len = 1;
15887     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
15888   %}
15889   ins_pipe( pipe_slow );
15890 %}
15891 
15892 instruct vsll8L_reg(vecZ dst, vecZ src, vecS shift) %{
15893   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
15894   match(Set dst (LShiftVL src shift));
15895   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed8L" %}
15896   ins_encode %{
15897     int vector_len = 2;
15898     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
15899   %}
15900   ins_pipe( pipe_slow );
15901 %}
15902 
15903 instruct vsll8L_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
15904   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
15905   match(Set dst (LShiftVL src (LShiftCntV shift)));
15906   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed8L" %}
15907   ins_encode %{
15908     int vector_len = 2;
15909     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
15910   %}
15911   ins_pipe( pipe_slow );
15912 %}
15913 
15914 // ----------------------- LogicalRightShift -----------------------------------
15915 
15916 // Bytes vector logical right shift
15917 instruct vsrl4B_reg(vecS dst, vecS src, vecS shift, vecD tmp, vecD tmp2) %{
15918   predicate(UseSSE > 3 && n->as_Vector()->length() == 4);
15919   match(Set dst (URShiftVB src shift));
15920   effect(TEMP tmp2, TEMP tmp);
15921   format %{"pmovzxbw   $tmp,$src\n\t"
15922            "psrlw      $tmp,$shift\n\t"
15923            "movdqu     $tmp2,[0x00ff00ff0x00ff00ff]\n\t"
15924            "pand       $tmp,$tmp2\n\t"
15925            "packuswb   $tmp,$tmp\n\t"
15926            "movss      $dst,$tmp\n\t! logical right shift for packed4B" %}
15927   ins_encode %{
15928     __ pmovzxbw($tmp$$XMMRegister, $src$$XMMRegister);
15929     __ psrlw($tmp$$XMMRegister, $shift$$XMMRegister);
15930     __ movdqu($tmp2$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()));
15931     __ pand($tmp$$XMMRegister, $tmp2$$XMMRegister);
15932     __ packuswb($tmp$$XMMRegister, $tmp$$XMMRegister);
15933     __ movss($dst$$XMMRegister, $tmp$$XMMRegister);
15934   %}
15935   ins_pipe( pipe_slow );
15936 %}
15937 
15938 instruct vsrl8B_reg(vecD dst, vecD src, vecS shift, vecX tmp, vecX tmp2) %{
15939   predicate(UseSSE > 3 && n->as_Vector()->length() == 8);
15940   match(Set dst (URShiftVB src shift));
15941   effect(TEMP tmp2, TEMP tmp);
15942   format %{"pmovzxbw   $tmp,$src\n\t"
15943            "psrlw      $tmp,$shift\n\t"
15944            "movdqu     $tmp2,[0x00ff00ff0x00ff00ff]\n\t"
15945            "pand       $tmp,$tmp2\n\t"
15946            "packuswb   $tmp,$tmp\n\t"
15947            "movsd      $dst,$tmp\n\t!logical right shift for packed8B" %}
15948   ins_encode %{
15949     __ pmovzxbw($tmp$$XMMRegister, $src$$XMMRegister);
15950     __ psrlw($tmp$$XMMRegister, $shift$$XMMRegister);
15951     __ movdqu($tmp2$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()));
15952     __ pand($tmp$$XMMRegister, $tmp2$$XMMRegister);
15953     __ packuswb($tmp$$XMMRegister, $tmp$$XMMRegister);
15954     __ movsd($dst$$XMMRegister, $tmp$$XMMRegister);
15955   %}
15956   ins_pipe( pipe_slow );
15957 %}
15958 
15959 instruct vsrl16B_reg(vecX dst, vecX src, vecS shift, vecX tmp, vecX tmp2, vecX tmp3) %{
15960   predicate(UseSSE > 3  && n->as_Vector()->length() == 16);
15961   match(Set dst (URShiftVB src shift));
15962   effect(TEMP tmp2, TEMP tmp, TEMP tmp3);
15963   format %{"pmovzxbw  $tmp,$src\n\t"
15964            "psrlw     $tmp,$shift\n\t"
15965            "pshufd    $tmp2,$src,14\n\t"
15966            "pmovzxbw  $tmp2,$tmp2\n\t"
15967            "psrlw     $tmp2,$shift\n\t"
15968            "movdqu    $tmp3,[0x00ff00ff0x00ff00ff]\n\t"
15969            "pand      $tmp,$tmp3\n\t"
15970            "pand      tmp2,$tmp3\n\t"
15971            "packuswb  $tmp,$tmp2\n\t"
15972            "movdqu    $dst,$tmp\n\t! logical right shift for packed16B" %}
15973   ins_encode %{
15974     __ pmovzxbw($tmp$$XMMRegister, $src$$XMMRegister);
15975     __ psrlw($tmp$$XMMRegister, $shift$$XMMRegister);
15976     __ pshufd($tmp2$$XMMRegister, $src$$XMMRegister, 14);
15977     __ pmovzxbw($tmp2$$XMMRegister, $tmp2$$XMMRegister);
15978     __ psrlw($tmp2$$XMMRegister, $shift$$XMMRegister);
15979     __ movdqu($tmp3$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()));
15980     __ pand($tmp$$XMMRegister, $tmp3$$XMMRegister);
15981     __ pand($tmp2$$XMMRegister, $tmp3$$XMMRegister);
15982     __ packuswb($tmp$$XMMRegister, $tmp2$$XMMRegister);
15983     __ movdqu($dst$$XMMRegister, $tmp$$XMMRegister);
15984   %}
15985   ins_pipe( pipe_slow );
15986 %}
15987 
15988 instruct vsrl16B_avx(vecX dst, vecX src, vecS shift, vecY tmp, rRegL scratch) %{
15989   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
15990   match(Set dst (URShiftVB src shift));
15991   effect(TEMP dst, TEMP tmp, TEMP scratch);
15992   format %{"vpmovzxbw   $tmp,$src\n\t"
15993            "vpsrlw      $tmp,$tmp,$shift\n\t"
15994            "vpand       $tmp,$tmp,[0x00ff00ff0x00ff00ff]\n\t"
15995            "vextracti128_high   $dst,$tmp\n\t"
15996            "vpackuswb   $dst,$tmp,$dst\n\t! logical right shift for packed16B" %}
15997   ins_encode %{
15998     int vector_len = 1;
15999     __ vpmovzxbw($tmp$$XMMRegister, $src$$XMMRegister, vector_len);
16000     __ vpsrlw($tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vector_len);
16001     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vector_len, $scratch$$Register);
16002     __ vextracti128_high($dst$$XMMRegister, $tmp$$XMMRegister);
16003     __ vpackuswb($dst$$XMMRegister, $tmp$$XMMRegister, $dst$$XMMRegister, 0);
16004   %}
16005   ins_pipe( pipe_slow );
16006 %}
16007 
16008 instruct vsrl32B_avx(vecY dst, vecY src, vecS shift, vecY tmp, vecY tmp2, rRegL scratch) %{
16009   predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
16010   match(Set dst (URShiftVB src shift));
16011   effect(TEMP tmp2, TEMP tmp, TEMP scratch);
16012   format %{"vextracti128_high  $tmp,$src\n\t"
16013            "vpmovzxbw   $tmp,$tmp\n\t"
16014            "vpmovzxbw   $tmp2,$src\n\t"
16015            "vpsrlw      $tmp,$tmp,$shift\n\t"
16016            "vpsrlw      $tmp2,$tmp2,$shift\n\t"
16017            "vpand       $tmp,$tmp,[0x00ff00ff0x00ff00ff]\n\t"
16018            "vpand       $tmp2,$tmp2,[0x00ff00ff0x00ff00ff]\n\t"
16019            "vpackuswb   $dst,$tmp2, $tmp\n\t"
16020            "vpermq      $dst,$dst, 0xD8\n\t! logical right shift for packed32B" %}
16021   ins_encode %{
16022     int vector_len = 1;
16023     __ vextracti128_high($tmp$$XMMRegister, $src$$XMMRegister);
16024     __ vpmovzxbw($tmp$$XMMRegister, $tmp$$XMMRegister, vector_len);
16025     __ vpmovzxbw($tmp2$$XMMRegister, $src$$XMMRegister, vector_len);
16026     __ vpsrlw($tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vector_len);
16027     __ vpsrlw($tmp2$$XMMRegister, $tmp2$$XMMRegister, $shift$$XMMRegister, vector_len);
16028     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vector_len, $scratch$$Register);
16029     __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vector_len, $scratch$$Register);
16030     __ vpackuswb($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
16031     __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vector_len);
16032   %}
16033   ins_pipe( pipe_slow );
16034 %}
16035 
16036 instruct vsrl64B(vecZ dst, vecZ src, vecS shift, vecZ tmp, vecZ tmp2, vecZ tmp3, rRegL scratch) %{
16037   predicate(UseAVX > 2 && n->as_Vector()->length() == 64);
16038   match(Set dst (URShiftVB src shift));
16039   effect(TEMP dst, TEMP tmp3, TEMP tmp2, TEMP tmp, TEMP scratch);
16040   format %{"vextracti64x4  $tmp,$src\n\t"
16041            "vpmovzxbw      $tmp,$tmp\n\t"
16042            "vpmovzxbw      $tmp2,$src\n\t"
16043            "vpsrlw         $tmp,$tmp,$shift\n\t"
16044            "vpsrlw         $tmp2,$tmp2,$shift\n\t"
16045            "vmovdqu        $tmp3,[0x00ff00ff0x00ff00ff]\n\t"
16046            "vpbroadcastd   $tmp3,$tmp3\n\t"
16047            "vpand          $tmp,$tmp,$tmp3\n\t"
16048            "vpand          $tmp2,$tmp2,$tmp3\n\t"
16049            "vpackuswb      $dst,$tmp,$tmp2\n\t"
16050            "evmovdquq     $tmp3, [0x06040200070500301]\n\t"
16051            "vpermq  $dst,$tmp3,$dst\n\t! logical right shift for packed64B" %}
16052   ins_encode %{
16053     int vector_len = 2;
16054     __ vextracti64x4($tmp$$XMMRegister, $src$$XMMRegister, 1);
16055     __ vpmovzxbw($tmp$$XMMRegister, $tmp$$XMMRegister, vector_len);
16056     __ vpmovzxbw($tmp2$$XMMRegister, $src$$XMMRegister, vector_len);
16057     __ vpsrlw($tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vector_len);
16058     __ vpsrlw($tmp2$$XMMRegister, $tmp2$$XMMRegister, $shift$$XMMRegister, vector_len);
16059     __ vmovdqu($tmp3$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()));
16060     __ vpbroadcastd($tmp3$$XMMRegister, $tmp3$$XMMRegister, vector_len);
16061     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp3$$XMMRegister, vector_len);
16062     __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, vector_len);
16063     __ vpackuswb($dst$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
16064     __ evmovdquq($tmp3$$XMMRegister, ExternalAddress(vector_byte_perm_mask()), vector_len, $scratch$$Register);
16065     __ vpermq($dst$$XMMRegister, $tmp3$$XMMRegister, $dst$$XMMRegister, vector_len);
16066   %}
16067   ins_pipe( pipe_slow );
16068 %}
16069 
16070 // Shorts vector logical right shift produces incorrect Java result
16071 // for negative data because java code convert short value into int with
16072 // sign extension before a shift. But char vectors are fine since chars are
16073 // unsigned values.
16074 
16075 instruct vsrl2S(vecS dst, vecS shift) %{
16076   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
16077   match(Set dst (URShiftVS dst shift));
16078   format %{ "psrlw   $dst,$shift\t! logical right shift packed2S" %}
16079   ins_encode %{
16080     __ psrlw($dst$$XMMRegister, $shift$$XMMRegister);
16081   %}
16082   ins_pipe( pipe_slow );
16083 %}
16084 
16085 instruct vsrl2S_imm(vecS dst, immI8 shift) %{
16086   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
16087   match(Set dst (URShiftVS dst (RShiftCntV shift)));
16088   format %{ "psrlw   $dst,$shift\t! logical right shift packed2S" %}
16089   ins_encode %{
16090     __ psrlw($dst$$XMMRegister, (int)$shift$$constant);
16091   %}
16092   ins_pipe( pipe_slow );
16093 %}
16094 
16095 instruct vsrl2S_reg(vecS dst, vecS src, vecS shift) %{
16096   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
16097   match(Set dst (URShiftVS src shift));
16098   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed2S" %}
16099   ins_encode %{
16100     int vector_len = 0;
16101     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
16102   %}
16103   ins_pipe( pipe_slow );
16104 %}
16105 
16106 instruct vsrl2S_reg_imm(vecS dst, vecS src, immI8 shift) %{
16107   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
16108   match(Set dst (URShiftVS src (RShiftCntV shift)));
16109   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed2S" %}
16110   ins_encode %{
16111     int vector_len = 0;
16112     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
16113   %}
16114   ins_pipe( pipe_slow );
16115 %}
16116 
16117 instruct vsrl4S(vecD dst, vecS shift) %{
16118   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
16119   match(Set dst (URShiftVS dst shift));
16120   format %{ "psrlw   $dst,$shift\t! logical right shift packed4S" %}
16121   ins_encode %{
16122     __ psrlw($dst$$XMMRegister, $shift$$XMMRegister);
16123   %}
16124   ins_pipe( pipe_slow );
16125 %}
16126 
16127 instruct vsrl4S_imm(vecD dst, immI8 shift) %{
16128   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
16129   match(Set dst (URShiftVS dst (RShiftCntV shift)));
16130   format %{ "psrlw   $dst,$shift\t! logical right shift packed4S" %}
16131   ins_encode %{
16132     __ psrlw($dst$$XMMRegister, (int)$shift$$constant);
16133   %}
16134   ins_pipe( pipe_slow );
16135 %}
16136 
16137 instruct vsrl4S_reg(vecD dst, vecD src, vecS shift) %{
16138   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
16139   match(Set dst (URShiftVS src shift));
16140   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed4S" %}
16141   ins_encode %{
16142     int vector_len = 0;
16143     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
16144   %}
16145   ins_pipe( pipe_slow );
16146 %}
16147 
16148 instruct vsrl4S_reg_imm(vecD dst, vecD src, immI8 shift) %{
16149   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
16150   match(Set dst (URShiftVS src (RShiftCntV shift)));
16151   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed4S" %}
16152   ins_encode %{
16153     int vector_len = 0;
16154     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
16155   %}
16156   ins_pipe( pipe_slow );
16157 %}
16158 
16159 instruct vsrl8S(vecX dst, vecS shift) %{
16160   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
16161   match(Set dst (URShiftVS dst shift));
16162   format %{ "psrlw   $dst,$shift\t! logical right shift packed8S" %}
16163   ins_encode %{
16164     __ psrlw($dst$$XMMRegister, $shift$$XMMRegister);
16165   %}
16166   ins_pipe( pipe_slow );
16167 %}
16168 
16169 instruct vsrl8S_imm(vecX dst, immI8 shift) %{
16170   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
16171   match(Set dst (URShiftVS dst (RShiftCntV shift)));
16172   format %{ "psrlw   $dst,$shift\t! logical right shift packed8S" %}
16173   ins_encode %{
16174     __ psrlw($dst$$XMMRegister, (int)$shift$$constant);
16175   %}
16176   ins_pipe( pipe_slow );
16177 %}
16178 
16179 instruct vsrl8S_reg(vecX dst, vecX src, vecS shift) %{
16180   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
16181   match(Set dst (URShiftVS src shift));
16182   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed8S" %}
16183   ins_encode %{
16184     int vector_len = 0;
16185     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
16186   %}
16187   ins_pipe( pipe_slow );
16188 %}
16189 
16190 instruct vsrl8S_reg_imm(vecX dst, vecX src, immI8 shift) %{
16191   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
16192   match(Set dst (URShiftVS src (RShiftCntV shift)));
16193   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed8S" %}
16194   ins_encode %{
16195     int vector_len = 0;
16196     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
16197   %}
16198   ins_pipe( pipe_slow );
16199 %}
16200 
16201 instruct vsrl16S_reg(vecY dst, vecY src, vecS shift) %{
16202   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
16203   match(Set dst (URShiftVS src shift));
16204   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed16S" %}
16205   ins_encode %{
16206     int vector_len = 1;
16207     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
16208   %}
16209   ins_pipe( pipe_slow );
16210 %}
16211 
16212 instruct vsrl16S_reg_imm(vecY dst, vecY src, immI8 shift) %{
16213   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
16214   match(Set dst (URShiftVS src (RShiftCntV shift)));
16215   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed16S" %}
16216   ins_encode %{
16217     int vector_len = 1;
16218     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
16219   %}
16220   ins_pipe( pipe_slow );
16221 %}
16222 
16223 instruct vsrl32S_reg(vecZ dst, vecZ src, vecS shift) %{
16224   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
16225   match(Set dst (URShiftVS src shift));
16226   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed32S" %}
16227   ins_encode %{
16228     int vector_len = 2;
16229     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
16230   %}
16231   ins_pipe( pipe_slow );
16232 %}
16233 
16234 instruct vsrl32S_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
16235   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
16236   match(Set dst (URShiftVS src (RShiftCntV shift)));
16237   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed32S" %}
16238   ins_encode %{
16239     int vector_len = 2;
16240     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
16241   %}
16242   ins_pipe( pipe_slow );
16243 %}
16244 
16245 // Integers vector logical right shift
16246 instruct vsrl2I(vecD dst, vecS shift) %{
16247   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
16248   match(Set dst (URShiftVI dst shift));
16249   format %{ "psrld   $dst,$shift\t! logical right shift packed2I" %}
16250   ins_encode %{
16251     __ psrld($dst$$XMMRegister, $shift$$XMMRegister);
16252   %}
16253   ins_pipe( pipe_slow );
16254 %}
16255 
16256 instruct vsrl2I_imm(vecD dst, immI8 shift) %{
16257   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
16258   match(Set dst (URShiftVI dst (RShiftCntV shift)));
16259   format %{ "psrld   $dst,$shift\t! logical right shift packed2I" %}
16260   ins_encode %{
16261     __ psrld($dst$$XMMRegister, (int)$shift$$constant);
16262   %}
16263   ins_pipe( pipe_slow );
16264 %}
16265 
16266 instruct vsrl2I_reg(vecD dst, vecD src, vecS shift) %{
16267   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
16268   match(Set dst (URShiftVI src shift));
16269   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed2I" %}
16270   ins_encode %{
16271     int vector_len = 0;
16272     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
16273   %}
16274   ins_pipe( pipe_slow );
16275 %}
16276 
16277 instruct vsrl2I_reg_imm(vecD dst, vecD src, immI8 shift) %{
16278   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
16279   match(Set dst (URShiftVI src (RShiftCntV shift)));
16280   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed2I" %}
16281   ins_encode %{
16282     int vector_len = 0;
16283     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
16284   %}
16285   ins_pipe( pipe_slow );
16286 %}
16287 
16288 instruct vsrl4I(vecX dst, vecS shift) %{
16289   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
16290   match(Set dst (URShiftVI dst shift));
16291   format %{ "psrld   $dst,$shift\t! logical right shift packed4I" %}
16292   ins_encode %{
16293     __ psrld($dst$$XMMRegister, $shift$$XMMRegister);
16294   %}
16295   ins_pipe( pipe_slow );
16296 %}
16297 
16298 instruct vsrl4I_imm(vecX dst, immI8 shift) %{
16299   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
16300   match(Set dst (URShiftVI dst (RShiftCntV shift)));
16301   format %{ "psrld   $dst,$shift\t! logical right shift packed4I" %}
16302   ins_encode %{
16303     __ psrld($dst$$XMMRegister, (int)$shift$$constant);
16304   %}
16305   ins_pipe( pipe_slow );
16306 %}
16307 
16308 instruct vsrl4I_reg(vecX dst, vecX src, vecS shift) %{
16309   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
16310   match(Set dst (URShiftVI src shift));
16311   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed4I" %}
16312   ins_encode %{
16313     int vector_len = 0;
16314     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
16315   %}
16316   ins_pipe( pipe_slow );
16317 %}
16318 
16319 instruct vsrl4I_reg_imm(vecX dst, vecX src, immI8 shift) %{
16320   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
16321   match(Set dst (URShiftVI src (RShiftCntV shift)));
16322   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed4I" %}
16323   ins_encode %{
16324     int vector_len = 0;
16325     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
16326   %}
16327   ins_pipe( pipe_slow );
16328 %}
16329 
16330 instruct vsrl8I_reg(vecY dst, vecY src, vecS shift) %{
16331   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
16332   match(Set dst (URShiftVI src shift));
16333   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed8I" %}
16334   ins_encode %{
16335     int vector_len = 1;
16336     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
16337   %}
16338   ins_pipe( pipe_slow );
16339 %}
16340 
16341 instruct vsrl8I_reg_imm(vecY dst, vecY src, immI8 shift) %{
16342   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
16343   match(Set dst (URShiftVI src (RShiftCntV shift)));
16344   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed8I" %}
16345   ins_encode %{
16346     int vector_len = 1;
16347     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
16348   %}
16349   ins_pipe( pipe_slow );
16350 %}
16351 
16352 instruct vsrl16I_reg(vecZ dst, vecZ src, vecS shift) %{
16353   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
16354   match(Set dst (URShiftVI src shift));
16355   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed16I" %}
16356   ins_encode %{
16357     int vector_len = 2;
16358     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
16359   %}
16360   ins_pipe( pipe_slow );
16361 %}
16362 
16363 instruct vsrl16I_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
16364   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
16365   match(Set dst (URShiftVI src (RShiftCntV shift)));
16366   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed16I" %}
16367   ins_encode %{
16368     int vector_len = 2;
16369     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
16370   %}
16371   ins_pipe( pipe_slow );
16372 %}
16373 
16374 // Longs vector logical right shift
16375 instruct vsrl2L(vecX dst, vecS shift) %{
16376   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
16377   match(Set dst (URShiftVL dst shift));
16378   format %{ "psrlq   $dst,$shift\t! logical right shift packed2L" %}
16379   ins_encode %{
16380     __ psrlq($dst$$XMMRegister, $shift$$XMMRegister);
16381   %}
16382   ins_pipe( pipe_slow );
16383 %}
16384 
16385 instruct vsrl2L_imm(vecX dst, immI8 shift) %{
16386   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
16387   match(Set dst (URShiftVL dst (RShiftCntV shift)));
16388   format %{ "psrlq   $dst,$shift\t! logical right shift packed2L" %}
16389   ins_encode %{
16390     __ psrlq($dst$$XMMRegister, (int)$shift$$constant);
16391   %}
16392   ins_pipe( pipe_slow );
16393 %}
16394 
16395 instruct vsrl2L_reg(vecX dst, vecX src, vecS shift) %{
16396   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
16397   match(Set dst (URShiftVL src shift));
16398   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed2L" %}
16399   ins_encode %{
16400     int vector_len = 0;
16401     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
16402   %}
16403   ins_pipe( pipe_slow );
16404 %}
16405 
16406 instruct vsrl2L_reg_imm(vecX dst, vecX src, immI8 shift) %{
16407   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
16408   match(Set dst (URShiftVL src (RShiftCntV shift)));
16409   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed2L" %}
16410   ins_encode %{
16411     int vector_len = 0;
16412     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
16413   %}
16414   ins_pipe( pipe_slow );
16415 %}
16416 
16417 instruct vsrl4L_reg(vecY dst, vecY src, vecS shift) %{
16418   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
16419   match(Set dst (URShiftVL src shift));
16420   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed4L" %}
16421   ins_encode %{
16422     int vector_len = 1;
16423     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
16424   %}
16425   ins_pipe( pipe_slow );
16426 %}
16427 
16428 instruct vsrl4L_reg_imm(vecY dst, vecY src, immI8 shift) %{
16429   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
16430   match(Set dst (URShiftVL src (RShiftCntV shift)));
16431   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed4L" %}
16432   ins_encode %{
16433     int vector_len = 1;
16434     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
16435   %}
16436   ins_pipe( pipe_slow );
16437 %}
16438 
16439 instruct vsrl8L_reg(vecZ dst, vecZ src, vecS shift) %{
16440   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
16441   match(Set dst (URShiftVL src shift));
16442   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed8L" %}
16443   ins_encode %{
16444     int vector_len = 2;
16445     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
16446   %}
16447   ins_pipe( pipe_slow );
16448 %}
16449 
16450 instruct vsrl8L_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
16451   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
16452   match(Set dst (URShiftVL src (RShiftCntV shift)));
16453   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed8L" %}
16454   ins_encode %{
16455     int vector_len = 2;
16456     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
16457   %}
16458   ins_pipe( pipe_slow );
16459 %}
16460 
16461 // ------------------- ArithmeticRightShift -----------------------------------
16462 
16463 // Byte vector arithmetic right shift
16464 instruct vsra4B_reg(vecS dst, vecS src, vecS shift, vecD tmp, vecD tmp2) %{
16465   predicate(UseSSE > 3 && n->as_Vector()->length() == 4);
16466   match(Set dst (RShiftVB src shift));
16467   effect(TEMP tmp2, TEMP tmp);
16468   format %{"pmovsxbw  $tmp,$src\n\t"
16469            "psraw     $tmp,$shift\n\t"
16470            "movdqu    $tmp2,[0x00ff00ff0x00ff00ff]\n\t"
16471            "pand      $tmp,$tmp2\n\t"
16472            "packuswb  $tmp,$tmp\n\t"
16473            "movss     $dst,$tmp\n\t! arithmetic right shift for packed4B" %}
16474   ins_encode %{
16475     __ pmovsxbw($tmp$$XMMRegister, $src$$XMMRegister);
16476     __ psraw($tmp$$XMMRegister, $shift$$XMMRegister);
16477     __ movdqu($tmp2$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()));
16478     __ pand($tmp$$XMMRegister, $tmp2$$XMMRegister);
16479     __ packuswb($tmp$$XMMRegister, $tmp$$XMMRegister);
16480     __ movss($dst$$XMMRegister, $tmp$$XMMRegister);
16481   %}
16482   ins_pipe( pipe_slow );
16483 %}
16484 
16485 instruct vsra8B_reg(vecD dst, vecD src, vecS shift, vecX tmp, vecX tmp2) %{
16486   predicate(UseSSE > 3 && n->as_Vector()->length() == 8);
16487   match(Set dst (RShiftVB src shift));
16488   effect(TEMP tmp2, TEMP tmp);
16489   format %{"pmovsxbw  $tmp,$src\n\t"
16490            "psraw     $tmp,$shift\n\t"
16491            "movdqu    $tmp2,[0x00ff00ff0x00ff00ff]\n\t"
16492            "pand      $tmp,$tmp2\n\t"
16493            "packuswb  $tmp,$tmp\n\t"
16494            "movsd     $dst,$tmp\n\t! arithmetic right shift for packed8B" %}
16495   ins_encode %{
16496     __ pmovsxbw($tmp$$XMMRegister, $src$$XMMRegister);
16497     __ psraw($tmp$$XMMRegister, $shift$$XMMRegister);
16498     __ movdqu($tmp2$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()));
16499     __ pand($tmp$$XMMRegister, $tmp2$$XMMRegister);
16500     __ packuswb($tmp$$XMMRegister, $tmp$$XMMRegister);
16501     __ movsd($dst$$XMMRegister, $tmp$$XMMRegister);
16502   %}
16503   ins_pipe( pipe_slow );
16504 %}
16505 
16506 instruct vsra16B_reg(vecX dst, vecX src, vecS shift, vecX tmp, vecX tmp2, vecX tmp3) %{
16507   predicate(UseSSE > 3  && n->as_Vector()->length() == 16);
16508   match(Set dst (RShiftVB src shift));
16509   effect(TEMP tmp2, TEMP tmp, TEMP tmp3);
16510   format %{"pmovsxbw  $tmp,$src\n\t"
16511            "psraw     $tmp,$shift\n\t"
16512            "pshufd    $tmp2,$src\n\t"
16513            "pmovsxbw  $tmp2,$tmp2\n\t"
16514            "psraw     $tmp2,$shift\n\t"
16515            "movdqu    $tmp3,[0x00ff00ff0x00ff00ff]\n\t"
16516            "pand      $tmp,$tmp3\n\t"
16517            "pand      $tmp2,$tmp3\n\t"
16518            "packuswb  $tmp,$tmp2\n\t"
16519            "movdqu    $dst,$tmp\n\t! arithmetic right shift for packed16B" %}
16520   ins_encode %{
16521     __ pmovsxbw($tmp$$XMMRegister, $src$$XMMRegister);
16522     __ psraw($tmp$$XMMRegister, $shift$$XMMRegister);
16523     __ pshufd($tmp2$$XMMRegister, $src$$XMMRegister, 0xE);
16524     __ pmovsxbw($tmp2$$XMMRegister, $tmp2$$XMMRegister);
16525     __ psraw($tmp2$$XMMRegister, $shift$$XMMRegister);
16526     __ movdqu($tmp3$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()));
16527     __ pand($tmp$$XMMRegister, $tmp3$$XMMRegister);
16528     __ pand($tmp2$$XMMRegister, $tmp3$$XMMRegister);
16529     __ packuswb($tmp$$XMMRegister, $tmp2$$XMMRegister);
16530     __ movdqu($dst$$XMMRegister, $tmp$$XMMRegister);
16531   %}
16532   ins_pipe( pipe_slow );
16533 %}
16534 
16535 instruct vsra16B_avx(vecX dst, vecX src, vecS shift, vecY tmp, rRegL scratch) %{
16536   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
16537   match(Set dst (RShiftVB src shift));
16538   effect(TEMP dst, TEMP tmp, TEMP scratch);
16539   format %{"vpmovsxbw  $tmp,$src\n\t"
16540            "vpsraw     $tmp,$tmp,$shift\n\t"
16541            "vpand      $tmp,$tmp,[0x00ff00ff0x00ff00ff]\n\t"
16542            "vextracti128_high  $dst,$tmp\n\t"
16543            "vpackuswb  $dst,$tmp,$dst\n\t! arithmetic right shift for packed16B" %}
16544   ins_encode %{
16545     int vector_len = 1;
16546     __ vpmovsxbw($tmp$$XMMRegister, $src$$XMMRegister, vector_len);
16547     __ vpsraw($tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vector_len);
16548     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vector_len, $scratch$$Register);
16549     __ vextracti128_high($dst$$XMMRegister, $tmp$$XMMRegister);
16550     __ vpackuswb($dst$$XMMRegister, $tmp$$XMMRegister, $dst$$XMMRegister, 0);
16551         %}
16552   ins_pipe( pipe_slow );
16553 %}
16554 
16555 instruct vsra32B_avx(vecY dst, vecY src, vecS shift, vecY tmp, vecY tmp2, rRegL scratch) %{
16556   predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
16557   match(Set dst (RShiftVB src shift));
16558   effect(TEMP tmp2, TEMP tmp, TEMP dst, TEMP scratch);
16559   format %{"vextracti128_high  $tmp,$src\n\t"
16560            "vpmovsxbw  $tmp,$tmp\n\t"
16561            "vpmovsxbw  $tmp2,$src\n\t"
16562            "vpsraw     $tmp,$tmp,$shift\n\t"
16563            "vpsraw     $tmp2,$tmp2,$shift\n\t"
16564            "vpand      $tmp,$tmp,[0x00ff00ff0x00ff00ff]\n\t"
16565            "vpand      $tmp2,$tmp2,[0x00ff00ff0x00ff00ff]\n\t"
16566            "vpackuswb  $dst,$tmp2,$tmp\n\t"
16567            "vpermq     $dst,$dst,0xD8\n\t! arithmetic right shift for packed32B" %}
16568   ins_encode %{
16569     int vector_len = 1;
16570     __ vextracti128_high($tmp$$XMMRegister, $src$$XMMRegister);
16571     __ vpmovsxbw($tmp$$XMMRegister, $tmp$$XMMRegister, vector_len);
16572     __ vpmovsxbw($tmp2$$XMMRegister, $src$$XMMRegister, vector_len);
16573     __ vpsraw($tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vector_len);
16574     __ vpsraw($tmp2$$XMMRegister, $tmp2$$XMMRegister, $shift$$XMMRegister, vector_len);
16575     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vector_len, $scratch$$Register);
16576     __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vector_len, $scratch$$Register);
16577     __ vpackuswb($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
16578     __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vector_len);
16579   %}
16580   ins_pipe( pipe_slow );
16581 %}
16582 
16583 instruct vsra64B(vecZ dst, vecZ src, vecS shift, vecZ tmp, vecZ tmp2, vecZ tmp3, rRegL scratch) %{
16584   predicate(UseAVX > 2 && n->as_Vector()->length() == 64);
16585   match(Set dst (RShiftVB src shift));
16586   effect(TEMP dst, TEMP tmp3, TEMP tmp2, TEMP tmp, TEMP scratch);
16587   format %{"vextracti64x4  $tmp,$src\n\t"
16588            "vpmovsxbw      $tmp,$tmp\n\t"
16589            "vpmovsxbw      $tmp2,$src\n\t"
16590            "vpsraw         $tmp,$tmp,$shift\n\t"
16591            "vpsraw         $tmp2,$tmp2,$shift\n\t"
16592            "vmovdqu        $tmp3,[0x00ff00ff0x00ff00ff]\n\t"
16593            "vpbroadcastd   $tmp3,$tmp3\n\t"
16594            "vpand          $tmp,$tmp,$tmp3\n\t"
16595            "vpand          $tmp2,$tmp2,$tmp3\n\t"
16596            "vpackuswb      $dst,$tmp,$tmp2\n\t"
16597            "evmovdquq     $tmp3, [0x06040200070500301]\n\t"
16598            "vpermq  $dst,$tmp3,$dst\n\t! arithmetic right shift for packed64B" %}
16599   ins_encode %{
16600     int vector_len = 2;
16601     __ vextracti64x4($tmp$$XMMRegister, $src$$XMMRegister, 1);
16602     __ vpmovsxbw($tmp$$XMMRegister, $tmp$$XMMRegister, vector_len);
16603     __ vpmovsxbw($tmp2$$XMMRegister, $src$$XMMRegister, vector_len);
16604     __ vpsraw($tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vector_len);
16605     __ vpsraw($tmp2$$XMMRegister, $tmp2$$XMMRegister, $shift$$XMMRegister, vector_len);
16606     __ vmovdqu($tmp3$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()));
16607     __ vpbroadcastd($tmp3$$XMMRegister, $tmp3$$XMMRegister, vector_len);
16608     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp3$$XMMRegister, vector_len);
16609     __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, vector_len);
16610     __ vpackuswb($dst$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
16611     __ evmovdquq($tmp3$$XMMRegister, ExternalAddress(vector_byte_perm_mask()), vector_len, $scratch$$Register);
16612     __ vpermq($dst$$XMMRegister, $tmp3$$XMMRegister, $dst$$XMMRegister, vector_len);
16613   %}
16614   ins_pipe( pipe_slow );
16615 %}
16616 
16617 // Shorts/Chars vector arithmetic right shift
16618 instruct vsra2S(vecS dst, vecS shift) %{
16619   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
16620   match(Set dst (RShiftVS dst shift));
16621   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed2S" %}
16622   ins_encode %{
16623     __ psraw($dst$$XMMRegister, $shift$$XMMRegister);
16624   %}
16625   ins_pipe( pipe_slow );
16626 %}
16627 
16628 instruct vsra2S_imm(vecS dst, immI8 shift) %{
16629   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
16630   match(Set dst (RShiftVS dst (RShiftCntV shift)));
16631   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed2S" %}
16632   ins_encode %{
16633     __ psraw($dst$$XMMRegister, (int)$shift$$constant);
16634   %}
16635   ins_pipe( pipe_slow );
16636 %}
16637 
16638 instruct vsra2S_reg(vecS dst, vecS src, vecS shift) %{
16639   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
16640   match(Set dst (RShiftVS src shift));
16641   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
16642   ins_encode %{
16643     int vector_len = 0;
16644     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
16645   %}
16646   ins_pipe( pipe_slow );
16647 %}
16648 
16649 instruct vsra2S_reg_imm(vecS dst, vecS src, immI8 shift) %{
16650   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
16651   match(Set dst (RShiftVS src (RShiftCntV shift)));
16652   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
16653   ins_encode %{
16654     int vector_len = 0;
16655     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
16656   %}
16657   ins_pipe( pipe_slow );
16658 %}
16659 
16660 instruct vsra4S(vecD dst, vecS shift) %{
16661   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
16662   match(Set dst (RShiftVS dst shift));
16663   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed4S" %}
16664   ins_encode %{
16665     __ psraw($dst$$XMMRegister, $shift$$XMMRegister);
16666   %}
16667   ins_pipe( pipe_slow );
16668 %}
16669 
16670 instruct vsra4S_imm(vecD dst, immI8 shift) %{
16671   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
16672   match(Set dst (RShiftVS dst (RShiftCntV shift)));
16673   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed4S" %}
16674   ins_encode %{
16675     __ psraw($dst$$XMMRegister, (int)$shift$$constant);
16676   %}
16677   ins_pipe( pipe_slow );
16678 %}
16679 
16680 instruct vsra4S_reg(vecD dst, vecD src, vecS shift) %{
16681   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
16682   match(Set dst (RShiftVS src shift));
16683   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
16684   ins_encode %{
16685     int vector_len = 0;
16686     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
16687   %}
16688   ins_pipe( pipe_slow );
16689 %}
16690 
16691 instruct vsra4S_reg_imm(vecD dst, vecD src, immI8 shift) %{
16692   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
16693   match(Set dst (RShiftVS src (RShiftCntV shift)));
16694   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
16695   ins_encode %{
16696     int vector_len = 0;
16697     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
16698   %}
16699   ins_pipe( pipe_slow );
16700 %}
16701 
16702 instruct vsra8S(vecX dst, vecS shift) %{
16703   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
16704   match(Set dst (RShiftVS dst shift));
16705   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed8S" %}
16706   ins_encode %{
16707     __ psraw($dst$$XMMRegister, $shift$$XMMRegister);
16708   %}
16709   ins_pipe( pipe_slow );
16710 %}
16711 
16712 instruct vsra8S_imm(vecX dst, immI8 shift) %{
16713   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
16714   match(Set dst (RShiftVS dst (RShiftCntV shift)));
16715   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed8S" %}
16716   ins_encode %{
16717     __ psraw($dst$$XMMRegister, (int)$shift$$constant);
16718   %}
16719   ins_pipe( pipe_slow );
16720 %}
16721 
16722 instruct vsra8S_reg(vecX dst, vecX src, vecS shift) %{
16723   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
16724   match(Set dst (RShiftVS src shift));
16725   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
16726   ins_encode %{
16727     int vector_len = 0;
16728     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
16729   %}
16730   ins_pipe( pipe_slow );
16731 %}
16732 
16733 instruct vsra8S_reg_imm(vecX dst, vecX src, immI8 shift) %{
16734   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
16735   match(Set dst (RShiftVS src (RShiftCntV shift)));
16736   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
16737   ins_encode %{
16738     int vector_len = 0;
16739     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
16740   %}
16741   ins_pipe( pipe_slow );
16742 %}
16743 
16744 instruct vsra16S_reg(vecY dst, vecY src, vecS shift) %{
16745   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
16746   match(Set dst (RShiftVS src shift));
16747   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
16748   ins_encode %{
16749     int vector_len = 1;
16750     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
16751   %}
16752   ins_pipe( pipe_slow );
16753 %}
16754 
16755 instruct vsra16S_reg_imm(vecY dst, vecY src, immI8 shift) %{
16756   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
16757   match(Set dst (RShiftVS src (RShiftCntV shift)));
16758   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
16759   ins_encode %{
16760     int vector_len = 1;
16761     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
16762   %}
16763   ins_pipe( pipe_slow );
16764 %}
16765 
16766 instruct vsra32S_reg(vecZ dst, vecZ src, vecS shift) %{
16767   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
16768   match(Set dst (RShiftVS src shift));
16769   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed32S" %}
16770   ins_encode %{
16771     int vector_len = 2;
16772     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
16773   %}
16774   ins_pipe( pipe_slow );
16775 %}
16776 
16777 instruct vsra32S_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
16778   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
16779   match(Set dst (RShiftVS src (RShiftCntV shift)));
16780   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed32S" %}
16781   ins_encode %{
16782     int vector_len = 2;
16783     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
16784   %}
16785   ins_pipe( pipe_slow );
16786 %}
16787 
16788 // Integers vector arithmetic right shift
16789 instruct vsra2I(vecD dst, vecS shift) %{
16790   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
16791   match(Set dst (RShiftVI dst shift));
16792   format %{ "psrad   $dst,$shift\t! arithmetic right shift packed2I" %}
16793   ins_encode %{
16794     __ psrad($dst$$XMMRegister, $shift$$XMMRegister);
16795   %}
16796   ins_pipe( pipe_slow );
16797 %}
16798 
16799 instruct vsra2I_imm(vecD dst, immI8 shift) %{
16800   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
16801   match(Set dst (RShiftVI dst (RShiftCntV shift)));
16802   format %{ "psrad   $dst,$shift\t! arithmetic right shift packed2I" %}
16803   ins_encode %{
16804     __ psrad($dst$$XMMRegister, (int)$shift$$constant);
16805   %}
16806   ins_pipe( pipe_slow );
16807 %}
16808 
16809 instruct vsra2I_reg(vecD dst, vecD src, vecS shift) %{
16810   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
16811   match(Set dst (RShiftVI src shift));
16812   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed2I" %}
16813   ins_encode %{
16814     int vector_len = 0;
16815     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
16816   %}
16817   ins_pipe( pipe_slow );
16818 %}
16819 
16820 instruct vsra2I_reg_imm(vecD dst, vecD src, immI8 shift) %{
16821   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
16822   match(Set dst (RShiftVI src (RShiftCntV shift)));
16823   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed2I" %}
16824   ins_encode %{
16825     int vector_len = 0;
16826     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
16827   %}
16828   ins_pipe( pipe_slow );
16829 %}
16830 
16831 instruct vsra4I(vecX dst, vecS shift) %{
16832   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
16833   match(Set dst (RShiftVI dst shift));
16834   format %{ "psrad   $dst,$shift\t! arithmetic right shift packed4I" %}
16835   ins_encode %{
16836     __ psrad($dst$$XMMRegister, $shift$$XMMRegister);
16837   %}
16838   ins_pipe( pipe_slow );
16839 %}
16840 
16841 instruct vsra4I_imm(vecX dst, immI8 shift) %{
16842   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
16843   match(Set dst (RShiftVI dst (RShiftCntV shift)));
16844   format %{ "psrad   $dst,$shift\t! arithmetic right shift packed4I" %}
16845   ins_encode %{
16846     __ psrad($dst$$XMMRegister, (int)$shift$$constant);
16847   %}
16848   ins_pipe( pipe_slow );
16849 %}
16850 
16851 instruct vsra4I_reg(vecX dst, vecX src, vecS shift) %{
16852   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
16853   match(Set dst (RShiftVI src shift));
16854   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed4I" %}
16855   ins_encode %{
16856     int vector_len = 0;
16857     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
16858   %}
16859   ins_pipe( pipe_slow );
16860 %}
16861 
16862 instruct vsra4I_reg_imm(vecX dst, vecX src, immI8 shift) %{
16863   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
16864   match(Set dst (RShiftVI src (RShiftCntV shift)));
16865   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed4I" %}
16866   ins_encode %{
16867     int vector_len = 0;
16868     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
16869   %}
16870   ins_pipe( pipe_slow );
16871 %}
16872 
16873 instruct vsra8I_reg(vecY dst, vecY src, vecS shift) %{
16874   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
16875   match(Set dst (RShiftVI src shift));
16876   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed8I" %}
16877   ins_encode %{
16878     int vector_len = 1;
16879     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
16880   %}
16881   ins_pipe( pipe_slow );
16882 %}
16883 
16884 instruct vsra8I_reg_imm(vecY dst, vecY src, immI8 shift) %{
16885   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
16886   match(Set dst (RShiftVI src (RShiftCntV shift)));
16887   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed8I" %}
16888   ins_encode %{
16889     int vector_len = 1;
16890     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
16891   %}
16892   ins_pipe( pipe_slow );
16893 %}
16894 
16895 instruct vsra16I_reg(vecZ dst, vecZ src, vecS shift) %{
16896   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
16897   match(Set dst (RShiftVI src shift));
16898   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed16I" %}
16899   ins_encode %{
16900     int vector_len = 2;
16901     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
16902   %}
16903   ins_pipe( pipe_slow );
16904 %}
16905 
16906 instruct vsra16I_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
16907   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
16908   match(Set dst (RShiftVI src (RShiftCntV shift)));
16909   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed16I" %}
16910   ins_encode %{
16911     int vector_len = 2;
16912     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
16913   %}
16914   ins_pipe( pipe_slow );
16915 %}
16916 
16917 // Long vector arithmetic right shift
16918 instruct vsra1L(vecD dst, vecD src, vecS shift, vecD tmp) %{
16919   predicate(n->as_Vector()->length() == 1);
16920   match(Set dst (RShiftVL src shift));
16921   effect(TEMP dst, TEMP tmp);
16922   format %{ "movdqu  $dst,$src\n\t"
16923             "psrlq   $dst,$shift\n\t"
16924             "movdqu  $tmp,[0x8000000000000000]\n\t"
16925             "psrlq   $tmp,$shift\n\t"
16926             "pxor    $dst,$tmp\n\t"
16927             "psubq   $dst,$tmp\t! arithmetic right shift packed1L" %}
16928   ins_encode %{
16929     __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
16930     __ psrlq($dst$$XMMRegister, $shift$$XMMRegister);
16931     __ movdqu($tmp$$XMMRegister, ExternalAddress(vector_all_ones_mask()));
16932     __ psrlq($tmp$$XMMRegister, $shift$$XMMRegister);
16933     __ pxor($dst$$XMMRegister, $tmp$$XMMRegister);
16934     __ psubq($dst$$XMMRegister, $tmp$$XMMRegister);
16935   %}
16936   ins_pipe( pipe_slow );
16937 %}
16938 
16939 instruct vsra1L_imm(vecD dst, vecD src, immI8 shift, vecD tmp) %{
16940   predicate(n->as_Vector()->length() == 1);
16941   match(Set dst (RShiftVL src (RShiftCntV shift)));
16942   effect(TEMP dst, TEMP tmp);
16943   format %{ "movdqu  $dst,$src\n\t"
16944             "psrlq   $dst,$shift\n\t"
16945             "movdqu  $tmp,[0x8000000000000000]\n\t"
16946             "psrlq   $tmp,$shift\n\t"
16947             "pxor    $dst,$tmp\n\t"
16948             "psubq   $dst,$tmp\t! arithmetic right shift packed1L" %}
16949   ins_encode %{
16950     __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
16951     __ psrlq($dst$$XMMRegister, (int)$shift$$constant);
16952     __ movdqu($tmp$$XMMRegister, ExternalAddress(vector_all_ones_mask()));
16953     __ psrlq($tmp$$XMMRegister, (int)$shift$$constant);
16954     __ pxor($dst$$XMMRegister, $tmp$$XMMRegister);
16955     __ psubq($dst$$XMMRegister, $tmp$$XMMRegister);
16956   %}
16957   ins_pipe( pipe_slow );
16958 %}
16959 
16960 instruct vsra1L_reg(vecD dst, vecD src, vecS shift, vecD tmp) %{
16961   predicate(UseAVX > 0 && n->as_Vector()->length() == 1);
16962   match(Set dst (RShiftVL src shift));
16963   effect(TEMP dst, TEMP tmp);
16964   format %{ "vpsrlq   $dst,$src,$shift\n\t"
16965             "vmovdqu  $tmp,[0x8000000000000000]\n\t"
16966             "vpsrlq   $tmp,$tmp,$shift\n\t"
16967             "vpxor    $dst,$dst,$tmp\n\t"
16968             "vpsubq   $dst,$dst,$tmp\t! arithmetic right shift packed1L" %}
16969   ins_encode %{
16970     int vector_len = 0;
16971     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
16972     __ vmovdqu($tmp$$XMMRegister, ExternalAddress(vector_all_ones_mask()));
16973     __ vpsrlq($tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vector_len);
16974     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
16975     __ vpsubq($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
16976   %}
16977   ins_pipe( pipe_slow );
16978 %}
16979 
16980 instruct vsra1L_reg_imm(vecD dst, vecD src, immI8 shift, vecD tmp) %{
16981   predicate(UseAVX > 0 && n->as_Vector()->length() == 1);
16982   match(Set dst (RShiftVL src (RShiftCntV shift)));
16983   effect(TEMP dst, TEMP tmp);
16984   format %{ "vpsrlq   $dst,$src,$shift\n\t"
16985             "vmovdqu  $tmp,[0x8000000000000000]\n\t"
16986             "vpsrlq   $tmp,$tmp,$shift\n\t"
16987             "vpxor    $dst,$dst,$tmp\n\t"
16988             "vpsubq   $dst,$dst,$tmp\t! arithmetic right shift packed1L" %}
16989   ins_encode %{
16990     int vector_len = 0;
16991     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
16992     __ vmovdqu($tmp$$XMMRegister, ExternalAddress(vector_all_ones_mask()));
16993     __ vpsrlq($tmp$$XMMRegister, $tmp$$XMMRegister, (int)$shift$$constant, vector_len);
16994     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
16995     __ vpsubq($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
16996   %}
16997   ins_pipe( pipe_slow );
16998 %}
16999 
17000 instruct vsra1L_reg_evex(vecD dst, vecD src, vecS shift) %{
17001   predicate(UseAVX > 2 && VM_Version::supports_avx512vl() && n->as_Vector()->length() == 1);
17002   match(Set dst (RShiftVL src shift));
17003   format %{ "evpsraq  $dst,$src,$shift\t! arithmetic right shift packed1L" %}
17004   ins_encode %{
17005     int vector_len = 0;
17006     __ evpsraq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17007   %}
17008   ins_pipe( pipe_slow );
17009 %}
17010 
17011 instruct vsra2L_reg_imm(vecX dst, vecX src, immI8 shift, vecX tmp) %{
17012   predicate(UseSSE >= 2 && n->as_Vector()->length() == 2);
17013   match(Set dst (RShiftVL src (RShiftCntV shift)));
17014   effect(TEMP dst, TEMP tmp);
17015   format %{ "movdqu  $dst,$src\n\t"
17016             "psrlq   $dst,$shift\n\t"
17017             "movdqu  $tmp,[0x8000000000000000]\n\t"
17018             "psrlq   $tmp,$shift\n\t"
17019             "pxor    $dst,$tmp\n\t"
17020             "psubq   $dst,$tmp\t! arithmetic right shift packed2L" %}
17021   ins_encode %{
17022     __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
17023     __ psrlq($dst$$XMMRegister, (int)$shift$$constant);
17024     __ movdqu($tmp$$XMMRegister, ExternalAddress(vector_all_ones_mask()));
17025     __ psrlq($tmp$$XMMRegister, (int)$shift$$constant);
17026     __ pxor($dst$$XMMRegister, $tmp$$XMMRegister);
17027     __ psubq($dst$$XMMRegister, $tmp$$XMMRegister);
17028   %}
17029   ins_pipe( pipe_slow );
17030 %}
17031 
17032 instruct vsra2L_reg(vecX dst, vecX src, vecS shift, vecX tmp) %{
17033   predicate(UseSSE >= 2 && n->as_Vector()->length() == 2);
17034   match(Set dst (RShiftVL src shift));
17035   effect(TEMP dst, TEMP tmp);
17036   format %{ "movdqu  $dst,$src\n\t"
17037             "psrlq   $dst,$shift\n\t"
17038             "movdqu  $tmp,[0x8000000000000000]\n\t"
17039             "psrlq   $tmp,$shift\n\t"
17040             "pxor    $dst,$tmp\n\t"
17041             "psubq   $dst,$tmp\t! arithmetic right shift packed2L" %}
17042   ins_encode %{
17043     __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
17044     __ psrlq($dst$$XMMRegister, $shift$$XMMRegister);
17045     __ movdqu($tmp$$XMMRegister, ExternalAddress(vector_all_ones_mask()));
17046     __ psrlq($tmp$$XMMRegister, $shift$$XMMRegister);
17047     __ pxor($dst$$XMMRegister, $tmp$$XMMRegister);
17048     __ psubq($dst$$XMMRegister, $tmp$$XMMRegister);
17049   %}
17050   ins_pipe( pipe_slow );
17051 %}
17052 
17053 instruct vsra2L_reg_evex_imm(vecX dst, vecX src, immI8 shift) %{
17054   predicate(UseAVX > 2 && VM_Version::supports_avx512vl() && n->as_Vector()->length() == 2);
17055   match(Set dst (RShiftVL src (RShiftCntV shift)));
17056   format %{ "evpsraq  $dst,$src,$shift\t! arithmetic right shift packed2L" %}
17057   ins_encode %{
17058     int vector_len = 0;
17059     __ evpsraq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
17060   %}
17061   ins_pipe( pipe_slow );
17062 %}
17063 
17064 instruct vsra2L_reg_evex(vecX dst, vecX src, vecS shift) %{
17065   predicate(UseAVX > 2 && VM_Version::supports_avx512vl() && n->as_Vector()->length() == 2);
17066   match(Set dst (RShiftVL src shift));
17067   format %{ "evpsraq  $dst,$src,$shift\t! arithmetic right shift packed2L" %}
17068   ins_encode %{
17069     int vector_len = 0;
17070     __ evpsraq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17071   %}
17072   ins_pipe( pipe_slow );
17073 %}
17074 
17075 instruct vsra4L_reg_imm(vecY dst, vecY src, immI8 shift, vecY tmp) %{
17076   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
17077   match(Set dst (RShiftVL src (RShiftCntV shift)));
17078   effect(TEMP dst, TEMP tmp);
17079   format %{ "vpsrlq   $dst,$src,$shift\n\t"
17080             "vmovdqu  $tmp,[0x8000000000000000]\n\t"
17081             "vpsrlq   $tmp,$tmp,$shift\n\t"
17082             "vpxor    $dst,$dst,$tmp\n\t"
17083             "vpsubq   $dst,$dst,$tmp\t! arithmetic right shift packed4L" %}
17084   ins_encode %{
17085     int vector_len = 1;
17086     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
17087     __ vmovdqu($tmp$$XMMRegister, ExternalAddress(vector_all_ones_mask()));
17088     __ vpsrlq($tmp$$XMMRegister, $tmp$$XMMRegister, (int)$shift$$constant, vector_len);
17089     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
17090     __ vpsubq($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
17091   %}
17092   ins_pipe( pipe_slow );
17093 %}
17094 
17095 instruct vsra4L_reg(vecY dst, vecY src, vecS shift, vecY tmp) %{
17096   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
17097   match(Set dst (RShiftVL src shift));
17098   effect(TEMP dst, TEMP tmp);
17099   format %{ "vpsrlq   $dst,$src,$shift\n\t"
17100             "vmovdqu  $tmp,[0x8000000000000000]\n\t"
17101             "vpsrlq   $tmp,$tmp,$shift\n\t"
17102             "vpxor    $dst,$dst,$tmp\n\t"
17103             "vpsubq   $dst,$dst,$tmp\t! arithmetic right shift packed4L" %}
17104   ins_encode %{
17105     int vector_len = 1;
17106     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17107     __ vmovdqu($tmp$$XMMRegister, ExternalAddress(vector_all_ones_mask()));
17108     __ vpsrlq($tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vector_len);
17109     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
17110     __ vpsubq($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
17111   %}
17112   ins_pipe( pipe_slow );
17113 %}
17114 
17115 instruct vsra4L_reg_evex_imm(vecY dst, vecY src, immI8 shift) %{
17116   predicate(UseAVX > 2 && VM_Version::supports_avx512vl() && n->as_Vector()->length() == 4);
17117   match(Set dst (RShiftVL src (RShiftCntV shift)));
17118   format %{ "evpsraq  $dst,$src,$shift\t! arithmetic right shift packed2L" %}
17119   ins_encode %{
17120     int vector_len = 1;
17121     __ evpsraq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
17122   %}
17123   ins_pipe( pipe_slow );
17124 %}
17125 
17126 instruct vsra4L_reg_evex(vecY dst, vecY src, vecS shift) %{
17127   predicate(UseAVX > 2 && VM_Version::supports_avx512vl() && n->as_Vector()->length() == 4);
17128   match(Set dst (RShiftVL src shift));
17129   format %{ "evpsraq  $dst,$src,$shift\t! arithmetic right shift packed4L" %}
17130   ins_encode %{
17131     int vector_len = 1;
17132     __ evpsraq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17133   %}
17134   ins_pipe( pipe_slow );
17135 %}
17136 
17137 instruct vsra8L_reg_evex_imm(vecZ dst, vecZ src, immI8 shift) %{
17138   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
17139   match(Set dst (RShiftVL src (RShiftCntV shift)));
17140   format %{ "evpsraq  $dst,$src,$shift\t! arithmetic right shift packed2L" %}
17141   ins_encode %{
17142     int vector_len = 2;
17143     __ evpsraq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
17144   %}
17145   ins_pipe( pipe_slow );
17146 %}
17147 
17148 instruct vsra8L_reg_evex(vecZ dst, vecZ src, vecS shift) %{
17149   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
17150   match(Set dst (RShiftVL src shift));
17151   format %{ "evpsraq  $dst,$src,$shift\t! arithmetic right shift packed8L" %}
17152   ins_encode %{
17153     int vector_len = 2;
17154     __ evpsraq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17155   %}
17156   ins_pipe( pipe_slow );
17157 %}
17158 
17159 // ------------------- Variable Bit Shift Left Logical -----------------------------
17160 //Integer Variable left shift
17161 instruct vsllv2I(vecD dst, vecD src, vecD shift) %{
17162   predicate(UseAVX > 1 && n->as_Vector()->length() == 2 && n->in(2)->Opcode() != Op_LShiftCntV);
17163   match(Set dst (LShiftVI src shift));
17164   format %{ "vpsllvd  $dst,$src,$shift\t! variable bit shift left shift packed2I" %}
17165   ins_encode %{
17166     int vector_len = 0;
17167     __ vpsllvd($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17168   %}
17169   ins_pipe( pipe_slow );
17170 %}
17171 
17172 instruct vsllv4I_reg(vecX dst, vecX src, vecX shift) %{
17173   predicate(UseAVX > 1 && n->as_Vector()->length() == 4 && n->in(2)->Opcode() != Op_LShiftCntV);
17174   match(Set dst (LShiftVI src shift));
17175   format %{ "vpsllvd  $dst,$src,$shift\t! variable bit shift left shift packed4I" %}
17176   ins_encode %{
17177     int vector_len = 0;
17178     __ vpsllvd($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17179   %}
17180   ins_pipe( pipe_slow );
17181 %}
17182 
17183 instruct vsllv4I_reg_evex(vecX dst, vecX src, vecX shift) %{
17184   predicate(UseAVX > 2 && n->as_Vector()->length() == 4 && n->in(2)->Opcode() != Op_LShiftCntV);
17185   match(Set dst (LShiftVI src shift));
17186   format %{ "vpsllvd  $dst,$src,$shift\t! variable bit shift left shift packed4I" %}
17187   ins_encode %{
17188     int vector_len = 0;
17189     __ vpsllvd($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17190   %}
17191   ins_pipe( pipe_slow );
17192 %}
17193 
17194 instruct vsllv8I_reg(vecY dst, vecY src, vecY shift) %{
17195   predicate(UseAVX > 1 && n->as_Vector()->length() == 8 && n->in(2)->Opcode() != Op_LShiftCntV);
17196   match(Set dst (LShiftVI src shift));
17197   format %{ "vpsllvd  $dst,$src,$shift\t! variable bit shift left shift packed8I" %}
17198   ins_encode %{
17199     int vector_len = 1;
17200     __ vpsllvd($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17201   %}
17202   ins_pipe( pipe_slow );
17203 %}
17204 
17205 instruct vsllv8I_reg_evex(vecY dst, vecY src, vecY shift) %{
17206   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && n->in(2)->Opcode() != Op_LShiftCntV);
17207   match(Set dst (LShiftVI src shift));
17208   format %{ "vpsllvd  $dst,$src,$shift\t! variable bit shift left shift packed8I" %}
17209   ins_encode %{
17210     int vector_len = 1;
17211     __ vpsllvd($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17212   %}
17213   ins_pipe( pipe_slow );
17214 %}
17215 
17216 instruct vsllv16I_reg_evex(vecZ dst, vecZ src, vecZ shift) %{
17217   predicate(UseAVX > 2 && n->as_Vector()->length() == 16 && n->in(2)->Opcode() != Op_LShiftCntV);
17218   match(Set dst (LShiftVI src shift));
17219   format %{ "vpsllvd  $dst,$src,$shift\t! variable bit shift left shift packed16I" %}
17220   ins_encode %{
17221     int vector_len = 2;
17222     __ vpsllvd($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17223   %}
17224   ins_pipe( pipe_slow );
17225 %}
17226 
17227 //Long Variable left shift
17228 instruct vsllv1L_reg(vecD dst, vecD src, vecD shift) %{
17229   predicate(UseAVX > 1 && n->as_Vector()->length() == 1 && n->in(2)->Opcode() != Op_LShiftCntV);
17230   match(Set dst (LShiftVL src shift));
17231   format %{ "vpsllvq  $dst,$src,$shift\t! variable bit shift left shift packed1L" %}
17232   ins_encode %{
17233     int vector_len = 0;
17234     __ vpsllvq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17235   %}
17236   ins_pipe( pipe_slow );
17237 %}
17238 
17239 instruct vsllv2L_reg(vecX dst, vecX src, vecX shift) %{
17240   predicate(UseAVX > 1 && n->as_Vector()->length() == 2 && n->in(2)->Opcode() != Op_LShiftCntV);
17241   match(Set dst (LShiftVL src shift));
17242   format %{ "vpsllvq  $dst,$src,$shift\t! variable bit shift left shift packed2L" %}
17243   ins_encode %{
17244     int vector_len = 0;
17245     __ vpsllvq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17246   %}
17247   ins_pipe( pipe_slow );
17248 %}
17249 
17250 instruct vsllv2L_reg_evex(vecX dst, vecX src, vecX shift) %{
17251   predicate(UseAVX > 2 && n->as_Vector()->length() == 2 && n->in(2)->Opcode() != Op_LShiftCntV);
17252   match(Set dst (LShiftVL src shift));
17253   format %{ "vpsllvq  $dst,$src,$shift\t! variable bit shift left shift packed2L" %}
17254   ins_encode %{
17255     int vector_len = 0;
17256     __ vpsllvq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17257   %}
17258   ins_pipe( pipe_slow );
17259 %}
17260 
17261 instruct vsllv4L_reg(vecY dst, vecY src, vecY shift) %{
17262   predicate(UseAVX > 1 && n->as_Vector()->length() == 4 && n->in(2)->Opcode() != Op_LShiftCntV);
17263   match(Set dst (LShiftVL src shift));
17264   format %{ "vpsllvq  $dst,$src,$shift\t! variable bit shift left shift packed4L" %}
17265   ins_encode %{
17266     int vector_len = 1;
17267     __ vpsllvq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17268   %}
17269   ins_pipe( pipe_slow );
17270 %}
17271 
17272 instruct vsllv4L_reg_evex(vecY dst, vecY src, vecY shift) %{
17273   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && n->in(2)->Opcode() != Op_LShiftCntV);
17274   match(Set dst (LShiftVL src shift));
17275   format %{ "vpsllvq  $dst,$src,$shift\t! variable bit shift left shift packed4L" %}
17276   ins_encode %{
17277     int vector_len = 1;
17278     __ vpsllvq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17279   %}
17280   ins_pipe( pipe_slow );
17281 %}
17282 
17283 instruct vsllv8L_reg_evex(vecZ dst, vecZ src, vecZ shift) %{
17284   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && n->in(2)->Opcode() != Op_LShiftCntV);
17285   match(Set dst (LShiftVL src shift));
17286   format %{ "vpsllvq  $dst,$src,$shift\t! variable bit shift left shift packed16I" %}
17287   ins_encode %{
17288     int vector_len = 2;
17289     __ vpsllvq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17290   %}
17291   ins_pipe( pipe_slow );
17292 %}
17293 
17294 // ------------------- Variable Bit Shift Right Logical -----------------------------
17295 //Integer Variable right shift
17296 instruct vsrlv2I_reg(vecD dst, vecD src, vecD shift) %{
17297   predicate(UseAVX > 1 && n->as_Vector()->length() == 2 && n->in(2)->Opcode() != Op_RShiftCntV);
17298   match(Set dst (URShiftVI src shift));
17299   format %{ "vpsrlvd  $dst,$src,$shift\t! variable bit shift right shift packed2I" %}
17300   ins_encode %{
17301     int vector_len = 0;
17302     __ vpsrlvd($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17303   %}
17304   ins_pipe( pipe_slow );
17305 %}
17306 
17307 instruct vsrlv4I_reg(vecX dst, vecX src, vecX shift) %{
17308   predicate(UseAVX > 1 && n->as_Vector()->length() == 4 && n->in(2)->Opcode() != Op_RShiftCntV);
17309   match(Set dst (URShiftVI src shift));
17310   format %{ "vpsrlvd  $dst,$src,$shift\t! variable bit shift right shift packed4I" %}
17311   ins_encode %{
17312     int vector_len = 0;
17313     __ vpsrlvd($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17314   %}
17315   ins_pipe( pipe_slow );
17316 %}
17317 
17318 instruct vsrlv4I_reg_evex(vecX dst, vecX src, vecX shift) %{
17319   predicate(UseAVX > 2 && n->as_Vector()->length() == 4 && n->in(2)->Opcode() != Op_RShiftCntV);
17320   match(Set dst (URShiftVI src shift));
17321   format %{ "vpsrlvd  $dst,$src,$shift\t! variable bit shift right shift packed4I" %}
17322   ins_encode %{
17323     int vector_len = 0;
17324     __ vpsrlvd($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17325   %}
17326   ins_pipe( pipe_slow );
17327 %}
17328 
17329 instruct vsrlv8I_reg(vecY dst, vecY src, vecY shift) %{
17330   predicate(UseAVX > 1 && n->as_Vector()->length() == 8 && n->in(2)->Opcode() != Op_RShiftCntV);
17331   match(Set dst (URShiftVI src shift));
17332   format %{ "vpsrlvd  $dst,$src,$shift\t! variable bit shift right shift packed8I" %}
17333   ins_encode %{
17334     int vector_len = 1;
17335     __ vpsrlvd($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17336   %}
17337   ins_pipe( pipe_slow );
17338 %}
17339 
17340 instruct vsrlv8I_reg_evex(vecY dst, vecY src, vecY shift) %{
17341   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && n->in(2)->Opcode() != Op_RShiftCntV);
17342   match(Set dst (URShiftVI src shift));
17343   format %{ "vpsrlvd  $dst,$src,$shift\t! variable bit shift right shift packed8I" %}
17344   ins_encode %{
17345     int vector_len = 1;
17346     __ vpsrlvd($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17347   %}
17348   ins_pipe( pipe_slow );
17349 %}
17350 
17351 instruct vsrlv16I_reg_evex(vecZ dst, vecZ src, vecZ shift) %{
17352   predicate(UseAVX > 2 && n->as_Vector()->length() == 16 && n->in(2)->Opcode() != Op_RShiftCntV);
17353   match(Set dst (URShiftVI src shift));
17354   format %{ "vpsrlvd  $dst,$src,$shift\t! variable bit shift right shift packed16I" %}
17355   ins_encode %{
17356     int vector_len = 2;
17357     __ vpsrlvd($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17358   %}
17359   ins_pipe( pipe_slow );
17360 %}
17361 
17362 //Long Variable right shift
17363 instruct vsrlv1L_reg(vecD dst, vecD src, vecD shift) %{
17364   predicate(UseAVX > 1 && n->as_Vector()->length() == 1 && n->in(2)->Opcode() != Op_RShiftCntV);
17365   match(Set dst (URShiftVL src shift));
17366   format %{ "vpsrlvq  $dst,$src,$shift\t! variable bit shift right shift packed1L" %}
17367   ins_encode %{
17368     int vector_len = 0;
17369     __ vpsrlvq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17370   %}
17371   ins_pipe( pipe_slow );
17372 %}
17373 
17374 instruct vsrlv2L_reg(vecX dst, vecX src, vecX shift) %{
17375   predicate(UseAVX > 1 && n->as_Vector()->length() == 2 && n->in(2)->Opcode() != Op_RShiftCntV);
17376   match(Set dst (URShiftVL src shift));
17377   format %{ "vpsrlvq  $dst,$src,$shift\t! variable bit shift right shift packed2L" %}
17378   ins_encode %{
17379     int vector_len = 0;
17380     __ vpsrlvq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17381   %}
17382   ins_pipe( pipe_slow );
17383 %}
17384 
17385 instruct vsrlv2L_reg_evex(vecX dst, vecX src, vecX shift) %{
17386   predicate(UseAVX > 2 && n->as_Vector()->length() == 2 && n->in(2)->Opcode() != Op_RShiftCntV);
17387   match(Set dst (URShiftVL src shift));
17388   format %{ "vpsrlvq  $dst,$src,$shift\t! variable bit shift right shift packed2L" %}
17389   ins_encode %{
17390     int vector_len = 0;
17391     __ vpsrlvq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17392   %}
17393   ins_pipe( pipe_slow );
17394 %}
17395 
17396 instruct vsrlv4L_reg(vecY dst, vecY src, vecY shift) %{
17397   predicate(UseAVX > 1 && n->as_Vector()->length() == 4 && n->in(2)->Opcode() != Op_RShiftCntV);
17398   match(Set dst (URShiftVL src shift));
17399   format %{ "vpsrlvq  $dst,$src,$shift\t! variable bit shift right shift packed4L" %}
17400   ins_encode %{
17401     int vector_len = 1;
17402     __ vpsrlvq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17403   %}
17404   ins_pipe( pipe_slow );
17405 %}
17406 
17407 instruct vsrlv4L_reg_evex(vecY dst, vecY src, vecY shift) %{
17408   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && n->in(2)->Opcode() != Op_RShiftCntV);
17409   match(Set dst (URShiftVL src shift));
17410   format %{ "vpsrlvq  $dst,$src,$shift\t! variable bit shift right shift packed4L" %}
17411   ins_encode %{
17412     int vector_len = 1;
17413     __ vpsrlvq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17414   %}
17415   ins_pipe( pipe_slow );
17416 %}
17417 
17418 instruct vsrlv8L_reg(vecZ dst, vecZ src, vecZ shift) %{
17419   predicate(UseAVX > 1 && n->as_Vector()->length() == 8 && n->in(2)->Opcode() != Op_RShiftCntV);
17420   match(Set dst (URShiftVL src shift));
17421   format %{ "vpsrlvq  $dst,$src,$shift\t! variable bit shift right shift packed8L" %}
17422   ins_encode %{
17423     int vector_len = 2;
17424     __ vpsrlvq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17425   %}
17426   ins_pipe( pipe_slow );
17427 %}
17428 
17429 // ------------------- Variable Bit Shift Right Arithmetic -----------------------------
17430 //Integer Variable right shift
17431 instruct vsrav2I_reg(vecD dst, vecD src, vecD shift) %{
17432   predicate(UseAVX > 1 && n->as_Vector()->length() == 2 && n->in(2)->Opcode() != Op_RShiftCntV);
17433   match(Set dst (RShiftVI src shift));
17434   format %{ "vpsravd  $dst,$src,$shift\t! variable bit shift right shift packed2I" %}
17435   ins_encode %{
17436     int vector_len = 0;
17437     __ vpsravd($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17438   %}
17439   ins_pipe( pipe_slow );
17440 %}
17441 
17442 instruct vsrav4I_reg(vecX dst, vecX src, vecX shift) %{
17443   predicate(UseAVX > 1 && n->as_Vector()->length() == 4 && n->in(2)->Opcode() != Op_RShiftCntV);
17444   match(Set dst (RShiftVI src shift));
17445   format %{ "vpsravd  $dst,$src,$shift\t! variable bit shift right shift packed4I" %}
17446   ins_encode %{
17447     int vector_len = 0;
17448     __ vpsravd($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17449   %}
17450   ins_pipe( pipe_slow );
17451 %}
17452 
17453 instruct vsrav4I_reg_evex(vecX dst, vecX src, vecX shift) %{
17454   predicate(UseAVX > 2 && n->as_Vector()->length() == 4 && n->in(2)->Opcode() != Op_RShiftCntV);
17455   match(Set dst (RShiftVI src shift));
17456   format %{ "vpsravd  $dst,$src,$shift\t! variable bit shift right shift packed4I" %}
17457   ins_encode %{
17458     int vector_len = 0;
17459     __ vpsravd($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17460   %}
17461   ins_pipe( pipe_slow );
17462 %}
17463 
17464 instruct vsrav8I_reg(vecY dst, vecY src, vecY shift) %{
17465   predicate(UseAVX > 1 && n->as_Vector()->length() == 8 && n->in(2)->Opcode() != Op_RShiftCntV);
17466   match(Set dst (RShiftVI src shift));
17467   format %{ "vpsravd  $dst,$src,$shift\t! variable bit shift right shift packed8I" %}
17468   ins_encode %{
17469     int vector_len = 1;
17470     __ vpsravd($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17471   %}
17472   ins_pipe( pipe_slow );
17473 %}
17474 
17475 instruct vsrav8I_reg_evex(vecY dst, vecY src, vecY shift) %{
17476   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && n->in(2)->Opcode() != Op_RShiftCntV);
17477   match(Set dst (RShiftVI src shift));
17478   format %{ "vpsravd  $dst,$src,$shift\t! variable bit shift right shift packed8I" %}
17479   ins_encode %{
17480     int vector_len = 1;
17481     __ vpsravd($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17482   %}
17483   ins_pipe( pipe_slow );
17484 %}
17485 
17486 instruct vsrav16I_reg_evex(vecZ dst, vecZ src, vecZ shift) %{
17487   predicate(UseAVX > 2 && n->as_Vector()->length() == 16 && n->in(2)->Opcode() != Op_RShiftCntV);
17488   match(Set dst (RShiftVI src shift));
17489   format %{ "vpsravd  $dst,$src,$shift\t! variable bit shift right shift packed16I" %}
17490   ins_encode %{
17491     int vector_len = 2;
17492     __ vpsravd($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17493   %}
17494   ins_pipe( pipe_slow );
17495 %}
17496 
17497 //Long Variable right shift arithmetic
17498 instruct vsrav1L_reg(vecD dst, vecD src, vecD shift, vecD tmp) %{
17499   predicate(UseAVX > 0 && n->as_Vector()->length() == 1 && n->in(2)->Opcode() != Op_RShiftCntV);
17500   match(Set dst (RShiftVL src shift));
17501   effect(TEMP dst, TEMP tmp);
17502   format %{ "vpsrlvq   $dst,$src,$shift\n\t"
17503             "vmovdqu   $tmp,[0x8000000000000000]\n\t"
17504             "vpsrlvq   $tmp,$tmp,$shift\n\t"
17505             "vpxor     $dst,$dst,$tmp\n\t"
17506             "vpsubq    $dst,$dst,$tmp\t! variable arithmetic right shift packed1L" %}
17507    ins_encode %{
17508      int vector_len = 0;
17509      __ vpsrlvq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17510      __ vmovdqu($tmp$$XMMRegister, ExternalAddress(vector_all_ones_mask()));
17511      __ vpsrlvq($tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vector_len);
17512      __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
17513      __ vpsubq($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
17514    %}
17515    ins_pipe( pipe_slow );
17516  %}
17517 
17518 instruct vsrav1L_reg_evex(vecD dst, vecD src, vecD shift) %{
17519   predicate(UseAVX > 2 && n->as_Vector()->length() == 1 && n->in(2)->Opcode() != Op_RShiftCntV);
17520   match(Set dst (RShiftVL src shift));
17521   format %{ "evpsravq  $dst,$src,$shift\t! variable arithmetic right shift packed1L" %}
17522   ins_encode %{
17523     int vector_len = 0;
17524     __ evpsravq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17525   %}
17526   ins_pipe( pipe_slow );
17527 %}
17528 
17529 instruct vsrav2L_reg(vecX dst, vecX src, vecX shift, vecX tmp) %{
17530   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 && n->in(2)->Opcode() != Op_RShiftCntV);
17531   match(Set dst (RShiftVL src shift));
17532   effect(TEMP dst, TEMP tmp);
17533   format %{ "vpsrlvq   $dst,$src,$shift\n\t"
17534             "vmovdqu   $tmp,[0x8000000000000000]\n\t"
17535             "vpsrlvq   $tmp,$tmp,$shift\n\t"
17536             "vpxor     $dst,$dst,$tmp\n\t"
17537             "vpsubq    $dst,$dst,$tmp\t! variable arithmetic right shift packed2L" %}
17538   ins_encode %{
17539     int vector_len = 0;
17540     __ vpsrlvq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17541     __ vmovdqu($tmp$$XMMRegister, ExternalAddress(vector_all_ones_mask()));
17542     __ vpsrlvq($tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vector_len);
17543     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
17544     __ vpsubq($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
17545   %}
17546   ins_pipe( pipe_slow );
17547 %}
17548 
17549 instruct vsrav2L_reg_evex(vecX dst, vecX src, vecX shift) %{
17550   predicate(UseAVX > 2 && n->as_Vector()->length() == 2 && n->in(2)->Opcode() != Op_RShiftCntV);
17551   match(Set dst (RShiftVL src shift));
17552   format %{ "evpsravq  $dst,$src,$shift\t! variable arithmetic right shift packed2L" %}
17553   ins_encode %{
17554     int vector_len = 0;
17555     __ evpsravq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17556   %}
17557   ins_pipe( pipe_slow );
17558 %}
17559 
17560 instruct vsrav4L_reg(vecY dst, vecY src, vecY shift, vecY tmp) %{
17561   predicate(UseAVX > 1 && n->as_Vector()->length() == 4 && n->in(2)->Opcode() != Op_RShiftCntV);
17562   match(Set dst (RShiftVL src shift));
17563   effect(TEMP dst, TEMP tmp);
17564   format %{ "vpsrlvq   $dst,$src,$shift\n\t"
17565             "vmovdqu   $tmp,[0x8000000000000000]\n\t"
17566             "vpsrlvq   $tmp,$tmp,$shift\n\t"
17567             "vpxor     $dst,$dst,$tmp\n\t"
17568             "vpsubq    $dst,$dst,$tmp\t! variable arithmetic right shift packed4L" %}
17569   ins_encode %{
17570     int vector_len = 1;
17571     __ vpsrlvq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17572     __ vmovdqu($tmp$$XMMRegister, ExternalAddress(vector_all_ones_mask()));
17573     __ vpsrlvq($tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vector_len);
17574     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
17575     __ vpsubq($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
17576   %}
17577   ins_pipe( pipe_slow );
17578 %}
17579 
17580 instruct vsrav4L_reg_evex(vecY dst, vecY src, vecY shift) %{
17581   predicate(UseAVX > 2 && n->as_Vector()->length() == 4 && n->in(2)->Opcode() != Op_RShiftCntV);
17582   match(Set dst (RShiftVL src shift));
17583   format %{ "evpsravq  $dst,$src,$shift\t! variable bit shift right shift packed4L" %}
17584   ins_encode %{
17585     int vector_len = 1;
17586     __ evpsravq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17587   %}
17588   ins_pipe( pipe_slow );
17589 %}
17590 
17591 instruct vsrav8L_reg_evex(vecZ dst, vecZ src, vecZ shift) %{
17592   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && n->in(2)->Opcode() != Op_RShiftCntV);
17593   match(Set dst (RShiftVL src shift));
17594   format %{ "evpsravq  $dst,$src,$shift\t! variable bit shift right shift packed8L" %}
17595   ins_encode %{
17596     int vector_len = 2;
17597     __ evpsravq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17598   %}
17599   ins_pipe( pipe_slow );
17600 %}
17601 
17602 // --------------------------------- AND --------------------------------------
17603 
17604 instruct vand4B(vecS dst, vecS src) %{
17605   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 4);
17606   match(Set dst (AndV dst src));
17607   format %{ "pand    $dst,$src\t! and vectors (4 bytes)" %}
17608   ins_encode %{
17609     __ pand($dst$$XMMRegister, $src$$XMMRegister);
17610   %}
17611   ins_pipe( pipe_slow );
17612 %}
17613 
17614 instruct vand4B_reg(vecS dst, vecS src1, vecS src2) %{
17615   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
17616   match(Set dst (AndV src1 src2));
17617   format %{ "vpand   $dst,$src1,$src2\t! and vectors (4 bytes)" %}
17618   ins_encode %{
17619     int vector_len = 0;
17620     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
17621   %}
17622   ins_pipe( pipe_slow );
17623 %}
17624 
17625 instruct vand4B_mem(vecS dst, vecS src, memory mem) %{
17626   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
17627   match(Set dst (AndV src (LoadVector mem)));
17628   format %{ "vpand   $dst,$src,$mem\t! and vectors (4 bytes)" %}
17629   ins_encode %{
17630     int vector_len = 0;
17631     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
17632   %}
17633   ins_pipe( pipe_slow );
17634 %}
17635 
17636 instruct vand8B(vecD dst, vecD src) %{
17637   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 8);
17638   match(Set dst (AndV dst src));
17639   format %{ "pand    $dst,$src\t! and vectors (8 bytes)" %}
17640   ins_encode %{
17641     __ pand($dst$$XMMRegister, $src$$XMMRegister);
17642   %}
17643   ins_pipe( pipe_slow );
17644 %}
17645 
17646 instruct vand8B_reg(vecD dst, vecD src1, vecD src2) %{
17647   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
17648   match(Set dst (AndV src1 src2));
17649   format %{ "vpand   $dst,$src1,$src2\t! and vectors (8 bytes)" %}
17650   ins_encode %{
17651     int vector_len = 0;
17652     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
17653   %}
17654   ins_pipe( pipe_slow );
17655 %}
17656 
17657 instruct vand8B_mem(vecD dst, vecD src, memory mem) %{
17658   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
17659   match(Set dst (AndV src (LoadVector mem)));
17660   format %{ "vpand   $dst,$src,$mem\t! and vectors (8 bytes)" %}
17661   ins_encode %{
17662     int vector_len = 0;
17663     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
17664   %}
17665   ins_pipe( pipe_slow );
17666 %}
17667 
17668 instruct vand16B(vecX dst, vecX src) %{
17669   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 16);
17670   match(Set dst (AndV dst src));
17671   format %{ "pand    $dst,$src\t! and vectors (16 bytes)" %}
17672   ins_encode %{
17673     __ pand($dst$$XMMRegister, $src$$XMMRegister);
17674   %}
17675   ins_pipe( pipe_slow );
17676 %}
17677 
17678 instruct vand16B_reg(vecX dst, vecX src1, vecX src2) %{
17679   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
17680   match(Set dst (AndV src1 src2));
17681   format %{ "vpand   $dst,$src1,$src2\t! and vectors (16 bytes)" %}
17682   ins_encode %{
17683     int vector_len = 0;
17684     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
17685   %}
17686   ins_pipe( pipe_slow );
17687 %}
17688 
17689 instruct vand16B_mem(vecX dst, vecX src, memory mem) %{
17690   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
17691   match(Set dst (AndV src (LoadVector mem)));
17692   format %{ "vpand   $dst,$src,$mem\t! and vectors (16 bytes)" %}
17693   ins_encode %{
17694     int vector_len = 0;
17695     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
17696   %}
17697   ins_pipe( pipe_slow );
17698 %}
17699 
17700 instruct vand32B_reg(vecY dst, vecY src1, vecY src2) %{
17701   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
17702   match(Set dst (AndV src1 src2));
17703   format %{ "vpand   $dst,$src1,$src2\t! and vectors (32 bytes)" %}
17704   ins_encode %{
17705     int vector_len = 1;
17706     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
17707   %}
17708   ins_pipe( pipe_slow );
17709 %}
17710 
17711 instruct vand32B_mem(vecY dst, vecY src, memory mem) %{
17712   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
17713   match(Set dst (AndV src (LoadVector mem)));
17714   format %{ "vpand   $dst,$src,$mem\t! and vectors (32 bytes)" %}
17715   ins_encode %{
17716     int vector_len = 1;
17717     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
17718   %}
17719   ins_pipe( pipe_slow );
17720 %}
17721 
17722 instruct vand64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
17723   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
17724   match(Set dst (AndV src1 src2));
17725   format %{ "vpand   $dst,$src1,$src2\t! and vectors (64 bytes)" %}
17726   ins_encode %{
17727     int vector_len = 2;
17728     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
17729   %}
17730   ins_pipe( pipe_slow );
17731 %}
17732 
17733 instruct vand64B_mem(vecZ dst, vecZ src, memory mem) %{
17734   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
17735   match(Set dst (AndV src (LoadVector mem)));
17736   format %{ "vpand   $dst,$src,$mem\t! and vectors (64 bytes)" %}
17737   ins_encode %{
17738     int vector_len = 2;
17739     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
17740   %}
17741   ins_pipe( pipe_slow );
17742 %}
17743 
17744 // --------------------------------- OR ---------------------------------------
17745 
17746 instruct vor4B(vecS dst, vecS src) %{
17747   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 4);
17748   match(Set dst (OrV dst src));
17749   format %{ "por     $dst,$src\t! or vectors (4 bytes)" %}
17750   ins_encode %{
17751     __ por($dst$$XMMRegister, $src$$XMMRegister);
17752   %}
17753   ins_pipe( pipe_slow );
17754 %}
17755 
17756 instruct vor4B_reg(vecS dst, vecS src1, vecS src2) %{
17757   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
17758   match(Set dst (OrV src1 src2));
17759   format %{ "vpor    $dst,$src1,$src2\t! or vectors (4 bytes)" %}
17760   ins_encode %{
17761     int vector_len = 0;
17762     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
17763   %}
17764   ins_pipe( pipe_slow );
17765 %}
17766 
17767 instruct vor4B_mem(vecS dst, vecS src, memory mem) %{
17768   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
17769   match(Set dst (OrV src (LoadVector mem)));
17770   format %{ "vpor    $dst,$src,$mem\t! or vectors (4 bytes)" %}
17771   ins_encode %{
17772     int vector_len = 0;
17773     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
17774   %}
17775   ins_pipe( pipe_slow );
17776 %}
17777 
17778 instruct vor8B(vecD dst, vecD src) %{
17779   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 8);
17780   match(Set dst (OrV dst src));
17781   format %{ "por     $dst,$src\t! or vectors (8 bytes)" %}
17782   ins_encode %{
17783     __ por($dst$$XMMRegister, $src$$XMMRegister);
17784   %}
17785   ins_pipe( pipe_slow );
17786 %}
17787 
17788 instruct vor8B_reg(vecD dst, vecD src1, vecD src2) %{
17789   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
17790   match(Set dst (OrV src1 src2));
17791   format %{ "vpor    $dst,$src1,$src2\t! or vectors (8 bytes)" %}
17792   ins_encode %{
17793     int vector_len = 0;
17794     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
17795   %}
17796   ins_pipe( pipe_slow );
17797 %}
17798 
17799 instruct vor8B_mem(vecD dst, vecD src, memory mem) %{
17800   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
17801   match(Set dst (OrV src (LoadVector mem)));
17802   format %{ "vpor    $dst,$src,$mem\t! or vectors (8 bytes)" %}
17803   ins_encode %{
17804     int vector_len = 0;
17805     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
17806   %}
17807   ins_pipe( pipe_slow );
17808 %}
17809 
17810 instruct vor16B(vecX dst, vecX src) %{
17811   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 16);
17812   match(Set dst (OrV dst src));
17813   format %{ "por     $dst,$src\t! or vectors (16 bytes)" %}
17814   ins_encode %{
17815     __ por($dst$$XMMRegister, $src$$XMMRegister);
17816   %}
17817   ins_pipe( pipe_slow );
17818 %}
17819 
17820 instruct vor16B_reg(vecX dst, vecX src1, vecX src2) %{
17821   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
17822   match(Set dst (OrV src1 src2));
17823   format %{ "vpor    $dst,$src1,$src2\t! or vectors (16 bytes)" %}
17824   ins_encode %{
17825     int vector_len = 0;
17826     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
17827   %}
17828   ins_pipe( pipe_slow );
17829 %}
17830 
17831 instruct vor16B_mem(vecX dst, vecX src, memory mem) %{
17832   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
17833   match(Set dst (OrV src (LoadVector mem)));
17834   format %{ "vpor    $dst,$src,$mem\t! or vectors (16 bytes)" %}
17835   ins_encode %{
17836     int vector_len = 0;
17837     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
17838   %}
17839   ins_pipe( pipe_slow );
17840 %}
17841 
17842 instruct vor32B_reg(vecY dst, vecY src1, vecY src2) %{
17843   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
17844   match(Set dst (OrV src1 src2));
17845   format %{ "vpor    $dst,$src1,$src2\t! or vectors (32 bytes)" %}
17846   ins_encode %{
17847     int vector_len = 1;
17848     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
17849   %}
17850   ins_pipe( pipe_slow );
17851 %}
17852 
17853 instruct vor32B_mem(vecY dst, vecY src, memory mem) %{
17854   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
17855   match(Set dst (OrV src (LoadVector mem)));
17856   format %{ "vpor    $dst,$src,$mem\t! or vectors (32 bytes)" %}
17857   ins_encode %{
17858     int vector_len = 1;
17859     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
17860   %}
17861   ins_pipe( pipe_slow );
17862 %}
17863 
17864 instruct vor64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
17865   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
17866   match(Set dst (OrV src1 src2));
17867   format %{ "vpor    $dst,$src1,$src2\t! or vectors (64 bytes)" %}
17868   ins_encode %{
17869     int vector_len = 2;
17870     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
17871   %}
17872   ins_pipe( pipe_slow );
17873 %}
17874 
17875 instruct vor64B_mem(vecZ dst, vecZ src, memory mem) %{
17876   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
17877   match(Set dst (OrV src (LoadVector mem)));
17878   format %{ "vpor    $dst,$src,$mem\t! or vectors (64 bytes)" %}
17879   ins_encode %{
17880     int vector_len = 2;
17881     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
17882   %}
17883   ins_pipe( pipe_slow );
17884 %}
17885 
17886 // --------------------------------- XOR --------------------------------------
17887 
17888 instruct vxor4B(vecS dst, vecS src) %{
17889   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 4);
17890   match(Set dst (XorV dst src));
17891   format %{ "pxor    $dst,$src\t! xor vectors (4 bytes)" %}
17892   ins_encode %{
17893     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
17894   %}
17895   ins_pipe( pipe_slow );
17896 %}
17897 
17898 instruct vxor4B_reg(vecS dst, vecS src1, vecS src2) %{
17899   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
17900   match(Set dst (XorV src1 src2));
17901   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (4 bytes)" %}
17902   ins_encode %{
17903     int vector_len = 0;
17904     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
17905   %}
17906   ins_pipe( pipe_slow );
17907 %}
17908 
17909 instruct vxor4B_mem(vecS dst, vecS src, memory mem) %{
17910   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
17911   match(Set dst (XorV src (LoadVector mem)));
17912   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (4 bytes)" %}
17913   ins_encode %{
17914     int vector_len = 0;
17915     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
17916   %}
17917   ins_pipe( pipe_slow );
17918 %}
17919 
17920 instruct vxor8B(vecD dst, vecD src) %{
17921   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 8);
17922   match(Set dst (XorV dst src));
17923   format %{ "pxor    $dst,$src\t! xor vectors (8 bytes)" %}
17924   ins_encode %{
17925     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
17926   %}
17927   ins_pipe( pipe_slow );
17928 %}
17929 
17930 instruct vxor8B_reg(vecD dst, vecD src1, vecD src2) %{
17931   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
17932   match(Set dst (XorV src1 src2));
17933   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (8 bytes)" %}
17934   ins_encode %{
17935     int vector_len = 0;
17936     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
17937   %}
17938   ins_pipe( pipe_slow );
17939 %}
17940 
17941 instruct vxor8B_mem(vecD dst, vecD src, memory mem) %{
17942   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
17943   match(Set dst (XorV src (LoadVector mem)));
17944   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (8 bytes)" %}
17945   ins_encode %{
17946     int vector_len = 0;
17947     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
17948   %}
17949   ins_pipe( pipe_slow );
17950 %}
17951 
17952 instruct vxor16B(vecX dst, vecX src) %{
17953   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 16);
17954   match(Set dst (XorV dst src));
17955   format %{ "pxor    $dst,$src\t! xor vectors (16 bytes)" %}
17956   ins_encode %{
17957     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
17958   %}
17959   ins_pipe( pipe_slow );
17960 %}
17961 
17962 instruct vxor16B_reg(vecX dst, vecX src1, vecX src2) %{
17963   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
17964   match(Set dst (XorV src1 src2));
17965   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (16 bytes)" %}
17966   ins_encode %{
17967     int vector_len = 0;
17968     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
17969   %}
17970   ins_pipe( pipe_slow );
17971 %}
17972 
17973 instruct vxor16B_mem(vecX dst, vecX src, memory mem) %{
17974   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
17975   match(Set dst (XorV src (LoadVector mem)));
17976   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (16 bytes)" %}
17977   ins_encode %{
17978     int vector_len = 0;
17979     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
17980   %}
17981   ins_pipe( pipe_slow );
17982 %}
17983 
17984 instruct vxor32B_reg(vecY dst, vecY src1, vecY src2) %{
17985   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
17986   match(Set dst (XorV src1 src2));
17987   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (32 bytes)" %}
17988   ins_encode %{
17989     int vector_len = 1;
17990     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
17991   %}
17992   ins_pipe( pipe_slow );
17993 %}
17994 
17995 instruct vxor32B_mem(vecY dst, vecY src, memory mem) %{
17996   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
17997   match(Set dst (XorV src (LoadVector mem)));
17998   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (32 bytes)" %}
17999   ins_encode %{
18000     int vector_len = 1;
18001     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
18002   %}
18003   ins_pipe( pipe_slow );
18004 %}
18005 
18006 instruct vxor64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
18007   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
18008   match(Set dst (XorV src1 src2));
18009   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (64 bytes)" %}
18010   ins_encode %{
18011     int vector_len = 2;
18012     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
18013   %}
18014   ins_pipe( pipe_slow );
18015 %}
18016 
18017 instruct vxor64B_mem(vecZ dst, vecZ src, memory mem) %{
18018   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
18019   match(Set dst (XorV src (LoadVector mem)));
18020   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (64 bytes)" %}
18021   ins_encode %{
18022     int vector_len = 2;
18023     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
18024   %}
18025   ins_pipe( pipe_slow );
18026 %}
18027 
18028 instruct vcvt4Bto4S_reg(vecD dst, vecS src) %{
18029   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
18030   match(Set dst (VectorCastB2X src));
18031   format %{ "vpmovsxbw   $dst,$src\t! convert 4B to 4S vector" %}
18032   ins_encode %{
18033     int vector_len = 0;
18034     __ vpmovsxbw($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18035   %}
18036   ins_pipe( pipe_slow );
18037 %}
18038 
18039 instruct vcvt8Bto8S_reg(vecX dst, vecD src) %{
18040   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
18041   match(Set dst (VectorCastB2X src));
18042   format %{ "vpmovsxbw   $dst,$src\t! convert 8B to 8S vector" %}
18043   ins_encode %{
18044     int vector_len = 0;
18045     __ vpmovsxbw($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18046   %}
18047   ins_pipe( pipe_slow );
18048 %}
18049 
18050 instruct vcvt16Bto16S_reg(vecY dst, vecX src) %{
18051   predicate(UseAVX >= 2 && n->as_Vector()->length() == 16 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
18052   match(Set dst (VectorCastB2X src));
18053   format %{ "vpmovsxbw   $dst,$src\t! convert 16B to 16S vector" %}
18054   ins_encode %{
18055     int vector_len = 1;
18056     __ vpmovsxbw($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18057   %}
18058   ins_pipe( pipe_slow );
18059 %}
18060 
18061 instruct vcvt32Bto32S_reg(vecZ dst, vecY src) %{
18062   predicate(UseAVX > 2 && n->as_Vector()->length() == 32 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
18063   match(Set dst (VectorCastB2X src));
18064   format %{ "vpmovsxbw   $dst,$src\t! convert 32B to 32S vector" %}
18065   ins_encode %{
18066     int vector_len = 2;
18067     __ vpmovsxbw($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18068   %}
18069   ins_pipe( pipe_slow );
18070 %}
18071 
18072 instruct vcvt4Bto4I_reg(vecX dst, vecS src) %{
18073   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
18074   match(Set dst (VectorCastB2X src));
18075   format %{ "vpmovsxbd   $dst,$src\t! convert 4B to 4I vector" %}
18076   ins_encode %{
18077     int vector_len = 0;
18078     __ vpmovsxbd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18079   %}
18080   ins_pipe( pipe_slow );
18081 %}
18082 
18083 instruct vcvt8Bto8I_reg(vecY dst, vecD src) %{
18084   predicate(UseAVX >= 2 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
18085   match(Set dst (VectorCastB2X src));
18086   format %{ "vpmovsxbd   $dst,$src\t! convert 8B to 8I vector" %}
18087   ins_encode %{
18088     int vector_len = 1;
18089     __ vpmovsxbd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18090   %}
18091   ins_pipe( pipe_slow );
18092 %}
18093 
18094 instruct vcvt16Bto16I_reg(vecZ dst, vecX src) %{
18095   predicate(UseAVX > 2 && n->as_Vector()->length() == 16 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
18096   match(Set dst (VectorCastB2X src));
18097   format %{ "vpmovsxbd   $dst,$src\t! convert 16B to 16I vector" %}
18098   ins_encode %{
18099     int vector_len = 2;
18100     __ vpmovsxbd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18101   %}
18102   ins_pipe( pipe_slow );
18103 %}
18104 
18105 instruct vcvt4Bto4L_reg(vecY dst, vecS src) %{
18106   predicate(UseAVX >= 2 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
18107   match(Set dst (VectorCastB2X src));
18108   format %{ "vpmovsxbq   $dst,$src\t! convert 4B to 4L vector" %}
18109   ins_encode %{
18110     int vector_len = 1;
18111     __ vpmovsxbq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18112   %}
18113   ins_pipe( pipe_slow );
18114 %}
18115 
18116 instruct vcvt8Bto8L_reg(vecZ dst, vecD src) %{
18117   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
18118   match(Set dst (VectorCastB2X src));
18119   format %{ "vpmovsxbq   $dst,$src\t! convert 8B to 8L vector" %}
18120   ins_encode %{
18121     int vector_len = 2;
18122     __ vpmovsxbq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18123   %}
18124   ins_pipe( pipe_slow );
18125 %}
18126 
18127 instruct vcvt4Bto4F_reg(vecX dst, vecS src) %{
18128   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
18129   match(Set dst (VectorCastB2X src));
18130   format %{ "vpmovsxbd   $dst,$src\n\t"
18131             "vcvtdq2ps   $dst,$dst\t! convert 4B to 4F vector" %}
18132   ins_encode %{
18133     int vector_len = 0;
18134     __ vpmovsxbd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18135     __ vcvtdq2ps($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
18136   %}
18137   ins_pipe( pipe_slow );
18138 %}
18139 
18140 instruct vcvt8Bto8F_reg(vecY dst, vecD src) %{
18141   predicate(UseAVX >= 2 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
18142   match(Set dst (VectorCastB2X src));
18143   format %{ "vpmovsxbd   $dst,$src\n\t"
18144             "vcvtdq2ps   $dst,$dst\t! convert 8B to 8F vector" %}
18145   ins_encode %{
18146     int vector_len = 1;
18147     __ vpmovsxbd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18148     __ vcvtdq2ps($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
18149   %}
18150   ins_pipe( pipe_slow );
18151 %}
18152 
18153 instruct vcvt16Bto16F_reg(vecZ dst, vecX src) %{
18154   predicate(UseAVX > 2 && n->as_Vector()->length() == 16 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
18155   match(Set dst (VectorCastB2X src));
18156   format %{ "vpmovsxbd   $dst,$src\n\t"
18157             "vcvtdq2ps   $dst,$dst\t! convert 16B to 16F vector" %}
18158   ins_encode %{
18159     int vector_len = 2;
18160     __ vpmovsxbd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18161     __ vcvtdq2ps($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
18162   %}
18163   ins_pipe( pipe_slow );
18164 %}
18165 
18166 instruct vcvt4Bto4D_reg(vecY dst, vecS src) %{
18167   predicate(UseAVX >= 2 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
18168   match(Set dst (VectorCastB2X src));
18169   format %{ "vpmovsxbd   $dst,$src\n\t"
18170             "vcvtdq2pd   $dst,$dst\t! convert 4B to 4D vector" %}
18171   ins_encode %{
18172     __ vpmovsxbd($dst$$XMMRegister, $src$$XMMRegister, 0);
18173     __ vcvtdq2pd($dst$$XMMRegister, $dst$$XMMRegister, 1);
18174   %}
18175   ins_pipe( pipe_slow );
18176 %}
18177 
18178 instruct vcvt8Bto8D_reg(vecZ dst, vecD src) %{
18179   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
18180   match(Set dst (VectorCastB2X src));
18181   format %{ "vpmovsxbd   $dst,$src\n\t"
18182             "vcvtdq2pd   $dst,$dst\t! convert 8B to 8D vector" %}
18183   ins_encode %{
18184     __ vpmovsxbd($dst$$XMMRegister, $src$$XMMRegister, 1);
18185     __ vcvtdq2pd($dst$$XMMRegister, $dst$$XMMRegister, 2);
18186   %}
18187   ins_pipe( pipe_slow );
18188 %}
18189 
18190 instruct vcvt4Sto4B_reg(vecS dst, vecD src, rRegL scratch) %{
18191   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
18192   effect(TEMP scratch);
18193   match(Set dst (VectorCastS2X src));
18194   format %{ "vpand      $dst,$src,[0x00FF00FF00FF00FF]\n\t"
18195             "vpackuswb  $dst,$dst\t! convert 4S to 4B vector" %}
18196   ins_encode %{
18197     int vector_len = 0;
18198     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vector_len, $scratch$$Register);
18199     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
18200   %}
18201   ins_pipe( pipe_slow );
18202 %}
18203 
18204 instruct vcvt8Sto8B_reg(vecD dst, vecX src, rRegL scratch) %{
18205   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
18206   effect(TEMP scratch);
18207   match(Set dst (VectorCastS2X src));
18208   format %{ "vpand      $dst,$src,[0x00FF00FF00FF00FF]\n\t"
18209             "vpackuswb  $dst,$dst\t! convert 8S to 8B vector" %}
18210   ins_encode %{
18211     int vector_len = 0;
18212     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vector_len, $scratch$$Register);
18213     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
18214   %}
18215   ins_pipe( pipe_slow );
18216 %}
18217 
18218 instruct vcvt16Sto16B_reg(vecX dst, vecY src, vecY tmp, rRegL scratch) %{
18219   predicate(UseAVX > 0 && n->as_Vector()->length() == 16 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
18220   effect(TEMP scratch, TEMP tmp);
18221   match(Set dst (VectorCastS2X src));
18222     format %{ "vpand      $dst,$src,[0x00FF00FF00FF00FF]\n\t"
18223               "vextracti128 $tmp,$dst,0x1\n\t"
18224               "vpackuswb  $dst,$dst,$tmp\t! convert 16S to 16B vector" %}
18225   ins_encode %{
18226     int vector_len = 1;
18227     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vector_len, $scratch$$Register);
18228     __ vextracti128($tmp$$XMMRegister, $dst$$XMMRegister, 0x1);
18229     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
18230   %}
18231   ins_pipe( pipe_slow );
18232 %}
18233 
18234 instruct vcvt32Sto32B_reg(vecY dst, vecZ src) %{
18235   predicate(UseAVX > 2 && n->as_Vector()->length() == 32 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
18236   match(Set dst (VectorCastS2X src));
18237     format %{ "evpmovwb   $dst,$src\t! convert 32S to 32B vector" %}
18238   ins_encode %{
18239     int vector_len = 2;
18240     __ evpmovwb($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18241   %}
18242   ins_pipe( pipe_slow );
18243 %}
18244 
18245 instruct vcvt2Sto2I_reg(vecD dst, vecS src) %{
18246   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
18247   match(Set dst (VectorCastS2X src));
18248   format %{ "vpmovsxwd   $dst,$src\t! convert 2S to 2I vector" %}
18249   ins_encode %{
18250     int vector_len = 0;
18251     __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18252   %}
18253   ins_pipe( pipe_slow );
18254 %}
18255 
18256 instruct vcvt4Sto4I_reg(vecX dst, vecD src) %{
18257   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
18258   match(Set dst (VectorCastS2X src));
18259   format %{ "vpmovsxwd   $dst,$src\t! convert 4S to 4I vector" %}
18260   ins_encode %{
18261     int vector_len = 0;
18262     __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18263   %}
18264   ins_pipe( pipe_slow );
18265 %}
18266 
18267 instruct vcvt8Sto8I_reg(vecY dst, vecX src) %{
18268   predicate(UseAVX >= 2 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
18269   match(Set dst (VectorCastS2X src));
18270   format %{ "vpmovsxwd   $dst,$src\t! convert 8S to 8I vector" %}
18271   ins_encode %{
18272     int vector_len = 1;
18273     __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18274   %}
18275   ins_pipe( pipe_slow );
18276 %}
18277 
18278 instruct vcvt16Sto16I_reg(vecZ dst, vecY src) %{
18279   predicate(UseAVX > 2 && n->as_Vector()->length() == 16 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
18280   match(Set dst (VectorCastS2X src));
18281   format %{ "vpmovsxwd   $dst,$src\t! convert 16S to 16I vector" %}
18282   ins_encode %{
18283     int vector_len = 2;
18284     __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18285   %}
18286   ins_pipe( pipe_slow );
18287 %}
18288 
18289 instruct vcvt2Sto2L_reg(vecX dst, vecS src) %{
18290   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
18291   match(Set dst (VectorCastS2X src));
18292   format %{ "vpmovsxwq   $dst,$src\t! convert 2S to 2L vector" %}
18293   ins_encode %{
18294     int vector_len = 0;
18295     __ vpmovsxwq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18296   %}
18297   ins_pipe( pipe_slow );
18298 %}
18299 
18300 instruct vcvt4Sto4L_reg(vecY dst, vecD src) %{
18301   predicate(UseAVX >= 2 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
18302   match(Set dst (VectorCastS2X src));
18303   format %{ "vpmovsxwq   $dst,$src\t! convert 4S to 4L vector" %}
18304   ins_encode %{
18305     int vector_len = 1;
18306     __ vpmovsxwq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18307   %}
18308   ins_pipe( pipe_slow );
18309 %}
18310 
18311 instruct vcvt8Sto8L_reg(vecZ dst, vecX src) %{
18312   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
18313   match(Set dst (VectorCastS2X src));
18314   format %{ "vpmovsxwq   $dst,$src\t! convert 8S to 8L vector" %}
18315   ins_encode %{
18316     int vector_len = 2;
18317     __ vpmovsxwq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18318   %}
18319   ins_pipe( pipe_slow );
18320 %}
18321 
18322 instruct vcvt2Sto2F_reg(vecD dst, vecS src) %{
18323   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
18324   match(Set dst (VectorCastS2X src));
18325   format %{ "vpmovsxwd   $dst,$src\n\t"
18326             "vcvtdq2ps   $dst,$dst\t! convert 2S to 2F vector" %}
18327   ins_encode %{
18328     int vector_len = 0;
18329     __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18330     __ vcvtdq2ps($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
18331   %}
18332   ins_pipe( pipe_slow );
18333 %}
18334 
18335 instruct vcvt4Sto4F_reg(vecX dst, vecD src) %{
18336   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
18337   match(Set dst (VectorCastS2X src));
18338   format %{ "vpmovsxwd   $dst,$src\n\t"
18339             "vcvtdq2ps   $dst,$dst\t! convert 4S to 4F vector" %}
18340   ins_encode %{
18341     int vector_len = 0;
18342     __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18343     __ vcvtdq2ps($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
18344   %}
18345   ins_pipe( pipe_slow );
18346 %}
18347 
18348 instruct vcvt8Sto8F_reg(vecY dst, vecX src) %{
18349   predicate(UseAVX >= 2 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
18350   match(Set dst (VectorCastS2X src));
18351   format %{ "vpmovsxwd   $dst,$src\n\t"
18352             "vcvtdq2ps   $dst,$dst\t! convert 8S to 8F vector" %}
18353   ins_encode %{
18354     int vector_len = 1;
18355     __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18356     __ vcvtdq2ps($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
18357   %}
18358   ins_pipe( pipe_slow );
18359 %}
18360 
18361 instruct vcvt16Sto16F_reg(vecZ dst, vecY src) %{
18362   predicate(UseAVX > 2 && n->as_Vector()->length() == 16 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
18363   match(Set dst (VectorCastS2X src));
18364   format %{ "vpmovsxwd   $dst,$src\n\t"
18365             "vcvtdq2ps   $dst,$dst\t! convert 16S to 16F vector" %}
18366   ins_encode %{
18367     int vector_len = 2;
18368     __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18369     __ vcvtdq2ps($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
18370   %}
18371   ins_pipe( pipe_slow );
18372 %}
18373 
18374 instruct vcvt2Sto2D_reg(vecX dst, vecS src) %{
18375   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
18376   match(Set dst (VectorCastS2X src));
18377   format %{ "vpmovsxwd   $dst,$src\n\t"
18378             "vcvtdq2pd   $dst,$dst\t! convert 2S to 2D vector" %}
18379   ins_encode %{
18380     int vector_len = 0;
18381     __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18382     __ vcvtdq2pd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
18383   %}
18384   ins_pipe( pipe_slow );
18385 %}
18386 
18387 instruct vcvt4Sto4D_reg(vecY dst, vecD src) %{
18388   predicate(UseAVX >= 2 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
18389   match(Set dst (VectorCastS2X src));
18390   format %{ "vpmovsxwd   $dst,$src\n\t"
18391             "vcvtdq2pd   $dst,$dst\t! convert 4S to 4D vector" %}
18392   ins_encode %{
18393     int vector_len = 1;
18394     __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18395     __ vcvtdq2pd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
18396   %}
18397   ins_pipe( pipe_slow );
18398 %}
18399 
18400 instruct vcvt8Sto8D_reg(vecZ dst, vecX src) %{
18401   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
18402   match(Set dst (VectorCastS2X src));
18403   format %{ "vpmovsxwd   $dst,$src\n\t"
18404             "vcvtdq2pd   $dst,$dst\t! convert 8S to 8D vector" %}
18405   ins_encode %{
18406     int vector_len = 2;
18407     __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18408     __ vcvtdq2pd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
18409   %}
18410   ins_pipe( pipe_slow );
18411 %}
18412 
18413 instruct vcvt4Ito4B_reg(vecS dst, vecX src, rRegL scratch) %{
18414   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
18415   effect(TEMP scratch);
18416   match(Set dst (VectorCastI2X src));
18417   format %{ "vpand      $dst,$src,[0x000000FF000000FF]\n\t"
18418             "vpackusdw  $dst,$dst\n\t"
18419             "vpackuswb  $dst,$dst\t! convert 4I to 4B vector" %}
18420   ins_encode %{
18421     int vector_len = 0;
18422     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_byte_mask()), vector_len, $scratch$$Register);
18423     __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
18424     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
18425   %}
18426   ins_pipe( pipe_slow );
18427 %}
18428 
18429 instruct vcvt8Ito8B_reg(vecD dst, vecY src, vecY tmp, rRegL scratch) %{
18430   predicate(UseAVX >= 2 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
18431   effect(TEMP scratch, TEMP tmp);
18432   match(Set dst (VectorCastI2X src));
18433   format %{ "vpand      $dst,$src,[0x000000FF000000FF]\n\t"
18434             "vextracti128 $tmp,$dst,0x1\n\t"
18435             "vpackusdw  $dst,$dst,$tmp\n\t"
18436             "vpackuswb  $dst,$dst\t! convert 8I to 8B vector" %}
18437   ins_encode %{
18438     int vector_len = 1;
18439     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_byte_mask()), vector_len, $scratch$$Register);
18440     __ vextracti128($tmp$$XMMRegister, $dst$$XMMRegister, 0x1);
18441     __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
18442     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0);
18443   %}
18444   ins_pipe( pipe_slow );
18445 %}
18446 
18447 instruct vcvt16Ito16B_reg(vecX dst, vecZ src) %{
18448   predicate(UseAVX > 2 && n->as_Vector()->length() == 16 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
18449   match(Set dst (VectorCastI2X src));
18450     format %{ "evpmovdb   $dst,$src\t! convert 16I to 16B vector" %}
18451   ins_encode %{
18452     int vector_len = 2;
18453     __ evpmovdb($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18454   %}
18455   ins_pipe( pipe_slow );
18456 %}
18457 
18458 instruct vcvt2Ito2S_reg(vecS dst, vecD src, rRegL scratch) %{
18459   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
18460   effect(TEMP scratch);
18461   match(Set dst (VectorCastI2X src));
18462   format %{ "vpand      $dst,$src,[0x0000FFFF0000FFFF]\n\t"
18463             "vpackusdw  $dst,$dst\t! convert 2I to 2S vector" %}
18464   ins_encode %{
18465     int vector_len = 0;
18466     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vector_len, $scratch$$Register);
18467     __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
18468   %}
18469   ins_pipe( pipe_slow );
18470 %}
18471 
18472 instruct vcvt4Ito4S_reg(vecD dst, vecX src, rRegL scratch) %{
18473   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
18474   effect(TEMP scratch);
18475   match(Set dst (VectorCastI2X src));
18476   format %{ "vpand      $dst,$src,[0x0000FFFF0000FFFF]\n\t"
18477             "vpackusdw  $dst,$dst\t! convert 4I to 4S vector" %}
18478   ins_encode %{
18479     int vector_len = 0;
18480     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vector_len, $scratch$$Register);
18481     __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
18482   %}
18483   ins_pipe( pipe_slow );
18484 %}
18485 
18486 instruct vcvt8Ito8S_reg(vecX dst, vecY src, vecY tmp, rRegL scratch) %{
18487   predicate(UseAVX >= 2 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
18488   effect(TEMP scratch, TEMP tmp);
18489   match(Set dst (VectorCastI2X src));
18490   format %{ "vpand      $dst,$src,[0x0000FFFF0000FFFF]\n\t"
18491             "vextracti128 $tmp,$dst,0x1\n\t"
18492             "vpackusdw  $dst,$dst,$tmp\t! convert 8I to 8S vector" %}
18493   ins_encode %{
18494     int vector_len = 1;
18495     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vector_len, $scratch$$Register);
18496     __ vextracti128($tmp$$XMMRegister, $dst$$XMMRegister, 0x1);
18497     __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
18498   %}
18499   ins_pipe( pipe_slow );
18500 %}
18501 
18502 instruct vcvt16Ito16S_reg(vecY dst, vecZ src) %{
18503   predicate(UseAVX > 2 && n->as_Vector()->length() == 16 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
18504   match(Set dst (VectorCastI2X src));
18505     format %{ "evpmovdw   $dst,$src\t! convert 16I to 16S vector" %}
18506   ins_encode %{
18507     int vector_len = 2;
18508     __ evpmovdw($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18509   %}
18510   ins_pipe( pipe_slow );
18511 %}
18512 
18513 instruct vcvt2Ito2L_reg(vecX dst, vecD src) %{
18514   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
18515   match(Set dst (VectorCastI2X src));
18516   format %{ "vpmovsxdq   $dst,$src\t! convert 2I to 2L vector" %}
18517   ins_encode %{
18518     int vector_len = 0;
18519     __ vpmovsxdq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18520   %}
18521   ins_pipe( pipe_slow );
18522 %}
18523 
18524 instruct vcvt4Ito4L_reg(vecY dst, vecX src) %{
18525   predicate(UseAVX >= 2 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
18526   match(Set dst (VectorCastI2X src));
18527   format %{ "vpmovsxdq   $dst,$src\t! convert 4I to 4L vector" %}
18528   ins_encode %{
18529     int vector_len = 1;
18530     __ vpmovsxdq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18531   %}
18532   ins_pipe( pipe_slow );
18533 %}
18534 
18535 instruct vcvt8Ito8L_reg(vecZ dst, vecY src) %{
18536   predicate(UseAVX >= 2 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
18537   match(Set dst (VectorCastI2X src));
18538   format %{ "vpmovsxdq   $dst,$src\t! convert 8I to 8L vector" %}
18539   ins_encode %{
18540     int vector_len = 2;
18541     __ vpmovsxdq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18542   %}
18543   ins_pipe( pipe_slow );
18544 %}
18545 
18546 instruct vcvt2Ito2F_reg(vecD dst, vecD src) %{
18547   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
18548   match(Set dst (VectorCastI2X src));
18549   format %{ "vcvtdq2ps   $dst,$src\t! convert 2I to 2F vector" %}
18550   ins_encode %{
18551     int vector_len = 0;
18552     __ vcvtdq2ps($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18553   %}
18554   ins_pipe( pipe_slow );
18555 %}
18556 
18557 instruct vcvt4Ito4F_reg(vecX dst, vecX src) %{
18558   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
18559   match(Set dst (VectorCastI2X src));
18560   format %{ "vcvtdq2ps   $dst,$src\t! convert 4I to 4F vector" %}
18561   ins_encode %{
18562     int vector_len = 0;
18563     __ vcvtdq2ps($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18564   %}
18565   ins_pipe( pipe_slow );
18566 %}
18567 
18568 instruct vcvt8Ito8F_reg(vecY dst, vecY src) %{
18569   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
18570   match(Set dst (VectorCastI2X src));
18571   format %{ "vcvtdq2ps   $dst,$src\t! convert 8I to 8F vector" %}
18572   ins_encode %{
18573     int vector_len = 1;
18574     __ vcvtdq2ps($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18575   %}
18576   ins_pipe( pipe_slow );
18577 %}
18578 
18579 instruct vcvt16Ito16F_reg(vecZ dst, vecZ src) %{
18580   predicate(UseAVX > 2 && n->as_Vector()->length() == 16 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
18581   match(Set dst (VectorCastI2X src));
18582   format %{ "vcvtdq2ps   $dst,$src\t! convert 16I to 16F vector" %}
18583   ins_encode %{
18584     int vector_len = 2;
18585     __ vcvtdq2ps($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18586   %}
18587   ins_pipe( pipe_slow );
18588 %}
18589 
18590 instruct vcvt2Ito2D_reg(vecX dst, vecD src) %{
18591   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
18592   match(Set dst (VectorCastI2X src));
18593   format %{ "vcvtdq2pd   $dst,$src\t! convert 2I to 2D vector" %}
18594   ins_encode %{
18595     int vector_len = 0;
18596     __ vcvtdq2pd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18597   %}
18598   ins_pipe( pipe_slow );
18599 %}
18600 
18601 instruct vcvt4Ito4D_reg(vecY dst, vecX src) %{
18602   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
18603   match(Set dst (VectorCastI2X src));
18604   format %{ "vcvtdq2pd   $dst,$src\t! convert 4I to 4D vector" %}
18605   ins_encode %{
18606     int vector_len = 1;
18607     __ vcvtdq2pd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18608   %}
18609   ins_pipe( pipe_slow );
18610 %}
18611 
18612 instruct vcvt8Ito8D_reg(vecZ dst, vecY src) %{
18613   predicate(UseAVX >= 2 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
18614   match(Set dst (VectorCastI2X src));
18615   format %{ "vcvtdq2pd   $dst,$src\t! convert 8I to 8D vector" %}
18616   ins_encode %{
18617     int vector_len = 2;
18618     __ vcvtdq2pd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18619   %}
18620   ins_pipe( pipe_slow );
18621 %}
18622 
18623 instruct vcvt4Lto4B_reg(vecS dst, vecY src, rRegL scratch) %{
18624   predicate(UseAVX >= 2 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
18625   match(Set dst (VectorCastL2X src));
18626   effect(TEMP scratch);
18627   format %{ "vpermilps  $dst,$src,8\n\t"
18628             "vpermpd    $dst,$dst,8\n\t"
18629             "vpand      $dst,$dst,[0x000000FF000000FF]\n\t"
18630             "vpackusdw  $dst,$dst\n\t"
18631             "vpackuswb  $dst,$dst\t! convert 4L to 4B vector" %}
18632   ins_encode %{
18633     int vector_len = 1;
18634     __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 8, vector_len);
18635     __ vpermpd($dst$$XMMRegister, $dst$$XMMRegister, 8, vector_len);
18636     // Since cast to int has been done, do rest of operations in 128.
18637     vector_len = 0;
18638     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_int_to_byte_mask()), vector_len, $scratch$$Register);
18639     __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
18640     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
18641   %}
18642   ins_pipe( pipe_slow );
18643 %}
18644 
18645 instruct vcvt8Lto8B_reg(vecD dst, vecZ src) %{
18646   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
18647   match(Set dst (VectorCastL2X src));
18648     format %{ "evpmovqb   $dst,$src\t! convert 8L to 8B vector" %}
18649   ins_encode %{
18650     int vector_len = 2;
18651     __ evpmovqb($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18652   %}
18653   ins_pipe( pipe_slow );
18654 %}
18655 
18656 instruct vcvt2Lto2S_reg(vecS dst, vecX src, rRegL scratch) %{
18657   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
18658   match(Set dst (VectorCastL2X src));
18659   effect(TEMP scratch);
18660   format %{ "vpshufd    $dst,$src,8\n\t"
18661             "vpand      $dst,$dst,[0x0000FFFF0000FFFF]\n\t"
18662             "vpackusdw  $dst,$dst\t! convert 2L to 2S vector" %}
18663   ins_encode %{
18664     int vector_len = 0;
18665     __ vpshufd($dst$$XMMRegister, $src$$XMMRegister, 8, vector_len);
18666     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vector_len, $scratch$$Register);
18667     __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
18668   %}
18669   ins_pipe( pipe_slow );
18670 %}
18671 
18672 instruct vcvt4Lto4S_reg(vecD dst, vecY src, rRegL scratch) %{
18673   predicate(UseAVX >= 2 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
18674   match(Set dst (VectorCastL2X src));
18675   effect(TEMP scratch);
18676   format %{ "vpermilps  $dst,$src,8\n\t"
18677             "vpermpd    $dst,$dst,8\n\t"
18678             "vpand      $dst,$dst,[0x0000FFFF0000FFFF]\n\t"
18679             "vpackusdw  $dst,$dst\t! convert 4L to 4S vector" %}
18680   ins_encode %{
18681     int vector_len = 1;
18682     __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 8, vector_len);
18683     __ vpermpd($dst$$XMMRegister, $dst$$XMMRegister, 8, vector_len);
18684     // Since cast to int has been done, do rest of operations in 128.
18685     vector_len = 0;
18686     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vector_len, $scratch$$Register);
18687     __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
18688   %}
18689   ins_pipe( pipe_slow );
18690 %}
18691 
18692 instruct vcvt8Lto8S_reg(vecX dst, vecZ src) %{
18693   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
18694   match(Set dst (VectorCastL2X src));
18695     format %{ "evpmovqw   $dst,$src\t! convert 8L to 8S vector" %}
18696   ins_encode %{
18697     int vector_len = 2;
18698     __ evpmovqw($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18699   %}
18700   ins_pipe( pipe_slow );
18701 %}
18702 
18703 instruct vcvt1Lto1I_reg(vecS dst, vecD src) %{
18704   predicate(n->as_Vector()->length() == 1 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
18705   match(Set dst (VectorCastL2X src));
18706   format %{ "movdqu   $dst,$src\t! convert 1L to 1I vector" %}
18707   ins_encode %{
18708     // If register is the same, then move is not needed.
18709     if ($dst$$XMMRegister != $src$$XMMRegister) {
18710       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
18711     }
18712   %}
18713   ins_pipe( pipe_slow );
18714 %}
18715 
18716 instruct vcvt2Lto2I_reg(vecD dst, vecX src) %{
18717   predicate(UseAVX == 0 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
18718   match(Set dst (VectorCastL2X src));
18719   format %{ "pshufd   $dst,$src,8\t! convert 2L to 2I vector" %}
18720   ins_encode %{
18721     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 8);
18722   %}
18723   ins_pipe( pipe_slow );
18724 %}
18725 
18726 instruct vcvt2Lto2I_reg_avx(vecD dst, vecX src) %{
18727   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
18728   match(Set dst (VectorCastL2X src));
18729   format %{ "vpshufd   $dst,$src,8\t! convert 2L to 2I vector" %}
18730   ins_encode %{
18731     int vector_len = 0;
18732     __ vpshufd($dst$$XMMRegister, $src$$XMMRegister, 8, vector_len);
18733   %}
18734   ins_pipe( pipe_slow );
18735 %}
18736 
18737 instruct vcvt4Lto4I_reg(vecX dst, vecY src) %{
18738   predicate(UseAVX >= 2 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
18739   match(Set dst (VectorCastL2X src));
18740   format %{ "vpermilps  $dst,$src,8\n\t"
18741           "vpermpd  $dst,$dst,8\t! convert 4L to 4I vector" %}
18742   ins_encode %{
18743     int vector_len = 1;
18744     __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 8, vector_len);
18745     __ vpermpd($dst$$XMMRegister, $dst$$XMMRegister, 8, vector_len);
18746   %}
18747   ins_pipe( pipe_slow );
18748 %}
18749 
18750 instruct vcvt8Lto8I_reg(vecY dst, vecZ src) %{
18751   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
18752   match(Set dst (VectorCastL2X src));
18753     format %{ "evpmovqd   $dst,$src\t! convert 8L to 8I vector" %}
18754   ins_encode %{
18755     int vector_len = 2;
18756     __ evpmovqd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18757   %}
18758   ins_pipe( pipe_slow );
18759 %}
18760 
18761 instruct vcvt2Lto2F_reg(vecD dst, vecX src) %{
18762   predicate(UseAVX > 2 && VM_Version::supports_avx512dq() && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
18763   match(Set dst (VectorCastL2X src));
18764   format %{ "vcvtqq2ps   $dst,$src\t! convert 2L to 2F vector" %}
18765   ins_encode %{
18766     int vector_len = 0;
18767     __ evcvtqq2ps($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18768   %}
18769   ins_pipe( pipe_slow );
18770 %}
18771 
18772 instruct vcvt4Lto4F_reg(vecX dst, vecY src) %{
18773   predicate(UseAVX > 2 && VM_Version::supports_avx512dq() && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
18774   match(Set dst (VectorCastL2X src));
18775   format %{ "vcvtqq2ps   $dst,$src\t! convert 4L to 4F vector" %}
18776   ins_encode %{
18777     int vector_len = 1;
18778     __ evcvtqq2ps($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18779   %}
18780   ins_pipe( pipe_slow );
18781 %}
18782 
18783 instruct vcvt8Lto8F_reg(vecY dst, vecZ src) %{
18784   predicate(UseAVX > 2 && VM_Version::supports_avx512dq() && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
18785   match(Set dst (VectorCastL2X src));
18786   format %{ "vcvtqq2ps   $dst,$src\t! convert 8L to 8F vector" %}
18787   ins_encode %{
18788     int vector_len = 2;
18789     __ evcvtqq2ps($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18790   %}
18791   ins_pipe( pipe_slow );
18792 %}
18793 
18794 instruct vcvt1Lto1D_reg(vecD dst, vecD src) %{
18795   predicate(UseAVX > 2 && VM_Version::supports_avx512dq() && n->as_Vector()->length() == 1 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
18796   match(Set dst (VectorCastL2X src));
18797   format %{ "vcvtqq2pd   $dst,$src\t! convert 1L to 1D vector" %}
18798   ins_encode %{
18799     int vector_len = 0;
18800     __ evcvtqq2pd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18801   %}
18802   ins_pipe( pipe_slow );
18803 %}
18804 
18805 instruct vcvt2Lto2D_reg(vecX dst, vecX src) %{
18806   predicate(UseAVX > 2 && VM_Version::supports_avx512dq() && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
18807   match(Set dst (VectorCastL2X src));
18808   format %{ "vcvtqq2pd   $dst,$src\t! convert 2L to 2D vector" %}
18809   ins_encode %{
18810     int vector_len = 0;
18811     __ evcvtqq2pd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18812   %}
18813   ins_pipe( pipe_slow );
18814 %}
18815 
18816 instruct vcvt4Lto4D_reg(vecY dst, vecY src) %{
18817   predicate(UseAVX > 2 && VM_Version::supports_avx512dq() && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
18818   match(Set dst (VectorCastL2X src));
18819   format %{ "vcvtqq2pd   $dst,$src\t! convert 4L to 4D vector" %}
18820   ins_encode %{
18821     int vector_len = 1;
18822     __ evcvtqq2pd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18823   %}
18824   ins_pipe( pipe_slow );
18825 %}
18826 
18827 instruct vcvt8Lto8D_reg(vecZ dst, vecZ src) %{
18828   predicate(UseAVX > 2 && VM_Version::supports_avx512dq() && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
18829   match(Set dst (VectorCastL2X src));
18830   format %{ "vcvtqq2pd   $dst,$src\t! convert 8L to 8D vector" %}
18831   ins_encode %{
18832     int vector_len = 2;
18833     __ evcvtqq2pd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18834   %}
18835   ins_pipe( pipe_slow );
18836 %}
18837 
18838 instruct vcvt2Fto2D_reg(vecX dst, vecD src) %{
18839   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
18840   match(Set dst (VectorCastF2X src));
18841   format %{ "vcvtps2pd   $dst,$src\t! convert 2F to 2D vector" %}
18842   ins_encode %{
18843     int vector_len = 0;
18844     __ vcvtps2pd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18845   %}
18846   ins_pipe( pipe_slow );
18847 %}
18848 
18849 instruct vcvt4Fto4D_reg(vecY dst, vecX src) %{
18850   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
18851   match(Set dst (VectorCastF2X src));
18852   format %{ "vcvtps2pd   $dst,$src\t! convert 4F to 4D vector" %}
18853   ins_encode %{
18854     int vector_len = 1;
18855     __ vcvtps2pd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18856   %}
18857   ins_pipe( pipe_slow );
18858 %}
18859 
18860 instruct vcvt8Fto8D_reg(vecZ dst, vecY src) %{
18861   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
18862   match(Set dst (VectorCastF2X src));
18863   format %{ "vcvtps2pd   $dst,$src\t! convert 8F to 8D vector" %}
18864   ins_encode %{
18865     int vector_len = 2;
18866     __ vcvtps2pd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18867   %}
18868   ins_pipe( pipe_slow );
18869 %}
18870 
18871 instruct vcvt2Dto2F_reg(vecD dst, vecX src) %{
18872   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
18873   match(Set dst (VectorCastD2X src));
18874   format %{ "vcvtpd2ps   $dst,$src\t! convert 2D to 2F vector" %}
18875   ins_encode %{
18876     int vector_len = 0;
18877     __ vcvtpd2ps($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18878   %}
18879   ins_pipe( pipe_slow );
18880 %}
18881 
18882 instruct vcvt4Dto4F_reg(vecX dst, vecY src) %{
18883   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
18884   match(Set dst (VectorCastD2X src));
18885   format %{ "vcvtpd2ps   $dst,$src\t! convert 4D to 4F vector" %}
18886   ins_encode %{
18887     int vector_len = 1;
18888     __ vcvtpd2ps($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18889   %}
18890   ins_pipe( pipe_slow );
18891 %}
18892 
18893 instruct vcvt8Dto8F_reg(vecY dst, vecZ src) %{
18894   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
18895   match(Set dst (VectorCastD2X src));
18896   format %{ "vcvtpd2ps   $dst,$src\t! convert 8D to 8F vector" %}
18897   ins_encode %{
18898     int vector_len = 2;
18899     __ vcvtpd2ps($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18900   %}
18901   ins_pipe( pipe_slow );
18902 %}
18903 
18904 instruct vcmpeq2F(vecD dst, vecD src1, vecD src2) %{
18905   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 &&
18906             n->as_VectorMaskCmp()->get_predicate() == BoolTest::eq &&
18907             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
18908   match(Set dst (VectorMaskCmp src1 src2));
18909   format %{ "vcmpeqps  $dst,$src1,$src2\t! cmpeq packed2F" %}
18910   ins_encode %{
18911     int vector_len = 0;
18912     Assembler::ComparisonPredicateFP cmp = Assembler::EQ_OQ;  // ordered non-signaling
18913     __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
18914   %}
18915   ins_pipe( pipe_slow );
18916 %}
18917 
18918 instruct vcmpeq4F(vecX dst, vecX src1, vecX src2) %{
18919   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 &&
18920             n->as_VectorMaskCmp()->get_predicate() == BoolTest::eq &&
18921             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
18922   match(Set dst (VectorMaskCmp src1 src2));
18923   format %{ "vcmpeqps  $dst,$src1,$src2\t! cmpeq packed4F" %}
18924   ins_encode %{
18925     int vector_len = 0;
18926     Assembler::ComparisonPredicateFP cmp = Assembler::EQ_OQ;  // ordered non-signaling
18927     __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
18928   %}
18929   ins_pipe( pipe_slow );
18930 %}
18931 
18932 instruct vcmpeq8F(vecY dst, vecY src1, vecY src2) %{
18933   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 &&
18934             n->as_VectorMaskCmp()->get_predicate() == BoolTest::eq &&
18935             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
18936   match(Set dst (VectorMaskCmp src1 src2));
18937   format %{ "vcmpeqps  $dst,$src1,$src2\t! cmpeq packed8F" %}
18938   ins_encode %{
18939     int vector_len = 1;
18940     Assembler::ComparisonPredicateFP cmp = Assembler::EQ_OQ;  // ordered non-signaling
18941     __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
18942   %}
18943   ins_pipe( pipe_slow );
18944 %}
18945 
18946 instruct vcmpeq16F(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
18947   predicate(UseAVX > 2 && n->as_Vector()->length() == 16 &&
18948             n->as_VectorMaskCmp()->get_predicate() == BoolTest::eq &&
18949             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
18950   match(Set dst (VectorMaskCmp src1 src2));
18951   effect(TEMP dst, TEMP scratch);
18952   format %{ "vcmpeqps  k2,$src1,$src2\n\t"
18953             "vmovdqu8  $dst, k2{z}, 0xFFFFFFFFFF \t! cmpeq packed16F" %}
18954   ins_encode %{
18955     int vector_len = 2;
18956     Assembler::ComparisonPredicateFP cmp = Assembler::EQ_OQ;  // ordered non-signaling
18957     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
18958     KRegister mask = k0; // The comparison itself is not being masked.
18959     __ evcmpps(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
18960     __ evmovdqul($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
18961   %}
18962   ins_pipe( pipe_slow );
18963 %}
18964 
18965 instruct vcmplt2F(vecD dst, vecD src1, vecD src2) %{
18966   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 &&
18967             n->as_VectorMaskCmp()->get_predicate() == BoolTest::lt &&
18968             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
18969   match(Set dst (VectorMaskCmp src1 src2));
18970   format %{ "vcmpltps  $dst,$src1,$src2\t! cmplt packed2F" %}
18971   ins_encode %{
18972     int vector_len = 0;
18973     Assembler::ComparisonPredicateFP cmp = Assembler::LT_OQ; //ordered non-signaling
18974     __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
18975   %}
18976   ins_pipe( pipe_slow );
18977 %}
18978 
18979 instruct vcmplt4F(vecX dst, vecX src1, vecX src2) %{
18980   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 &&
18981             n->as_VectorMaskCmp()->get_predicate() == BoolTest::lt &&
18982             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
18983   match(Set dst (VectorMaskCmp src1 src2));
18984   format %{ "vcmpltps  $dst,$src1,$src2\t! cmplt packed4F" %}
18985   ins_encode %{
18986     int vector_len = 0;
18987     Assembler::ComparisonPredicateFP cmp = Assembler::LT_OQ; //ordered non-signaling
18988     __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
18989   %}
18990   ins_pipe( pipe_slow );
18991 %}
18992 
18993 instruct vcmplt8F(vecY dst, vecY src1, vecY src2) %{
18994   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 &&
18995             n->as_VectorMaskCmp()->get_predicate() == BoolTest::lt &&
18996             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
18997   match(Set dst (VectorMaskCmp src1 src2));
18998   format %{ "vcmpltps  $dst,$src1,$src2\t! cmplt packed8F" %}
18999   ins_encode %{
19000     int vector_len = 1;
19001     Assembler::ComparisonPredicateFP cmp = Assembler::LT_OQ; //ordered non-signaling
19002     __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19003   %}
19004   ins_pipe( pipe_slow );
19005 %}
19006 
19007 instruct vcmplt16F(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
19008   predicate(UseAVX > 2 && n->as_Vector()->length() == 16 &&
19009             n->as_VectorMaskCmp()->get_predicate() == BoolTest::lt &&
19010             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
19011   match(Set dst (VectorMaskCmp src1 src2));
19012   effect(TEMP dst, TEMP scratch);
19013   format %{ "vcmpltps  k2,$src1,$src2\n\t"
19014             "vmovdqu8  $dst, k2{z}, 0xFFFFFFFFFF \t! cmplt packed16F" %}
19015   ins_encode %{
19016     int vector_len = 2;
19017     Assembler::ComparisonPredicateFP cmp = Assembler::LT_OQ; //ordered non-signaling
19018     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
19019     KRegister mask = k0; // The comparison itself is not being masked.
19020     __ evcmpps(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19021     __ evmovdqul($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
19022   %}
19023   ins_pipe( pipe_slow );
19024 %}
19025 
19026 instruct vcmpgt2F(vecD dst, vecD src1, vecD src2) %{
19027   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 &&
19028             n->as_VectorMaskCmp()->get_predicate() == BoolTest::gt &&
19029             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
19030   match(Set dst (VectorMaskCmp src1 src2));
19031   format %{ "vcmpgtps  $dst,$src1,$src2\t! cmpgt packed2F" %}
19032   ins_encode %{
19033     int vector_len = 0;
19034     Assembler::ComparisonPredicateFP cmp = Assembler::GT_OQ; //ordered non-signaling
19035     __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19036   %}
19037   ins_pipe( pipe_slow );
19038 %}
19039 
19040 instruct vcmpgt4F(vecX dst, vecX src1, vecX src2) %{
19041   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 &&
19042             n->as_VectorMaskCmp()->get_predicate() == BoolTest::gt &&
19043             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
19044   match(Set dst (VectorMaskCmp src1 src2));
19045   format %{ "vcmpgtps  $dst,$src1,$src2\t! cmpgt packed4F" %}
19046   ins_encode %{
19047     int vector_len = 0;
19048     Assembler::ComparisonPredicateFP cmp = Assembler::GT_OQ; //ordered non-signaling
19049     __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19050   %}
19051   ins_pipe( pipe_slow );
19052 %}
19053 
19054 instruct vcmpgt8F(vecY dst, vecY src1, vecY src2) %{
19055   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 &&
19056             n->as_VectorMaskCmp()->get_predicate() == BoolTest::gt &&
19057             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
19058   match(Set dst (VectorMaskCmp src1 src2));
19059   format %{ "vcmpgtps  $dst,$src1,$src2\t! cmpgt packed8F" %}
19060   ins_encode %{
19061     int vector_len = 1;
19062     Assembler::ComparisonPredicateFP cmp = Assembler::GT_OQ; //ordered non-signaling
19063     __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19064   %}
19065   ins_pipe( pipe_slow );
19066 %}
19067 
19068 instruct vcmpgt16F(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
19069   predicate(UseAVX > 2 && n->as_Vector()->length() == 16 &&
19070             n->as_VectorMaskCmp()->get_predicate() == BoolTest::gt &&
19071             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
19072   match(Set dst (VectorMaskCmp src1 src2));
19073   effect(TEMP dst, TEMP scratch);
19074   format %{ "vcmpgtps  k2,$src1,$src2\n\t"
19075             "vmovdqu8  $dst, k2{z}, 0xFFFFFFFFFF \t! cmpgt packed16F" %}
19076   ins_encode %{
19077     int vector_len = 2;
19078     Assembler::ComparisonPredicateFP cmp = Assembler::GT_OQ; //ordered non-signaling
19079     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
19080     KRegister mask = k0; // The comparison itself is not being masked.
19081     __ evcmpps(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19082     __ evmovdqul($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
19083   %}
19084   ins_pipe( pipe_slow );
19085 %}
19086 
19087 instruct vcmpge2F(vecD dst, vecD src1, vecD src2) %{
19088   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 &&
19089             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ge &&
19090             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
19091   match(Set dst (VectorMaskCmp src1 src2));
19092   format %{ "vcmpgeps  $dst,$src1,$src2\t! cmpge packed2F" %}
19093   ins_encode %{
19094     int vector_len = 0;
19095     Assembler::ComparisonPredicateFP cmp = Assembler::GE_OQ; //ordered non-signaling
19096     __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19097   %}
19098   ins_pipe( pipe_slow );
19099 %}
19100 
19101 instruct vcmpge4F(vecX dst, vecX src1, vecX src2) %{
19102   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 &&
19103             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ge &&
19104             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
19105   match(Set dst (VectorMaskCmp src1 src2));
19106   format %{ "vcmpgeps  $dst,$src1,$src2\t! cmpge packed4F" %}
19107   ins_encode %{
19108     int vector_len = 0;
19109     Assembler::ComparisonPredicateFP cmp = Assembler::GE_OQ; //ordered non-signaling
19110     __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19111   %}
19112   ins_pipe( pipe_slow );
19113 %}
19114 
19115 instruct vcmpge8F(vecY dst, vecY src1, vecY src2) %{
19116   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 &&
19117             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ge &&
19118             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
19119   match(Set dst (VectorMaskCmp src1 src2));
19120   format %{ "vcmpgeps  $dst,$src1,$src2\t! cmpge packed8F" %}
19121   ins_encode %{
19122     int vector_len = 1;
19123     Assembler::ComparisonPredicateFP cmp = Assembler::GE_OQ; //ordered non-signaling
19124     __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19125   %}
19126   ins_pipe( pipe_slow );
19127 %}
19128 
19129 instruct vcmpge16F(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
19130   predicate(UseAVX > 2 && n->as_Vector()->length() == 16 &&
19131             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ge &&
19132             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
19133   match(Set dst (VectorMaskCmp src1 src2));
19134   effect(TEMP dst, TEMP scratch);
19135   format %{ "vcmpgeps  k2,$src1,$src2\n\t"
19136             "vmovdqu8  $dst, k2{z}, 0xFFFFFFFFFF \t! cmpge packed16F" %}
19137   ins_encode %{
19138     int vector_len = 2;
19139     Assembler::ComparisonPredicateFP cmp = Assembler::GE_OQ; //ordered non-signaling
19140     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
19141     KRegister mask = k0; // The comparison itself is not being masked.
19142     __ evcmpps(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19143     __ evmovdqul($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
19144   %}
19145   ins_pipe( pipe_slow );
19146 %}
19147 
19148 instruct vcmple2F(vecD dst, vecD src1, vecD src2) %{
19149   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 &&
19150             n->as_VectorMaskCmp()->get_predicate() == BoolTest::le &&
19151             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
19152   match(Set dst (VectorMaskCmp src1 src2));
19153   format %{ "vcmpleps  $dst,$src1,$src2\t! cmple packed2F" %}
19154   ins_encode %{
19155     int vector_len = 0;
19156     Assembler::ComparisonPredicateFP cmp = Assembler::LE_OQ; //ordered non-signaling
19157     __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19158   %}
19159   ins_pipe( pipe_slow );
19160 %}
19161 
19162 instruct vcmple4F(vecX dst, vecX src1, vecX src2) %{
19163   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 &&
19164             n->as_VectorMaskCmp()->get_predicate() == BoolTest::le &&
19165             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
19166   match(Set dst (VectorMaskCmp src1 src2));
19167   format %{ "vcmpleps  $dst,$src1,$src2\t! cmple packed4F" %}
19168   ins_encode %{
19169     int vector_len = 0;
19170     Assembler::ComparisonPredicateFP cmp = Assembler::LE_OQ; //ordered non-signaling
19171     __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19172   %}
19173   ins_pipe( pipe_slow );
19174 %}
19175 
19176 instruct vcmple8F(vecY dst, vecY src1, vecY src2) %{
19177   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 &&
19178             n->as_VectorMaskCmp()->get_predicate() == BoolTest::le &&
19179             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
19180   match(Set dst (VectorMaskCmp src1 src2));
19181   format %{ "vcmpleps  $dst,$src1,$src2\t! cmple packed8F" %}
19182   ins_encode %{
19183     int vector_len = 1;
19184     Assembler::ComparisonPredicateFP cmp = Assembler::LE_OQ; //ordered non-signaling
19185     __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19186   %}
19187   ins_pipe( pipe_slow );
19188 %}
19189 
19190 instruct vcmple16F(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
19191   predicate(UseAVX > 2 && n->as_Vector()->length() == 16 &&
19192             n->as_VectorMaskCmp()->get_predicate() == BoolTest::le &&
19193             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
19194   match(Set dst (VectorMaskCmp src1 src2));
19195   effect(TEMP dst, TEMP scratch);
19196   format %{ "vcmpleps  k2,$src1,$src2\n\t"
19197             "vmovdqu8  $dst, k2{z}, 0xFFFFFFFFFF \t! cmple packed16F" %}
19198   ins_encode %{
19199     int vector_len = 2;
19200     Assembler::ComparisonPredicateFP cmp = Assembler::LE_OQ; //ordered non-signaling
19201     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
19202     KRegister mask = k0; // The comparison itself is not being masked.
19203     __ evcmpps(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19204     __ evmovdqul($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
19205   %}
19206   ins_pipe( pipe_slow );
19207 %}
19208 
19209 instruct vcmpne2F(vecD dst, vecD src1, vecD src2) %{
19210   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 &&
19211             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ne &&
19212             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
19213   match(Set dst (VectorMaskCmp src1 src2));
19214   format %{ "vcmpneps  $dst,$src1,$src2\t! cmpne packed2F" %}
19215   ins_encode %{
19216     int vector_len = 0;
19217     // As per JLS 15.21.1, != of NaNs is true. Thus use unordered compare.
19218     Assembler::ComparisonPredicateFP cmp = Assembler::NEQ_UQ; //unordered non-signaling
19219     __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19220   %}
19221   ins_pipe( pipe_slow );
19222 %}
19223 
19224 instruct vcmpne4F(vecX dst, vecX src1, vecX src2) %{
19225   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 &&
19226             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ne &&
19227             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
19228   match(Set dst (VectorMaskCmp src1 src2));
19229   format %{ "vcmpneps  $dst,$src1,$src2\t! cmpne packed4F" %}
19230   ins_encode %{
19231     int vector_len = 0;
19232     // As per JLS 15.21.1, != of NaNs is true. Thus use unordered compare.
19233     Assembler::ComparisonPredicateFP cmp = Assembler::NEQ_UQ; //unordered non-signaling
19234     __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19235   %}
19236   ins_pipe( pipe_slow );
19237 %}
19238 
19239 instruct vcmpne8F(vecY dst, vecY src1, vecY src2) %{
19240   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 &&
19241             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ne &&
19242             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
19243   match(Set dst (VectorMaskCmp src1 src2));
19244   format %{ "vcmpneps  $dst,$src1,$src2\t! cmpne packed8F" %}
19245   ins_encode %{
19246     int vector_len = 1;
19247     // As per JLS 15.21.1, != of NaNs is true. Thus use unordered compare.
19248     Assembler::ComparisonPredicateFP cmp = Assembler::NEQ_UQ; //unordered non-signaling
19249     __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19250   %}
19251   ins_pipe( pipe_slow );
19252 %}
19253 
19254 instruct vcmpne16F(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
19255   predicate(UseAVX > 2 && n->as_Vector()->length() == 16 &&
19256             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ne &&
19257             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
19258   match(Set dst (VectorMaskCmp src1 src2));
19259   effect(TEMP dst, TEMP scratch);
19260   format %{ "vcmpneps  k2,$src1,$src2\n\t"
19261             "vmovdqu8  $dst, k2{z}, 0xFFFFFFFFFF \t! cmpne packed16F" %}
19262   ins_encode %{
19263     int vector_len = 2;
19264     // As per JLS 15.21.1, != of NaNs is true. Thus use unordered compare.
19265     Assembler::ComparisonPredicateFP cmp = Assembler::NEQ_UQ; //unordered non-signaling
19266     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
19267     KRegister mask = k0; // The comparison itself is not being masked.
19268     __ evcmpps(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19269     __ evmovdqul($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
19270   %}
19271   ins_pipe( pipe_slow );
19272 %}
19273 
19274 instruct vcmpeq1D(vecD dst, vecD src1, vecD src2) %{
19275   predicate(UseAVX > 0 && n->as_Vector()->length() == 1 &&
19276             n->as_VectorMaskCmp()->get_predicate() == BoolTest::eq &&
19277             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
19278   match(Set dst (VectorMaskCmp src1 src2));
19279   format %{ "vcmpeqpd  $dst,$src1,$src2\t! cmpeq packed1D" %}
19280   ins_encode %{
19281     int vector_len = 0;
19282     Assembler::ComparisonPredicateFP cmp = Assembler::EQ_OQ;  // ordered non-signaling
19283     __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19284   %}
19285   ins_pipe( pipe_slow );
19286 %}
19287 
19288 instruct vcmpeq2D(vecX dst, vecX src1, vecX src2) %{
19289   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 &&
19290             n->as_VectorMaskCmp()->get_predicate() == BoolTest::eq &&
19291             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
19292   match(Set dst (VectorMaskCmp src1 src2));
19293   format %{ "vcmpeqpd  $dst,$src1,$src2\t! cmpeq packed2D" %}
19294   ins_encode %{
19295     int vector_len = 0;
19296     Assembler::ComparisonPredicateFP cmp = Assembler::EQ_OQ;  // ordered non-signaling
19297     __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19298   %}
19299   ins_pipe( pipe_slow );
19300 %}
19301 
19302 instruct vcmpeq4D(vecY dst, vecY src1, vecY src2) %{
19303   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 &&
19304             n->as_VectorMaskCmp()->get_predicate() == BoolTest::eq &&
19305             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
19306   match(Set dst (VectorMaskCmp src1 src2));
19307   format %{ "vcmpeqpd  $dst,$src1,$src2\t! cmpeq packed4D" %}
19308   ins_encode %{
19309     int vector_len = 1;
19310     Assembler::ComparisonPredicateFP cmp = Assembler::EQ_OQ;  // ordered non-signaling
19311     __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19312   %}
19313   ins_pipe( pipe_slow );
19314 %}
19315 
19316 instruct vcmpeq8D(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
19317   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 &&
19318             n->as_VectorMaskCmp()->get_predicate() == BoolTest::eq &&
19319             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
19320   match(Set dst (VectorMaskCmp src1 src2));
19321   effect(TEMP dst, TEMP scratch);
19322   format %{ "vcmpeqpd  k2,$src1,$src2\n\t"
19323             "vmovdqu8  $dst, k2{z}, 0xFFFFFFFFFF \t! cmpeq packed8D" %}
19324   ins_encode %{
19325     int vector_len = 2;
19326     Assembler::ComparisonPredicateFP cmp = Assembler::EQ_OQ;  // ordered non-signaling
19327     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
19328     KRegister mask = k0; // The comparison itself is not being masked.
19329     __ evcmppd(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19330     __ evmovdquq($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
19331   %}
19332   ins_pipe( pipe_slow );
19333 %}
19334 
19335 instruct vcmplt1D(vecD dst, vecD src1, vecD src2) %{
19336   predicate(UseAVX > 0 && n->as_Vector()->length() == 1 &&
19337             n->as_VectorMaskCmp()->get_predicate() == BoolTest::lt &&
19338             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
19339   match(Set dst (VectorMaskCmp src1 src2));
19340   format %{ "vcmpltpd  $dst,$src1,$src2\t! cmplt packed1D" %}
19341   ins_encode %{
19342     int vector_len = 0;
19343     Assembler::ComparisonPredicateFP cmp = Assembler::LT_OQ; //ordered non-signaling
19344     __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19345   %}
19346   ins_pipe( pipe_slow );
19347 %}
19348 
19349 instruct vcmplt2D(vecX dst, vecX src1, vecX src2) %{
19350   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 &&
19351             n->as_VectorMaskCmp()->get_predicate() == BoolTest::lt &&
19352             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
19353   match(Set dst (VectorMaskCmp src1 src2));
19354   format %{ "vcmpltpd  $dst,$src1,$src2\t! cmplt packed2D" %}
19355   ins_encode %{
19356     int vector_len = 0;
19357     Assembler::ComparisonPredicateFP cmp = Assembler::LT_OQ; //ordered non-signaling
19358     __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19359   %}
19360   ins_pipe( pipe_slow );
19361 %}
19362 
19363 instruct vcmplt4D(vecY dst, vecY src1, vecY src2) %{
19364   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 &&
19365             n->as_VectorMaskCmp()->get_predicate() == BoolTest::lt &&
19366             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
19367   match(Set dst (VectorMaskCmp src1 src2));
19368   format %{ "vcmpltpd  $dst,$src1,$src2\t! cmplt packed4D" %}
19369   ins_encode %{
19370     int vector_len = 1;
19371     Assembler::ComparisonPredicateFP cmp = Assembler::LT_OQ; //ordered non-signaling
19372     __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19373   %}
19374   ins_pipe( pipe_slow );
19375 %}
19376 
19377 instruct vcmplt8D(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
19378   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 &&
19379             n->as_VectorMaskCmp()->get_predicate() == BoolTest::lt &&
19380             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
19381   match(Set dst (VectorMaskCmp src1 src2));
19382   effect(TEMP dst, TEMP scratch);
19383   format %{ "vcmpltpd  k2,$src1,$src2\n\t"
19384             "vmovdqu8  $dst, k2{z}, 0xFFFFFFFFFF \t! cmplt packed8D" %}
19385   ins_encode %{
19386     int vector_len = 2;
19387     Assembler::ComparisonPredicateFP cmp = Assembler::LT_OQ; //ordered non-signaling
19388     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
19389     KRegister mask = k0; // The comparison itself is not being masked.
19390     __ evcmppd(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19391     __ evmovdquq($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
19392   %}
19393   ins_pipe( pipe_slow );
19394 %}
19395 
19396 instruct vcmpgt1D(vecD dst, vecD src1, vecD src2) %{
19397   predicate(UseAVX > 0 && n->as_Vector()->length() == 1 &&
19398             n->as_VectorMaskCmp()->get_predicate() == BoolTest::gt &&
19399             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
19400   match(Set dst (VectorMaskCmp src1 src2));
19401   format %{ "vcmpgtpd  $dst,$src1,$src2\t! cmpgt packed1D" %}
19402   ins_encode %{
19403     int vector_len = 0;
19404     Assembler::ComparisonPredicateFP cmp = Assembler::GT_OQ; //ordered non-signaling
19405     __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19406   %}
19407   ins_pipe( pipe_slow );
19408 %}
19409 
19410 instruct vcmpgt2D(vecX dst, vecX src1, vecX src2) %{
19411   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 &&
19412             n->as_VectorMaskCmp()->get_predicate() == BoolTest::gt &&
19413             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
19414   match(Set dst (VectorMaskCmp src1 src2));
19415   format %{ "vcmpgtpd  $dst,$src1,$src2\t! cmpgt packed2D" %}
19416   ins_encode %{
19417     int vector_len = 0;
19418     Assembler::ComparisonPredicateFP cmp = Assembler::GT_OQ; //ordered non-signaling
19419     __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19420   %}
19421   ins_pipe( pipe_slow );
19422 %}
19423 
19424 instruct vcmpgt4D(vecY dst, vecY src1, vecY src2) %{
19425   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 &&
19426             n->as_VectorMaskCmp()->get_predicate() == BoolTest::gt &&
19427             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
19428   match(Set dst (VectorMaskCmp src1 src2));
19429   format %{ "vcmpgtpd  $dst,$src1,$src2\t! cmpgt packed4D" %}
19430   ins_encode %{
19431     int vector_len = 1;
19432     Assembler::ComparisonPredicateFP cmp = Assembler::GT_OQ; //ordered non-signaling
19433     __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19434   %}
19435   ins_pipe( pipe_slow );
19436 %}
19437 
19438 instruct vcmpgt8D(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
19439   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 &&
19440             n->as_VectorMaskCmp()->get_predicate() == BoolTest::gt &&
19441             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
19442   match(Set dst (VectorMaskCmp src1 src2));
19443   effect(TEMP dst, TEMP scratch);
19444   format %{ "vcmpgtpd  k2,$src1,$src2\n\t"
19445             "vmovdqu8  $dst, k2{z}, 0xFFFFFFFFFF \t! cmpgt packed8D" %}
19446   ins_encode %{
19447     int vector_len = 2;
19448     Assembler::ComparisonPredicateFP cmp = Assembler::GT_OQ; //ordered non-signaling
19449     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
19450     KRegister mask = k0; // The comparison itself is not being masked.
19451     __ evcmppd(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19452     __ evmovdquq($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
19453   %}
19454   ins_pipe( pipe_slow );
19455 %}
19456 
19457 instruct vcmpge1D(vecD dst, vecD src1, vecD src2) %{
19458   predicate(UseAVX > 0 && n->as_Vector()->length() == 1 &&
19459             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ge &&
19460             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
19461   match(Set dst (VectorMaskCmp src1 src2));
19462   format %{ "vcmpgepd  $dst,$src1,$src2\t! cmpge packed1D" %}
19463   ins_encode %{
19464     int vector_len = 0;
19465     Assembler::ComparisonPredicateFP cmp = Assembler::GE_OQ; //ordered non-signaling
19466     __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19467   %}
19468   ins_pipe( pipe_slow );
19469 %}
19470 
19471 instruct vcmpge2D(vecX dst, vecX src1, vecX src2) %{
19472   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 &&
19473             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ge &&
19474             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
19475   match(Set dst (VectorMaskCmp src1 src2));
19476   format %{ "vcmpgepd  $dst,$src1,$src2\t! cmpge packed2D" %}
19477   ins_encode %{
19478     int vector_len = 0;
19479     Assembler::ComparisonPredicateFP cmp = Assembler::GE_OQ; //ordered non-signaling
19480     __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19481   %}
19482   ins_pipe( pipe_slow );
19483 %}
19484 
19485 instruct vcmpge4D(vecY dst, vecY src1, vecY src2) %{
19486   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 &&
19487             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ge &&
19488             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
19489   match(Set dst (VectorMaskCmp src1 src2));
19490   format %{ "vcmpgepd  $dst,$src1,$src2\t! cmpge packed4D" %}
19491   ins_encode %{
19492     int vector_len = 1;
19493     Assembler::ComparisonPredicateFP cmp = Assembler::GE_OQ; //ordered non-signaling
19494     __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19495   %}
19496   ins_pipe( pipe_slow );
19497 %}
19498 
19499 instruct vcmpge8D(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
19500   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 &&
19501             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ge &&
19502             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
19503   match(Set dst (VectorMaskCmp src1 src2));
19504   effect(TEMP dst, TEMP scratch);
19505   format %{ "vcmpgepd  k2,$src1,$src2\n\t"
19506             "vmovdqu8  $dst, k2{z}, 0xFFFFFFFFFF \t! cmpge packed8D" %}
19507   ins_encode %{
19508     int vector_len = 2;
19509     Assembler::ComparisonPredicateFP cmp = Assembler::GE_OQ; //ordered non-signaling
19510     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
19511     KRegister mask = k0; // The comparison itself is not being masked.
19512     __ evcmppd(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19513     __ evmovdquq($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
19514   %}
19515   ins_pipe( pipe_slow );
19516 %}
19517 
19518 instruct vcmple1D(vecD dst, vecD src1, vecD src2) %{
19519   predicate(UseAVX > 0 && n->as_Vector()->length() == 1 &&
19520             n->as_VectorMaskCmp()->get_predicate() == BoolTest::le &&
19521             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
19522   match(Set dst (VectorMaskCmp src1 src2));
19523   format %{ "vcmplepd  $dst,$src1,$src2\t! cmple packed1D" %}
19524   ins_encode %{
19525     int vector_len = 0;
19526     Assembler::ComparisonPredicateFP cmp = Assembler::LE_OQ; //ordered non-signaling
19527     __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19528   %}
19529   ins_pipe( pipe_slow );
19530 %}
19531 
19532 instruct vcmple2D(vecX dst, vecX src1, vecX src2) %{
19533   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 &&
19534             n->as_VectorMaskCmp()->get_predicate() == BoolTest::le &&
19535             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
19536   match(Set dst (VectorMaskCmp src1 src2));
19537   format %{ "vcmplepd  $dst,$src1,$src2\t! cmple packed2D" %}
19538   ins_encode %{
19539     int vector_len = 0;
19540     Assembler::ComparisonPredicateFP cmp = Assembler::LE_OQ; //ordered non-signaling
19541     __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19542   %}
19543   ins_pipe( pipe_slow );
19544 %}
19545 
19546 instruct vcmple4D(vecY dst, vecY src1, vecY src2) %{
19547   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 &&
19548             n->as_VectorMaskCmp()->get_predicate() == BoolTest::le &&
19549             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
19550   match(Set dst (VectorMaskCmp src1 src2));
19551   format %{ "vcmplepd  $dst,$src1,$src2\t! cmple packed4D" %}
19552   ins_encode %{
19553     int vector_len = 1;
19554     Assembler::ComparisonPredicateFP cmp = Assembler::LE_OQ; //ordered non-signaling
19555     __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19556   %}
19557   ins_pipe( pipe_slow );
19558 %}
19559 
19560 instruct vcmple8D(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
19561   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 &&
19562             n->as_VectorMaskCmp()->get_predicate() == BoolTest::le &&
19563             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
19564   match(Set dst (VectorMaskCmp src1 src2));
19565   effect(TEMP dst, TEMP scratch);
19566   format %{ "vcmplepd  k2,$src1,$src2\n\t"
19567             "vmovdqu8  $dst, k2{z}, 0xFFFFFFFFFF \t! cmple packed8D" %}
19568   ins_encode %{
19569     int vector_len = 2;
19570     Assembler::ComparisonPredicateFP cmp = Assembler::LE_OQ; //ordered non-signaling
19571     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
19572     KRegister mask = k0; // The comparison itself is not being masked.
19573     __ evcmppd(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19574     __ evmovdquq($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
19575   %}
19576   ins_pipe( pipe_slow );
19577 %}
19578 
19579 instruct vcmpne1D(vecD dst, vecD src1, vecD src2) %{
19580   predicate(UseAVX > 0 && n->as_Vector()->length() == 1 &&
19581             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ne &&
19582             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
19583   match(Set dst (VectorMaskCmp src1 src2));
19584   format %{ "vcmpnepd  $dst,$src1,$src2\t! cmpne packed1D" %}
19585   ins_encode %{
19586     int vector_len = 0;
19587     // As per JLS 15.21.1, != of NaNs is true. Thus use unordered compare.
19588     Assembler::ComparisonPredicateFP cmp = Assembler::NEQ_UQ; //unordered non-signaling
19589     __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19590   %}
19591   ins_pipe( pipe_slow );
19592 %}
19593 
19594 instruct vcmpne2D(vecX dst, vecX src1, vecX src2) %{
19595   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 &&
19596             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ne &&
19597             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
19598   match(Set dst (VectorMaskCmp src1 src2));
19599   format %{ "vcmpnepd  $dst,$src1,$src2\t! cmpne packed2D" %}
19600   ins_encode %{
19601     int vector_len = 0;
19602     // As per JLS 15.21.1, != of NaNs is true. Thus use unordered compare.
19603     Assembler::ComparisonPredicateFP cmp = Assembler::NEQ_UQ; //unordered non-signaling
19604     __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19605   %}
19606   ins_pipe( pipe_slow );
19607 %}
19608 
19609 instruct vcmpne4D(vecY dst, vecY src1, vecY src2) %{
19610   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 &&
19611             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ne &&
19612             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
19613   match(Set dst (VectorMaskCmp src1 src2));
19614   format %{ "vcmpnepd  $dst,$src1,$src2\t! cmpne packed4D" %}
19615   ins_encode %{
19616     int vector_len = 1;
19617     // As per JLS 15.21.1, != of NaNs is true. Thus use unordered compare.
19618     Assembler::ComparisonPredicateFP cmp = Assembler::NEQ_UQ; //unordered non-signaling
19619     __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19620   %}
19621   ins_pipe( pipe_slow );
19622 %}
19623 
19624 instruct vcmpne8D(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
19625   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 &&
19626             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ne &&
19627             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
19628   match(Set dst (VectorMaskCmp src1 src2));
19629   effect(TEMP dst, TEMP scratch);
19630   format %{ "vcmpnepd  k2,$src1,$src2\n\t"
19631             "vmovdqu8  $dst, k2{z}, 0xFFFFFFFFFF \t! cmpne packed8D" %}
19632   ins_encode %{
19633     int vector_len = 2;
19634     // As per JLS 15.21.1, != of NaNs is true. Thus use unordered compare.
19635     Assembler::ComparisonPredicateFP cmp = Assembler::NEQ_UQ; //unordered non-signaling
19636     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
19637     KRegister mask = k0; // The comparison itself is not being masked.
19638     __ evcmppd(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19639     __ evmovdquq($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
19640   %}
19641   ins_pipe( pipe_slow );
19642 %}
19643 
19644 instruct vcmpeq2I(vecD dst, vecD src1, vecD src2) %{
19645   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 &&
19646             n->as_VectorMaskCmp()->get_predicate() == BoolTest::eq &&
19647             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_INT);
19648   match(Set dst (VectorMaskCmp src1 src2));
19649   format %{ "vpcmpeqd  $dst,$src1,$src2\n\t! cmpeq packed2I" %}
19650   ins_encode %{
19651     int vector_len = 0;
19652     __ vpcmpeqd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
19653   %}
19654   ins_pipe( pipe_slow );
19655 %}
19656 
19657 instruct vcmpeq4I(vecX dst, vecX src1, vecX src2) %{
19658   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 &&
19659             n->as_VectorMaskCmp()->get_predicate() == BoolTest::eq &&
19660             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_INT);
19661   match(Set dst (VectorMaskCmp src1 src2));
19662   format %{ "vpcmpeqd  $dst,$src1,$src2\n\t! cmpeq packed4I" %}
19663   ins_encode %{
19664     int vector_len = 0;
19665     __ vpcmpeqd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
19666   %}
19667   ins_pipe( pipe_slow );
19668 %}
19669 
19670 instruct vcmpeq8I(vecY dst, vecY src1, vecY src2) %{
19671   predicate(UseAVX > 1 && n->as_Vector()->length() == 8 &&
19672             n->as_VectorMaskCmp()->get_predicate() == BoolTest::eq &&
19673             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_INT);
19674   match(Set dst (VectorMaskCmp src1 src2));
19675   format %{ "vpcmpeqd  $dst,$src1,$src2\n\t! cmpeq packed8I" %}
19676   ins_encode %{
19677     int vector_len = 1;
19678     __ vpcmpeqd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
19679   %}
19680   ins_pipe( pipe_slow );
19681 %}
19682 
19683 instruct vcmpeq16I(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
19684   predicate(UseAVX > 2 && n->as_Vector()->length() == 16 &&
19685             n->as_VectorMaskCmp()->get_predicate() == BoolTest::eq &&
19686             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_INT);
19687   match(Set dst (VectorMaskCmp src1 src2));
19688   effect(TEMP dst, TEMP scratch);
19689   format %{ "vpcmpeqd  k2,$src1,$src2\n\t"
19690             "vmovdqu32 $dst, k2{z}, 0xFFFFFFFFFF \t! cmpeq packed16I" %}
19691   ins_encode %{
19692     int vector_len = 2;
19693     Assembler::ComparisonPredicate cmp = Assembler::eq;
19694     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
19695     KRegister mask = k0; // The comparison itself is not being masked.
19696     __ evpcmpd(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19697     __ evmovdqul($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
19698   %}
19699   ins_pipe( pipe_slow );
19700 %}
19701 
19702 instruct vcmplt2I(vecD dst, vecD src1, vecD src2) %{
19703   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 &&
19704             n->as_VectorMaskCmp()->get_predicate() == BoolTest::lt &&
19705             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_INT);
19706   match(Set dst (VectorMaskCmp src1 src2));
19707   format %{ "vpcmpgtd  $dst,$src2,$src1\t! cmplt packed2I" %}
19708   ins_encode %{
19709     int vector_len = 0;
19710     __ vpcmpgtd($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, vector_len);
19711   %}
19712   ins_pipe( pipe_slow );
19713 %}
19714 
19715 instruct vcmplt4I(vecX dst, vecX src1, vecX src2) %{
19716   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 &&
19717             n->as_VectorMaskCmp()->get_predicate() == BoolTest::lt &&
19718             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_INT);
19719   match(Set dst (VectorMaskCmp src1 src2));
19720   format %{ "vpcmpgtd  $dst,$src2,$src1\t! cmplt packed4I" %}
19721   ins_encode %{
19722     int vector_len = 0;
19723     __ vpcmpgtd($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, vector_len);
19724   %}
19725   ins_pipe( pipe_slow );
19726 %}
19727 
19728 instruct vcmplt8I(vecY dst, vecY src1, vecY src2) %{
19729   predicate(UseAVX > 1 && n->as_Vector()->length() == 8 &&
19730             n->as_VectorMaskCmp()->get_predicate() == BoolTest::lt &&
19731             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_INT);
19732   match(Set dst (VectorMaskCmp src1 src2));
19733   format %{ "vpcmpgtd  $dst,$src2,$src1\t! cmplt packed8I" %}
19734   ins_encode %{
19735     int vector_len = 1;
19736     __ vpcmpgtd($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, vector_len);
19737   %}
19738   ins_pipe( pipe_slow );
19739 %}
19740 
19741 instruct vcmplt16I(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
19742   predicate(UseAVX > 2 && n->as_Vector()->length() == 16 &&
19743             n->as_VectorMaskCmp()->get_predicate() == BoolTest::lt &&
19744             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_INT);
19745   match(Set dst (VectorMaskCmp src1 src2));
19746   effect(TEMP dst, TEMP scratch);
19747   format %{ "vpcmpnled  k2,$src1,$src2\n\t"
19748             "vmovdqu32   $dst, k2{z}, 0xFFFFFFFFFF \t! cmpgt packed16I" %}
19749   ins_encode %{
19750     int vector_len = 2;
19751     Assembler::ComparisonPredicate cmp = Assembler::lt;
19752     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
19753     KRegister mask = k0; // The comparison itself is not being masked.
19754     __ evpcmpd(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19755     __ evmovdqul($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
19756   %}
19757   ins_pipe( pipe_slow );
19758 %}
19759 
19760 instruct vcmpgt2I(vecD dst, vecD src1, vecD src2) %{
19761   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 &&
19762             n->as_VectorMaskCmp()->get_predicate() == BoolTest::gt &&
19763             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_INT);
19764   match(Set dst (VectorMaskCmp src1 src2));
19765   format %{ "vpcmpgtd  $dst,$src1,$src2\t! cmpgt packed2I" %}
19766   ins_encode %{
19767     int vector_len = 0;
19768     __ vpcmpgtd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
19769   %}
19770   ins_pipe( pipe_slow );
19771 %}
19772 
19773 instruct vcmpgt4I(vecX dst, vecX src1, vecX src2) %{
19774   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 &&
19775             n->as_VectorMaskCmp()->get_predicate() == BoolTest::gt &&
19776             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_INT);
19777   match(Set dst (VectorMaskCmp src1 src2));
19778   format %{ "vpcmpgtd  $dst,$src1,$src2\t! cmpgt packed4I" %}
19779   ins_encode %{
19780     int vector_len = 0;
19781     __ vpcmpgtd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
19782   %}
19783   ins_pipe( pipe_slow );
19784 %}
19785 
19786 instruct vcmpgt8I(vecY dst, vecY src1, vecY src2) %{
19787   predicate(UseAVX > 1 && n->as_Vector()->length() == 8 &&
19788             n->as_VectorMaskCmp()->get_predicate() == BoolTest::gt &&
19789             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_INT);
19790   match(Set dst (VectorMaskCmp src1 src2));
19791   format %{ "vpcmpgtd  $dst,$src1,$src2\t! cmpgt packed8I" %}
19792   ins_encode %{
19793     int vector_len = 1;
19794     __ vpcmpgtd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
19795   %}
19796   ins_pipe( pipe_slow );
19797 %}
19798 
19799 instruct vcmpgt16I(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
19800   predicate(UseAVX > 2 && n->as_Vector()->length() == 16 &&
19801             n->as_VectorMaskCmp()->get_predicate() == BoolTest::gt &&
19802             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_INT);
19803   match(Set dst (VectorMaskCmp src1 src2));
19804   effect(TEMP dst, TEMP scratch);
19805   format %{ "vpcmpnled  k2,$src1,$src2\n\t"
19806             "vmovdqu32   $dst, k2{z}, 0xFFFFFFFFFF \t! cmpgt packed16I" %}
19807   ins_encode %{
19808     int vector_len = 2;
19809     Assembler::ComparisonPredicate cmp = Assembler::nle;
19810     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
19811     KRegister mask = k0; // The comparison itself is not being masked.
19812     __ evpcmpd(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19813     __ evmovdqul($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
19814   %}
19815   ins_pipe( pipe_slow );
19816 %}
19817 
19818 instruct vcmpge2I(vecD dst, vecD src1, vecD src2, rRegL scratch) %{
19819   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 &&
19820             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ge &&
19821             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_INT);
19822   match(Set dst (VectorMaskCmp src1 src2));
19823   effect(TEMP scratch);
19824   format %{ "vpcmpgtd  $dst,$src2,$src1\n\t"
19825             "vpxor $dst,$dst,0xFFFFFFFF\t! cmpge packed2I" %}
19826   ins_encode %{
19827     int vector_len = 0;
19828     __ vpcmpgtd($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, vector_len);
19829     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
19830   %}
19831   ins_pipe( pipe_slow );
19832 %}
19833 
19834 instruct vcmpge4I(vecX dst, vecX src1, vecX src2, rRegL scratch) %{
19835   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 &&
19836             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ge &&
19837             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_INT);
19838   match(Set dst (VectorMaskCmp src1 src2));
19839   effect(TEMP scratch);
19840   format %{ "vpcmpgtd  $dst,$src2,$src1\n\t"
19841             "vpxor $dst,$dst,0xFFFFFFFF\t! cmpge packed4I" %}
19842   ins_encode %{
19843     int vector_len = 0;
19844     __ vpcmpgtd($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, vector_len);
19845     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
19846   %}
19847   ins_pipe( pipe_slow );
19848 %}
19849 
19850 instruct vcmpge8I(vecY dst, vecY src1, vecY src2, rRegL scratch) %{
19851   predicate(UseAVX > 1 && n->as_Vector()->length() == 8 &&
19852             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ge &&
19853             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_INT);
19854   match(Set dst (VectorMaskCmp src1 src2));
19855   effect(TEMP scratch);
19856   format %{ "vpcmpgtd  $dst,$src2,$src1\n\t"
19857             "vpxor $dst,$dst,0xFFFFFFFF\t! cmpge packed8I" %}
19858   ins_encode %{
19859     int vector_len = 1;
19860     __ vpcmpgtd($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, vector_len);
19861     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
19862   %}
19863   ins_pipe( pipe_slow );
19864 %}
19865 
19866 instruct vcmpge16I(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
19867   predicate(UseAVX > 2 && n->as_Vector()->length() == 16 &&
19868             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ge &&
19869             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_INT);
19870   match(Set dst (VectorMaskCmp src1 src2));
19871   effect(TEMP dst, TEMP scratch);
19872   format %{ "vpcmpnltd  k2,$src1,$src2\n\t"
19873             "vmovdqu32   $dst, k2{z}, 0xFFFFFFFFFF \t! cmpge packed16I" %}
19874   ins_encode %{
19875     int vector_len = 2;
19876     Assembler::ComparisonPredicate cmp = Assembler::nlt;
19877     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
19878     KRegister mask = k0; // The comparison itself is not being masked.
19879     __ evpcmpd(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19880     __ evmovdqul($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
19881   %}
19882   ins_pipe( pipe_slow );
19883 %}
19884 
19885 instruct vcmple2I(vecD dst, vecD src1, vecD src2, rRegL scratch) %{
19886   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 &&
19887             n->as_VectorMaskCmp()->get_predicate() == BoolTest::le &&
19888             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_INT);
19889   match(Set dst (VectorMaskCmp src1 src2));
19890   effect(TEMP scratch);
19891   format %{ "vpcmpgtd  $dst,$src1,$src2\n\t"
19892             "vpxor $dst,$dst,0xFFFFFFFF\t! cmple packed2I" %}
19893   ins_encode %{
19894     int vector_len = 0;
19895     __ vpcmpgtd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
19896     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
19897   %}
19898   ins_pipe( pipe_slow );
19899 %}
19900 
19901 instruct vcmple4I(vecX dst, vecX src1, vecX src2, rRegL scratch) %{
19902   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 &&
19903             n->as_VectorMaskCmp()->get_predicate() == BoolTest::le &&
19904             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_INT);
19905   match(Set dst (VectorMaskCmp src1 src2));
19906   effect(TEMP scratch);
19907   format %{ "vpcmpgtd  $dst,$src1,$src2\n\t"
19908             "vpxor $dst,$dst,0xFFFFFFFF\t! cmple packed4I" %}
19909   ins_encode %{
19910     int vector_len = 0;
19911     __ vpcmpgtd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
19912     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
19913   %}
19914   ins_pipe( pipe_slow );
19915 %}
19916 
19917 instruct vcmple8I(vecY dst, vecY src1, vecY src2, rRegL scratch) %{
19918   predicate(UseAVX > 1 && n->as_Vector()->length() == 8 &&
19919             n->as_VectorMaskCmp()->get_predicate() == BoolTest::le &&
19920             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_INT);
19921   match(Set dst (VectorMaskCmp src1 src2));
19922   effect(TEMP scratch);
19923   format %{ "vpcmpgtd  $dst,$src1,$src2\n\t"
19924             "vpxor $dst,$dst,0xFFFFFFFF\t! cmple packed8I" %}
19925   ins_encode %{
19926     int vector_len = 1;
19927     __ vpcmpgtd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
19928     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
19929   %}
19930   ins_pipe( pipe_slow );
19931 %}
19932 
19933 instruct vcmple16I(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
19934   predicate(UseAVX > 2 && n->as_Vector()->length() == 16 &&
19935             n->as_VectorMaskCmp()->get_predicate() == BoolTest::le &&
19936             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_INT);
19937   match(Set dst (VectorMaskCmp src1 src2));
19938   effect(TEMP dst, TEMP scratch);
19939   format %{ "vpcmpled  k2,$src1,$src2\n\t"
19940             "vmovdqu32   $dst, k2{z}, 0xFFFFFFFFFF \t! cmple packed16I" %}
19941   ins_encode %{
19942     int vector_len = 2;
19943     Assembler::ComparisonPredicate cmp = Assembler::le;
19944     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
19945     KRegister mask = k0; // The comparison itself is not being masked.
19946     __ evpcmpd(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19947     __ evmovdqul($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
19948   %}
19949   ins_pipe( pipe_slow );
19950 %}
19951 
19952 instruct vcmpne2I(vecD dst, vecD src1, vecD src2, rRegL scratch) %{
19953   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 &&
19954             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ne &&
19955             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_INT);
19956   match(Set dst (VectorMaskCmp src1 src2));
19957   effect(TEMP scratch);
19958   format %{ "vpcmpeqd  $dst,$src1,$src2\n\t"
19959             "vpxor $dst,$dst,0xFFFFFFFF\t! cmpneq packed2I" %}
19960   ins_encode %{
19961     int vector_len = 0;
19962     __ vpcmpeqd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
19963     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
19964   %}
19965   ins_pipe( pipe_slow );
19966 %}
19967 
19968 instruct vcmpne4I(vecX dst, vecX src1, vecX src2, rRegL scratch) %{
19969   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 &&
19970             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ne &&
19971             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_INT);
19972   match(Set dst (VectorMaskCmp src1 src2));
19973   effect(TEMP scratch);
19974   format %{ "vpcmpeqd  $dst,$src1,$src2\n\t"
19975             "vpxor $dst,$dst,0xFFFFFFFF\t! cmpneq packed4I" %}
19976   ins_encode %{
19977     int vector_len = 0;
19978     __ vpcmpeqd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
19979     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
19980   %}
19981   ins_pipe( pipe_slow );
19982 %}
19983 
19984 instruct vcmpne8I(vecY dst, vecY src1, vecY src2, rRegL scratch) %{
19985   predicate(UseAVX > 1 && n->as_Vector()->length() == 8 &&
19986             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ne &&
19987             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_INT);
19988   match(Set dst (VectorMaskCmp src1 src2));
19989   effect(TEMP scratch);
19990   format %{ "vpcmpeqd  $dst,$src1,$src2\n\t"
19991             "vpxor $dst,$dst,0xFFFFFFFF\t! cmpneq packed8I" %}
19992   ins_encode %{
19993     int vector_len = 1;
19994     __ vpcmpeqd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
19995     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
19996   %}
19997   ins_pipe( pipe_slow );
19998 %}
19999 
20000 instruct vcmpne16I(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
20001   predicate(UseAVX > 2 && n->as_Vector()->length() == 16 &&
20002             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ne &&
20003             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_INT);
20004   match(Set dst (VectorMaskCmp src1 src2));
20005   effect(TEMP dst, TEMP scratch);
20006   format %{ "vpcmpneqd  k2,$src1,$src2\n\t"
20007             "vmovdqu32   $dst, k2{z}, 0xFFFFFFFFFF \t! cmpneq packed16I" %}
20008   ins_encode %{
20009     int vector_len = 2;
20010     Assembler::ComparisonPredicate cmp = Assembler::neq;
20011     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
20012     KRegister mask = k0; // The comparison itself is not being masked.
20013     __ evpcmpd(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
20014     __ evmovdqul($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
20015   %}
20016   ins_pipe( pipe_slow );
20017 %}
20018 
20019 instruct vcmpeq8B(vecD dst, vecD src1, vecD src2) %{
20020   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 &&
20021             n->as_VectorMaskCmp()->get_predicate() == BoolTest::eq &&
20022             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
20023   match(Set dst (VectorMaskCmp src1 src2));
20024   format %{ "vpcmpeqb  $dst,$src1,$src2\n\t! cmpeq packed8B" %}
20025   ins_encode %{
20026     int vector_len = 0;
20027     __ vpcmpeqb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
20028   %}
20029   ins_pipe( pipe_slow );
20030 %}
20031 
20032 instruct vcmpeq16B(vecX dst, vecX src1, vecX src2) %{
20033   predicate(UseAVX > 0 && n->as_Vector()->length() == 16 &&
20034             n->as_VectorMaskCmp()->get_predicate() == BoolTest::eq &&
20035             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
20036   match(Set dst (VectorMaskCmp src1 src2));
20037   format %{ "vpcmpeqb  $dst,$src1,$src2\n\t! cmpeq packed16B" %}
20038   ins_encode %{
20039     int vector_len = 0;
20040     __ vpcmpeqb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
20041   %}
20042   ins_pipe( pipe_slow );
20043 %}
20044 
20045 instruct vcmpeq32B(vecY dst, vecY src1, vecY src2) %{
20046   predicate(UseAVX > 1 && n->as_Vector()->length() == 32 &&
20047             n->as_VectorMaskCmp()->get_predicate() == BoolTest::eq &&
20048             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
20049   match(Set dst (VectorMaskCmp src1 src2));
20050   format %{ "vpcmpeqb  $dst,$src1,$src2\n\t! cmpeq packed32B" %}
20051   ins_encode %{
20052     int vector_len = 1;
20053     __ vpcmpeqb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
20054   %}
20055   ins_pipe( pipe_slow );
20056 %}
20057 
20058 instruct vcmpeq64B(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
20059   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64 &&
20060             n->as_VectorMaskCmp()->get_predicate() == BoolTest::eq &&
20061             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
20062   match(Set dst (VectorMaskCmp src1 src2));
20063   effect(TEMP dst, TEMP scratch);
20064   format %{ "vpcmpeqb  k2,$src1,$src2\n\t"
20065             "vmovdqu8 $dst, k2{z}, 0xFFFFFFFFFF \t! cmpeq packed64B" %}
20066   ins_encode %{
20067     int vector_len = 2;
20068     Assembler::ComparisonPredicate cmp = Assembler::eq;
20069     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
20070     KRegister mask = k0; // The comparison itself is not being masked.
20071     __ evpcmpb(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
20072     __ evmovdqub($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
20073   %}
20074   ins_pipe( pipe_slow );
20075 %}
20076 
20077 instruct vcmplt8B(vecD dst, vecD src1, vecD src2) %{
20078   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 &&
20079             n->as_VectorMaskCmp()->get_predicate() == BoolTest::lt &&
20080             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
20081   match(Set dst (VectorMaskCmp src1 src2));
20082   format %{ "vpcmpgtb  $dst,$src2,$src1\t! cmplt packed8B" %}
20083   ins_encode %{
20084     int vector_len = 0;
20085     __ vpcmpgtb($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, vector_len);
20086   %}
20087   ins_pipe( pipe_slow );
20088 %}
20089 
20090 instruct vcmplt16B(vecX dst, vecX src1, vecX src2) %{
20091   predicate(UseAVX > 0 && n->as_Vector()->length() == 16 &&
20092             n->as_VectorMaskCmp()->get_predicate() == BoolTest::lt &&
20093             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
20094   match(Set dst (VectorMaskCmp src1 src2));
20095   format %{ "vpcmpgtb  $dst,$src2,$src1\t! cmplt packed16B" %}
20096   ins_encode %{
20097     int vector_len = 0;
20098     __ vpcmpgtb($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, vector_len);
20099   %}
20100   ins_pipe( pipe_slow );
20101 %}
20102 
20103 instruct vcmplt32B(vecY dst, vecY src1, vecY src2) %{
20104   predicate(UseAVX > 1 && n->as_Vector()->length() == 32 &&
20105             n->as_VectorMaskCmp()->get_predicate() == BoolTest::lt &&
20106             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
20107   match(Set dst (VectorMaskCmp src1 src2));
20108   format %{ "vpcmpgtb  $dst,$src2,$src1\t! cmplt packed32B" %}
20109   ins_encode %{
20110     int vector_len = 1;
20111     __ vpcmpgtb($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, vector_len);
20112   %}
20113   ins_pipe( pipe_slow );
20114 %}
20115 
20116 instruct vcmplt64B(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
20117   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64 &&
20118             n->as_VectorMaskCmp()->get_predicate() == BoolTest::lt &&
20119             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
20120   match(Set dst (VectorMaskCmp src1 src2));
20121   effect(TEMP dst, TEMP scratch);
20122   format %{ "vpcmpnleb  k2,$src1,$src2\n\t"
20123             "vmovdqu8   $dst, k2{z}, 0xFFFFFFFFFF \t! cmpgt packed64B" %}
20124   ins_encode %{
20125     int vector_len = 2;
20126     Assembler::ComparisonPredicate cmp = Assembler::lt;
20127     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
20128     KRegister mask = k0; // The comparison itself is not being masked.
20129     __ evpcmpb(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
20130     __ evmovdqub($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
20131   %}
20132   ins_pipe( pipe_slow );
20133 %}
20134 
20135 instruct vcmpgt8B(vecD dst, vecD src1, vecD src2) %{
20136   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 &&
20137             n->as_VectorMaskCmp()->get_predicate() == BoolTest::gt &&
20138             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
20139   match(Set dst (VectorMaskCmp src1 src2));
20140   format %{ "vpcmpgtb  $dst,$src1,$src2\t! cmpgt packed8B" %}
20141   ins_encode %{
20142     int vector_len = 0;
20143     __ vpcmpgtb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
20144   %}
20145   ins_pipe( pipe_slow );
20146 %}
20147 
20148 instruct vcmpgt16B(vecX dst, vecX src1, vecX src2) %{
20149   predicate(UseAVX > 0 && n->as_Vector()->length() == 16 &&
20150             n->as_VectorMaskCmp()->get_predicate() == BoolTest::gt &&
20151             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
20152   match(Set dst (VectorMaskCmp src1 src2));
20153   format %{ "vpcmpgtb  $dst,$src1,$src2\t! cmpgt packed16B" %}
20154   ins_encode %{
20155     int vector_len = 0;
20156     __ vpcmpgtb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
20157   %}
20158   ins_pipe( pipe_slow );
20159 %}
20160 
20161 instruct vcmpgt32B(vecY dst, vecY src1, vecY src2) %{
20162   predicate(UseAVX > 1 && n->as_Vector()->length() == 32 &&
20163             n->as_VectorMaskCmp()->get_predicate() == BoolTest::gt &&
20164             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
20165   match(Set dst (VectorMaskCmp src1 src2));
20166   format %{ "vpcmpgtb  $dst,$src1,$src2\t! cmpgt packed32B" %}
20167   ins_encode %{
20168     int vector_len = 1;
20169     __ vpcmpgtb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
20170   %}
20171   ins_pipe( pipe_slow );
20172 %}
20173 
20174 instruct vcmpgt64B(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
20175   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64 &&
20176             n->as_VectorMaskCmp()->get_predicate() == BoolTest::gt &&
20177             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
20178   match(Set dst (VectorMaskCmp src1 src2));
20179   effect(TEMP dst, TEMP scratch);
20180   format %{ "vpcmpnleb  k2,$src1,$src2\n\t"
20181             "vmovdqu8   $dst, k2{z}, 0xFFFFFFFFFF \t! cmpgt packed64B" %}
20182   ins_encode %{
20183     int vector_len = 2;
20184     Assembler::ComparisonPredicate cmp = Assembler::nle;
20185     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
20186     KRegister mask = k0; // The comparison itself is not being masked.
20187     __ evpcmpb(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
20188     __ evmovdqub($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
20189   %}
20190   ins_pipe( pipe_slow );
20191 %}
20192 
20193 instruct vcmpge8B(vecD dst, vecD src1, vecD src2, rRegL scratch) %{
20194   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 &&
20195             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ge &&
20196             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
20197   match(Set dst (VectorMaskCmp src1 src2));
20198   effect(TEMP scratch);
20199   format %{ "vpcmpgtb  $dst,$src2,$src1\n\t"
20200             "vpxor $dst,$dst,0xFFFFFFFF\t! cmpge packed8B" %}
20201   ins_encode %{
20202     int vector_len = 0;
20203     __ vpcmpgtb($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, vector_len);
20204     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
20205   %}
20206   ins_pipe( pipe_slow );
20207 %}
20208 
20209 instruct vcmpge16B(vecX dst, vecX src1, vecX src2, rRegL scratch) %{
20210   predicate(UseAVX > 0 && n->as_Vector()->length() == 16 &&
20211             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ge &&
20212             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
20213   match(Set dst (VectorMaskCmp src1 src2));
20214   effect(TEMP scratch);
20215   format %{ "vpcmpgtb  $dst,$src2,$src1\n\t"
20216             "vpxor $dst,$dst,0xFFFFFFFF\t! cmpge packed16B" %}
20217   ins_encode %{
20218     int vector_len = 0;
20219     __ vpcmpgtb($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, vector_len);
20220     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
20221   %}
20222   ins_pipe( pipe_slow );
20223 %}
20224 
20225 instruct extract8d(regD dst, vecZ src, vecZ tmp, immI idx) %{
20226   predicate(UseAVX > 2 && n->in(1)->bottom_type()->is_vect()->length() == 8);
20227   match(Set dst (ExtractD src idx));
20228   effect(TEMP tmp);
20229   ins_encode %{
20230     int vector_len = 2;
20231     int midx = 0x7 & $idx$$constant;
20232     if (midx == 0 && $dst$$XMMRegister != $src$$XMMRegister) {
20233       __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister);
20234     } else if (midx == 1) {
20235       __ vpshufpd($dst$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, midx, vector_len);
20236     } else if (midx > 1 && midx <= 7) {
20237       int extr_idx1 = midx / 2;
20238       int extr_idx2 = midx % 2;
20239       __ vextractf32x4($tmp$$XMMRegister, $src$$XMMRegister, extr_idx1);
20240       __ vpshufpd($dst$$XMMRegister, $tmp$$XMMRegister, $tmp$$XMMRegister, extr_idx2, vector_len);
20241     }
20242   %}
20243   ins_pipe( pipe_slow );
20244 %}
20245 
20246 instruct extract4d(regD dst, vecY src, vecY tmp, immI idx) %{
20247   predicate(UseAVX > 0 && n->in(1)->bottom_type()->is_vect()->length() == 4);
20248   match(Set dst (ExtractD src idx));
20249   effect(TEMP tmp);
20250   ins_encode %{
20251     int vector_len = 1;
20252     int midx = 0x3 & $idx$$constant;
20253     if (midx == 0 && $dst$$XMMRegister != $src$$XMMRegister) {
20254       __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister);
20255     } else if (midx == 1) {
20256       __ vpshufpd($dst$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, midx, vector_len);
20257     } else if (midx > 1 && midx <= 3) {
20258       __ vextractf128($tmp$$XMMRegister, $src$$XMMRegister, 0x1);
20259       __ vpshufpd($dst$$XMMRegister, $tmp$$XMMRegister, $tmp$$XMMRegister, midx - 2, vector_len);
20260     }
20261   %}
20262   ins_pipe( pipe_slow );
20263 %}
20264 
20265 instruct extract2d(regD dst, vecX src, immI idx) %{
20266   predicate(UseSSE >= 2 && n->in(1)->bottom_type()->is_vect()->length() == 2);
20267   match(Set dst (ExtractD src idx));
20268   ins_encode %{
20269     int midx = 0x1 & $idx$$constant;
20270     if ($dst$$XMMRegister != $src$$XMMRegister) {
20271       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
20272     }
20273     if (midx > 0) {
20274       __ pshufpd($dst$$XMMRegister, $dst$$XMMRegister, midx);
20275     }
20276   %}
20277   ins_pipe( pipe_slow );
20278 %}
20279 
20280 instruct extract1d(regD dst, vecD src, immI idx) %{
20281   predicate(UseSSE >= 2  && n->in(1)->bottom_type()->is_vect()->length() == 1);
20282   match(Set dst (ExtractD src idx));
20283   ins_encode %{
20284     int midx = 0x1 & $idx$$constant;
20285     if (midx == 0 && $dst$$XMMRegister != $src$$XMMRegister) {
20286       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
20287     }
20288   %}
20289   ins_pipe( pipe_slow );
20290 %}
20291 
20292 instruct extract16f(regF dst, vecZ src, vecZ tmp, immI idx) %{
20293   predicate(UseAVX > 2 && n->in(1)->bottom_type()->is_vect()->length() == 16);
20294   match(Set dst (ExtractF src idx));
20295   effect(TEMP tmp);
20296   ins_encode %{
20297     int vector_len=2;
20298     int midx = 0xF & $idx$$constant;
20299     if (midx == 0 && $dst$$XMMRegister != $src$$XMMRegister) {
20300       __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister);
20301     } else if (midx >= 1 && midx <= 3) {
20302       __ vpshufps($dst$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, midx, vector_len);
20303     } else {
20304       int extr_idx1 = midx / 4;
20305       int extr_idx2 = midx % 4;
20306       __ vextractf32x4($tmp$$XMMRegister, $src$$XMMRegister, extr_idx1);
20307       __ vpshufps($dst$$XMMRegister, $tmp$$XMMRegister, $tmp$$XMMRegister, extr_idx2, vector_len);
20308     }
20309   %}
20310   ins_pipe( pipe_slow );
20311 %}
20312 
20313 instruct extract8f(regF dst, vecY src, vecY tmp, immI idx) %{
20314   predicate(UseAVX > 0 && n->in(1)->bottom_type()->is_vect()->length() == 8);
20315   match(Set dst (ExtractF src idx));
20316   effect(TEMP tmp);
20317   ins_encode %{
20318     int vector_len=1;
20319     int midx = 0x7 & $idx$$constant;
20320     if (midx == 0 && $dst$$XMMRegister != $src$$XMMRegister) {
20321       __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister);
20322     } else if (midx >= 1 && midx <= 3) {
20323       __ vpshufps($dst$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, midx, vector_len);
20324     } else if (midx >= 4) {
20325       __ vextractf128($tmp$$XMMRegister, $src$$XMMRegister, 0x1);
20326       __ vpshufps($dst$$XMMRegister, $tmp$$XMMRegister, $tmp$$XMMRegister, midx - 4, vector_len);
20327     }
20328   %}
20329   ins_pipe( pipe_slow );
20330 %}
20331 
20332 instruct extract4f(regF dst, vecX src, immI idx) %{
20333   predicate(UseSSE >= 2 && n->in(1)->bottom_type()->is_vect()->length() == 4);
20334   match(Set dst (ExtractF src idx));
20335   ins_encode %{
20336     int midx = 0x3 & $idx$$constant;
20337     if ($dst$$XMMRegister != $src$$XMMRegister) {
20338       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
20339     }
20340     if (midx > 0) {
20341       __ pshufps($dst$$XMMRegister, $dst$$XMMRegister, midx);
20342     }
20343   %}
20344   ins_pipe( pipe_slow );
20345 %}
20346 
20347 instruct extract2f(regF dst, vecD src, immI idx) %{
20348   predicate(UseSSE >= 2 && n->in(1)->bottom_type()->is_vect()->length() == 2);
20349   match(Set dst (ExtractF src idx));
20350   ins_encode %{
20351     int midx = 0x1 & $idx$$constant;
20352     if ($dst$$XMMRegister != $src$$XMMRegister) {
20353       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
20354     }
20355     if (midx > 0)
20356     {
20357       __ pshufps($dst$$XMMRegister, $dst$$XMMRegister, midx);
20358     }
20359   %}
20360   ins_pipe( pipe_slow );
20361 %}
20362 
20363 instruct extract1l(rRegL dst, vecD src, immI idx) %{
20364   predicate(UseSSE > 1 && n->in(1)->bottom_type()->is_vect()->length() == 1);
20365   match(Set dst (ExtractL src idx));
20366   ins_encode %{
20367     int midx = 0x1 & $idx$$constant;
20368     if (midx == 0) {
20369       __ movq($dst$$Register, $src$$XMMRegister);
20370     }
20371   %}
20372   ins_pipe( pipe_slow );
20373 %}
20374 
20375 instruct extract2l(rRegL dst, vecX src, immI idx) %{
20376   predicate(UseSSE >= 4 && n->in(1)->bottom_type()->is_vect()->length() == 2);
20377   match(Set dst (ExtractL src idx));
20378   ins_encode %{
20379     int midx = 0x1 & $idx$$constant;
20380     if (midx == 0) {
20381       __ movq($dst$$Register, $src$$XMMRegister);
20382     } else {
20383       __ pextrq($dst$$Register, $src$$XMMRegister, midx);
20384     }
20385   %}
20386   ins_pipe( pipe_slow );
20387 %}
20388 
20389 instruct extract4l(rRegL dst, vecY src, immI idx, vecX tmp) %{
20390   predicate(UseAVX > 1 && n->in(1)->bottom_type()->is_vect()->length() == 4);
20391   match(Set dst (ExtractL src idx));
20392   effect(TEMP tmp);
20393   ins_encode %{
20394     int midx = 0x3 & $idx$$constant;
20395     if (midx == 0) {
20396       __ movq($dst$$Register, $src$$XMMRegister);
20397     } else if(midx==1){
20398       __ pextrq($dst$$Register, $src$$XMMRegister, midx);
20399     } else {
20400       __ vextracti128($tmp$$XMMRegister, $src$$XMMRegister, 0x1);
20401       __ pextrq($dst$$Register, $tmp$$XMMRegister, midx-2);
20402     }
20403   %}
20404   ins_pipe( pipe_slow );
20405 %}
20406 
20407 instruct extract8l(rRegL dst, vecZ src, vecX tmp, immI idx) %{
20408   predicate(UseAVX > 2 && n->in(1)->bottom_type()->is_vect()->length() == 8);
20409   match(Set dst (ExtractL src idx));
20410   effect(TEMP tmp);
20411   ins_encode %{
20412     int midx = 0x7 & $idx$$constant;
20413     if (midx == 0) {
20414       __ movq($dst$$Register, $src$$XMMRegister);
20415     } else if (midx == 1) {
20416       __ pextrq($dst$$Register, $src$$XMMRegister, midx);
20417     } else {
20418       // Using 2 because there are 2 longs in 128-bit
20419       int extr_idx1 = midx / 2;
20420       int extr_idx2 = midx % 2;
20421       __ vextracti32x4($tmp$$XMMRegister, $src$$XMMRegister, extr_idx1);
20422       __ pextrq($dst$$Register, $tmp$$XMMRegister, extr_idx2);
20423     }
20424   %}
20425   ins_pipe( pipe_slow );
20426 %}
20427 
20428 instruct extract2i(rRegI dst, vecD src, immI idx) %{
20429   predicate(UseSSE > 3 && n->in(1)->bottom_type()->is_vect()->length() == 2);
20430   match(Set dst (ExtractI src idx));
20431   ins_encode %{
20432     int midx = 0x1 & $idx$$constant;
20433     if (midx == 0) {
20434       __ movdl($dst$$Register, $src$$XMMRegister);
20435     } else if (midx >= 1) {
20436       __ pextrd($dst$$Register, $src$$XMMRegister, midx);
20437     }
20438   %}
20439   ins_pipe( pipe_slow );
20440 %}
20441 
20442 instruct extract4i(rRegI dst, vecX src, immI idx) %{
20443   predicate(UseSSE > 3 && n->in(1)->bottom_type()->is_vect()->length() == 4);
20444   match(Set dst (ExtractI src idx));
20445   ins_encode %{
20446     int midx = 0x3 & $idx$$constant;
20447     if (midx == 0) {
20448       __ movdl($dst$$Register, $src$$XMMRegister);
20449     } else if (midx >= 1 && midx <= 3) {
20450       __ pextrd($dst$$Register, $src$$XMMRegister, midx);
20451     }
20452   %}
20453   ins_pipe( pipe_slow );
20454 %}
20455 
20456 instruct extract8i(rRegI dst, vecY src, vecX tmp, immI idx) %{
20457   predicate(UseAVX > 1 && n->in(1)->bottom_type()->is_vect()->length() == 8);
20458   match(Set dst (ExtractI src idx));
20459   effect(TEMP tmp);
20460   ins_encode %{
20461     int midx = 0x7 & $idx$$constant;
20462     if (midx == 0) {
20463       __ movdl($dst$$Register, $src$$XMMRegister);
20464     } else if (midx >= 1 && midx <= 3) {
20465       __ pextrd($dst$$Register, $src$$XMMRegister, midx);
20466     } else if (midx >= 4) {
20467       __ vextracti128($tmp$$XMMRegister, $src$$XMMRegister, 0x1);
20468       __ pextrd($dst$$Register, $tmp$$XMMRegister, midx - 4);
20469     }
20470   %}
20471   ins_pipe( pipe_slow );
20472 %}
20473 
20474 instruct extract16i(rRegI dst, vecZ src, vecX tmp, immI idx) %{
20475   predicate(UseAVX > 2 && n->in(1)->bottom_type()->is_vect()->length() == 16);
20476   match(Set dst (ExtractI src idx));
20477   effect(TEMP tmp);
20478     ins_encode %{
20479     int midx = 0xF & $idx$$constant;
20480     if (midx == 0) {
20481       __ movdl($dst$$Register, $src$$XMMRegister);
20482     } else if (midx >= 1 && midx <= 3) {
20483       __ pextrd($dst$$Register, $src$$XMMRegister, midx);
20484     } else {
20485       // Using 4 because there are 4 ints in 128-bit
20486       int extr_idx1 = midx / 4;
20487       int extr_idx2 = midx % 4;
20488       __ vextracti32x4($tmp$$XMMRegister, $src$$XMMRegister, extr_idx1);
20489       __ pextrd($dst$$Register, $tmp$$XMMRegister, extr_idx2);
20490     }
20491   %}
20492   ins_pipe( pipe_slow );
20493 %}
20494 
20495 instruct extract4s(rRegI dst, vecD src, immI idx) %{
20496   predicate(UseSSE > 1 && n->in(1)->bottom_type()->is_vect()->length() == 4);
20497   match(Set dst (ExtractS src idx));
20498   ins_encode %{
20499     int midx = 0x3 & $idx$$constant;
20500     if (midx == 0) {
20501       __ movdl($dst$$Register, $src$$XMMRegister);
20502       __ movswl($dst$$Register, $dst$$Register);
20503     } else if (midx >= 1) {
20504       __ pextrw($dst$$Register, $src$$XMMRegister, midx);
20505       __ movswl($dst$$Register, $dst$$Register);
20506     }
20507   %}
20508   ins_pipe( pipe_slow );
20509 %}
20510 
20511 instruct extract8s(rRegI dst, vecX src, immI idx) %{
20512   predicate(UseSSE > 1 && n->in(1)->bottom_type()->is_vect()->length() == 8);
20513   match(Set dst (ExtractS src idx));
20514   ins_encode %{
20515     int midx = 0x7 & $idx$$constant;
20516     if (midx == 0) {
20517       __ movdl($dst$$Register, $src$$XMMRegister);
20518       __ movswl($dst$$Register, $dst$$Register);
20519     } else if (midx >= 1) {
20520       __ pextrw($dst$$Register, $src$$XMMRegister, midx);
20521       __ movswl($dst$$Register, $dst$$Register);
20522     }
20523   %}
20524   ins_pipe( pipe_slow );
20525 %}
20526 
20527 instruct extract16s(rRegI dst, vecY src, vecX tmp, immI idx) %{
20528   predicate(UseAVX > 1 && n->in(1)->bottom_type()->is_vect()->length() == 16);
20529   match(Set dst (ExtractS src idx));
20530   effect(TEMP tmp);
20531   ins_encode %{
20532     int midx = 0xF & $idx$$constant;
20533     if (midx == 0) {
20534       __ movdl($dst$$Register, $src$$XMMRegister);
20535       __ movswl($dst$$Register, $dst$$Register);
20536     } else if (midx >= 1 && midx <= 7) {
20537       __ pextrw($dst$$Register, $src$$XMMRegister, midx);
20538       __ movswl($dst$$Register, $dst$$Register);
20539     } else {
20540       __ vextracti128($tmp$$XMMRegister, $src$$XMMRegister, 0x1);
20541       __ pextrw($dst$$Register, $tmp$$XMMRegister, midx-8);
20542       __ movswl($dst$$Register, $dst$$Register);
20543     }
20544   %}
20545   ins_pipe( pipe_slow );
20546 %}
20547 
20548 instruct extract32s(rRegI dst, vecZ src, vecX tmp, immI idx) %{
20549   predicate(UseAVX > 2 && n->in(1)->bottom_type()->is_vect()->length() == 32);
20550   match(Set dst (ExtractS src idx));
20551   effect(TEMP tmp);
20552     ins_encode %{
20553     int midx = 0x1F & $idx$$constant;
20554     if (midx == 0) {
20555     __ movdl($dst$$Register, $src$$XMMRegister);
20556     __ movswl($dst$$Register, $dst$$Register);
20557     } else if (midx >= 1 && midx <= 7) {
20558       __ pextrw($dst$$Register, $src$$XMMRegister, midx);
20559       __ movswl($dst$$Register, $dst$$Register);
20560     } else {
20561       int extr_idx1 = midx / 8;
20562       int extr_idx2 = midx % 8;
20563       __ vextracti32x4($tmp$$XMMRegister, $src$$XMMRegister, extr_idx1);
20564       __ pextrw($dst$$Register, $tmp$$XMMRegister, extr_idx2);
20565       __ movswl($dst$$Register, $dst$$Register);
20566     }
20567   %}
20568   ins_pipe( pipe_slow );
20569 %}
20570 
20571 instruct extract8b(rRegI dst, vecD src, immI idx) %{
20572   predicate(UseSSE >= 4 && n->in(1)->bottom_type()->is_vect()->length() == 8);
20573   match(Set dst (ExtractB src idx));
20574   ins_encode %{
20575     int midx = 0x7 & $idx$$constant;
20576     if (midx == 0) {
20577       __ movdl($dst$$Register, $src$$XMMRegister);
20578       __ movsbl($dst$$Register, $dst$$Register);
20579     } else if (midx >= 1) {
20580       __ pextrb($dst$$Register, $src$$XMMRegister, midx);
20581       __ movsbl($dst$$Register, $dst$$Register);
20582     }
20583   %}
20584   ins_pipe( pipe_slow );
20585 %}
20586 
20587 instruct extract16b(rRegI dst, vecX src, immI idx) %{
20588   predicate(UseSSE >= 4 && n->in(1)->bottom_type()->is_vect()->length() == 16);
20589   match(Set dst (ExtractB src idx));
20590   ins_encode %{
20591     int midx = 0xF & $idx$$constant;
20592     if (midx == 0) {
20593       __ movdl($dst$$Register, $src$$XMMRegister);
20594       __ movsbl($dst$$Register, $dst$$Register);
20595     } else if (midx >= 1) {
20596       __ pextrb($dst$$Register, $src$$XMMRegister, midx);
20597       __ movsbl($dst$$Register, $dst$$Register);
20598     }
20599   %}
20600   ins_pipe( pipe_slow );
20601 %}
20602 
20603 instruct extract32b(rRegI dst, vecY src, vecX tmp, immI idx) %{
20604   predicate(UseAVX > 0 && n->in(1)->bottom_type()->is_vect()->length() == 32);
20605   match(Set dst (ExtractB src idx));
20606   effect(TEMP tmp);
20607     ins_encode %{
20608     int midx = 0x1F & $idx$$constant;
20609     if (midx == 0) {
20610       __ movdl($dst$$Register, $src$$XMMRegister);
20611       __ movsbl($dst$$Register, $dst$$Register);
20612     } else if (midx >= 1 && midx <= 15) {
20613       __ pextrb($dst$$Register, $src$$XMMRegister, midx);
20614       __ movsbl($dst$$Register, $dst$$Register);
20615     } else {
20616       int extr_idx1 = midx / 16;
20617       int extr_idx2 = midx % 16;
20618       __ vextractf128($tmp$$XMMRegister, $src$$XMMRegister, extr_idx1);
20619       __ pextrb($dst$$Register, $tmp$$XMMRegister, extr_idx2);
20620       __ movsbl($dst$$Register, $dst$$Register);
20621     }
20622   %}
20623   ins_pipe( pipe_slow );
20624 %}
20625 
20626 instruct extract64b(rRegI dst, vecZ src, vecX tmp, immI idx) %{
20627   predicate(UseAVX > 2 && n->in(1)->bottom_type()->is_vect()->length() == 64);
20628   match(Set dst (ExtractB src idx));
20629   effect(TEMP tmp);
20630     ins_encode %{
20631     int midx = 0x3F & $idx$$constant;
20632     if (midx == 0) {
20633     __ movdl($dst$$Register, $src$$XMMRegister);
20634     __ movsbl($dst$$Register, $dst$$Register);
20635     } else if (midx >= 1 && midx <= 15) {
20636       __ pextrb($dst$$Register, $src$$XMMRegister, midx);
20637       __ movsbl($dst$$Register, $dst$$Register);
20638     } else {
20639       int extr_idx1 = midx / 16;
20640       int extr_idx2 = midx % 16;
20641       __ vextracti32x4($tmp$$XMMRegister, $src$$XMMRegister, extr_idx1);
20642       __ pextrb($dst$$Register, $tmp$$XMMRegister, extr_idx2);
20643       __ movsbl($dst$$Register, $dst$$Register);
20644     }
20645   %}
20646   ins_pipe( pipe_slow );
20647 %}
20648 
20649 instruct vcmpge32B(vecY dst, vecY src1, vecY src2, rRegL scratch) %{
20650   predicate(UseAVX > 1 && n->as_Vector()->length() == 32 &&
20651             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ge &&
20652             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
20653   match(Set dst (VectorMaskCmp src1 src2));
20654   effect(TEMP scratch);
20655   format %{ "vpcmpgtb  $dst,$src2,$src1\n  "
20656             "vpxor $dst,$dst,0xFFFFFFFF\t! cmpge packed32B" %}
20657   ins_encode %{
20658     int vector_len = 1;
20659     __ vpcmpgtb($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, vector_len);
20660     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
20661   %}
20662   ins_pipe( pipe_slow );
20663 %}
20664 
20665 instruct vcmpge64B(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
20666   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64 &&
20667             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ge &&
20668             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
20669   match(Set dst (VectorMaskCmp src1 src2));
20670   effect(TEMP dst, TEMP scratch);
20671   format %{ "vpcmpnltb  k2,$src1,$src2\n\t"
20672             "vmovdqu8   $dst, k2{z}, 0xFFFFFFFFFF \t! cmpge packed64B" %}
20673   ins_encode %{
20674     int vector_len = 2;
20675     Assembler::ComparisonPredicate cmp = Assembler::nlt;
20676     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
20677     KRegister mask = k0; // The comparison itself is not being masked.
20678     __ evpcmpb(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
20679     __ evmovdqub($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
20680   %}
20681   ins_pipe( pipe_slow );
20682 %}
20683 
20684 instruct vcmple8B(vecD dst, vecD src1, vecD src2, rRegL scratch) %{
20685   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 &&
20686             n->as_VectorMaskCmp()->get_predicate() == BoolTest::le &&
20687             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
20688   match(Set dst (VectorMaskCmp src1 src2));
20689   effect(TEMP scratch);
20690   format %{ "vpcmpgtb  $dst,$src1,$src2\n\t"
20691             "vpxor $dst,$dst,0xFFFFFFFF\t! cmple packed8B" %}
20692   ins_encode %{
20693     int vector_len = 0;
20694     __ vpcmpgtb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
20695     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
20696   %}
20697   ins_pipe( pipe_slow );
20698 %}
20699 
20700 instruct vcmple16B(vecX dst, vecX src1, vecX src2, rRegL scratch) %{
20701   predicate(UseAVX > 0 && n->as_Vector()->length() == 16 &&
20702             n->as_VectorMaskCmp()->get_predicate() == BoolTest::le &&
20703             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
20704   match(Set dst (VectorMaskCmp src1 src2));
20705   effect(TEMP scratch);
20706   format %{ "vpcmpgtb  $dst,$src1,$src2\n\t"
20707             "vpxor $dst,$dst,0xFFFFFFFF\t! cmple packed16B" %}
20708   ins_encode %{
20709     int vector_len = 0;
20710     __ vpcmpgtb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
20711     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
20712   %}
20713   ins_pipe( pipe_slow );
20714 %}
20715 
20716 instruct vcmple32B(vecY dst, vecY src1, vecY src2, rRegL scratch) %{
20717   predicate(UseAVX > 1 && n->as_Vector()->length() == 32 &&
20718             n->as_VectorMaskCmp()->get_predicate() == BoolTest::le &&
20719             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
20720   match(Set dst (VectorMaskCmp src1 src2));
20721   effect(TEMP scratch);
20722   format %{ "vpcmpgtb  $dst,$src1,$src2\n\t"
20723             "vpxor $dst,$dst,0xFFFFFFFF\t! cmple packed32B" %}
20724   ins_encode %{
20725     int vector_len = 1;
20726     __ vpcmpgtb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
20727     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
20728   %}
20729   ins_pipe( pipe_slow );
20730 %}
20731 
20732 instruct vcmple64B(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
20733   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64 &&
20734             n->as_VectorMaskCmp()->get_predicate() == BoolTest::le &&
20735             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
20736   match(Set dst (VectorMaskCmp src1 src2));
20737   effect(TEMP dst, TEMP scratch);
20738   format %{ "vpcmpleb  k2,$src1,$src2\n\t"
20739             "vmovdqu8   $dst, k2{z}, 0xFFFFFFFFFF \t! cmple packed64B" %}
20740   ins_encode %{
20741     int vector_len = 2;
20742     Assembler::ComparisonPredicate cmp = Assembler::le;
20743     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
20744     KRegister mask = k0; // The comparison itself is not being masked.
20745     __ evpcmpb(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
20746     __ evmovdqub($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
20747   %}
20748   ins_pipe( pipe_slow );
20749 %}
20750 
20751 instruct vcmpne8B(vecD dst, vecD src1, vecD src2, rRegL scratch) %{
20752   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 &&
20753             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ne &&
20754             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
20755   match(Set dst (VectorMaskCmp src1 src2));
20756   effect(TEMP scratch);
20757   format %{ "vpcmpeqb  $dst,$src1,$src2\n\t"
20758             "vpxor $dst,$dst,0xFFFFFFFF\t! cmpneq packed8B" %}
20759   ins_encode %{
20760     int vector_len = 0;
20761     __ vpcmpeqb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
20762     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
20763   %}
20764   ins_pipe( pipe_slow );
20765 %}
20766 
20767 instruct vcmpne16B(vecX dst, vecX src1, vecX src2, rRegL scratch) %{
20768   predicate(UseAVX > 0 && n->as_Vector()->length() == 16 &&
20769             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ne &&
20770             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
20771   match(Set dst (VectorMaskCmp src1 src2));
20772   effect(TEMP scratch);
20773   format %{ "vpcmpeqb  $dst,$src1,$src2\n\t"
20774             "vpxor $dst,$dst,0xFFFFFFFF\t! cmpneq packed16B" %}
20775   ins_encode %{
20776     int vector_len = 0;
20777     __ vpcmpeqb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
20778     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
20779   %}
20780   ins_pipe( pipe_slow );
20781 %}
20782 
20783 instruct vcmpne32B(vecY dst, vecY src1, vecY src2, rRegL scratch) %{
20784   predicate(UseAVX > 1 && n->as_Vector()->length() == 32 &&
20785             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ne &&
20786             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
20787   match(Set dst (VectorMaskCmp src1 src2));
20788   effect(TEMP scratch);
20789   format %{ "vpcmpeqb  $dst,$src1,$src2\n\t"
20790             "vpxor $dst,$dst,0xFFFFFFFF\t! cmpneq packed32B" %}
20791   ins_encode %{
20792     int vector_len = 1;
20793     __ vpcmpeqb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
20794     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
20795   %}
20796   ins_pipe( pipe_slow );
20797 %}
20798 
20799 instruct vcmpne64B(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
20800   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64 &&
20801             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ne &&
20802             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
20803   match(Set dst (VectorMaskCmp src1 src2));
20804   effect(TEMP dst, TEMP scratch);
20805   format %{ "vpcmpneqb  k2,$src1,$src2\n\t"
20806             "vmovdqu8   $dst, k2{z}, 0xFFFFFFFFFF \t! cmpneq packed64B" %}
20807   ins_encode %{
20808     int vector_len = 2;
20809     Assembler::ComparisonPredicate cmp = Assembler::neq;
20810     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
20811     KRegister mask = k0; // The comparison itself is not being masked.
20812     __ evpcmpb(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
20813     __ evmovdqub($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
20814   %}
20815   ins_pipe( pipe_slow );
20816 %}
20817 
20818 instruct vcmpeq4S(vecD dst, vecD src1, vecD src2) %{
20819   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 &&
20820             n->as_VectorMaskCmp()->get_predicate() == BoolTest::eq &&
20821             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
20822   match(Set dst (VectorMaskCmp src1 src2));
20823   format %{ "vpcmpeqw  $dst,$src1,$src2\n\t! cmpeq packed4S" %}
20824   ins_encode %{
20825     int vector_len = 0;
20826     __ vpcmpeqw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
20827   %}
20828   ins_pipe( pipe_slow );
20829 %}
20830 
20831 instruct vcmpeq8S(vecX dst, vecX src1, vecX src2) %{
20832   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 &&
20833             n->as_VectorMaskCmp()->get_predicate() == BoolTest::eq &&
20834             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
20835   match(Set dst (VectorMaskCmp src1 src2));
20836   format %{ "vpcmpeqw  $dst,$src1,$src2\n\t! cmpeq packed8S" %}
20837   ins_encode %{
20838     int vector_len = 0;
20839     __ vpcmpeqw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
20840   %}
20841   ins_pipe( pipe_slow );
20842 %}
20843 
20844 instruct vcmpeq16S(vecY dst, vecY src1, vecY src2) %{
20845   predicate(UseAVX > 1 && n->as_Vector()->length() == 16 &&
20846             n->as_VectorMaskCmp()->get_predicate() == BoolTest::eq &&
20847             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
20848   match(Set dst (VectorMaskCmp src1 src2));
20849   format %{ "vpcmpeqw  $dst,$src1,$src2\n\t! cmpeq packed16S" %}
20850   ins_encode %{
20851     int vector_len = 1;
20852     __ vpcmpeqw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
20853   %}
20854   ins_pipe( pipe_slow );
20855 %}
20856 
20857 instruct vcmpeq32S(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
20858   predicate(UseAVX > 2 && n->as_Vector()->length() == 32 &&
20859             n->as_VectorMaskCmp()->get_predicate() == BoolTest::eq &&
20860             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
20861   match(Set dst (VectorMaskCmp src1 src2));
20862   effect(TEMP dst, TEMP scratch);
20863   format %{ "vpcmpeqw  k2,$src1,$src2\n\t"
20864             "vmovdqu16 $dst, k2{z}, 0xFFFFFFFFFF \t! cmpeq packed32S" %}
20865   ins_encode %{
20866     int vector_len = 2;
20867     Assembler::ComparisonPredicate cmp = Assembler::eq;
20868     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
20869     KRegister mask = k0; // The comparison itself is not being masked.
20870     __ evpcmpw(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
20871     __ evmovdquw($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
20872   %}
20873   ins_pipe( pipe_slow );
20874 %}
20875 
20876 instruct vcmplt4S(vecD dst, vecD src1, vecD src2) %{
20877   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 &&
20878             n->as_VectorMaskCmp()->get_predicate() == BoolTest::lt &&
20879             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
20880   match(Set dst (VectorMaskCmp src1 src2));
20881   format %{ "vpcmpgtw  $dst,$src2,$src1\t! cmplt packed4S" %}
20882   ins_encode %{
20883     int vector_len = 0;
20884     __ vpcmpgtw($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, vector_len);
20885   %}
20886   ins_pipe( pipe_slow );
20887 %}
20888 
20889 instruct vcmplt8S(vecX dst, vecX src1, vecX src2) %{
20890   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 &&
20891             n->as_VectorMaskCmp()->get_predicate() == BoolTest::lt &&
20892             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
20893   match(Set dst (VectorMaskCmp src1 src2));
20894   format %{ "vpcmpgtw  $dst,$src2,$src1\t! cmplt packed8S" %}
20895   ins_encode %{
20896     int vector_len = 0;
20897     __ vpcmpgtw($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, vector_len);
20898   %}
20899   ins_pipe( pipe_slow );
20900 %}
20901 
20902 instruct vcmplt16S(vecY dst, vecY src1, vecY src2) %{
20903   predicate(UseAVX > 1 && n->as_Vector()->length() == 16 &&
20904             n->as_VectorMaskCmp()->get_predicate() == BoolTest::lt &&
20905             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
20906   match(Set dst (VectorMaskCmp src1 src2));
20907   format %{ "vpcmpgtw  $dst,$src2,$src1\t! cmplt packed16S" %}
20908   ins_encode %{
20909     int vector_len = 1;
20910     __ vpcmpgtw($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, vector_len);
20911   %}
20912   ins_pipe( pipe_slow );
20913 %}
20914 
20915 instruct vcmplt32S(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
20916   predicate(UseAVX > 2 && n->as_Vector()->length() == 32 &&
20917             n->as_VectorMaskCmp()->get_predicate() == BoolTest::lt &&
20918             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
20919   match(Set dst (VectorMaskCmp src1 src2));
20920   effect(TEMP dst, TEMP scratch);
20921   format %{ "vpcmpnlew  k2,$src1,$src2\n\t"
20922             "vmovdqu16   $dst, k2{z}, 0xFFFFFFFFFF \t! cmpgt packed32S" %}
20923   ins_encode %{
20924     int vector_len = 2;
20925     Assembler::ComparisonPredicate cmp = Assembler::lt;
20926     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
20927     KRegister mask = k0; // The comparison itself is not being masked.
20928     __ evpcmpw(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
20929     __ evmovdquw($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
20930   %}
20931   ins_pipe( pipe_slow );
20932 %}
20933 
20934 instruct vcmpgt4S(vecD dst, vecD src1, vecD src2) %{
20935   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 &&
20936             n->as_VectorMaskCmp()->get_predicate() == BoolTest::gt &&
20937             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
20938   match(Set dst (VectorMaskCmp src1 src2));
20939   format %{ "vpcmpgtw  $dst,$src1,$src2\t! cmpgt packed4S" %}
20940   ins_encode %{
20941     int vector_len = 0;
20942     __ vpcmpgtw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
20943   %}
20944   ins_pipe( pipe_slow );
20945 %}
20946 
20947 instruct vcmpgt8S(vecX dst, vecX src1, vecX src2) %{
20948   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 &&
20949             n->as_VectorMaskCmp()->get_predicate() == BoolTest::gt &&
20950             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
20951   match(Set dst (VectorMaskCmp src1 src2));
20952   format %{ "vpcmpgtw  $dst,$src1,$src2\t! cmpgt packed8S" %}
20953   ins_encode %{
20954     int vector_len = 0;
20955     __ vpcmpgtw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
20956   %}
20957   ins_pipe( pipe_slow );
20958 %}
20959 
20960 instruct vcmpgt16S(vecY dst, vecY src1, vecY src2) %{
20961   predicate(UseAVX > 1 && n->as_Vector()->length() == 16 &&
20962             n->as_VectorMaskCmp()->get_predicate() == BoolTest::gt &&
20963             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
20964   match(Set dst (VectorMaskCmp src1 src2));
20965   format %{ "vpcmpgtw  $dst,$src1,$src2\t! cmpgt packed16S" %}
20966   ins_encode %{
20967     int vector_len = 1;
20968     __ vpcmpgtw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
20969   %}
20970   ins_pipe( pipe_slow );
20971 %}
20972 
20973 instruct vcmpgt32S(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
20974   predicate(UseAVX > 2 && n->as_Vector()->length() == 32 &&
20975             n->as_VectorMaskCmp()->get_predicate() == BoolTest::gt &&
20976             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
20977   match(Set dst (VectorMaskCmp src1 src2));
20978   effect(TEMP dst, TEMP scratch);
20979   format %{ "vpcmpnlew  k2,$src1,$src2\n\t"
20980             "vmovdqu16   $dst, k2{z}, 0xFFFFFFFFFF \t! cmpgt packed32S" %}
20981   ins_encode %{
20982     int vector_len = 2;
20983     Assembler::ComparisonPredicate cmp = Assembler::nle;
20984     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
20985     KRegister mask = k0; // The comparison itself is not being masked.
20986     __ evpcmpw(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
20987     __ evmovdquw($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
20988   %}
20989   ins_pipe( pipe_slow );
20990 %}
20991 
20992 instruct vcmpge4S(vecD dst, vecD src1, vecD src2, rRegL scratch) %{
20993   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 &&
20994             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ge &&
20995             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
20996   match(Set dst (VectorMaskCmp src1 src2));
20997   effect(TEMP scratch);
20998   format %{ "vpcmpgtw  $dst,$src2,$src1\n\t"
20999             "vpxor $dst,$dst,0xFFFFFFFF\t! cmpge packed4S" %}
21000   ins_encode %{
21001     int vector_len = 0;
21002     __ vpcmpgtw($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, vector_len);
21003     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
21004   %}
21005   ins_pipe( pipe_slow );
21006 %}
21007 
21008 instruct vcmpge8S(vecX dst, vecX src1, vecX src2, rRegL scratch) %{
21009   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 &&
21010             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ge &&
21011             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
21012   match(Set dst (VectorMaskCmp src1 src2));
21013   effect(TEMP scratch);
21014   format %{ "vpcmpgtw  $dst,$src2,$src1\n\t"
21015             "vpxor $dst,$dst,0xFFFFFFFF\t! cmpge packed8S" %}
21016   ins_encode %{
21017     int vector_len = 0;
21018     __ vpcmpgtw($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, vector_len);
21019     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
21020   %}
21021   ins_pipe( pipe_slow );
21022 %}
21023 
21024 instruct vcmpge16S(vecY dst, vecY src1, vecY src2, rRegL scratch) %{
21025   predicate(UseAVX > 1 && n->as_Vector()->length() == 16 &&
21026             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ge &&
21027             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
21028   match(Set dst (VectorMaskCmp src1 src2));
21029   effect(TEMP scratch);
21030   format %{ "vpcmpgtw  $dst,$src2,$src1\n\t"
21031             "vpxor $dst,$dst,0xFFFFFFFF\t! cmpge packed16S" %}
21032   ins_encode %{
21033     int vector_len = 1;
21034     __ vpcmpgtw($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, vector_len);
21035     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
21036   %}
21037   ins_pipe( pipe_slow );
21038 %}
21039 
21040 instruct vcmpge32S(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
21041   predicate(UseAVX > 2 && n->as_Vector()->length() == 32 &&
21042             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ge &&
21043             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
21044   match(Set dst (VectorMaskCmp src1 src2));
21045   effect(TEMP dst, TEMP scratch);
21046   format %{ "vpcmpnltw  k2,$src1,$src2\n\t"
21047             "vmovdqu16   $dst, k2{z}, 0xFFFFFFFFFF \t! cmpge packed32S" %}
21048   ins_encode %{
21049     int vector_len = 2;
21050     Assembler::ComparisonPredicate cmp = Assembler::nlt;
21051     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
21052     KRegister mask = k0; // The comparison itself is not being masked.
21053     __ evpcmpw(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
21054     __ evmovdquw($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
21055   %}
21056   ins_pipe( pipe_slow );
21057 %}
21058 
21059 instruct vcmple4S(vecD dst, vecD src1, vecD src2, rRegL scratch) %{
21060   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 &&
21061             n->as_VectorMaskCmp()->get_predicate() == BoolTest::le &&
21062             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
21063   match(Set dst (VectorMaskCmp src1 src2));
21064   effect(TEMP scratch);
21065   format %{ "vpcmpgtw  $dst,$src1,$src2\n\t"
21066             "vpxor $dst,$dst,0xFFFFFFFF\t! cmple packed4S" %}
21067   ins_encode %{
21068     int vector_len = 0;
21069     __ vpcmpgtw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
21070     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
21071   %}
21072   ins_pipe( pipe_slow );
21073 %}
21074 
21075 instruct vcmple8S(vecX dst, vecX src1, vecX src2, rRegL scratch) %{
21076   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 &&
21077             n->as_VectorMaskCmp()->get_predicate() == BoolTest::le &&
21078             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
21079   match(Set dst (VectorMaskCmp src1 src2));
21080   effect(TEMP scratch);
21081   format %{ "vpcmpgtw  $dst,$src1,$src2\n\t"
21082             "vpxor $dst,$dst,0xFFFFFFFF\t! cmple packed8S" %}
21083   ins_encode %{
21084     int vector_len = 0;
21085     __ vpcmpgtw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
21086     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
21087   %}
21088   ins_pipe( pipe_slow );
21089 %}
21090 
21091 instruct vcmple16S(vecY dst, vecY src1, vecY src2, rRegL scratch) %{
21092   predicate(UseAVX > 1 && n->as_Vector()->length() == 16 &&
21093             n->as_VectorMaskCmp()->get_predicate() == BoolTest::le &&
21094             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
21095   match(Set dst (VectorMaskCmp src1 src2));
21096   effect(TEMP scratch);
21097   format %{ "vpcmpgtw  $dst,$src1,$src2\n\t"
21098             "vpxor $dst,$dst,0xFFFFFFFF\t! cmple packed16S" %}
21099   ins_encode %{
21100     int vector_len = 1;
21101     __ vpcmpgtw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
21102     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
21103   %}
21104   ins_pipe( pipe_slow );
21105 %}
21106 
21107 instruct vcmple32S(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
21108   predicate(UseAVX > 2 && n->as_Vector()->length() == 32 &&
21109             n->as_VectorMaskCmp()->get_predicate() == BoolTest::le &&
21110             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
21111   match(Set dst (VectorMaskCmp src1 src2));
21112   effect(TEMP dst, TEMP scratch);
21113   format %{ "vpcmplew  k2,$src1,$src2\n\t"
21114             "vmovdqu16   $dst, k2{z}, 0xFFFFFFFFFF \t! cmple packed32S" %}
21115   ins_encode %{
21116     int vector_len = 2;
21117     Assembler::ComparisonPredicate cmp = Assembler::le;
21118     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
21119     KRegister mask = k0; // The comparison itself is not being masked.
21120     __ evpcmpw(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
21121     __ evmovdquw($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
21122   %}
21123   ins_pipe( pipe_slow );
21124 %}
21125 
21126 instruct vcmpne4S(vecD dst, vecD src1, vecD src2, rRegL scratch) %{
21127   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 &&
21128             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ne &&
21129             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
21130   match(Set dst (VectorMaskCmp src1 src2));
21131   effect(TEMP scratch);
21132   format %{ "vpcmpeqw  $dst,$src1,$src2\n\t"
21133             "vpxor $dst,$dst,0xFFFFFFFF\t! cmpneq packed4S" %}
21134   ins_encode %{
21135     int vector_len = 0;
21136     __ vpcmpeqw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
21137     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
21138   %}
21139   ins_pipe( pipe_slow );
21140 %}
21141 
21142 instruct vcmpne8S(vecX dst, vecX src1, vecX src2, rRegL scratch) %{
21143   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 &&
21144             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ne &&
21145             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
21146   match(Set dst (VectorMaskCmp src1 src2));
21147   effect(TEMP scratch);
21148   format %{ "vpcmpeqw  $dst,$src1,$src2\n\t"
21149             "vpxor $dst,$dst,0xFFFFFFFF\t! cmpneq packed8S" %}
21150   ins_encode %{
21151     int vector_len = 0;
21152     __ vpcmpeqw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
21153     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
21154   %}
21155   ins_pipe( pipe_slow );
21156 %}
21157 
21158 instruct vcmpne16S(vecY dst, vecY src1, vecY src2, rRegL scratch) %{
21159   predicate(UseAVX > 1 && n->as_Vector()->length() == 16 &&
21160             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ne &&
21161             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
21162   match(Set dst (VectorMaskCmp src1 src2));
21163   effect(TEMP scratch);
21164   format %{ "vpcmpeqw  $dst,$src1,$src2\n\t"
21165             "vpxor $dst,$dst,0xFFFFFFFF\t! cmpneq packed16S" %}
21166   ins_encode %{
21167     int vector_len = 1;
21168     __ vpcmpeqw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
21169     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
21170   %}
21171   ins_pipe( pipe_slow );
21172 %}
21173 
21174 instruct vcmpne32S(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
21175   predicate(UseAVX > 2 && n->as_Vector()->length() == 32 &&
21176             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ne &&
21177             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
21178   match(Set dst (VectorMaskCmp src1 src2));
21179   effect(TEMP dst, TEMP scratch);
21180   format %{ "vpcmpneqw  k2,$src1,$src2\n\t"
21181             "vmovdqu16   $dst, k2{z}, 0xFFFFFFFFFF \t! cmpneq packed32S" %}
21182   ins_encode %{
21183     int vector_len = 2;
21184     Assembler::ComparisonPredicate cmp = Assembler::neq;
21185     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
21186     KRegister mask = k0; // The comparison itself is not being masked.
21187     __ evpcmpw(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
21188     __ evmovdquw($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
21189   %}
21190   ins_pipe( pipe_slow );
21191 %}
21192 
21193 instruct vcmpeq1L(vecD dst, vecD src1, vecD src2) %{
21194   predicate(UseAVX > 0 && n->as_Vector()->length() == 1 &&
21195             n->as_VectorMaskCmp()->get_predicate() == BoolTest::eq &&
21196             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
21197   match(Set dst (VectorMaskCmp src1 src2));
21198   format %{ "vpcmpeqq  $dst,$src1,$src2\n\t! cmpeq packed1L" %}
21199   ins_encode %{
21200     int vector_len = 0;
21201     __ vpcmpeqq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
21202   %}
21203   ins_pipe( pipe_slow );
21204 %}
21205 
21206 instruct vcmpeq2L(vecX dst, vecX src1, vecX src2) %{
21207   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 &&
21208             n->as_VectorMaskCmp()->get_predicate() == BoolTest::eq &&
21209             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
21210   match(Set dst (VectorMaskCmp src1 src2));
21211   format %{ "vpcmpeqq  $dst,$src1,$src2\n\t! cmpeq packed2L" %}
21212   ins_encode %{
21213     int vector_len = 0;
21214     __ vpcmpeqq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
21215   %}
21216   ins_pipe( pipe_slow );
21217 %}
21218 
21219 instruct vcmpeq4L(vecY dst, vecY src1, vecY src2) %{
21220   predicate(UseAVX > 1 && n->as_Vector()->length() == 4 &&
21221             n->as_VectorMaskCmp()->get_predicate() == BoolTest::eq &&
21222             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
21223   match(Set dst (VectorMaskCmp src1 src2));
21224   format %{ "vpcmpeqq  $dst,$src1,$src2\n\t! cmpeq packed4L" %}
21225   ins_encode %{
21226     int vector_len = 1;
21227     __ vpcmpeqq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
21228   %}
21229   ins_pipe( pipe_slow );
21230 %}
21231 
21232 instruct vcmpeq8L(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
21233   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 &&
21234             n->as_VectorMaskCmp()->get_predicate() == BoolTest::eq &&
21235             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
21236   match(Set dst (VectorMaskCmp src1 src2));
21237   effect(TEMP dst, TEMP scratch);
21238   format %{ "vpcmpeqq  k2,$src1,$src2\n\t"
21239             "vmovdqu64 $dst, k2{z}, 0xFFFFFFFFFF \t! cmpeq packed8L" %}
21240   ins_encode %{
21241     int vector_len = 2;
21242     Assembler::ComparisonPredicate cmp = Assembler::eq;
21243     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
21244     KRegister mask = k0; // The comparison itself is not being masked.
21245     __ evpcmpq(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
21246     __ evmovdquq($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
21247   %}
21248   ins_pipe( pipe_slow );
21249 %}
21250 
21251 instruct vcmplt1L(vecD dst, vecD src1, vecD src2) %{
21252   predicate(UseAVX > 0 && n->as_Vector()->length() == 1 &&
21253             n->as_VectorMaskCmp()->get_predicate() == BoolTest::lt &&
21254             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
21255   match(Set dst (VectorMaskCmp src1 src2));
21256   format %{ "vpcmpgtq  $dst,$src2,$src1\t! cmplt packed1L" %}
21257   ins_encode %{
21258     int vector_len = 0;
21259     __ vpcmpgtq($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, vector_len);
21260   %}
21261   ins_pipe( pipe_slow );
21262 %}
21263 
21264 instruct vcmplt2L(vecX dst, vecX src1, vecX src2) %{
21265   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 &&
21266             n->as_VectorMaskCmp()->get_predicate() == BoolTest::lt &&
21267             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
21268   match(Set dst (VectorMaskCmp src1 src2));
21269   format %{ "vpcmpgtq  $dst,$src2,$src1\t! cmplt packed2L" %}
21270   ins_encode %{
21271     int vector_len = 0;
21272     __ vpcmpgtq($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, vector_len);
21273   %}
21274   ins_pipe( pipe_slow );
21275 %}
21276 
21277 instruct vcmplt4L(vecY dst, vecY src1, vecY src2) %{
21278   predicate(UseAVX > 1 && n->as_Vector()->length() == 4 &&
21279             n->as_VectorMaskCmp()->get_predicate() == BoolTest::lt &&
21280             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
21281   match(Set dst (VectorMaskCmp src1 src2));
21282   format %{ "vpcmpgtq  $dst,$src2,$src1\t! cmplt packed4L" %}
21283   ins_encode %{
21284     int vector_len = 1;
21285     __ vpcmpgtq($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, vector_len);
21286   %}
21287   ins_pipe( pipe_slow );
21288 %}
21289 
21290 instruct vcmplt8L(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
21291   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 &&
21292             n->as_VectorMaskCmp()->get_predicate() == BoolTest::lt &&
21293             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
21294   match(Set dst (VectorMaskCmp src1 src2));
21295   effect(TEMP dst, TEMP scratch);
21296   format %{ "vpcmpnleq  k2,$src1,$src2\n\t"
21297             "vmovdqu64   $dst, k2{z}, 0xFFFFFFFFFF \t! cmpgt packed8L" %}
21298   ins_encode %{
21299     int vector_len = 2;
21300     Assembler::ComparisonPredicate cmp = Assembler::lt;
21301     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
21302     KRegister mask = k0; // The comparison itself is not being masked.
21303     __ evpcmpq(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
21304     __ evmovdquq($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
21305   %}
21306   ins_pipe( pipe_slow );
21307 %}
21308 
21309 instruct vcmpgt1L(vecD dst, vecD src1, vecD src2) %{
21310   predicate(UseAVX > 0 && n->as_Vector()->length() == 1 &&
21311             n->as_VectorMaskCmp()->get_predicate() == BoolTest::gt &&
21312             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
21313   match(Set dst (VectorMaskCmp src1 src2));
21314   format %{ "vpcmpgtq  $dst,$src1,$src2\t! cmpgt packed1L" %}
21315   ins_encode %{
21316     int vector_len = 0;
21317     __ vpcmpgtq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
21318   %}
21319   ins_pipe( pipe_slow );
21320 %}
21321 
21322 instruct vcmpgt2L(vecX dst, vecX src1, vecX src2) %{
21323   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 &&
21324             n->as_VectorMaskCmp()->get_predicate() == BoolTest::gt &&
21325             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
21326   match(Set dst (VectorMaskCmp src1 src2));
21327   format %{ "vpcmpgtq  $dst,$src1,$src2\t! cmpgt packed2L" %}
21