1 //
   2 // Copyright (c) 2011, 2019, Oracle and/or its affiliates. All rights reserved.
   3 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4 //
   5 // This code is free software; you can redistribute it and/or modify it
   6 // under the terms of the GNU General Public License version 2 only, as
   7 // published by the Free Software Foundation.
   8 //
   9 // This code is distributed in the hope that it will be useful, but WITHOUT
  10 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11 // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12 // version 2 for more details (a copy is included in the LICENSE file that
  13 // accompanied this code).
  14 //
  15 // You should have received a copy of the GNU General Public License version
  16 // 2 along with this work; if not, write to the Free Software Foundation,
  17 // Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18 //
  19 // Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20 // or visit www.oracle.com if you need additional information or have any
  21 // questions.
  22 //
  23 //
  24 
  25 // X86 Common Architecture Description File
  26 
  27 //----------REGISTER DEFINITION BLOCK------------------------------------------
  28 // This information is used by the matcher and the register allocator to
  29 // describe individual registers and classes of registers within the target
  30 // archtecture.
  31 
  32 register %{
  33 //----------Architecture Description Register Definitions----------------------
  34 // General Registers
  35 // "reg_def"  name ( register save type, C convention save type,
  36 //                   ideal register type, encoding );
  37 // Register Save Types:
  38 //
  39 // NS  = No-Save:       The register allocator assumes that these registers
  40 //                      can be used without saving upon entry to the method, &
  41 //                      that they do not need to be saved at call sites.
  42 //
  43 // SOC = Save-On-Call:  The register allocator assumes that these registers
  44 //                      can be used without saving upon entry to the method,
  45 //                      but that they must be saved at call sites.
  46 //
  47 // SOE = Save-On-Entry: The register allocator assumes that these registers
  48 //                      must be saved before using them upon entry to the
  49 //                      method, but they do not need to be saved at call
  50 //                      sites.
  51 //
  52 // AS  = Always-Save:   The register allocator assumes that these registers
  53 //                      must be saved before using them upon entry to the
  54 //                      method, & that they must be saved at call sites.
  55 //
  56 // Ideal Register Type is used to determine how to save & restore a
  57 // register.  Op_RegI will get spilled with LoadI/StoreI, Op_RegP will get
  58 // spilled with LoadP/StoreP.  If the register supports both, use Op_RegI.
  59 //
  60 // The encoding number is the actual bit-pattern placed into the opcodes.
  61 
  62 // XMM registers.  512-bit registers or 8 words each, labeled (a)-p.
  63 // Word a in each register holds a Float, words ab hold a Double.
  64 // The whole registers are used in SSE4.2 version intrinsics,
  65 // array copy stubs and superword operations (see UseSSE42Intrinsics,
  66 // UseXMMForArrayCopy and UseSuperword flags).
  67 // For pre EVEX enabled architectures:
  68 //      XMM8-XMM15 must be encoded with REX (VEX for UseAVX)
  69 // For EVEX enabled architectures:
  70 //      XMM8-XMM31 must be encoded with REX (EVEX for UseAVX).
  71 //
  72 // Linux ABI:   No register preserved across function calls
  73 //              XMM0-XMM7 might hold parameters
  74 // Windows ABI: XMM6-XMM31 preserved across function calls
  75 //              XMM0-XMM3 might hold parameters
  76 
  77 reg_def XMM0 ( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg());
  78 reg_def XMM0b( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(1));
  79 reg_def XMM0c( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(2));
  80 reg_def XMM0d( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(3));
  81 reg_def XMM0e( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(4));
  82 reg_def XMM0f( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(5));
  83 reg_def XMM0g( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(6));
  84 reg_def XMM0h( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(7));
  85 reg_def XMM0i( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(8));
  86 reg_def XMM0j( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(9));
  87 reg_def XMM0k( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(10));
  88 reg_def XMM0l( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(11));
  89 reg_def XMM0m( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(12));
  90 reg_def XMM0n( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(13));
  91 reg_def XMM0o( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(14));
  92 reg_def XMM0p( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(15));
  93 
  94 reg_def XMM1 ( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg());
  95 reg_def XMM1b( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(1));
  96 reg_def XMM1c( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(2));
  97 reg_def XMM1d( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(3));
  98 reg_def XMM1e( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(4));
  99 reg_def XMM1f( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(5));
 100 reg_def XMM1g( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(6));
 101 reg_def XMM1h( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(7));
 102 reg_def XMM1i( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(8));
 103 reg_def XMM1j( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(9));
 104 reg_def XMM1k( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(10));
 105 reg_def XMM1l( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(11));
 106 reg_def XMM1m( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(12));
 107 reg_def XMM1n( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(13));
 108 reg_def XMM1o( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(14));
 109 reg_def XMM1p( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(15));
 110 
 111 reg_def XMM2 ( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg());
 112 reg_def XMM2b( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(1));
 113 reg_def XMM2c( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(2));
 114 reg_def XMM2d( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(3));
 115 reg_def XMM2e( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(4));
 116 reg_def XMM2f( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(5));
 117 reg_def XMM2g( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(6));
 118 reg_def XMM2h( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(7));
 119 reg_def XMM2i( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(8));
 120 reg_def XMM2j( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(9));
 121 reg_def XMM2k( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(10));
 122 reg_def XMM2l( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(11));
 123 reg_def XMM2m( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(12));
 124 reg_def XMM2n( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(13));
 125 reg_def XMM2o( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(14));
 126 reg_def XMM2p( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(15));
 127 
 128 reg_def XMM3 ( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg());
 129 reg_def XMM3b( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(1));
 130 reg_def XMM3c( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(2));
 131 reg_def XMM3d( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(3));
 132 reg_def XMM3e( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(4));
 133 reg_def XMM3f( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(5));
 134 reg_def XMM3g( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(6));
 135 reg_def XMM3h( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(7));
 136 reg_def XMM3i( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(8));
 137 reg_def XMM3j( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(9));
 138 reg_def XMM3k( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(10));
 139 reg_def XMM3l( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(11));
 140 reg_def XMM3m( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(12));
 141 reg_def XMM3n( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(13));
 142 reg_def XMM3o( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(14));
 143 reg_def XMM3p( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(15));
 144 
 145 reg_def XMM4 ( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg());
 146 reg_def XMM4b( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(1));
 147 reg_def XMM4c( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(2));
 148 reg_def XMM4d( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(3));
 149 reg_def XMM4e( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(4));
 150 reg_def XMM4f( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(5));
 151 reg_def XMM4g( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(6));
 152 reg_def XMM4h( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(7));
 153 reg_def XMM4i( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(8));
 154 reg_def XMM4j( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(9));
 155 reg_def XMM4k( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(10));
 156 reg_def XMM4l( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(11));
 157 reg_def XMM4m( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(12));
 158 reg_def XMM4n( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(13));
 159 reg_def XMM4o( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(14));
 160 reg_def XMM4p( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(15));
 161 
 162 reg_def XMM5 ( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg());
 163 reg_def XMM5b( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(1));
 164 reg_def XMM5c( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(2));
 165 reg_def XMM5d( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(3));
 166 reg_def XMM5e( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(4));
 167 reg_def XMM5f( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(5));
 168 reg_def XMM5g( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(6));
 169 reg_def XMM5h( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(7));
 170 reg_def XMM5i( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(8));
 171 reg_def XMM5j( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(9));
 172 reg_def XMM5k( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(10));
 173 reg_def XMM5l( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(11));
 174 reg_def XMM5m( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(12));
 175 reg_def XMM5n( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(13));
 176 reg_def XMM5o( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(14));
 177 reg_def XMM5p( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(15));
 178 
 179 reg_def XMM6 ( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg());
 180 reg_def XMM6b( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(1));
 181 reg_def XMM6c( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(2));
 182 reg_def XMM6d( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(3));
 183 reg_def XMM6e( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(4));
 184 reg_def XMM6f( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(5));
 185 reg_def XMM6g( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(6));
 186 reg_def XMM6h( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(7));
 187 reg_def XMM6i( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(8));
 188 reg_def XMM6j( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(9));
 189 reg_def XMM6k( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(10));
 190 reg_def XMM6l( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(11));
 191 reg_def XMM6m( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(12));
 192 reg_def XMM6n( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(13));
 193 reg_def XMM6o( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(14));
 194 reg_def XMM6p( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(15));
 195 
 196 reg_def XMM7 ( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg());
 197 reg_def XMM7b( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(1));
 198 reg_def XMM7c( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(2));
 199 reg_def XMM7d( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(3));
 200 reg_def XMM7e( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(4));
 201 reg_def XMM7f( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(5));
 202 reg_def XMM7g( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(6));
 203 reg_def XMM7h( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(7));
 204 reg_def XMM7i( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(8));
 205 reg_def XMM7j( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(9));
 206 reg_def XMM7k( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(10));
 207 reg_def XMM7l( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(11));
 208 reg_def XMM7m( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(12));
 209 reg_def XMM7n( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(13));
 210 reg_def XMM7o( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(14));
 211 reg_def XMM7p( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(15));
 212 
 213 #ifdef _LP64
 214 
 215 reg_def XMM8 ( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg());
 216 reg_def XMM8b( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(1));
 217 reg_def XMM8c( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(2));
 218 reg_def XMM8d( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(3));
 219 reg_def XMM8e( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(4));
 220 reg_def XMM8f( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(5));
 221 reg_def XMM8g( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(6));
 222 reg_def XMM8h( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(7));
 223 reg_def XMM8i( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(8));
 224 reg_def XMM8j( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(9));
 225 reg_def XMM8k( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(10));
 226 reg_def XMM8l( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(11));
 227 reg_def XMM8m( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(12));
 228 reg_def XMM8n( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(13));
 229 reg_def XMM8o( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(14));
 230 reg_def XMM8p( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(15));
 231 
 232 reg_def XMM9 ( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg());
 233 reg_def XMM9b( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(1));
 234 reg_def XMM9c( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(2));
 235 reg_def XMM9d( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(3));
 236 reg_def XMM9e( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(4));
 237 reg_def XMM9f( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(5));
 238 reg_def XMM9g( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(6));
 239 reg_def XMM9h( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(7));
 240 reg_def XMM9i( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(8));
 241 reg_def XMM9j( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(9));
 242 reg_def XMM9k( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(10));
 243 reg_def XMM9l( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(11));
 244 reg_def XMM9m( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(12));
 245 reg_def XMM9n( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(13));
 246 reg_def XMM9o( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(14));
 247 reg_def XMM9p( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(15));
 248 
 249 reg_def XMM10 ( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg());
 250 reg_def XMM10b( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(1));
 251 reg_def XMM10c( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(2));
 252 reg_def XMM10d( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(3));
 253 reg_def XMM10e( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(4));
 254 reg_def XMM10f( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(5));
 255 reg_def XMM10g( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(6));
 256 reg_def XMM10h( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(7));
 257 reg_def XMM10i( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(8));
 258 reg_def XMM10j( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(9));
 259 reg_def XMM10k( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(10));
 260 reg_def XMM10l( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(11));
 261 reg_def XMM10m( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(12));
 262 reg_def XMM10n( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(13));
 263 reg_def XMM10o( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(14));
 264 reg_def XMM10p( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(15));
 265 
 266 reg_def XMM11 ( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg());
 267 reg_def XMM11b( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(1));
 268 reg_def XMM11c( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(2));
 269 reg_def XMM11d( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(3));
 270 reg_def XMM11e( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(4));
 271 reg_def XMM11f( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(5));
 272 reg_def XMM11g( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(6));
 273 reg_def XMM11h( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(7));
 274 reg_def XMM11i( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(8));
 275 reg_def XMM11j( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(9));
 276 reg_def XMM11k( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(10));
 277 reg_def XMM11l( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(11));
 278 reg_def XMM11m( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(12));
 279 reg_def XMM11n( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(13));
 280 reg_def XMM11o( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(14));
 281 reg_def XMM11p( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(15));
 282 
 283 reg_def XMM12 ( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg());
 284 reg_def XMM12b( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(1));
 285 reg_def XMM12c( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(2));
 286 reg_def XMM12d( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(3));
 287 reg_def XMM12e( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(4));
 288 reg_def XMM12f( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(5));
 289 reg_def XMM12g( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(6));
 290 reg_def XMM12h( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(7));
 291 reg_def XMM12i( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(8));
 292 reg_def XMM12j( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(9));
 293 reg_def XMM12k( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(10));
 294 reg_def XMM12l( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(11));
 295 reg_def XMM12m( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(12));
 296 reg_def XMM12n( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(13));
 297 reg_def XMM12o( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(14));
 298 reg_def XMM12p( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(15));
 299 
 300 reg_def XMM13 ( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg());
 301 reg_def XMM13b( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(1));
 302 reg_def XMM13c( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(2));
 303 reg_def XMM13d( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(3));
 304 reg_def XMM13e( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(4));
 305 reg_def XMM13f( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(5));
 306 reg_def XMM13g( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(6));
 307 reg_def XMM13h( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(7));
 308 reg_def XMM13i( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(8));
 309 reg_def XMM13j( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(9));
 310 reg_def XMM13k( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(10));
 311 reg_def XMM13l( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(11));
 312 reg_def XMM13m( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(12));
 313 reg_def XMM13n( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(13));
 314 reg_def XMM13o( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(14));
 315 reg_def XMM13p( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(15));
 316 
 317 reg_def XMM14 ( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg());
 318 reg_def XMM14b( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(1));
 319 reg_def XMM14c( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(2));
 320 reg_def XMM14d( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(3));
 321 reg_def XMM14e( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(4));
 322 reg_def XMM14f( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(5));
 323 reg_def XMM14g( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(6));
 324 reg_def XMM14h( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(7));
 325 reg_def XMM14i( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(8));
 326 reg_def XMM14j( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(9));
 327 reg_def XMM14k( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(10));
 328 reg_def XMM14l( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(11));
 329 reg_def XMM14m( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(12));
 330 reg_def XMM14n( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(13));
 331 reg_def XMM14o( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(14));
 332 reg_def XMM14p( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(15));
 333 
 334 reg_def XMM15 ( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg());
 335 reg_def XMM15b( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(1));
 336 reg_def XMM15c( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(2));
 337 reg_def XMM15d( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(3));
 338 reg_def XMM15e( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(4));
 339 reg_def XMM15f( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(5));
 340 reg_def XMM15g( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(6));
 341 reg_def XMM15h( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(7));
 342 reg_def XMM15i( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(8));
 343 reg_def XMM15j( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(9));
 344 reg_def XMM15k( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(10));
 345 reg_def XMM15l( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(11));
 346 reg_def XMM15m( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(12));
 347 reg_def XMM15n( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(13));
 348 reg_def XMM15o( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(14));
 349 reg_def XMM15p( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(15));
 350 
 351 reg_def XMM16 ( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg());
 352 reg_def XMM16b( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(1));
 353 reg_def XMM16c( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(2));
 354 reg_def XMM16d( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(3));
 355 reg_def XMM16e( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(4));
 356 reg_def XMM16f( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(5));
 357 reg_def XMM16g( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(6));
 358 reg_def XMM16h( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(7));
 359 reg_def XMM16i( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(8));
 360 reg_def XMM16j( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(9));
 361 reg_def XMM16k( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(10));
 362 reg_def XMM16l( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(11));
 363 reg_def XMM16m( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(12));
 364 reg_def XMM16n( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(13));
 365 reg_def XMM16o( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(14));
 366 reg_def XMM16p( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(15));
 367 
 368 reg_def XMM17 ( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg());
 369 reg_def XMM17b( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(1));
 370 reg_def XMM17c( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(2));
 371 reg_def XMM17d( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(3));
 372 reg_def XMM17e( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(4));
 373 reg_def XMM17f( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(5));
 374 reg_def XMM17g( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(6));
 375 reg_def XMM17h( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(7));
 376 reg_def XMM17i( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(8));
 377 reg_def XMM17j( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(9));
 378 reg_def XMM17k( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(10));
 379 reg_def XMM17l( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(11));
 380 reg_def XMM17m( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(12));
 381 reg_def XMM17n( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(13));
 382 reg_def XMM17o( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(14));
 383 reg_def XMM17p( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(15));
 384 
 385 reg_def XMM18 ( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg());
 386 reg_def XMM18b( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(1));
 387 reg_def XMM18c( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(2));
 388 reg_def XMM18d( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(3));
 389 reg_def XMM18e( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(4));
 390 reg_def XMM18f( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(5));
 391 reg_def XMM18g( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(6));
 392 reg_def XMM18h( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(7));
 393 reg_def XMM18i( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(8));
 394 reg_def XMM18j( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(9));
 395 reg_def XMM18k( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(10));
 396 reg_def XMM18l( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(11));
 397 reg_def XMM18m( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(12));
 398 reg_def XMM18n( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(13));
 399 reg_def XMM18o( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(14));
 400 reg_def XMM18p( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(15));
 401 
 402 reg_def XMM19 ( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg());
 403 reg_def XMM19b( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(1));
 404 reg_def XMM19c( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(2));
 405 reg_def XMM19d( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(3));
 406 reg_def XMM19e( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(4));
 407 reg_def XMM19f( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(5));
 408 reg_def XMM19g( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(6));
 409 reg_def XMM19h( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(7));
 410 reg_def XMM19i( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(8));
 411 reg_def XMM19j( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(9));
 412 reg_def XMM19k( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(10));
 413 reg_def XMM19l( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(11));
 414 reg_def XMM19m( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(12));
 415 reg_def XMM19n( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(13));
 416 reg_def XMM19o( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(14));
 417 reg_def XMM19p( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(15));
 418 
 419 reg_def XMM20 ( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg());
 420 reg_def XMM20b( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(1));
 421 reg_def XMM20c( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(2));
 422 reg_def XMM20d( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(3));
 423 reg_def XMM20e( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(4));
 424 reg_def XMM20f( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(5));
 425 reg_def XMM20g( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(6));
 426 reg_def XMM20h( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(7));
 427 reg_def XMM20i( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(8));
 428 reg_def XMM20j( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(9));
 429 reg_def XMM20k( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(10));
 430 reg_def XMM20l( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(11));
 431 reg_def XMM20m( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(12));
 432 reg_def XMM20n( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(13));
 433 reg_def XMM20o( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(14));
 434 reg_def XMM20p( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(15));
 435 
 436 reg_def XMM21 ( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg());
 437 reg_def XMM21b( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(1));
 438 reg_def XMM21c( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(2));
 439 reg_def XMM21d( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(3));
 440 reg_def XMM21e( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(4));
 441 reg_def XMM21f( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(5));
 442 reg_def XMM21g( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(6));
 443 reg_def XMM21h( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(7));
 444 reg_def XMM21i( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(8));
 445 reg_def XMM21j( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(9));
 446 reg_def XMM21k( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(10));
 447 reg_def XMM21l( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(11));
 448 reg_def XMM21m( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(12));
 449 reg_def XMM21n( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(13));
 450 reg_def XMM21o( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(14));
 451 reg_def XMM21p( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(15));
 452 
 453 reg_def XMM22 ( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg());
 454 reg_def XMM22b( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(1));
 455 reg_def XMM22c( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(2));
 456 reg_def XMM22d( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(3));
 457 reg_def XMM22e( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(4));
 458 reg_def XMM22f( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(5));
 459 reg_def XMM22g( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(6));
 460 reg_def XMM22h( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(7));
 461 reg_def XMM22i( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(8));
 462 reg_def XMM22j( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(9));
 463 reg_def XMM22k( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(10));
 464 reg_def XMM22l( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(11));
 465 reg_def XMM22m( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(12));
 466 reg_def XMM22n( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(13));
 467 reg_def XMM22o( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(14));
 468 reg_def XMM22p( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(15));
 469 
 470 reg_def XMM23 ( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg());
 471 reg_def XMM23b( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(1));
 472 reg_def XMM23c( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(2));
 473 reg_def XMM23d( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(3));
 474 reg_def XMM23e( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(4));
 475 reg_def XMM23f( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(5));
 476 reg_def XMM23g( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(6));
 477 reg_def XMM23h( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(7));
 478 reg_def XMM23i( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(8));
 479 reg_def XMM23j( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(9));
 480 reg_def XMM23k( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(10));
 481 reg_def XMM23l( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(11));
 482 reg_def XMM23m( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(12));
 483 reg_def XMM23n( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(13));
 484 reg_def XMM23o( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(14));
 485 reg_def XMM23p( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(15));
 486 
 487 reg_def XMM24 ( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg());
 488 reg_def XMM24b( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(1));
 489 reg_def XMM24c( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(2));
 490 reg_def XMM24d( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(3));
 491 reg_def XMM24e( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(4));
 492 reg_def XMM24f( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(5));
 493 reg_def XMM24g( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(6));
 494 reg_def XMM24h( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(7));
 495 reg_def XMM24i( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(8));
 496 reg_def XMM24j( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(9));
 497 reg_def XMM24k( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(10));
 498 reg_def XMM24l( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(11));
 499 reg_def XMM24m( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(12));
 500 reg_def XMM24n( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(13));
 501 reg_def XMM24o( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(14));
 502 reg_def XMM24p( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(15));
 503 
 504 reg_def XMM25 ( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg());
 505 reg_def XMM25b( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(1));
 506 reg_def XMM25c( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(2));
 507 reg_def XMM25d( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(3));
 508 reg_def XMM25e( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(4));
 509 reg_def XMM25f( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(5));
 510 reg_def XMM25g( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(6));
 511 reg_def XMM25h( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(7));
 512 reg_def XMM25i( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(8));
 513 reg_def XMM25j( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(9));
 514 reg_def XMM25k( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(10));
 515 reg_def XMM25l( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(11));
 516 reg_def XMM25m( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(12));
 517 reg_def XMM25n( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(13));
 518 reg_def XMM25o( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(14));
 519 reg_def XMM25p( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(15));
 520 
 521 reg_def XMM26 ( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg());
 522 reg_def XMM26b( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(1));
 523 reg_def XMM26c( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(2));
 524 reg_def XMM26d( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(3));
 525 reg_def XMM26e( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(4));
 526 reg_def XMM26f( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(5));
 527 reg_def XMM26g( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(6));
 528 reg_def XMM26h( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(7));
 529 reg_def XMM26i( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(8));
 530 reg_def XMM26j( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(9));
 531 reg_def XMM26k( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(10));
 532 reg_def XMM26l( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(11));
 533 reg_def XMM26m( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(12));
 534 reg_def XMM26n( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(13));
 535 reg_def XMM26o( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(14));
 536 reg_def XMM26p( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(15));
 537 
 538 reg_def XMM27 ( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg());
 539 reg_def XMM27b( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(1));
 540 reg_def XMM27c( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(2));
 541 reg_def XMM27d( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(3));
 542 reg_def XMM27e( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(4));
 543 reg_def XMM27f( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(5));
 544 reg_def XMM27g( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(6));
 545 reg_def XMM27h( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(7));
 546 reg_def XMM27i( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(8));
 547 reg_def XMM27j( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(9));
 548 reg_def XMM27k( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(10));
 549 reg_def XMM27l( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(11));
 550 reg_def XMM27m( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(12));
 551 reg_def XMM27n( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(13));
 552 reg_def XMM27o( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(14));
 553 reg_def XMM27p( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(15));
 554 
 555 reg_def XMM28 ( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg());
 556 reg_def XMM28b( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(1));
 557 reg_def XMM28c( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(2));
 558 reg_def XMM28d( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(3));
 559 reg_def XMM28e( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(4));
 560 reg_def XMM28f( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(5));
 561 reg_def XMM28g( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(6));
 562 reg_def XMM28h( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(7));
 563 reg_def XMM28i( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(8));
 564 reg_def XMM28j( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(9));
 565 reg_def XMM28k( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(10));
 566 reg_def XMM28l( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(11));
 567 reg_def XMM28m( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(12));
 568 reg_def XMM28n( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(13));
 569 reg_def XMM28o( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(14));
 570 reg_def XMM28p( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(15));
 571 
 572 reg_def XMM29 ( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg());
 573 reg_def XMM29b( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(1));
 574 reg_def XMM29c( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(2));
 575 reg_def XMM29d( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(3));
 576 reg_def XMM29e( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(4));
 577 reg_def XMM29f( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(5));
 578 reg_def XMM29g( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(6));
 579 reg_def XMM29h( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(7));
 580 reg_def XMM29i( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(8));
 581 reg_def XMM29j( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(9));
 582 reg_def XMM29k( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(10));
 583 reg_def XMM29l( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(11));
 584 reg_def XMM29m( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(12));
 585 reg_def XMM29n( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(13));
 586 reg_def XMM29o( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(14));
 587 reg_def XMM29p( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(15));
 588 
 589 reg_def XMM30 ( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg());
 590 reg_def XMM30b( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(1));
 591 reg_def XMM30c( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(2));
 592 reg_def XMM30d( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(3));
 593 reg_def XMM30e( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(4));
 594 reg_def XMM30f( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(5));
 595 reg_def XMM30g( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(6));
 596 reg_def XMM30h( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(7));
 597 reg_def XMM30i( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(8));
 598 reg_def XMM30j( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(9));
 599 reg_def XMM30k( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(10));
 600 reg_def XMM30l( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(11));
 601 reg_def XMM30m( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(12));
 602 reg_def XMM30n( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(13));
 603 reg_def XMM30o( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(14));
 604 reg_def XMM30p( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(15));
 605 
 606 reg_def XMM31 ( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg());
 607 reg_def XMM31b( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(1));
 608 reg_def XMM31c( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(2));
 609 reg_def XMM31d( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(3));
 610 reg_def XMM31e( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(4));
 611 reg_def XMM31f( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(5));
 612 reg_def XMM31g( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(6));
 613 reg_def XMM31h( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(7));
 614 reg_def XMM31i( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(8));
 615 reg_def XMM31j( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(9));
 616 reg_def XMM31k( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(10));
 617 reg_def XMM31l( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(11));
 618 reg_def XMM31m( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(12));
 619 reg_def XMM31n( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(13));
 620 reg_def XMM31o( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(14));
 621 reg_def XMM31p( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(15));
 622 
 623 #endif // _LP64
 624 
 625 #ifdef _LP64
 626 reg_def RFLAGS(SOC, SOC, 0, 16, VMRegImpl::Bad());
 627 #else
 628 reg_def RFLAGS(SOC, SOC, 0, 8, VMRegImpl::Bad());
 629 #endif // _LP64
 630 
 631 alloc_class chunk1(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
 632                    XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
 633                    XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
 634                    XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
 635                    XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
 636                    XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
 637                    XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
 638                    XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
 639 #ifdef _LP64
 640                   ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
 641                    XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
 642                    XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
 643                    XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
 644                    XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
 645                    XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
 646                    XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
 647                    XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
 648                   ,XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
 649                    XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
 650                    XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
 651                    XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
 652                    XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
 653                    XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
 654                    XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
 655                    XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
 656                    XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
 657                    XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
 658                    XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
 659                    XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
 660                    XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
 661                    XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
 662                    XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
 663                    XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p
 664 #endif
 665                       );
 666 
 667 // flags allocation class should be last.
 668 alloc_class chunk2(RFLAGS);
 669 
 670 // Singleton class for condition codes
 671 reg_class int_flags(RFLAGS);
 672 
 673 // Class for pre evex float registers
 674 reg_class float_reg_legacy(XMM0,
 675                     XMM1,
 676                     XMM2,
 677                     XMM3,
 678                     XMM4,
 679                     XMM5,
 680                     XMM6,
 681                     XMM7
 682 #ifdef _LP64
 683                    ,XMM8,
 684                     XMM9,
 685                     XMM10,
 686                     XMM11,
 687                     XMM12,
 688                     XMM13,
 689                     XMM14,
 690                     XMM15
 691 #endif
 692                     );
 693 
 694 // Class for evex float registers
 695 reg_class float_reg_evex(XMM0,
 696                     XMM1,
 697                     XMM2,
 698                     XMM3,
 699                     XMM4,
 700                     XMM5,
 701                     XMM6,
 702                     XMM7
 703 #ifdef _LP64
 704                    ,XMM8,
 705                     XMM9,
 706                     XMM10,
 707                     XMM11,
 708                     XMM12,
 709                     XMM13,
 710                     XMM14,
 711                     XMM15,
 712                     XMM16,
 713                     XMM17,
 714                     XMM18,
 715                     XMM19,
 716                     XMM20,
 717                     XMM21,
 718                     XMM22,
 719                     XMM23,
 720                     XMM24,
 721                     XMM25,
 722                     XMM26,
 723                     XMM27,
 724                     XMM28,
 725                     XMM29,
 726                     XMM30,
 727                     XMM31
 728 #endif
 729                     );
 730 
 731 reg_class_dynamic float_reg(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() %} );
 732 reg_class_dynamic float_reg_vl(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
 733 
 734 // Class for pre evex double registers
 735 reg_class double_reg_legacy(XMM0,  XMM0b,
 736                      XMM1,  XMM1b,
 737                      XMM2,  XMM2b,
 738                      XMM3,  XMM3b,
 739                      XMM4,  XMM4b,
 740                      XMM5,  XMM5b,
 741                      XMM6,  XMM6b,
 742                      XMM7,  XMM7b
 743 #ifdef _LP64
 744                     ,XMM8,  XMM8b,
 745                      XMM9,  XMM9b,
 746                      XMM10, XMM10b,
 747                      XMM11, XMM11b,
 748                      XMM12, XMM12b,
 749                      XMM13, XMM13b,
 750                      XMM14, XMM14b,
 751                      XMM15, XMM15b
 752 #endif
 753                      );
 754 
 755 // Class for evex double registers
 756 reg_class double_reg_evex(XMM0,  XMM0b,
 757                      XMM1,  XMM1b,
 758                      XMM2,  XMM2b,
 759                      XMM3,  XMM3b,
 760                      XMM4,  XMM4b,
 761                      XMM5,  XMM5b,
 762                      XMM6,  XMM6b,
 763                      XMM7,  XMM7b
 764 #ifdef _LP64
 765                     ,XMM8,  XMM8b,
 766                      XMM9,  XMM9b,
 767                      XMM10, XMM10b,
 768                      XMM11, XMM11b,
 769                      XMM12, XMM12b,
 770                      XMM13, XMM13b,
 771                      XMM14, XMM14b,
 772                      XMM15, XMM15b,
 773                      XMM16, XMM16b,
 774                      XMM17, XMM17b,
 775                      XMM18, XMM18b,
 776                      XMM19, XMM19b,
 777                      XMM20, XMM20b,
 778                      XMM21, XMM21b,
 779                      XMM22, XMM22b,
 780                      XMM23, XMM23b,
 781                      XMM24, XMM24b,
 782                      XMM25, XMM25b,
 783                      XMM26, XMM26b,
 784                      XMM27, XMM27b,
 785                      XMM28, XMM28b,
 786                      XMM29, XMM29b,
 787                      XMM30, XMM30b,
 788                      XMM31, XMM31b
 789 #endif
 790                      );
 791 
 792 reg_class_dynamic double_reg(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() %} );
 793 reg_class_dynamic double_reg_vl(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
 794 
 795 // Class for pre evex 32bit vector registers
 796 reg_class vectors_reg_legacy(XMM0,
 797                       XMM1,
 798                       XMM2,
 799                       XMM3,
 800                       XMM4,
 801                       XMM5,
 802                       XMM6,
 803                       XMM7
 804 #ifdef _LP64
 805                      ,XMM8,
 806                       XMM9,
 807                       XMM10,
 808                       XMM11,
 809                       XMM12,
 810                       XMM13,
 811                       XMM14,
 812                       XMM15
 813 #endif
 814                       );
 815 
 816 // Class for evex 32bit vector registers
 817 reg_class vectors_reg_evex(XMM0,
 818                       XMM1,
 819                       XMM2,
 820                       XMM3,
 821                       XMM4,
 822                       XMM5,
 823                       XMM6,
 824                       XMM7
 825 #ifdef _LP64
 826                      ,XMM8,
 827                       XMM9,
 828                       XMM10,
 829                       XMM11,
 830                       XMM12,
 831                       XMM13,
 832                       XMM14,
 833                       XMM15,
 834                       XMM16,
 835                       XMM17,
 836                       XMM18,
 837                       XMM19,
 838                       XMM20,
 839                       XMM21,
 840                       XMM22,
 841                       XMM23,
 842                       XMM24,
 843                       XMM25,
 844                       XMM26,
 845                       XMM27,
 846                       XMM28,
 847                       XMM29,
 848                       XMM30,
 849                       XMM31
 850 #endif
 851                       );
 852 
 853 reg_class_dynamic vectors_reg(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_evex() %} );
 854 reg_class_dynamic vectors_reg_vlbwdq(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
 855 
 856 // Class for all 64bit vector registers
 857 reg_class vectord_reg_legacy(XMM0,  XMM0b,
 858                       XMM1,  XMM1b,
 859                       XMM2,  XMM2b,
 860                       XMM3,  XMM3b,
 861                       XMM4,  XMM4b,
 862                       XMM5,  XMM5b,
 863                       XMM6,  XMM6b,
 864                       XMM7,  XMM7b
 865 #ifdef _LP64
 866                      ,XMM8,  XMM8b,
 867                       XMM9,  XMM9b,
 868                       XMM10, XMM10b,
 869                       XMM11, XMM11b,
 870                       XMM12, XMM12b,
 871                       XMM13, XMM13b,
 872                       XMM14, XMM14b,
 873                       XMM15, XMM15b
 874 #endif
 875                       );
 876 
 877 // Class for all 64bit vector registers
 878 reg_class vectord_reg_evex(XMM0,  XMM0b,
 879                       XMM1,  XMM1b,
 880                       XMM2,  XMM2b,
 881                       XMM3,  XMM3b,
 882                       XMM4,  XMM4b,
 883                       XMM5,  XMM5b,
 884                       XMM6,  XMM6b,
 885                       XMM7,  XMM7b
 886 #ifdef _LP64
 887                      ,XMM8,  XMM8b,
 888                       XMM9,  XMM9b,
 889                       XMM10, XMM10b,
 890                       XMM11, XMM11b,
 891                       XMM12, XMM12b,
 892                       XMM13, XMM13b,
 893                       XMM14, XMM14b,
 894                       XMM15, XMM15b,
 895                       XMM16, XMM16b,
 896                       XMM17, XMM17b,
 897                       XMM18, XMM18b,
 898                       XMM19, XMM19b,
 899                       XMM20, XMM20b,
 900                       XMM21, XMM21b,
 901                       XMM22, XMM22b,
 902                       XMM23, XMM23b,
 903                       XMM24, XMM24b,
 904                       XMM25, XMM25b,
 905                       XMM26, XMM26b,
 906                       XMM27, XMM27b,
 907                       XMM28, XMM28b,
 908                       XMM29, XMM29b,
 909                       XMM30, XMM30b,
 910                       XMM31, XMM31b
 911 #endif
 912                       );
 913 
 914 reg_class_dynamic vectord_reg(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_evex() %} );
 915 reg_class_dynamic vectord_reg_vlbwdq(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
 916 
 917 // Class for all 128bit vector registers
 918 reg_class vectorx_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,
 919                       XMM1,  XMM1b,  XMM1c,  XMM1d,
 920                       XMM2,  XMM2b,  XMM2c,  XMM2d,
 921                       XMM3,  XMM3b,  XMM3c,  XMM3d,
 922                       XMM4,  XMM4b,  XMM4c,  XMM4d,
 923                       XMM5,  XMM5b,  XMM5c,  XMM5d,
 924                       XMM6,  XMM6b,  XMM6c,  XMM6d,
 925                       XMM7,  XMM7b,  XMM7c,  XMM7d
 926 #ifdef _LP64
 927                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,
 928                       XMM9,  XMM9b,  XMM9c,  XMM9d,
 929                       XMM10, XMM10b, XMM10c, XMM10d,
 930                       XMM11, XMM11b, XMM11c, XMM11d,
 931                       XMM12, XMM12b, XMM12c, XMM12d,
 932                       XMM13, XMM13b, XMM13c, XMM13d,
 933                       XMM14, XMM14b, XMM14c, XMM14d,
 934                       XMM15, XMM15b, XMM15c, XMM15d
 935 #endif
 936                       );
 937 
 938 // Class for all 128bit vector registers
 939 reg_class vectorx_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,
 940                       XMM1,  XMM1b,  XMM1c,  XMM1d,
 941                       XMM2,  XMM2b,  XMM2c,  XMM2d,
 942                       XMM3,  XMM3b,  XMM3c,  XMM3d,
 943                       XMM4,  XMM4b,  XMM4c,  XMM4d,
 944                       XMM5,  XMM5b,  XMM5c,  XMM5d,
 945                       XMM6,  XMM6b,  XMM6c,  XMM6d,
 946                       XMM7,  XMM7b,  XMM7c,  XMM7d
 947 #ifdef _LP64
 948                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,
 949                       XMM9,  XMM9b,  XMM9c,  XMM9d,
 950                       XMM10, XMM10b, XMM10c, XMM10d,
 951                       XMM11, XMM11b, XMM11c, XMM11d,
 952                       XMM12, XMM12b, XMM12c, XMM12d,
 953                       XMM13, XMM13b, XMM13c, XMM13d,
 954                       XMM14, XMM14b, XMM14c, XMM14d,
 955                       XMM15, XMM15b, XMM15c, XMM15d,
 956                       XMM16, XMM16b, XMM16c, XMM16d,
 957                       XMM17, XMM17b, XMM17c, XMM17d,
 958                       XMM18, XMM18b, XMM18c, XMM18d,
 959                       XMM19, XMM19b, XMM19c, XMM19d,
 960                       XMM20, XMM20b, XMM20c, XMM20d,
 961                       XMM21, XMM21b, XMM21c, XMM21d,
 962                       XMM22, XMM22b, XMM22c, XMM22d,
 963                       XMM23, XMM23b, XMM23c, XMM23d,
 964                       XMM24, XMM24b, XMM24c, XMM24d,
 965                       XMM25, XMM25b, XMM25c, XMM25d,
 966                       XMM26, XMM26b, XMM26c, XMM26d,
 967                       XMM27, XMM27b, XMM27c, XMM27d,
 968                       XMM28, XMM28b, XMM28c, XMM28d,
 969                       XMM29, XMM29b, XMM29c, XMM29d,
 970                       XMM30, XMM30b, XMM30c, XMM30d,
 971                       XMM31, XMM31b, XMM31c, XMM31d
 972 #endif
 973                       );
 974 
 975 reg_class_dynamic vectorx_reg(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_evex() %} );
 976 reg_class_dynamic vectorx_reg_vlbwdq(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
 977 
 978 // Class for all 256bit vector registers
 979 reg_class vectory_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
 980                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
 981                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
 982                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
 983                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
 984                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
 985                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
 986                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
 987 #ifdef _LP64
 988                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
 989                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
 990                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
 991                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
 992                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
 993                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
 994                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
 995                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h
 996 #endif
 997                       );
 998 
 999 // Class for all 256bit vector registers
1000 reg_class vectory_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
1001                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
1002                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
1003                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
1004                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
1005                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
1006                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
1007                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
1008 #ifdef _LP64
1009                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
1010                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
1011                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
1012                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
1013                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
1014                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
1015                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
1016                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h,
1017                       XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h,
1018                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h,
1019                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h,
1020                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h,
1021                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h,
1022                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h,
1023                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h,
1024                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h,
1025                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h,
1026                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h,
1027                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h,
1028                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h,
1029                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h,
1030                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h,
1031                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h,
1032                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h
1033 #endif
1034                       );
1035 
1036 reg_class_dynamic vectory_reg(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_evex() %} );
1037 reg_class_dynamic vectory_reg_vlbwdq(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
1038 
1039 // Class for all 512bit vector registers
1040 reg_class vectorz_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
1041                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
1042                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
1043                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
1044                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
1045                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
1046                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
1047                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
1048 #ifdef _LP64
1049                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
1050                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
1051                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
1052                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
1053                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
1054                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
1055                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
1056                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
1057                      ,XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
1058                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
1059                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
1060                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
1061                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
1062                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
1063                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
1064                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
1065                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
1066                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
1067                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
1068                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
1069                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
1070                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
1071                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
1072                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p
1073 #endif
1074                       );
1075 
1076 // Class for restricted 512bit vector registers
1077 reg_class vectorz_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
1078                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
1079                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
1080                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
1081                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
1082                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
1083                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
1084                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
1085 #ifdef _LP64
1086                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
1087                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
1088                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
1089                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
1090                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
1091                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
1092                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
1093                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
1094 #endif
1095                       );
1096 
1097 reg_class_dynamic vectorz_reg(vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() %} );
1098 reg_class_dynamic vectorz_reg_vl(vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
1099 
1100 reg_class xmm0_reg(XMM0, XMM0b, XMM0c, XMM0d);
1101 reg_class ymm0_reg(XMM0, XMM0b, XMM0c, XMM0d, XMM0e, XMM0f, XMM0g, XMM0h);
1102 reg_class zmm0_reg(XMM0, XMM0b, XMM0c, XMM0d, XMM0e, XMM0f, XMM0g, XMM0h, XMM0i, XMM0j, XMM0k, XMM0l, XMM0m, XMM0n, XMM0o, XMM0p);
1103 
1104 reg_class xmm1_reg(XMM1, XMM1b, XMM1c, XMM1d);
1105 reg_class ymm1_reg(XMM1, XMM1b, XMM1c, XMM1d, XMM1e, XMM1f, XMM1g, XMM1h);
1106 reg_class zmm1_reg(XMM1, XMM1b, XMM1c, XMM1d, XMM1e, XMM1f, XMM1g, XMM1h, XMM1i, XMM1j, XMM1k, XMM1l, XMM1m, XMM1n, XMM1o, XMM1p);
1107 
1108 reg_class xmm2_reg(XMM2, XMM2b, XMM2c, XMM2d);
1109 reg_class ymm2_reg(XMM2, XMM2b, XMM2c, XMM2d, XMM2e, XMM2f, XMM2g, XMM2h);
1110 reg_class zmm2_reg(XMM2, XMM2b, XMM2c, XMM2d, XMM2e, XMM2f, XMM2g, XMM2h, XMM2i, XMM2j, XMM2k, XMM2l, XMM2m, XMM2n, XMM2o, XMM2p);
1111 
1112 reg_class xmm3_reg(XMM3, XMM3b, XMM3c, XMM3d);
1113 reg_class ymm3_reg(XMM3, XMM3b, XMM3c, XMM3d, XMM3e, XMM3f, XMM3g, XMM3h);
1114 reg_class zmm3_reg(XMM3, XMM3b, XMM3c, XMM3d, XMM3e, XMM3f, XMM3g, XMM3h, XMM3i, XMM3j, XMM3k, XMM3l, XMM3m, XMM3n, XMM3o, XMM3p);
1115 
1116 reg_class xmm4_reg(XMM4, XMM4b, XMM4c, XMM4d);
1117 reg_class ymm4_reg(XMM4, XMM4b, XMM4c, XMM4d, XMM4e, XMM4f, XMM4g, XMM4h);
1118 reg_class zmm4_reg(XMM4, XMM4b, XMM4c, XMM4d, XMM4e, XMM4f, XMM4g, XMM4h, XMM4i, XMM4j, XMM4k, XMM4l, XMM4m, XMM4n, XMM4o, XMM4p);
1119 
1120 reg_class xmm5_reg(XMM5, XMM5b, XMM5c, XMM5d);
1121 reg_class ymm5_reg(XMM5, XMM5b, XMM5c, XMM5d, XMM5e, XMM5f, XMM5g, XMM5h);
1122 reg_class zmm5_reg(XMM5, XMM5b, XMM5c, XMM5d, XMM5e, XMM5f, XMM5g, XMM5h, XMM5i, XMM5j, XMM5k, XMM5l, XMM5m, XMM5n, XMM5o, XMM5p);
1123 
1124 reg_class xmm6_reg(XMM6, XMM6b, XMM6c, XMM6d);
1125 reg_class ymm6_reg(XMM6, XMM6b, XMM6c, XMM6d, XMM6e, XMM6f, XMM6g, XMM6h);
1126 reg_class zmm6_reg(XMM6, XMM6b, XMM6c, XMM6d, XMM6e, XMM6f, XMM6g, XMM6h, XMM6i, XMM6j, XMM6k, XMM6l, XMM6m, XMM6n, XMM6o, XMM6p);
1127 
1128 reg_class xmm7_reg(XMM7, XMM7b, XMM7c, XMM7d);
1129 reg_class ymm7_reg(XMM7, XMM7b, XMM7c, XMM7d, XMM7e, XMM7f, XMM7g, XMM7h);
1130 reg_class zmm7_reg(XMM7, XMM7b, XMM7c, XMM7d, XMM7e, XMM7f, XMM7g, XMM7h, XMM7i, XMM7j, XMM7k, XMM7l, XMM7m, XMM7n, XMM7o, XMM7p);
1131 
1132 #ifdef _LP64
1133 
1134 reg_class xmm8_reg(XMM8, XMM8b, XMM8c, XMM8d);
1135 reg_class ymm8_reg(XMM8, XMM8b, XMM8c, XMM8d, XMM8e, XMM8f, XMM8g, XMM8h);
1136 reg_class zmm8_reg(XMM8, XMM8b, XMM8c, XMM8d, XMM8e, XMM8f, XMM8g, XMM8h, XMM8i, XMM8j, XMM8k, XMM8l, XMM8m, XMM8n, XMM8o, XMM8p);
1137 
1138 reg_class xmm9_reg(XMM9, XMM9b, XMM9c, XMM9d);
1139 reg_class ymm9_reg(XMM9, XMM9b, XMM9c, XMM9d, XMM9e, XMM9f, XMM9g, XMM9h);
1140 reg_class zmm9_reg(XMM9, XMM9b, XMM9c, XMM9d, XMM9e, XMM9f, XMM9g, XMM9h, XMM9i, XMM9j, XMM9k, XMM9l, XMM9m, XMM9n, XMM9o, XMM9p);
1141 
1142 reg_class xmm10_reg(XMM10, XMM10b, XMM10c, XMM10d);
1143 reg_class ymm10_reg(XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h);
1144 reg_class zmm10_reg(XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p);
1145 
1146 reg_class xmm11_reg(XMM11, XMM11b, XMM11c, XMM11d);
1147 reg_class ymm11_reg(XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h);
1148 reg_class zmm11_reg(XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p);
1149 
1150 reg_class xmm12_reg(XMM12, XMM12b, XMM12c, XMM12d);
1151 reg_class ymm12_reg(XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h);
1152 reg_class zmm12_reg(XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p);
1153 
1154 reg_class xmm13_reg(XMM13, XMM13b, XMM13c, XMM13d);
1155 reg_class ymm13_reg(XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h);
1156 reg_class zmm13_reg(XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p);
1157 
1158 reg_class xmm14_reg(XMM14, XMM14b, XMM14c, XMM14d);
1159 reg_class ymm14_reg(XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h);
1160 reg_class zmm14_reg(XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p);
1161 
1162 reg_class xmm15_reg(XMM15, XMM15b, XMM15c, XMM15d);
1163 reg_class ymm15_reg(XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h);
1164 reg_class zmm15_reg(XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p);
1165 
1166 reg_class xmm16_reg(XMM16, XMM16b, XMM16c, XMM16d);
1167 reg_class ymm16_reg(XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h);
1168 reg_class zmm16_reg(XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p);
1169 
1170 reg_class xmm17_reg(XMM17, XMM17b, XMM17c, XMM17d);
1171 reg_class ymm17_reg(XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h);
1172 reg_class zmm17_reg(XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p);
1173 
1174 reg_class xmm18_reg(XMM18, XMM18b, XMM18c, XMM18d);
1175 reg_class ymm18_reg(XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h);
1176 reg_class zmm18_reg(XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p);
1177 
1178 reg_class xmm19_reg(XMM19, XMM19b, XMM19c, XMM19d);
1179 reg_class ymm19_reg(XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h);
1180 reg_class zmm19_reg(XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p);
1181 
1182 reg_class xmm20_reg(XMM20, XMM20b, XMM20c, XMM20d);
1183 reg_class ymm20_reg(XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h);
1184 reg_class zmm20_reg(XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p);
1185 
1186 reg_class xmm21_reg(XMM21, XMM21b, XMM21c, XMM21d);
1187 reg_class ymm21_reg(XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h);
1188 reg_class zmm21_reg(XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p);
1189 
1190 reg_class xmm22_reg(XMM22, XMM22b, XMM22c, XMM22d);
1191 reg_class ymm22_reg(XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h);
1192 reg_class zmm22_reg(XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p);
1193 
1194 reg_class xmm23_reg(XMM23, XMM23b, XMM23c, XMM23d);
1195 reg_class ymm23_reg(XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h);
1196 reg_class zmm23_reg(XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p);
1197 
1198 reg_class xmm24_reg(XMM24, XMM24b, XMM24c, XMM24d);
1199 reg_class ymm24_reg(XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h);
1200 reg_class zmm24_reg(XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p);
1201 
1202 reg_class xmm25_reg(XMM25, XMM25b, XMM25c, XMM25d);
1203 reg_class ymm25_reg(XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h);
1204 reg_class zmm25_reg(XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p);
1205 
1206 reg_class xmm26_reg(XMM26, XMM26b, XMM26c, XMM26d);
1207 reg_class ymm26_reg(XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h);
1208 reg_class zmm26_reg(XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p);
1209 
1210 reg_class xmm27_reg(XMM27, XMM27b, XMM27c, XMM27d);
1211 reg_class ymm27_reg(XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h);
1212 reg_class zmm27_reg(XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p);
1213 
1214 reg_class xmm28_reg(XMM28, XMM28b, XMM28c, XMM28d);
1215 reg_class ymm28_reg(XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h);
1216 reg_class zmm28_reg(XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p);
1217 
1218 reg_class xmm29_reg(XMM29, XMM29b, XMM29c, XMM29d);
1219 reg_class ymm29_reg(XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h);
1220 reg_class zmm29_reg(XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p);
1221 
1222 reg_class xmm30_reg(XMM30, XMM30b, XMM30c, XMM30d);
1223 reg_class ymm30_reg(XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h);
1224 reg_class zmm30_reg(XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p);
1225 
1226 reg_class xmm31_reg(XMM31, XMM31b, XMM31c, XMM31d);
1227 reg_class ymm31_reg(XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h);
1228 reg_class zmm31_reg(XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p);
1229 
1230 #endif
1231 
1232 %}
1233 
1234 
1235 //----------SOURCE BLOCK-------------------------------------------------------
1236 // This is a block of C++ code which provides values, functions, and
1237 // definitions necessary in the rest of the architecture description
1238 
1239 source_hpp %{
1240 // Header information of the source block.
1241 // Method declarations/definitions which are used outside
1242 // the ad-scope can conveniently be defined here.
1243 //
1244 // To keep related declarations/definitions/uses close together,
1245 // we switch between source %{ }% and source_hpp %{ }% freely as needed.
1246 
1247 class NativeJump;
1248 
1249 class CallStubImpl {
1250 
1251   //--------------------------------------------------------------
1252   //---<  Used for optimization in Compile::shorten_branches  >---
1253   //--------------------------------------------------------------
1254 
1255  public:
1256   // Size of call trampoline stub.
1257   static uint size_call_trampoline() {
1258     return 0; // no call trampolines on this platform
1259   }
1260 
1261   // number of relocations needed by a call trampoline stub
1262   static uint reloc_call_trampoline() {
1263     return 0; // no call trampolines on this platform
1264   }
1265 };
1266 
1267 class HandlerImpl {
1268 
1269  public:
1270 
1271   static int emit_exception_handler(CodeBuffer &cbuf);
1272   static int emit_deopt_handler(CodeBuffer& cbuf);
1273 
1274   static uint size_exception_handler() {
1275     // NativeCall instruction size is the same as NativeJump.
1276     // exception handler starts out as jump and can be patched to
1277     // a call be deoptimization.  (4932387)
1278     // Note that this value is also credited (in output.cpp) to
1279     // the size of the code section.
1280     return NativeJump::instruction_size;
1281   }
1282 
1283 #ifdef _LP64
1284   static uint size_deopt_handler() {
1285     // three 5 byte instructions plus one move for unreachable address.
1286     return 15+3;
1287   }
1288 #else
1289   static uint size_deopt_handler() {
1290     // NativeCall instruction size is the same as NativeJump.
1291     // exception handler starts out as jump and can be patched to
1292     // a call be deoptimization.  (4932387)
1293     // Note that this value is also credited (in output.cpp) to
1294     // the size of the code section.
1295     return 5 + NativeJump::instruction_size; // pushl(); jmp;
1296   }
1297 #endif
1298 };
1299 
1300 %} // end source_hpp
1301 
1302 source %{
1303 
1304 #include "opto/addnode.hpp"
1305 
1306 // Emit exception handler code.
1307 // Stuff framesize into a register and call a VM stub routine.
1308 int HandlerImpl::emit_exception_handler(CodeBuffer& cbuf) {
1309 
1310   // Note that the code buffer's insts_mark is always relative to insts.
1311   // That's why we must use the macroassembler to generate a handler.
1312   MacroAssembler _masm(&cbuf);
1313   address base = __ start_a_stub(size_exception_handler());
1314   if (base == NULL) {
1315     ciEnv::current()->record_failure("CodeCache is full");
1316     return 0;  // CodeBuffer::expand failed
1317   }
1318   int offset = __ offset();
1319   __ jump(RuntimeAddress(OptoRuntime::exception_blob()->entry_point()));
1320   assert(__ offset() - offset <= (int) size_exception_handler(), "overflow");
1321   __ end_a_stub();
1322   return offset;
1323 }
1324 
1325 // Emit deopt handler code.
1326 int HandlerImpl::emit_deopt_handler(CodeBuffer& cbuf) {
1327 
1328   // Note that the code buffer's insts_mark is always relative to insts.
1329   // That's why we must use the macroassembler to generate a handler.
1330   MacroAssembler _masm(&cbuf);
1331   address base = __ start_a_stub(size_deopt_handler());
1332   if (base == NULL) {
1333     ciEnv::current()->record_failure("CodeCache is full");
1334     return 0;  // CodeBuffer::expand failed
1335   }
1336   int offset = __ offset();
1337 
1338 #ifdef _LP64
1339   address the_pc = (address) __ pc();
1340   Label next;
1341   // push a "the_pc" on the stack without destroying any registers
1342   // as they all may be live.
1343 
1344   // push address of "next"
1345   __ call(next, relocInfo::none); // reloc none is fine since it is a disp32
1346   __ bind(next);
1347   // adjust it so it matches "the_pc"
1348   __ subptr(Address(rsp, 0), __ offset() - offset);
1349 #else
1350   InternalAddress here(__ pc());
1351   __ pushptr(here.addr());
1352 #endif
1353 
1354   __ jump(RuntimeAddress(SharedRuntime::deopt_blob()->unpack()));
1355   assert(__ offset() - offset <= (int) size_deopt_handler(), "overflow %d", (__ offset() - offset));
1356   __ end_a_stub();
1357   return offset;
1358 }
1359 
1360 
1361 //=============================================================================
1362 
1363   // Float masks come from different places depending on platform.
1364 #ifdef _LP64
1365   static address float_signmask()  { return StubRoutines::x86::float_sign_mask(); }
1366   static address float_signflip()  { return StubRoutines::x86::float_sign_flip(); }
1367   static address double_signmask() { return StubRoutines::x86::double_sign_mask(); }
1368   static address double_signflip() { return StubRoutines::x86::double_sign_flip(); }
1369   static address vector_float_signmask() { return StubRoutines::x86::vector_float_sign_mask(); }
1370   static address vector_float_signflip() { return StubRoutines::x86::vector_float_sign_flip(); }
1371   static address vector_double_signmask() { return StubRoutines::x86::vector_double_sign_mask(); }
1372   static address vector_double_signflip() { return StubRoutines::x86::vector_double_sign_flip(); }
1373   static address vector_all_bits_set() { return StubRoutines::x86::vector_all_bits_set(); }
1374   static address vector_byte_bitset() { return StubRoutines::x86::vector_byte_bitset(); }
1375   static address vector_long_perm_mask() { return StubRoutines::x86::vector_long_perm_mask(); }
1376   static address vector_short_to_byte_mask() { return StubRoutines::x86::vector_short_to_byte_mask(); }
1377   static address vector_byte_perm_mask() { return StubRoutines::x86::vector_byte_perm_mask(); }
1378   static address vector_int_to_byte_mask() { return StubRoutines::x86::vector_int_to_byte_mask(); }
1379   static address vector_int_to_short_mask() { return StubRoutines::x86::vector_int_to_short_mask(); }
1380   static address vector_32_bit_mask() { return StubRoutines::x86::vector_32_bit_mask(); }
1381   static address vector_64_bit_mask() { return StubRoutines::x86::vector_64_bit_mask(); }
1382   static address vector_all_ones_mask() { return StubRoutines::x86::vector_all_ones_mask(); }
1383   static address vector_int_shufflemask() { return StubRoutines::x86::vector_int_shuffle_mask(); }
1384   static address vector_int_sizemask() { return StubRoutines::x86::vector_int_size_mask(); }
1385   static address vector_short_shufflemask() { return StubRoutines::x86::vector_short_shuffle_mask(); }
1386   static address vector_short_sizemask() { return StubRoutines::x86::vector_short_size_mask(); }
1387   static address vector_long_shufflemask() { return StubRoutines::x86::vector_long_shuffle_mask(); }
1388   static address vector_long_sizemask() { return StubRoutines::x86::vector_long_size_mask(); }
1389 #else
1390   static address float_signmask()  { return (address)float_signmask_pool; }
1391   static address float_signflip()  { return (address)float_signflip_pool; }
1392   static address double_signmask() { return (address)double_signmask_pool; }
1393   static address double_signflip() { return (address)double_signflip_pool; }
1394 #endif
1395 
1396 
1397 const bool Matcher::match_rule_supported(int opcode) {
1398   if (!has_match_rule(opcode))
1399     return false;
1400 
1401   bool ret_value = true;
1402   switch (opcode) {
1403     case Op_PopCountI:
1404     case Op_PopCountL:
1405       if (!UsePopCountInstruction)
1406         ret_value = false;
1407       break;
1408     case Op_PopCountVI:
1409       if (!UsePopCountInstruction || !VM_Version::supports_vpopcntdq())
1410         ret_value = false;
1411       break;
1412     case Op_MulVI:
1413     case Op_MulVL:
1414       if ((UseSSE < 4) && (UseAVX < 1)) // only with SSE4_1 or AVX
1415         ret_value = false;
1416       break;
1417     case Op_MulReductionVL:
1418       if (VM_Version::supports_avx512dq() == false)
1419         ret_value = false;
1420       break;
1421     case Op_AddReductionVL:
1422       if (UseSSE < 2) // requires at least SSE2
1423         ret_value = false;
1424       break;
1425     case Op_MulReductionVI:
1426       if (UseSSE < 4) // requires at least SSE4
1427         ret_value = false;
1428       break;
1429     case Op_AddReductionVF:
1430     case Op_AddReductionVD:
1431     case Op_MulReductionVF:
1432     case Op_MulReductionVD:
1433       if (UseSSE < 1) // requires at least SSE
1434         ret_value = false;
1435       break;
1436     case Op_SqrtVD:
1437     case Op_SqrtVF:
1438       if (UseAVX < 1) // enabled for AVX only
1439         ret_value = false;
1440       break;
1441     case Op_CompareAndSwapL:
1442 #ifdef _LP64
1443     case Op_CompareAndSwapP:
1444 #endif
1445       if (!VM_Version::supports_cx8())
1446         ret_value = false;
1447       break;
1448     case Op_CMoveVF:
1449     case Op_CMoveVD:
1450       if (UseAVX < 1 || UseAVX > 2)
1451         ret_value = false;
1452       break;
1453     case Op_StrIndexOf:
1454       if (!UseSSE42Intrinsics)
1455         ret_value = false;
1456       break;
1457     case Op_StrIndexOfChar:
1458       if (!UseSSE42Intrinsics)
1459         ret_value = false;
1460       break;
1461     case Op_OnSpinWait:
1462       if (VM_Version::supports_on_spin_wait() == false)
1463         ret_value = false;
1464       break;
1465     case Op_MulAddVS2VI:
1466       if (UseSSE < 2)
1467         ret_value = false;
1468       break;
1469 #ifdef _LP64
1470     case Op_MaxD:
1471     case Op_MaxF:
1472     case Op_MinD:
1473     case Op_MinF:
1474       if (UseAVX < 1) // enabled for AVX only
1475         ret_value = false;
1476       break;
1477 #endif
1478   }
1479 
1480   return ret_value;  // Per default match rules are supported.
1481 }
1482 
1483 const bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt, int op_arity) {
1484   // identify extra cases that we might want to provide match rules for
1485   // e.g. Op_ vector nodes and other intrinsics while guarding with vlen
1486   bool ret_value = match_rule_supported(opcode);
1487   if (ret_value) {
1488     int size_in_bits = vlen * type2aelembytes(bt) * BitsPerByte;
1489     if (!vector_size_supported(bt, vlen)) {
1490       ret_value = false;
1491     } else if (size_in_bits > 256 && UseAVX <= 2) {
1492       // Only AVX512 supports 512-bit vectors
1493       ret_value = false;
1494     } else if (UseAVX == 0 && size_in_bits > 128) {
1495       // Only AVX supports 256-bit vectors
1496       ret_value = false;
1497     } else if (is_subword_type(bt) && size_in_bits == 512 && VM_Version::supports_avx512bw() == false) {
1498       // Byte and Short types are not supported in AVX512 if AVX512BW is not true.
1499       ret_value = false;
1500     } else {
1501         switch (opcode) {
1502         case Op_AbsV:
1503           if (is_integral_type(bt) && UseSSE < 3) { ret_value = false; }
1504           else if (is_integral_type(bt) && size_in_bits == 256 && UseAVX < 2) { ret_value = false; }
1505           else if (bt == T_LONG && UseAVX <= 2) { ret_value = false; } // Implementation limitation
1506           break;
1507         case Op_AddVB:
1508         case Op_SubVB:
1509           if ((vlen == 64) && (VM_Version::supports_avx512bw() == false))
1510             ret_value = false;
1511           break;
1512         case Op_MaxV:
1513         case Op_MinV:
1514           if (UseSSE < 4 && (bt == T_BYTE || bt == T_INT || bt == T_LONG))
1515             ret_value = false;
1516 
1517           if ((bt == T_FLOAT || bt == T_DOUBLE)) {
1518             // Float/Double intrinsics are enabled for AVX family currently.
1519             if (UseAVX == 0)
1520               ret_value = false;
1521             // 512 bit Float/Double intrinsics need AVX512DQ
1522             if (UseAVX > 2 && (!VM_Version::supports_avx512dq() && size_in_bits == 512))
1523               ret_value = false;
1524           }
1525           break;
1526         case Op_MulVB:
1527         case Op_LShiftVB:
1528         case Op_RShiftVB:
1529         case Op_URShiftVB:
1530         case Op_LShiftVS:
1531         case Op_RShiftVS:
1532         case Op_URShiftVS:
1533           if (size_in_bits <= 128 && UseSSE < 4) { ret_value = false; }
1534           else if (size_in_bits > 256 && UseAVX < 2) { ret_value = false; }
1535           break;
1536         case Op_LShiftVI:
1537         case Op_RShiftVI:
1538         case Op_URShiftVI:
1539           if (op_arity == 2 && UseAVX <= 1)
1540             ret_value  = false;
1541           break;
1542         case Op_LShiftVL:
1543         case Op_RShiftVL:
1544         case Op_URShiftVL:
1545           if (op_arity == 2 && UseAVX <= 1)
1546             ret_value  = false;
1547           break;
1548         case Op_MulVS:
1549         case Op_AddVS:
1550         case Op_SubVS:
1551           if ((vlen == 32) && (VM_Version::supports_avx512bw() == false))
1552             ret_value = false;
1553           break;
1554         case Op_CallLeafVector:
1555           if (size_in_bits == 512 && !VM_Version::supports_avx512vlbwdq())
1556             ret_value = false;
1557           break;
1558         case Op_CMoveVF:
1559           if (vlen != 8)
1560             ret_value  = false;
1561           break;
1562         case Op_CMoveVD:
1563           if (vlen != 4)
1564             ret_value  = false;
1565           break;
1566         case Op_AddReductionVI:
1567           if (bt == T_INT && UseSSE < 3) { ret_value = false; }
1568           else if (is_subword_type(bt) && UseSSE <= 3) { ret_value = false; }
1569           break;
1570         case Op_AndReductionV:
1571         case Op_OrReductionV:
1572         case Op_XorReductionV:
1573           if (bt == T_BYTE && UseSSE <= 3) { ret_value = false; }
1574           break;
1575         case Op_VectorMaskCmp:
1576           if (UseAVX <= 0) { ret_value = false; }
1577           else if (is_integral_type(bt) && size_in_bits == 256 && UseAVX < 2) { ret_value = false; }
1578           break;
1579         case Op_MinReductionV:
1580         case Op_MaxReductionV:
1581           if ((bt == T_INT || bt == T_LONG || bt == T_BYTE) && UseSSE <= 3) { ret_value = false; }
1582           else if (is_integral_type(bt) && size_in_bits == 256 && UseAVX < 2) { ret_value = false; }
1583 
1584           // Float/Double intrinsics enabled for AVX family.
1585           if (UseAVX == 0 && (bt == T_FLOAT || bt == T_DOUBLE))
1586             ret_value = false;
1587           if (UseAVX > 2 && (!VM_Version::supports_avx512dq() && size_in_bits == 512))
1588             ret_value = false;
1589           break;
1590         case Op_VectorBlend:
1591           if (UseSSE <= 3 && UseAVX == 0) { ret_value = false; }
1592           else if (is_integral_type(bt) && size_in_bits == 256 && UseAVX < 2) { ret_value = false; }
1593           break;
1594         case Op_VectorTest:
1595           if (UseAVX <= 0) { ret_value = false; }
1596           else if (size_in_bits != 128 && size_in_bits != 256) { ret_value = false; } // Implementation limitation
1597           break;
1598         case Op_VectorLoadMask:
1599           if (UseSSE <= 3) { ret_value = false; }
1600           else if (vlen == 1 || vlen == 2) { ret_value = false; } // Implementation limitation
1601           else if (size_in_bits >= 256 && UseAVX < 2) { ret_value = false; } // Implementation limitation
1602           break;
1603         case Op_VectorLoadShuffle:
1604         case Op_VectorRearrange:
1605           if (vlen == 1 || vlen == 2) { ret_value = false; } // Implementation limitation due to how shuffle is loaded
1606           else if (size_in_bits >= 256 && UseAVX < 2) { ret_value = false; } // Implementation limitation
1607           else if (bt == T_BYTE && size_in_bits >= 256 && !VM_Version::supports_avx512vbmi())  { ret_value = false; } // Implementation limitation
1608           else if (bt == T_SHORT && size_in_bits >= 256 && !VM_Version::supports_avx512vlbw())  { ret_value = false; } // Implementation limitation
1609           break;
1610         case Op_VectorStoreMask:
1611           if (UseAVX < 0) { ret_value = false; } // Implementation limitation
1612           else if ((size_in_bits >= 256 || bt == T_LONG || bt == T_DOUBLE) && UseAVX < 2) { ret_value = false; } // Implementation limitation
1613           else if (vlen == 1 || vlen == 2) { ret_value = false; } // Implementation limitation
1614           else if (size_in_bits == 512 && !VM_Version::supports_avx512bw()) { ret_value = false; } // Implementation limitation
1615           break;
1616         case Op_VectorCastB2X:
1617           if (UseAVX <= 0) { ret_value = false; }
1618           else if (size_in_bits >= 256 && UseAVX < 2) { ret_value = false; }
1619           break;
1620         case Op_VectorCastS2X:
1621           if (UseAVX <= 0) { ret_value = false; }
1622           else if (is_integral_type(bt) && size_in_bits == 256 && UseAVX < 2) { ret_value = false; }
1623           else if (is_integral_type(bt) && vlen * type2aelembytes(T_SHORT) * BitsPerByte == 256 && UseAVX < 2) { ret_value = false; }
1624           break;
1625         case Op_VectorCastI2X:
1626           if (UseAVX <= 0) { ret_value = false; }
1627           else if (is_integral_type(bt) && size_in_bits == 256 && UseAVX < 2) { ret_value = false; }
1628           else if (is_integral_type(bt) && vlen * type2aelembytes(T_INT) * BitsPerByte == 256 && UseAVX < 2) { ret_value = false; }
1629           break;
1630         case Op_VectorCastL2X:
1631           if (UseAVX <= 0) { ret_value = false; }
1632           else if (is_integral_type(bt) && size_in_bits == 256 && UseAVX < 2) { ret_value = false; }
1633           else if (is_integral_type(bt) && vlen * type2aelembytes(T_LONG) * BitsPerByte == 256 && UseAVX < 2) { ret_value = false; }
1634           else if (!is_integral_type(bt) && !VM_Version::supports_avx512dq()) { ret_value = false; }
1635           break;
1636         case Op_VectorCastF2X:
1637           // Casts from FP to integral types require special fixup logic not easily
1638           // implementable with vectors.
1639           if (UseAVX <= 0) { ret_value = false; }
1640           else if (bt != T_DOUBLE) { ret_value = false; } // Implementation limitation
1641           break;
1642         case Op_VectorCastD2X:
1643           // Casts from FP to integral types require special fixup logic not easily
1644           // implementable with vectors.
1645           if (UseAVX <= 0) { ret_value = false; }
1646           else if (bt != T_FLOAT) { ret_value = false; } // Implementation limitation
1647           break;
1648         case Op_VectorReinterpret:
1649           if (size_in_bits >= 256 && UseAVX < 2) { ret_value = false; }
1650           break;
1651         case Op_MulReductionVI:
1652           if (bt ==T_BYTE && size_in_bits == 512 && !VM_Version::supports_avx512bw()) { ret_value = false; }
1653           break;
1654         case Op_FmaVD:
1655         case Op_FmaVF:
1656           if (!UseFMA) { ret_value = false; }
1657         case Op_LoadVectorGather:
1658           if (UseAVX < 2) { ret_value = false; }
1659           else if (size_in_bits == 64 ) { ret_value = false; }
1660           break;
1661         case Op_StoreVectorScatter:
1662           if (UseAVX < 3) { ret_value = false; }
1663           else if (size_in_bits == 64 ) { ret_value = false; }
1664           break;
1665         default:
1666           break;
1667       }
1668     }
1669   }
1670   if (ret_value) {
1671     assert(is_java_primitive(bt) && (vlen > 0) && is_power_of_2(vlen) &&
1672            vector_size_supported(bt, vlen), "must be supported");
1673   }
1674 
1675   return ret_value;  // Per default match rules are supported.
1676 }
1677 
1678 const bool Matcher::has_predicated_vectors(void) {
1679   bool ret_value = false;
1680   if (UseAVX > 2) {
1681     ret_value = VM_Version::supports_avx512vl();
1682   }
1683 
1684   return ret_value;
1685 }
1686 
1687 const int Matcher::float_pressure(int default_pressure_threshold) {
1688   int float_pressure_threshold = default_pressure_threshold;
1689 #ifdef _LP64
1690   if (UseAVX > 2) {
1691     // Increase pressure threshold on machines with AVX3 which have
1692     // 2x more XMM registers.
1693     float_pressure_threshold = default_pressure_threshold * 2;
1694   }
1695 #endif
1696   return float_pressure_threshold;
1697 }
1698 
1699 // Max vector size in bytes. 0 if not supported.
1700 const int Matcher::vector_width_in_bytes(BasicType bt) {
1701   assert(is_java_primitive(bt), "only primitive type vectors");
1702   if (UseSSE < 2) return 0;
1703   // SSE2 supports 128bit vectors for all types.
1704   // AVX2 supports 256bit vectors for all types.
1705   // AVX2/EVEX supports 512bit vectors for all types.
1706   int size = (UseAVX > 1) ? (1 << UseAVX) * 8 : 16;
1707   // AVX1 supports 256bit vectors only for FLOAT and DOUBLE.
1708   if (UseAVX > 0 && (bt == T_FLOAT || bt == T_DOUBLE))
1709     size = (UseAVX > 2) ? 64 : 32;
1710   if (UseAVX > 2 && (bt == T_BYTE || bt == T_SHORT || bt == T_CHAR))
1711     size = (VM_Version::supports_avx512bw()) ? 64 : 32;
1712   // Use flag to limit vector size.
1713   size = MIN2(size,(int)MaxVectorSize);
1714   // Minimum 2 values in vector (or 4 for bytes).
1715   switch (bt) {
1716   case T_DOUBLE:
1717   case T_LONG:
1718     if (size < 16) return 0;
1719     break;
1720   case T_FLOAT:
1721   case T_INT:
1722     if (size < 8) return 0;
1723     break;
1724   case T_BOOLEAN:
1725     if (size < 4) return 0;
1726     break;
1727   case T_CHAR:
1728     if (size < 4) return 0;
1729     break;
1730   case T_BYTE:
1731     if (size < 4) return 0;
1732     break;
1733   case T_SHORT:
1734     if (size < 4) return 0;
1735     break;
1736   default:
1737     ShouldNotReachHere();
1738   }
1739   return size;
1740 }
1741 
1742 // Limits on vector size (number of elements) loaded into vector.
1743 const int Matcher::max_vector_size(const BasicType bt) {
1744   return vector_width_in_bytes(bt)/type2aelembytes(bt);
1745 }
1746 const int Matcher::min_vector_size(const BasicType bt) {
1747   int max_size = max_vector_size(bt);
1748   // Min size which can be loaded into vector is 4 bytes.
1749   int size = (type2aelembytes(bt) == 1) ? 4 : 2;
1750   return MIN2(size,max_size);
1751 }
1752 
1753 // Vector ideal reg corresponding to specified size in bytes
1754 const uint Matcher::vector_ideal_reg(int size) {
1755   assert(MaxVectorSize >= size, "");
1756   switch(size) {
1757     case  4: return Op_VecS;
1758     case  8: return Op_VecD;
1759     case 16: return Op_VecX;
1760     case 32: return Op_VecY;
1761     case 64: return Op_VecZ;
1762   }
1763   ShouldNotReachHere();
1764   return 0;
1765 }
1766 
1767 // Only lowest bits of xmm reg are used for vector shift count.
1768 const uint Matcher::vector_shift_count_ideal_reg(int size) {
1769   return Op_VecS;
1770 }
1771 
1772 // x86 supports misaligned vectors store/load.
1773 const bool Matcher::misaligned_vectors_ok() {
1774   return true;
1775 }
1776 
1777 // x86 AES instructions are compatible with SunJCE expanded
1778 // keys, hence we do not need to pass the original key to stubs
1779 const bool Matcher::pass_original_key_for_aes() {
1780   return false;
1781 }
1782 
1783 
1784 const bool Matcher::convi2l_type_required = true;
1785 
1786 // Check for shift by small constant as well
1787 static bool clone_shift(Node* shift, Matcher* matcher, Matcher::MStack& mstack, VectorSet& address_visited) {
1788   if (shift->Opcode() == Op_LShiftX && shift->in(2)->is_Con() &&
1789       shift->in(2)->get_int() <= 3 &&
1790       // Are there other uses besides address expressions?
1791       !matcher->is_visited(shift)) {
1792     address_visited.set(shift->_idx); // Flag as address_visited
1793     mstack.push(shift->in(2), Matcher::Visit);
1794     Node *conv = shift->in(1);
1795 #ifdef _LP64
1796     // Allow Matcher to match the rule which bypass
1797     // ConvI2L operation for an array index on LP64
1798     // if the index value is positive.
1799     if (conv->Opcode() == Op_ConvI2L &&
1800         conv->as_Type()->type()->is_long()->_lo >= 0 &&
1801         // Are there other uses besides address expressions?
1802         !matcher->is_visited(conv)) {
1803       address_visited.set(conv->_idx); // Flag as address_visited
1804       mstack.push(conv->in(1), Matcher::Pre_Visit);
1805     } else
1806 #endif
1807       mstack.push(conv, Matcher::Pre_Visit);
1808     return true;
1809   }
1810   return false;
1811 }
1812 
1813 // Should the Matcher clone shifts on addressing modes, expecting them
1814 // to be subsumed into complex addressing expressions or compute them
1815 // into registers?
1816 bool Matcher::clone_address_expressions(AddPNode* m, Matcher::MStack& mstack, VectorSet& address_visited) {
1817   Node *off = m->in(AddPNode::Offset);
1818   if (off->is_Con()) {
1819     address_visited.test_set(m->_idx); // Flag as address_visited
1820     Node *adr = m->in(AddPNode::Address);
1821 
1822     // Intel can handle 2 adds in addressing mode
1823     // AtomicAdd is not an addressing expression.
1824     // Cheap to find it by looking for screwy base.
1825     if (adr->is_AddP() &&
1826         !adr->in(AddPNode::Base)->is_top() &&
1827         LP64_ONLY( off->get_long() == (int) (off->get_long()) && ) // immL32
1828         // Are there other uses besides address expressions?
1829         !is_visited(adr)) {
1830       address_visited.set(adr->_idx); // Flag as address_visited
1831       Node *shift = adr->in(AddPNode::Offset);
1832       if (!clone_shift(shift, this, mstack, address_visited)) {
1833         mstack.push(shift, Pre_Visit);
1834       }
1835       mstack.push(adr->in(AddPNode::Address), Pre_Visit);
1836       mstack.push(adr->in(AddPNode::Base), Pre_Visit);
1837     } else {
1838       mstack.push(adr, Pre_Visit);
1839     }
1840 
1841     // Clone X+offset as it also folds into most addressing expressions
1842     mstack.push(off, Visit);
1843     mstack.push(m->in(AddPNode::Base), Pre_Visit);
1844     return true;
1845   } else if (clone_shift(off, this, mstack, address_visited)) {
1846     address_visited.test_set(m->_idx); // Flag as address_visited
1847     mstack.push(m->in(AddPNode::Address), Pre_Visit);
1848     mstack.push(m->in(AddPNode::Base), Pre_Visit);
1849     return true;
1850   }
1851   return false;
1852 }
1853 
1854 void Compile::reshape_address(AddPNode* addp) {
1855 }
1856 
1857 // Helper methods for MachSpillCopyNode::implementation().
1858 static int vec_mov_helper(CodeBuffer *cbuf, bool do_size, int src_lo, int dst_lo,
1859                           int src_hi, int dst_hi, uint ireg, outputStream* st) {
1860   // In 64-bit VM size calculation is very complex. Emitting instructions
1861   // into scratch buffer is used to get size in 64-bit VM.
1862   LP64_ONLY( assert(!do_size, "this method calculates size only for 32-bit VM"); )
1863   assert(ireg == Op_VecS || // 32bit vector
1864          (src_lo & 1) == 0 && (src_lo + 1) == src_hi &&
1865          (dst_lo & 1) == 0 && (dst_lo + 1) == dst_hi,
1866          "no non-adjacent vector moves" );
1867   if (cbuf) {
1868     MacroAssembler _masm(cbuf);
1869     int offset = __ offset();
1870     switch (ireg) {
1871     case Op_VecS: // copy whole register
1872     case Op_VecD:
1873     case Op_VecX:
1874 #ifndef _LP64
1875       __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
1876 #else
1877       if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
1878         __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
1879       } else {
1880         __ vextractf32x4(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 0x0);
1881      }
1882 #endif
1883       break;
1884     case Op_VecY:
1885 #ifndef _LP64
1886       __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
1887 #else
1888       if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
1889         __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
1890       } else {
1891         __ vextractf64x4(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 0x0);
1892      }
1893 #endif
1894       break;
1895     case Op_VecZ:
1896       __ evmovdquq(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 2);
1897       break;
1898     default:
1899       ShouldNotReachHere();
1900     }
1901     int size = __ offset() - offset;
1902 #ifdef ASSERT
1903     // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
1904     assert(!do_size || size == 4, "incorrect size calculattion");
1905 #endif
1906     return size;
1907 #ifndef PRODUCT
1908   } else if (!do_size) {
1909     switch (ireg) {
1910     case Op_VecS:
1911     case Op_VecD:
1912     case Op_VecX:
1913       st->print("movdqu  %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
1914       break;
1915     case Op_VecY:
1916     case Op_VecZ:
1917       st->print("vmovdqu %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
1918       break;
1919     default:
1920       ShouldNotReachHere();
1921     }
1922 #endif
1923   }
1924   // VEX_2bytes prefix is used if UseAVX > 0, and it takes the same 2 bytes as SIMD prefix.
1925   return (UseAVX > 2) ? 6 : 4;
1926 }
1927 
1928 static int vec_spill_helper(CodeBuffer *cbuf, bool do_size, bool is_load,
1929                             int stack_offset, int reg, uint ireg, outputStream* st) {
1930   // In 64-bit VM size calculation is very complex. Emitting instructions
1931   // into scratch buffer is used to get size in 64-bit VM.
1932   LP64_ONLY( assert(!do_size, "this method calculates size only for 32-bit VM"); )
1933   if (cbuf) {
1934     MacroAssembler _masm(cbuf);
1935     int offset = __ offset();
1936     if (is_load) {
1937       switch (ireg) {
1938       case Op_VecS:
1939         __ movdl(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
1940         break;
1941       case Op_VecD:
1942         __ movq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
1943         break;
1944       case Op_VecX:
1945 #ifndef _LP64
1946         __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
1947 #else
1948         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
1949           __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
1950         } else {
1951           __ vpxor(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 2);
1952           __ vinsertf32x4(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset),0x0);
1953         }
1954 #endif
1955         break;
1956       case Op_VecY:
1957 #ifndef _LP64
1958         __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
1959 #else
1960         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
1961           __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
1962         } else {
1963           __ vpxor(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 2);
1964           __ vinsertf64x4(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset),0x0);
1965         }
1966 #endif
1967         break;
1968       case Op_VecZ:
1969         __ evmovdquq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset), 2);
1970         break;
1971       default:
1972         ShouldNotReachHere();
1973       }
1974     } else { // store
1975       switch (ireg) {
1976       case Op_VecS:
1977         __ movdl(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
1978         break;
1979       case Op_VecD:
1980         __ movq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
1981         break;
1982       case Op_VecX:
1983 #ifndef _LP64
1984         __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
1985 #else
1986         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
1987           __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
1988         }
1989         else {
1990           __ vextractf32x4(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 0x0);
1991         }
1992 #endif
1993         break;
1994       case Op_VecY:
1995 #ifndef _LP64
1996         __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
1997 #else
1998         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
1999           __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
2000         }
2001         else {
2002           __ vextractf64x4(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 0x0);
2003         }
2004 #endif
2005         break;
2006       case Op_VecZ:
2007         __ evmovdquq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 2);
2008         break;
2009       default:
2010         ShouldNotReachHere();
2011       }
2012     }
2013     int size = __ offset() - offset;
2014 #ifdef ASSERT
2015     int offset_size = (stack_offset == 0) ? 0 : ((stack_offset < 0x80) ? 1 : (UseAVX > 2) ? 6 : 4);
2016     // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
2017     assert(!do_size || size == (5+offset_size), "incorrect size calculattion");
2018 #endif
2019     return size;
2020 #ifndef PRODUCT
2021   } else if (!do_size) {
2022     if (is_load) {
2023       switch (ireg) {
2024       case Op_VecS:
2025         st->print("movd    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
2026         break;
2027       case Op_VecD:
2028         st->print("movq    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
2029         break;
2030        case Op_VecX:
2031         st->print("movdqu  %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
2032         break;
2033       case Op_VecY:
2034       case Op_VecZ:
2035         st->print("vmovdqu %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
2036         break;
2037       default:
2038         ShouldNotReachHere();
2039       }
2040     } else { // store
2041       switch (ireg) {
2042       case Op_VecS:
2043         st->print("movd    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
2044         break;
2045       case Op_VecD:
2046         st->print("movq    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
2047         break;
2048        case Op_VecX:
2049         st->print("movdqu  [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
2050         break;
2051       case Op_VecY:
2052       case Op_VecZ:
2053         st->print("vmovdqu [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
2054         break;
2055       default:
2056         ShouldNotReachHere();
2057       }
2058     }
2059 #endif
2060   }
2061   bool is_single_byte = false;
2062   int vec_len = 0;
2063   if ((UseAVX > 2) && (stack_offset != 0)) {
2064     int tuple_type = Assembler::EVEX_FVM;
2065     int input_size = Assembler::EVEX_32bit;
2066     switch (ireg) {
2067     case Op_VecS:
2068       tuple_type = Assembler::EVEX_T1S;
2069       break;
2070     case Op_VecD:
2071       tuple_type = Assembler::EVEX_T1S;
2072       input_size = Assembler::EVEX_64bit;
2073       break;
2074     case Op_VecX:
2075       break;
2076     case Op_VecY:
2077       vec_len = 1;
2078       break;
2079     case Op_VecZ:
2080       vec_len = 2;
2081       break;
2082     }
2083     is_single_byte = Assembler::query_compressed_disp_byte(stack_offset, true, vec_len, tuple_type, input_size, 0);
2084   }
2085   int offset_size = 0;
2086   int size = 5;
2087   if (UseAVX > 2 ) {
2088     if (VM_Version::supports_avx512novl() && (vec_len == 2)) {
2089       offset_size = (stack_offset == 0) ? 0 : ((is_single_byte) ? 1 : 4);
2090       size += 2; // Need an additional two bytes for EVEX encoding
2091     } else if (VM_Version::supports_avx512novl() && (vec_len < 2)) {
2092       offset_size = (stack_offset == 0) ? 0 : ((stack_offset <= 127) ? 1 : 4);
2093     } else {
2094       offset_size = (stack_offset == 0) ? 0 : ((is_single_byte) ? 1 : 4);
2095       size += 2; // Need an additional two bytes for EVEX encodding
2096     }
2097   } else {
2098     offset_size = (stack_offset == 0) ? 0 : ((stack_offset <= 127) ? 1 : 4);
2099   }
2100   // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
2101   return size+offset_size;
2102 }
2103 
2104 static inline jint replicate4_imm(int con, int width) {
2105   // Load a constant of "width" (in bytes) and replicate it to fill 32bit.
2106   assert(width == 1 || width == 2, "only byte or short types here");
2107   int bit_width = width * 8;
2108   jint val = con;
2109   val &= (1 << bit_width) - 1;  // mask off sign bits
2110   while(bit_width < 32) {
2111     val |= (val << bit_width);
2112     bit_width <<= 1;
2113   }
2114   return val;
2115 }
2116 
2117 static inline jlong replicate8_imm(int con, int width) {
2118   // Load a constant of "width" (in bytes) and replicate it to fill 64bit.
2119   assert(width == 1 || width == 2 || width == 4, "only byte, short or int types here");
2120   int bit_width = width * 8;
2121   jlong val = con;
2122   val &= (((jlong) 1) << bit_width) - 1;  // mask off sign bits
2123   while(bit_width < 64) {
2124     val |= (val << bit_width);
2125     bit_width <<= 1;
2126   }
2127   return val;
2128 }
2129 
2130 #ifndef PRODUCT
2131   void MachNopNode::format(PhaseRegAlloc*, outputStream* st) const {
2132     st->print("nop \t# %d bytes pad for loops and calls", _count);
2133   }
2134 #endif
2135 
2136   void MachNopNode::emit(CodeBuffer &cbuf, PhaseRegAlloc*) const {
2137     MacroAssembler _masm(&cbuf);
2138     __ nop(_count);
2139   }
2140 
2141   uint MachNopNode::size(PhaseRegAlloc*) const {
2142     return _count;
2143   }
2144 
2145 #ifndef PRODUCT
2146   void MachBreakpointNode::format(PhaseRegAlloc*, outputStream* st) const {
2147     st->print("# breakpoint");
2148   }
2149 #endif
2150 
2151   void MachBreakpointNode::emit(CodeBuffer &cbuf, PhaseRegAlloc* ra_) const {
2152     MacroAssembler _masm(&cbuf);
2153     __ int3();
2154   }
2155 
2156   uint MachBreakpointNode::size(PhaseRegAlloc* ra_) const {
2157     return MachNode::size(ra_);
2158   }
2159 
2160   
2161 
2162 %}
2163 
2164 encode %{
2165 
2166   enc_class call_epilog %{
2167     if (VerifyStackAtCalls) {
2168       // Check that stack depth is unchanged: find majik cookie on stack
2169       int framesize = ra_->reg2offset_unchecked(OptoReg::add(ra_->_matcher._old_SP, -3*VMRegImpl::slots_per_word));
2170       MacroAssembler _masm(&cbuf);
2171       Label L;
2172       __ cmpptr(Address(rsp, framesize), (int32_t)0xbadb100d);
2173       __ jccb(Assembler::equal, L);
2174       // Die if stack mismatch
2175       __ int3();
2176       __ bind(L);
2177     }
2178   %}
2179 
2180 %}
2181 
2182 
2183 //----------OPERANDS-----------------------------------------------------------
2184 // Operand definitions must precede instruction definitions for correct parsing
2185 // in the ADLC because operands constitute user defined types which are used in
2186 // instruction definitions.
2187 
2188 operand immU1() %{
2189   predicate(n->get_int() >= 0 && n->get_int() < nth_bit(1));
2190   match(ConI);
2191 
2192   op_cost(0);
2193   format %{ %}
2194   interface(CONST_INTER);
2195 %}
2196 
2197 operand immU2() %{
2198   predicate(n->get_int() >= 0 && n->get_int() < nth_bit(2));
2199   match(ConI);
2200 
2201   op_cost(0);
2202   format %{ %}
2203   interface(CONST_INTER);
2204 %}
2205 
2206 operand immU3() %{
2207   predicate(n->get_int() >= 0 && n->get_int() < nth_bit(3));
2208   match(ConI);
2209 
2210   op_cost(0);
2211   format %{ %}
2212   interface(CONST_INTER);
2213 %}
2214 
2215 operand immU4() %{
2216   predicate(n->get_int() >= 0 && n->get_int() < nth_bit(4));
2217   match(ConI);
2218 
2219   op_cost(0);
2220   format %{ %}
2221   interface(CONST_INTER);
2222 %}
2223 
2224 operand immU5() %{
2225   predicate(n->get_int() >= 0 && n->get_int() < nth_bit(5));
2226   match(ConI);
2227 
2228   op_cost(0);
2229   format %{ %}
2230   interface(CONST_INTER);
2231 %}
2232 
2233 operand immU6() %{
2234   predicate(n->get_int() >= 0 && n->get_int() < nth_bit(6));
2235   match(ConI);
2236 
2237   op_cost(0);
2238   format %{ %}
2239   interface(CONST_INTER);
2240 %}
2241 
2242 // Comparison Code for FP conditional move
2243 operand cmpOp_vcmppd() %{
2244   match(Bool);
2245 
2246   predicate(n->as_Bool()->_test._test != BoolTest::overflow &&
2247             n->as_Bool()->_test._test != BoolTest::no_overflow);
2248   format %{ "" %}
2249   interface(COND_INTER) %{
2250     equal        (0x0, "eq");
2251     less         (0x1, "lt");
2252     less_equal   (0x2, "le");
2253     not_equal    (0xC, "ne");
2254     greater_equal(0xD, "ge");
2255     greater      (0xE, "gt");
2256     //TODO cannot compile (adlc breaks) without two next lines with error:
2257     // x86_64.ad(13987) Syntax Error: :In operand cmpOp_vcmppd: Do not support this encode constant: ' %{
2258     // equal' for overflow.
2259     overflow     (0x20, "o");  // not really supported by the instruction
2260     no_overflow  (0x21, "no"); // not really supported by the instruction
2261   %}
2262 %}
2263 
2264 
2265 // INSTRUCTIONS -- Platform independent definitions (same for 32- and 64-bit)
2266 
2267 // ============================================================================
2268 
2269 instruct ShouldNotReachHere() %{
2270   match(Halt);
2271   format %{ "ud2\t# ShouldNotReachHere" %}
2272   ins_encode %{
2273     __ ud2();
2274   %}
2275   ins_pipe(pipe_slow);
2276 %}
2277 
2278 // =================================EVEX special===============================
2279 
2280 instruct setMask(rRegI dst, rRegI src) %{
2281   predicate(Matcher::has_predicated_vectors());
2282   match(Set dst (SetVectMaskI  src));
2283   effect(TEMP dst);
2284   format %{ "setvectmask   $dst, $src" %}
2285   ins_encode %{
2286     __ setvectmask($dst$$Register, $src$$Register);
2287   %}
2288   ins_pipe(pipe_slow);
2289 %}
2290 
2291 // ============================================================================
2292 
2293 instruct addF_reg(regF dst, regF src) %{
2294   predicate((UseSSE>=1) && (UseAVX == 0));
2295   match(Set dst (AddF dst src));
2296 
2297   format %{ "addss   $dst, $src" %}
2298   ins_cost(150);
2299   ins_encode %{
2300     __ addss($dst$$XMMRegister, $src$$XMMRegister);
2301   %}
2302   ins_pipe(pipe_slow);
2303 %}
2304 
2305 instruct addF_mem(regF dst, memory src) %{
2306   predicate((UseSSE>=1) && (UseAVX == 0));
2307   match(Set dst (AddF dst (LoadF src)));
2308 
2309   format %{ "addss   $dst, $src" %}
2310   ins_cost(150);
2311   ins_encode %{
2312     __ addss($dst$$XMMRegister, $src$$Address);
2313   %}
2314   ins_pipe(pipe_slow);
2315 %}
2316 
2317 instruct addF_imm(regF dst, immF con) %{
2318   predicate((UseSSE>=1) && (UseAVX == 0));
2319   match(Set dst (AddF dst con));
2320   format %{ "addss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2321   ins_cost(150);
2322   ins_encode %{
2323     __ addss($dst$$XMMRegister, $constantaddress($con));
2324   %}
2325   ins_pipe(pipe_slow);
2326 %}
2327 
2328 instruct addF_reg_reg(regF dst, regF src1, regF src2) %{
2329   predicate(UseAVX > 0);
2330   match(Set dst (AddF src1 src2));
2331 
2332   format %{ "vaddss  $dst, $src1, $src2" %}
2333   ins_cost(150);
2334   ins_encode %{
2335     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2336   %}
2337   ins_pipe(pipe_slow);
2338 %}
2339 
2340 instruct addF_reg_mem(regF dst, regF src1, memory src2) %{
2341   predicate(UseAVX > 0);
2342   match(Set dst (AddF src1 (LoadF src2)));
2343 
2344   format %{ "vaddss  $dst, $src1, $src2" %}
2345   ins_cost(150);
2346   ins_encode %{
2347     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2348   %}
2349   ins_pipe(pipe_slow);
2350 %}
2351 
2352 instruct addF_reg_imm(regF dst, regF src, immF con) %{
2353   predicate(UseAVX > 0);
2354   match(Set dst (AddF src con));
2355 
2356   format %{ "vaddss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
2357   ins_cost(150);
2358   ins_encode %{
2359     __ vaddss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2360   %}
2361   ins_pipe(pipe_slow);
2362 %}
2363 
2364 instruct addD_reg(regD dst, regD src) %{
2365   predicate((UseSSE>=2) && (UseAVX == 0));
2366   match(Set dst (AddD dst src));
2367 
2368   format %{ "addsd   $dst, $src" %}
2369   ins_cost(150);
2370   ins_encode %{
2371     __ addsd($dst$$XMMRegister, $src$$XMMRegister);
2372   %}
2373   ins_pipe(pipe_slow);
2374 %}
2375 
2376 instruct addD_mem(regD dst, memory src) %{
2377   predicate((UseSSE>=2) && (UseAVX == 0));
2378   match(Set dst (AddD dst (LoadD src)));
2379 
2380   format %{ "addsd   $dst, $src" %}
2381   ins_cost(150);
2382   ins_encode %{
2383     __ addsd($dst$$XMMRegister, $src$$Address);
2384   %}
2385   ins_pipe(pipe_slow);
2386 %}
2387 
2388 instruct addD_imm(regD dst, immD con) %{
2389   predicate((UseSSE>=2) && (UseAVX == 0));
2390   match(Set dst (AddD dst con));
2391   format %{ "addsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2392   ins_cost(150);
2393   ins_encode %{
2394     __ addsd($dst$$XMMRegister, $constantaddress($con));
2395   %}
2396   ins_pipe(pipe_slow);
2397 %}
2398 
2399 instruct addD_reg_reg(regD dst, regD src1, regD src2) %{
2400   predicate(UseAVX > 0);
2401   match(Set dst (AddD src1 src2));
2402 
2403   format %{ "vaddsd  $dst, $src1, $src2" %}
2404   ins_cost(150);
2405   ins_encode %{
2406     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2407   %}
2408   ins_pipe(pipe_slow);
2409 %}
2410 
2411 instruct addD_reg_mem(regD dst, regD src1, memory src2) %{
2412   predicate(UseAVX > 0);
2413   match(Set dst (AddD src1 (LoadD src2)));
2414 
2415   format %{ "vaddsd  $dst, $src1, $src2" %}
2416   ins_cost(150);
2417   ins_encode %{
2418     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2419   %}
2420   ins_pipe(pipe_slow);
2421 %}
2422 
2423 instruct addD_reg_imm(regD dst, regD src, immD con) %{
2424   predicate(UseAVX > 0);
2425   match(Set dst (AddD src con));
2426 
2427   format %{ "vaddsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
2428   ins_cost(150);
2429   ins_encode %{
2430     __ vaddsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2431   %}
2432   ins_pipe(pipe_slow);
2433 %}
2434 
2435 instruct subF_reg(regF dst, regF src) %{
2436   predicate((UseSSE>=1) && (UseAVX == 0));
2437   match(Set dst (SubF dst src));
2438 
2439   format %{ "subss   $dst, $src" %}
2440   ins_cost(150);
2441   ins_encode %{
2442     __ subss($dst$$XMMRegister, $src$$XMMRegister);
2443   %}
2444   ins_pipe(pipe_slow);
2445 %}
2446 
2447 instruct subF_mem(regF dst, memory src) %{
2448   predicate((UseSSE>=1) && (UseAVX == 0));
2449   match(Set dst (SubF dst (LoadF src)));
2450 
2451   format %{ "subss   $dst, $src" %}
2452   ins_cost(150);
2453   ins_encode %{
2454     __ subss($dst$$XMMRegister, $src$$Address);
2455   %}
2456   ins_pipe(pipe_slow);
2457 %}
2458 
2459 instruct subF_imm(regF dst, immF con) %{
2460   predicate((UseSSE>=1) && (UseAVX == 0));
2461   match(Set dst (SubF dst con));
2462   format %{ "subss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2463   ins_cost(150);
2464   ins_encode %{
2465     __ subss($dst$$XMMRegister, $constantaddress($con));
2466   %}
2467   ins_pipe(pipe_slow);
2468 %}
2469 
2470 instruct subF_reg_reg(regF dst, regF src1, regF src2) %{
2471   predicate(UseAVX > 0);
2472   match(Set dst (SubF src1 src2));
2473 
2474   format %{ "vsubss  $dst, $src1, $src2" %}
2475   ins_cost(150);
2476   ins_encode %{
2477     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2478   %}
2479   ins_pipe(pipe_slow);
2480 %}
2481 
2482 instruct subF_reg_mem(regF dst, regF src1, memory src2) %{
2483   predicate(UseAVX > 0);
2484   match(Set dst (SubF src1 (LoadF src2)));
2485 
2486   format %{ "vsubss  $dst, $src1, $src2" %}
2487   ins_cost(150);
2488   ins_encode %{
2489     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2490   %}
2491   ins_pipe(pipe_slow);
2492 %}
2493 
2494 instruct subF_reg_imm(regF dst, regF src, immF con) %{
2495   predicate(UseAVX > 0);
2496   match(Set dst (SubF src con));
2497 
2498   format %{ "vsubss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
2499   ins_cost(150);
2500   ins_encode %{
2501     __ vsubss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2502   %}
2503   ins_pipe(pipe_slow);
2504 %}
2505 
2506 instruct subD_reg(regD dst, regD src) %{
2507   predicate((UseSSE>=2) && (UseAVX == 0));
2508   match(Set dst (SubD dst src));
2509 
2510   format %{ "subsd   $dst, $src" %}
2511   ins_cost(150);
2512   ins_encode %{
2513     __ subsd($dst$$XMMRegister, $src$$XMMRegister);
2514   %}
2515   ins_pipe(pipe_slow);
2516 %}
2517 
2518 instruct subD_mem(regD dst, memory src) %{
2519   predicate((UseSSE>=2) && (UseAVX == 0));
2520   match(Set dst (SubD dst (LoadD src)));
2521 
2522   format %{ "subsd   $dst, $src" %}
2523   ins_cost(150);
2524   ins_encode %{
2525     __ subsd($dst$$XMMRegister, $src$$Address);
2526   %}
2527   ins_pipe(pipe_slow);
2528 %}
2529 
2530 instruct subD_imm(regD dst, immD con) %{
2531   predicate((UseSSE>=2) && (UseAVX == 0));
2532   match(Set dst (SubD dst con));
2533   format %{ "subsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2534   ins_cost(150);
2535   ins_encode %{
2536     __ subsd($dst$$XMMRegister, $constantaddress($con));
2537   %}
2538   ins_pipe(pipe_slow);
2539 %}
2540 
2541 instruct subD_reg_reg(regD dst, regD src1, regD src2) %{
2542   predicate(UseAVX > 0);
2543   match(Set dst (SubD src1 src2));
2544 
2545   format %{ "vsubsd  $dst, $src1, $src2" %}
2546   ins_cost(150);
2547   ins_encode %{
2548     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2549   %}
2550   ins_pipe(pipe_slow);
2551 %}
2552 
2553 instruct subD_reg_mem(regD dst, regD src1, memory src2) %{
2554   predicate(UseAVX > 0);
2555   match(Set dst (SubD src1 (LoadD src2)));
2556 
2557   format %{ "vsubsd  $dst, $src1, $src2" %}
2558   ins_cost(150);
2559   ins_encode %{
2560     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2561   %}
2562   ins_pipe(pipe_slow);
2563 %}
2564 
2565 instruct subD_reg_imm(regD dst, regD src, immD con) %{
2566   predicate(UseAVX > 0);
2567   match(Set dst (SubD src con));
2568 
2569   format %{ "vsubsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
2570   ins_cost(150);
2571   ins_encode %{
2572     __ vsubsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2573   %}
2574   ins_pipe(pipe_slow);
2575 %}
2576 
2577 instruct mulF_reg(regF dst, regF src) %{
2578   predicate((UseSSE>=1) && (UseAVX == 0));
2579   match(Set dst (MulF dst src));
2580 
2581   format %{ "mulss   $dst, $src" %}
2582   ins_cost(150);
2583   ins_encode %{
2584     __ mulss($dst$$XMMRegister, $src$$XMMRegister);
2585   %}
2586   ins_pipe(pipe_slow);
2587 %}
2588 
2589 instruct mulF_mem(regF dst, memory src) %{
2590   predicate((UseSSE>=1) && (UseAVX == 0));
2591   match(Set dst (MulF dst (LoadF src)));
2592 
2593   format %{ "mulss   $dst, $src" %}
2594   ins_cost(150);
2595   ins_encode %{
2596     __ mulss($dst$$XMMRegister, $src$$Address);
2597   %}
2598   ins_pipe(pipe_slow);
2599 %}
2600 
2601 instruct mulF_imm(regF dst, immF con) %{
2602   predicate((UseSSE>=1) && (UseAVX == 0));
2603   match(Set dst (MulF dst con));
2604   format %{ "mulss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2605   ins_cost(150);
2606   ins_encode %{
2607     __ mulss($dst$$XMMRegister, $constantaddress($con));
2608   %}
2609   ins_pipe(pipe_slow);
2610 %}
2611 
2612 instruct mulF_reg_reg(regF dst, regF src1, regF src2) %{
2613   predicate(UseAVX > 0);
2614   match(Set dst (MulF src1 src2));
2615 
2616   format %{ "vmulss  $dst, $src1, $src2" %}
2617   ins_cost(150);
2618   ins_encode %{
2619     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2620   %}
2621   ins_pipe(pipe_slow);
2622 %}
2623 
2624 instruct mulF_reg_mem(regF dst, regF src1, memory src2) %{
2625   predicate(UseAVX > 0);
2626   match(Set dst (MulF src1 (LoadF src2)));
2627 
2628   format %{ "vmulss  $dst, $src1, $src2" %}
2629   ins_cost(150);
2630   ins_encode %{
2631     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2632   %}
2633   ins_pipe(pipe_slow);
2634 %}
2635 
2636 instruct mulF_reg_imm(regF dst, regF src, immF con) %{
2637   predicate(UseAVX > 0);
2638   match(Set dst (MulF src con));
2639 
2640   format %{ "vmulss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
2641   ins_cost(150);
2642   ins_encode %{
2643     __ vmulss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2644   %}
2645   ins_pipe(pipe_slow);
2646 %}
2647 
2648 instruct mulD_reg(regD dst, regD src) %{
2649   predicate((UseSSE>=2) && (UseAVX == 0));
2650   match(Set dst (MulD dst src));
2651 
2652   format %{ "mulsd   $dst, $src" %}
2653   ins_cost(150);
2654   ins_encode %{
2655     __ mulsd($dst$$XMMRegister, $src$$XMMRegister);
2656   %}
2657   ins_pipe(pipe_slow);
2658 %}
2659 
2660 instruct mulD_mem(regD dst, memory src) %{
2661   predicate((UseSSE>=2) && (UseAVX == 0));
2662   match(Set dst (MulD dst (LoadD src)));
2663 
2664   format %{ "mulsd   $dst, $src" %}
2665   ins_cost(150);
2666   ins_encode %{
2667     __ mulsd($dst$$XMMRegister, $src$$Address);
2668   %}
2669   ins_pipe(pipe_slow);
2670 %}
2671 
2672 instruct mulD_imm(regD dst, immD con) %{
2673   predicate((UseSSE>=2) && (UseAVX == 0));
2674   match(Set dst (MulD dst con));
2675   format %{ "mulsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2676   ins_cost(150);
2677   ins_encode %{
2678     __ mulsd($dst$$XMMRegister, $constantaddress($con));
2679   %}
2680   ins_pipe(pipe_slow);
2681 %}
2682 
2683 instruct mulD_reg_reg(regD dst, regD src1, regD src2) %{
2684   predicate(UseAVX > 0);
2685   match(Set dst (MulD src1 src2));
2686 
2687   format %{ "vmulsd  $dst, $src1, $src2" %}
2688   ins_cost(150);
2689   ins_encode %{
2690     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2691   %}
2692   ins_pipe(pipe_slow);
2693 %}
2694 
2695 instruct mulD_reg_mem(regD dst, regD src1, memory src2) %{
2696   predicate(UseAVX > 0);
2697   match(Set dst (MulD src1 (LoadD src2)));
2698 
2699   format %{ "vmulsd  $dst, $src1, $src2" %}
2700   ins_cost(150);
2701   ins_encode %{
2702     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2703   %}
2704   ins_pipe(pipe_slow);
2705 %}
2706 
2707 instruct mulD_reg_imm(regD dst, regD src, immD con) %{
2708   predicate(UseAVX > 0);
2709   match(Set dst (MulD src con));
2710 
2711   format %{ "vmulsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
2712   ins_cost(150);
2713   ins_encode %{
2714     __ vmulsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2715   %}
2716   ins_pipe(pipe_slow);
2717 %}
2718 
2719 instruct divF_reg(regF dst, regF src) %{
2720   predicate((UseSSE>=1) && (UseAVX == 0));
2721   match(Set dst (DivF dst src));
2722 
2723   format %{ "divss   $dst, $src" %}
2724   ins_cost(150);
2725   ins_encode %{
2726     __ divss($dst$$XMMRegister, $src$$XMMRegister);
2727   %}
2728   ins_pipe(pipe_slow);
2729 %}
2730 
2731 instruct divF_mem(regF dst, memory src) %{
2732   predicate((UseSSE>=1) && (UseAVX == 0));
2733   match(Set dst (DivF dst (LoadF src)));
2734 
2735   format %{ "divss   $dst, $src" %}
2736   ins_cost(150);
2737   ins_encode %{
2738     __ divss($dst$$XMMRegister, $src$$Address);
2739   %}
2740   ins_pipe(pipe_slow);
2741 %}
2742 
2743 instruct divF_imm(regF dst, immF con) %{
2744   predicate((UseSSE>=1) && (UseAVX == 0));
2745   match(Set dst (DivF dst con));
2746   format %{ "divss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2747   ins_cost(150);
2748   ins_encode %{
2749     __ divss($dst$$XMMRegister, $constantaddress($con));
2750   %}
2751   ins_pipe(pipe_slow);
2752 %}
2753 
2754 instruct divF_reg_reg(regF dst, regF src1, regF src2) %{
2755   predicate(UseAVX > 0);
2756   match(Set dst (DivF src1 src2));
2757 
2758   format %{ "vdivss  $dst, $src1, $src2" %}
2759   ins_cost(150);
2760   ins_encode %{
2761     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2762   %}
2763   ins_pipe(pipe_slow);
2764 %}
2765 
2766 instruct divF_reg_mem(regF dst, regF src1, memory src2) %{
2767   predicate(UseAVX > 0);
2768   match(Set dst (DivF src1 (LoadF src2)));
2769 
2770   format %{ "vdivss  $dst, $src1, $src2" %}
2771   ins_cost(150);
2772   ins_encode %{
2773     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2774   %}
2775   ins_pipe(pipe_slow);
2776 %}
2777 
2778 instruct divF_reg_imm(regF dst, regF src, immF con) %{
2779   predicate(UseAVX > 0);
2780   match(Set dst (DivF src con));
2781 
2782   format %{ "vdivss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
2783   ins_cost(150);
2784   ins_encode %{
2785     __ vdivss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2786   %}
2787   ins_pipe(pipe_slow);
2788 %}
2789 
2790 instruct divD_reg(regD dst, regD src) %{
2791   predicate((UseSSE>=2) && (UseAVX == 0));
2792   match(Set dst (DivD dst src));
2793 
2794   format %{ "divsd   $dst, $src" %}
2795   ins_cost(150);
2796   ins_encode %{
2797     __ divsd($dst$$XMMRegister, $src$$XMMRegister);
2798   %}
2799   ins_pipe(pipe_slow);
2800 %}
2801 
2802 instruct divD_mem(regD dst, memory src) %{
2803   predicate((UseSSE>=2) && (UseAVX == 0));
2804   match(Set dst (DivD dst (LoadD src)));
2805 
2806   format %{ "divsd   $dst, $src" %}
2807   ins_cost(150);
2808   ins_encode %{
2809     __ divsd($dst$$XMMRegister, $src$$Address);
2810   %}
2811   ins_pipe(pipe_slow);
2812 %}
2813 
2814 instruct divD_imm(regD dst, immD con) %{
2815   predicate((UseSSE>=2) && (UseAVX == 0));
2816   match(Set dst (DivD dst con));
2817   format %{ "divsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2818   ins_cost(150);
2819   ins_encode %{
2820     __ divsd($dst$$XMMRegister, $constantaddress($con));
2821   %}
2822   ins_pipe(pipe_slow);
2823 %}
2824 
2825 instruct divD_reg_reg(regD dst, regD src1, regD src2) %{
2826   predicate(UseAVX > 0);
2827   match(Set dst (DivD src1 src2));
2828 
2829   format %{ "vdivsd  $dst, $src1, $src2" %}
2830   ins_cost(150);
2831   ins_encode %{
2832     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2833   %}
2834   ins_pipe(pipe_slow);
2835 %}
2836 
2837 instruct divD_reg_mem(regD dst, regD src1, memory src2) %{
2838   predicate(UseAVX > 0);
2839   match(Set dst (DivD src1 (LoadD src2)));
2840 
2841   format %{ "vdivsd  $dst, $src1, $src2" %}
2842   ins_cost(150);
2843   ins_encode %{
2844     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2845   %}
2846   ins_pipe(pipe_slow);
2847 %}
2848 
2849 instruct divD_reg_imm(regD dst, regD src, immD con) %{
2850   predicate(UseAVX > 0);
2851   match(Set dst (DivD src con));
2852 
2853   format %{ "vdivsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
2854   ins_cost(150);
2855   ins_encode %{
2856     __ vdivsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2857   %}
2858   ins_pipe(pipe_slow);
2859 %}
2860 
2861 instruct absF_reg(regF dst) %{
2862   predicate((UseSSE>=1) && (UseAVX == 0));
2863   match(Set dst (AbsF dst));
2864   ins_cost(150);
2865   format %{ "andps   $dst, [0x7fffffff]\t# abs float by sign masking" %}
2866   ins_encode %{
2867     __ andps($dst$$XMMRegister, ExternalAddress(float_signmask()));
2868   %}
2869   ins_pipe(pipe_slow);
2870 %}
2871 
2872 instruct absF_reg_reg(vlRegF dst, vlRegF src) %{
2873   predicate(UseAVX > 0);
2874   match(Set dst (AbsF src));
2875   ins_cost(150);
2876   format %{ "vandps  $dst, $src, [0x7fffffff]\t# abs float by sign masking" %}
2877   ins_encode %{
2878     int vector_len = 0;
2879     __ vandps($dst$$XMMRegister, $src$$XMMRegister,
2880               ExternalAddress(float_signmask()), vector_len);
2881   %}
2882   ins_pipe(pipe_slow);
2883 %}
2884 
2885 instruct absD_reg(regD dst) %{
2886   predicate((UseSSE>=2) && (UseAVX == 0));
2887   match(Set dst (AbsD dst));
2888   ins_cost(150);
2889   format %{ "andpd   $dst, [0x7fffffffffffffff]\t"
2890             "# abs double by sign masking" %}
2891   ins_encode %{
2892     __ andpd($dst$$XMMRegister, ExternalAddress(double_signmask()));
2893   %}
2894   ins_pipe(pipe_slow);
2895 %}
2896 
2897 instruct absD_reg_reg(vlRegD dst, vlRegD src) %{
2898   predicate(UseAVX > 0);
2899   match(Set dst (AbsD src));
2900   ins_cost(150);
2901   format %{ "vandpd  $dst, $src, [0x7fffffffffffffff]\t"
2902             "# abs double by sign masking" %}
2903   ins_encode %{
2904     int vector_len = 0;
2905     __ vandpd($dst$$XMMRegister, $src$$XMMRegister,
2906               ExternalAddress(double_signmask()), vector_len);
2907   %}
2908   ins_pipe(pipe_slow);
2909 %}
2910 
2911 instruct negI_rReg_2(rRegI dst, rFlagsReg cr)
2912 %{
2913   match(Set dst (NegI dst));
2914   effect(KILL cr);
2915 
2916   format %{ "negl    $dst\t# int" %}
2917   ins_encode %{
2918     __ negl($dst$$Register);
2919   %}
2920   ins_pipe(ialu_reg);
2921 %}
2922 
2923 instruct negL_rReg_2(rRegL dst, rFlagsReg cr)
2924 %{
2925   match(Set dst (NegL dst));
2926   effect(KILL cr);
2927 
2928   format %{ "negq    $dst\t# int" %}
2929   ins_encode %{
2930     __ negq($dst$$Register);
2931   %}
2932   ins_pipe(ialu_reg);
2933 %}
2934 
2935 instruct negF_reg(regF dst) %{
2936   predicate((UseSSE>=1) && (UseAVX == 0));
2937   match(Set dst (NegF dst));
2938   ins_cost(150);
2939   format %{ "xorps   $dst, [0x80000000]\t# neg float by sign flipping" %}
2940   ins_encode %{
2941     __ xorps($dst$$XMMRegister, ExternalAddress(float_signflip()));
2942   %}
2943   ins_pipe(pipe_slow);
2944 %}
2945 
2946 instruct negF_reg_reg(vlRegF dst, vlRegF src) %{
2947   predicate(UseAVX > 0);
2948   match(Set dst (NegF src));
2949   ins_cost(150);
2950   format %{ "vnegatess  $dst, $src, [0x80000000]\t# neg float by sign flipping" %}
2951   ins_encode %{
2952     __ vnegatess($dst$$XMMRegister, $src$$XMMRegister,
2953                  ExternalAddress(float_signflip()));
2954   %}
2955   ins_pipe(pipe_slow);
2956 %}
2957 
2958 instruct negD_reg(regD dst) %{
2959   predicate((UseSSE>=2) && (UseAVX == 0));
2960   match(Set dst (NegD dst));
2961   ins_cost(150);
2962   format %{ "xorpd   $dst, [0x8000000000000000]\t"
2963             "# neg double by sign flipping" %}
2964   ins_encode %{
2965     __ xorpd($dst$$XMMRegister, ExternalAddress(double_signflip()));
2966   %}
2967   ins_pipe(pipe_slow);
2968 %}
2969 
2970 instruct negD_reg_reg(vlRegD dst, vlRegD src) %{
2971   predicate(UseAVX > 0);
2972   match(Set dst (NegD src));
2973   ins_cost(150);
2974   format %{ "vnegatesd  $dst, $src, [0x8000000000000000]\t"
2975             "# neg double by sign flipping" %}
2976   ins_encode %{
2977     __ vnegatesd($dst$$XMMRegister, $src$$XMMRegister,
2978                  ExternalAddress(double_signflip()));
2979   %}
2980   ins_pipe(pipe_slow);
2981 %}
2982 
2983 instruct sqrtF_reg(regF dst, regF src) %{
2984   predicate(UseSSE>=1);
2985   match(Set dst (SqrtF src));
2986 
2987   format %{ "sqrtss  $dst, $src" %}
2988   ins_cost(150);
2989   ins_encode %{
2990     __ sqrtss($dst$$XMMRegister, $src$$XMMRegister);
2991   %}
2992   ins_pipe(pipe_slow);
2993 %}
2994 
2995 instruct sqrtF_mem(regF dst, memory src) %{
2996   predicate(UseSSE>=1);
2997   match(Set dst (SqrtF (LoadF src)));
2998 
2999   format %{ "sqrtss  $dst, $src" %}
3000   ins_cost(150);
3001   ins_encode %{
3002     __ sqrtss($dst$$XMMRegister, $src$$Address);
3003   %}
3004   ins_pipe(pipe_slow);
3005 %}
3006 
3007 instruct sqrtF_imm(regF dst, immF con) %{
3008   predicate(UseSSE>=1);
3009   match(Set dst (SqrtF con));
3010 
3011   format %{ "sqrtss  $dst, [$constantaddress]\t# load from constant table: float=$con" %}
3012   ins_cost(150);
3013   ins_encode %{
3014     __ sqrtss($dst$$XMMRegister, $constantaddress($con));
3015   %}
3016   ins_pipe(pipe_slow);
3017 %}
3018 
3019 instruct sqrtD_reg(regD dst, regD src) %{
3020   predicate(UseSSE>=2);
3021   match(Set dst (SqrtD src));
3022 
3023   format %{ "sqrtsd  $dst, $src" %}
3024   ins_cost(150);
3025   ins_encode %{
3026     __ sqrtsd($dst$$XMMRegister, $src$$XMMRegister);
3027   %}
3028   ins_pipe(pipe_slow);
3029 %}
3030 
3031 instruct sqrtD_mem(regD dst, memory src) %{
3032   predicate(UseSSE>=2);
3033   match(Set dst (SqrtD (LoadD src)));
3034 
3035   format %{ "sqrtsd  $dst, $src" %}
3036   ins_cost(150);
3037   ins_encode %{
3038     __ sqrtsd($dst$$XMMRegister, $src$$Address);
3039   %}
3040   ins_pipe(pipe_slow);
3041 %}
3042 
3043 instruct sqrtD_imm(regD dst, immD con) %{
3044   predicate(UseSSE>=2);
3045   match(Set dst (SqrtD con));
3046   format %{ "sqrtsd  $dst, [$constantaddress]\t# load from constant table: double=$con" %}
3047   ins_cost(150);
3048   ins_encode %{
3049     __ sqrtsd($dst$$XMMRegister, $constantaddress($con));
3050   %}
3051   ins_pipe(pipe_slow);
3052 %}
3053 
3054 instruct onspinwait() %{
3055   match(OnSpinWait);
3056   ins_cost(200);
3057 
3058   format %{
3059     $$template
3060     $$emit$$"pause\t! membar_onspinwait"
3061   %}
3062   ins_encode %{
3063     __ pause();
3064   %}
3065   ins_pipe(pipe_slow);
3066 %}
3067 
3068 // a * b + c
3069 instruct fmaD_reg(regD a, regD b, regD c) %{
3070   predicate(UseFMA);
3071   match(Set c (FmaD  c (Binary a b)));
3072   format %{ "fmasd $a,$b,$c\t# $c = $a * $b + $c" %}
3073   ins_cost(150);
3074   ins_encode %{
3075     __ fmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
3076   %}
3077   ins_pipe( pipe_slow );
3078 %}
3079 
3080 // a * b + c
3081 instruct fmaF_reg(regF a, regF b, regF c) %{
3082   predicate(UseFMA);
3083   match(Set c (FmaF  c (Binary a b)));
3084   format %{ "fmass $a,$b,$c\t# $c = $a * $b + $c" %}
3085   ins_cost(150);
3086   ins_encode %{
3087     __ fmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
3088   %}
3089   ins_pipe( pipe_slow );
3090 %}
3091 
3092 // ====================VECTOR INSTRUCTIONS=====================================
3093 
3094 instruct reinterpretS(vecS dst) %{
3095   predicate(n->bottom_type()->is_vect()->length_in_bytes() == 4 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 4);
3096   match(Set dst (VectorReinterpret dst));
3097   ins_cost(125);
3098   format %{ " # reinterpret $dst" %}
3099   ins_encode %{
3100     // empty
3101   %}
3102   ins_pipe( pipe_slow );
3103 %}
3104 
3105 instruct reinterpretS2D(vecD dst, vecS src, rRegL scratch) %{
3106   predicate(UseAVX == 0 && n->bottom_type()->is_vect()->length_in_bytes() == 8 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 4);
3107   match(Set dst (VectorReinterpret src));
3108   ins_cost(125);
3109   effect(TEMP dst, TEMP scratch);
3110   format %{ " # reinterpret $dst,$src" %}
3111   ins_encode %{
3112     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_32_bit_mask()), $scratch$$Register);
3113     __ pand($dst$$XMMRegister, $src$$XMMRegister);
3114   %}
3115   ins_pipe( pipe_slow );
3116 %}
3117 
3118 instruct reinterpretS2D_avx(vecD dst, vecS src, rRegL scratch) %{
3119   predicate(UseAVX > 0 && n->bottom_type()->is_vect()->length_in_bytes() == 8 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 4);
3120   match(Set dst (VectorReinterpret src));
3121   ins_cost(125);
3122   effect(TEMP dst, TEMP scratch);
3123   format %{ " # reinterpret $dst,$src" %}
3124   ins_encode %{
3125     int vector_len = 0;
3126     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_32_bit_mask()), vector_len, $scratch$$Register);
3127   %}
3128   ins_pipe( pipe_slow );
3129 %}
3130 
3131 instruct reinterpretS2X(vecX dst, vecS src, rRegL scratch) %{
3132   predicate(UseAVX == 0 && n->bottom_type()->is_vect()->length_in_bytes() == 16 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 4);
3133   match(Set dst (VectorReinterpret src));
3134   ins_cost(125);
3135   effect(TEMP dst, TEMP scratch);
3136   format %{ " # reinterpret $dst,$src" %}
3137   ins_encode %{
3138     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_32_bit_mask()), $scratch$$Register);
3139     __ pand($dst$$XMMRegister, $src$$XMMRegister);
3140   %}
3141   ins_pipe( pipe_slow );
3142 %}
3143 
3144 instruct reinterpretS2X_avx(vecX dst, vecS src, rRegL scratch) %{
3145   predicate(UseAVX > 0 && n->bottom_type()->is_vect()->length_in_bytes() == 16 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 4);
3146   match(Set dst (VectorReinterpret src));
3147   ins_cost(125);
3148   effect(TEMP scratch);
3149   format %{ " # reinterpret $dst,$src" %}
3150   ins_encode %{
3151     int vector_len = 0;
3152     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_32_bit_mask()), vector_len, $scratch$$Register);
3153   %}
3154   ins_pipe( pipe_slow );
3155 %}
3156 
3157 instruct reinterpretS2Y(vecY dst, vecS src, rRegL scratch) %{
3158   predicate(UseAVX >= 2 && n->bottom_type()->is_vect()->length_in_bytes() == 32 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 4);
3159   match(Set dst (VectorReinterpret src));
3160   ins_cost(125);
3161   effect(TEMP scratch);
3162   format %{ " # reinterpret $dst,$src" %}
3163   ins_encode %{
3164     int vector_len = 1;
3165     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_32_bit_mask()), vector_len, $scratch$$Register);
3166   %}
3167   ins_pipe( pipe_slow );
3168 %}
3169 
3170 instruct reinterpretS2Z(vecZ dst, vecS src, rRegL scratch) %{
3171   predicate(UseAVX > 2 && n->bottom_type()->is_vect()->length_in_bytes() == 64 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 4);
3172   match(Set dst (VectorReinterpret src));
3173   ins_cost(125);
3174   effect(TEMP scratch);
3175   format %{ " # reinterpret $dst,$src" %}
3176   ins_encode %{
3177     int vector_len = 2;
3178     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_32_bit_mask()), vector_len, $scratch$$Register);
3179   %}
3180   ins_pipe( pipe_slow );
3181 %}
3182 
3183 instruct reinterpretD2S(vecS dst, vecD src) %{
3184   predicate(n->bottom_type()->is_vect()->length_in_bytes() == 4 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 8);
3185   match(Set dst (VectorReinterpret src));
3186   ins_cost(125);
3187   format %{ " # reinterpret $dst,$src" %}
3188   ins_encode %{
3189     // If register is the same, then move is not needed.
3190     if ($dst$$XMMRegister != $src$$XMMRegister) {
3191       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
3192     }
3193   %}
3194   ins_pipe( pipe_slow );
3195 %}
3196 
3197 instruct reinterpretD(vecD dst) %{
3198   predicate(n->bottom_type()->is_vect()->length_in_bytes() == 8 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 8);
3199   match(Set dst (VectorReinterpret dst));
3200   ins_cost(125);
3201   format %{ " # reinterpret $dst" %}
3202   ins_encode %{
3203     // empty
3204   %}
3205   ins_pipe( pipe_slow );
3206 %}
3207 
3208 instruct reinterpretD2X(vecX dst, vecD src, rRegL scratch) %{
3209   predicate(UseAVX == 0 && n->bottom_type()->is_vect()->length_in_bytes() == 16 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 8);
3210   match(Set dst (VectorReinterpret src));
3211   ins_cost(125);
3212   effect(TEMP dst, TEMP scratch);
3213   format %{ " # reinterpret $dst,$src" %}
3214   ins_encode %{
3215     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_64_bit_mask()), $scratch$$Register);
3216     __ pand($dst$$XMMRegister, $src$$XMMRegister);
3217   %}
3218   ins_pipe( pipe_slow );
3219 %}
3220 
3221 instruct reinterpretD2X_avx(vecX dst, vecD src, rRegL scratch) %{
3222   predicate(UseAVX > 0 && n->bottom_type()->is_vect()->length_in_bytes() == 16 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 8);
3223   match(Set dst (VectorReinterpret src));
3224   ins_cost(125);
3225   effect(TEMP dst, TEMP scratch);
3226   format %{ " # reinterpret $dst,$src" %}
3227   ins_encode %{
3228     int vector_len = 0;
3229     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_64_bit_mask()), vector_len, $scratch$$Register);
3230   %}
3231   ins_pipe( pipe_slow );
3232 %}
3233 
3234 instruct reinterpretD2Y(vecY dst, vecD src, rRegL scratch) %{
3235   predicate(UseAVX >= 2 && n->bottom_type()->is_vect()->length_in_bytes() == 32 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 8);
3236   match(Set dst (VectorReinterpret src));
3237   ins_cost(125);
3238   effect(TEMP scratch);
3239   format %{ " # reinterpret $dst,$src" %}
3240   ins_encode %{
3241     int vector_len = 1;
3242     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_64_bit_mask()), vector_len, $scratch$$Register);
3243   %}
3244   ins_pipe( pipe_slow );
3245 %}
3246 
3247 instruct reinterpretD2Z(vecZ dst, vecD src, rRegL scratch) %{
3248   predicate(UseAVX > 2 && n->bottom_type()->is_vect()->length_in_bytes() == 64 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 8);
3249   match(Set dst (VectorReinterpret src));
3250   ins_cost(125);
3251   effect(TEMP scratch);
3252   format %{ " # reinterpret $dst,$src" %}
3253   ins_encode %{
3254     int vector_len = 2;
3255     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_64_bit_mask()), vector_len, $scratch$$Register);
3256   %}
3257   ins_pipe( pipe_slow );
3258 %}
3259 
3260 instruct reinterpretX2S(vecS dst, vecX src) %{
3261   predicate(n->bottom_type()->is_vect()->length_in_bytes() == 4 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 16);
3262   match(Set dst (VectorReinterpret src));
3263   ins_cost(125);
3264   format %{ " # reinterpret $dst,$src" %}
3265   ins_encode %{
3266     // If register is the same, then move is not needed.
3267     if ($dst$$XMMRegister != $src$$XMMRegister) {
3268       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
3269     }
3270   %}
3271   ins_pipe( pipe_slow );
3272 %}
3273 
3274 instruct reinterpretX2D(vecD dst, vecX src) %{
3275   predicate(n->bottom_type()->is_vect()->length_in_bytes() == 8 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 16);
3276   match(Set dst (VectorReinterpret src));
3277   ins_cost(125);
3278   format %{ " # reinterpret $dst,$src" %}
3279   ins_encode %{
3280     // If register is the same, then move is not needed.
3281     if ($dst$$XMMRegister != $src$$XMMRegister) {
3282       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
3283     }
3284   %}
3285   ins_pipe( pipe_slow );
3286 %}
3287 
3288 instruct reinterpretX(vecX dst) %{
3289   predicate(n->bottom_type()->is_vect()->length_in_bytes() == 16 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 16);
3290   match(Set dst (VectorReinterpret dst));
3291   ins_cost(125);
3292   format %{ " # reinterpret $dst" %}
3293   ins_encode %{
3294     // empty
3295   %}
3296   ins_pipe( pipe_slow );
3297 %}
3298 
3299 instruct reinterpretX2Y(vecY dst, vecX src) %{
3300   predicate(n->bottom_type()->is_vect()->length_in_bytes() == 32 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 16);
3301   match(Set dst (VectorReinterpret src));
3302   ins_cost(125);
3303   effect(TEMP dst);
3304   format %{ " # reinterpret $dst,$src" %}
3305   ins_encode %{
3306     int vector_len = 1;
3307     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3308     __ movdqu($dst$$XMMRegister, $src$$XMMRegister);  // just 128-bits need moved
3309   %}
3310   ins_pipe( pipe_slow );
3311 %}
3312 
3313 instruct reinterpretX2Z(vecZ dst, vecX src) %{
3314   predicate(n->bottom_type()->is_vect()->length_in_bytes() == 64 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 16);
3315   match(Set dst (VectorReinterpret src));
3316   ins_cost(125);
3317   effect(TEMP dst);
3318   format %{ " # reinterpret $dst,$src" %}
3319   ins_encode %{
3320     int vector_len = 2;
3321     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3322     __ movdqu($dst$$XMMRegister, $src$$XMMRegister);  // just 128-bits need moved
3323   %}
3324   ins_pipe( pipe_slow );
3325 %}
3326 
3327 instruct reinterpretY2S(vecS dst, vecY src) %{
3328   predicate(n->bottom_type()->is_vect()->length_in_bytes() == 4 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 32);
3329   match(Set dst (VectorReinterpret src));
3330   ins_cost(125);
3331   format %{ " # reinterpret $dst,$src" %}
3332   ins_encode %{
3333     // If register is the same, then move is not needed.
3334     if ($dst$$XMMRegister != $src$$XMMRegister) {
3335       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
3336     }
3337   %}
3338   ins_pipe( pipe_slow );
3339 %}
3340 
3341 instruct reinterpretY2D(vecD dst, vecY src) %{
3342   predicate(n->bottom_type()->is_vect()->length_in_bytes() == 8 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 32);
3343   match(Set dst (VectorReinterpret src));
3344   ins_cost(125);
3345   format %{ " # reinterpret $dst,$src" %}
3346   ins_encode %{
3347     // If register is the same, then move is not needed.
3348     if ($dst$$XMMRegister != $src$$XMMRegister) {
3349       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
3350     }
3351   %}
3352   ins_pipe( pipe_slow );
3353 %}
3354 
3355 instruct reinterpretY2X(vecX dst, vecY src) %{
3356   predicate(n->bottom_type()->is_vect()->length_in_bytes() == 16 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 32);
3357   match(Set dst (VectorReinterpret src));
3358   ins_cost(125);
3359   format %{ " # reinterpret $dst,$src" %}
3360   ins_encode %{
3361     // If register is the same, then move is not needed.
3362     if ($dst$$XMMRegister != $src$$XMMRegister) {
3363       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
3364     }
3365   %}
3366   ins_pipe( pipe_slow );
3367 %}
3368 
3369 instruct reinterpretY(vecY dst) %{
3370   predicate(n->bottom_type()->is_vect()->length_in_bytes() == 32 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 32);
3371   match(Set dst (VectorReinterpret dst));
3372   ins_cost(125);
3373   format %{ " # reinterpret $dst" %}
3374   ins_encode %{
3375     // empty
3376   %}
3377   ins_pipe( pipe_slow );
3378 %}
3379 
3380 instruct reinterpretY2Z(vecZ dst, vecY src) %{
3381   predicate(n->bottom_type()->is_vect()->length_in_bytes() == 64 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 32);
3382   match(Set dst (VectorReinterpret src));
3383   ins_cost(125);
3384   effect(TEMP dst);
3385   format %{ " # reinterpret $dst,$src" %}
3386   ins_encode %{
3387     int vector_len = 2;
3388     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3389     __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister);
3390   %}
3391   ins_pipe( pipe_slow );
3392 %}
3393 
3394 instruct reinterpretZ2S(vecS dst, vecZ src) %{
3395   predicate(n->bottom_type()->is_vect()->length_in_bytes() == 4 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 64);
3396   match(Set dst (VectorReinterpret src));
3397   ins_cost(125);
3398   format %{ " # reinterpret $dst,$src" %}
3399   ins_encode %{
3400     // If register is the same, then move is not needed.
3401     if ($dst$$XMMRegister != $src$$XMMRegister) {
3402       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
3403     }
3404   %}
3405   ins_pipe( pipe_slow );
3406 %}
3407 
3408 instruct reinterpretZ2D(vecD dst, vecZ src) %{
3409   predicate(n->bottom_type()->is_vect()->length_in_bytes() == 8 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 64);
3410   match(Set dst (VectorReinterpret src));
3411   ins_cost(125);
3412   format %{ " # reinterpret $dst,$src" %}
3413   ins_encode %{
3414     // If register is the same, then move is not needed.
3415     if ($dst$$XMMRegister != $src$$XMMRegister) {
3416       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
3417     }
3418   %}
3419   ins_pipe( pipe_slow );
3420 %}
3421 
3422 instruct reinterpretZ2X(vecX dst, vecZ src) %{
3423   predicate(n->bottom_type()->is_vect()->length_in_bytes() == 16 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 64);
3424   match(Set dst (VectorReinterpret src));
3425   ins_cost(125);
3426   format %{ " # reinterpret $dst,$src" %}
3427   ins_encode %{
3428     // If register is the same, then move is not needed.
3429     if ($dst$$XMMRegister != $src$$XMMRegister) {
3430       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
3431     }
3432   %}
3433   ins_pipe( pipe_slow );
3434 %}
3435 
3436 instruct reinterpretZ2Y(vecY dst, vecZ src) %{
3437   predicate(n->bottom_type()->is_vect()->length_in_bytes() == 32 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 64);
3438   match(Set dst (VectorReinterpret src));
3439   ins_cost(125);
3440   format %{ " # reinterpret $dst,$src" %}
3441   ins_encode %{
3442     // If register is the same, then move is not needed.
3443     if ($dst$$XMMRegister != $src$$XMMRegister) {
3444       __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister);
3445     }
3446   %}
3447   ins_pipe( pipe_slow );
3448 %}
3449 
3450 instruct reinterpretZ(vecZ dst) %{
3451   predicate(n->bottom_type()->is_vect()->length_in_bytes() == 64 && n->in(1)->bottom_type()->is_vect()->length_in_bytes() == 64);
3452   match(Set dst (VectorReinterpret dst));
3453   ins_cost(125);
3454   format %{ " # reinterpret $dst" %}
3455   ins_encode %{
3456     // empty
3457   %}
3458   ins_pipe( pipe_slow );
3459 %}
3460 
3461 // ==========
3462 
3463 // Load vectors (1 byte long)
3464 instruct loadV1(vecS dst, memory mem, rRegI tmp) %{
3465   predicate(n->as_LoadVector()->memory_size() == 1);
3466   match(Set dst (LoadVector mem));
3467   ins_cost(125);
3468   effect(TEMP tmp);
3469   format %{ "movzbl $tmp,$mem\n\t"
3470           "movd $dst,$tmp\t! load vector (1 byte)" %}
3471   ins_encode %{
3472     __ movzbl($tmp$$Register, $mem$$Address);
3473     __ movdl($dst$$XMMRegister, $tmp$$Register);
3474   %}
3475   ins_pipe( pipe_slow );
3476 %}
3477 
3478 // Load vectors (2 bytes long)
3479 instruct loadV2(vecS dst, memory mem, rRegI tmp) %{
3480   predicate(n->as_LoadVector()->memory_size() == 2);
3481   match(Set dst (LoadVector mem));
3482   ins_cost(125);
3483   effect(TEMP tmp);
3484   format %{ "movzwl $tmp,$mem\n\t"
3485           "movd $dst,$tmp\t! load vector (2 bytes)" %}
3486   ins_encode %{
3487     __ movzwl($tmp$$Register, $mem$$Address);
3488     __ movdl($dst$$XMMRegister, $tmp$$Register);
3489   %}
3490   ins_pipe( pipe_slow );
3491 %}
3492 
3493 
3494 // Load vectors (4 bytes long)
3495 instruct loadV4(vecS dst, memory mem) %{
3496   predicate(n->as_LoadVector()->memory_size() == 4);
3497   match(Set dst (LoadVector mem));
3498   ins_cost(125);
3499   format %{ "movd    $dst,$mem\t! load vector (4 bytes)" %}
3500   ins_encode %{
3501     __ movdl($dst$$XMMRegister, $mem$$Address);
3502   %}
3503   ins_pipe( pipe_slow );
3504 %}
3505 
3506 // Load vectors (4 bytes long)
3507 instruct MoveVecS2Leg(legVecS dst, vecS src) %{
3508   match(Set dst src);
3509   format %{ "movss $dst,$src\t! load vector (4 bytes)" %}
3510   ins_encode %{
3511     __ movflt($dst$$XMMRegister, $src$$XMMRegister);
3512   %}
3513   ins_pipe( fpu_reg_reg );
3514 %}
3515 
3516 // Load vectors (4 bytes long)
3517 instruct MoveLeg2VecS(vecS dst, legVecS src) %{
3518   match(Set dst src);
3519   format %{ "movss $dst,$src\t! load vector (4 bytes)" %}
3520   ins_encode %{
3521     __ movflt($dst$$XMMRegister, $src$$XMMRegister);
3522   %}
3523   ins_pipe( fpu_reg_reg );
3524 %}
3525 
3526 // Load vectors (8 bytes long)
3527 instruct loadV8(vecD dst, memory mem) %{
3528   predicate(n->as_LoadVector()->memory_size() == 8);
3529   match(Set dst (LoadVector mem));
3530   ins_cost(125);
3531   format %{ "movq    $dst,$mem\t! load vector (8 bytes)" %}
3532   ins_encode %{
3533     __ movq($dst$$XMMRegister, $mem$$Address);
3534   %}
3535   ins_pipe( pipe_slow );
3536 %}
3537 
3538 // Load vectors (8 bytes long)
3539 instruct MoveVecD2Leg(legVecD dst, vecD src) %{
3540   match(Set dst src);
3541   format %{ "movsd $dst,$src\t! load vector (8 bytes)" %}
3542   ins_encode %{
3543     __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
3544   %}
3545   ins_pipe( fpu_reg_reg );
3546 %}
3547 
3548 // Load vectors (8 bytes long)
3549 instruct MoveLeg2VecD(vecD dst, legVecD src) %{
3550   match(Set dst src);
3551   format %{ "movsd $dst,$src\t! load vector (8 bytes)" %}
3552   ins_encode %{
3553     __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
3554   %}
3555   ins_pipe( fpu_reg_reg );
3556 %}
3557 
3558 // Load vectors (16 bytes long)
3559 instruct loadV16(vecX dst, memory mem) %{
3560   predicate(n->as_LoadVector()->memory_size() == 16);
3561   match(Set dst (LoadVector mem));
3562   ins_cost(125);
3563   format %{ "movdqu  $dst,$mem\t! load vector (16 bytes)" %}
3564   ins_encode %{
3565     __ movdqu($dst$$XMMRegister, $mem$$Address);
3566   %}
3567   ins_pipe( pipe_slow );
3568 %}
3569 
3570 // Load vectors (16 bytes long)
3571 instruct MoveVecX2Leg(legVecX dst, vecX src) %{
3572   match(Set dst src);
3573   format %{ "movdqu $dst,$src\t! load vector (16 bytes)" %}
3574   ins_encode %{
3575     if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
3576       int vector_len = 2;
3577       __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
3578     } else {
3579       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
3580     }
3581   %}
3582   ins_pipe( fpu_reg_reg );
3583 %}
3584 
3585 // Load vectors (16 bytes long)
3586 instruct MoveLeg2VecX(vecX dst, legVecX src) %{
3587   match(Set dst src);
3588   format %{ "movdqu $dst,$src\t! load vector (16 bytes)" %}
3589   ins_encode %{
3590     if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
3591       int vector_len = 2;
3592       __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
3593     } else {
3594       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
3595     }
3596   %}
3597   ins_pipe( fpu_reg_reg );
3598 %}
3599 
3600 // Load vectors (32 bytes long)
3601 instruct loadV32(vecY dst, memory mem) %{
3602   predicate(n->as_LoadVector()->memory_size() == 32);
3603   match(Set dst (LoadVector mem));
3604   ins_cost(125);
3605   format %{ "vmovdqu $dst,$mem\t! load vector (32 bytes)" %}
3606   ins_encode %{
3607     __ vmovdqu($dst$$XMMRegister, $mem$$Address);
3608   %}
3609   ins_pipe( pipe_slow );
3610 %}
3611 
3612 // Load vectors (32 bytes long)
3613 instruct MoveVecY2Leg(legVecY dst, vecY src) %{
3614   match(Set dst src);
3615   format %{ "vmovdqu $dst,$src\t! load vector (32 bytes)" %}
3616   ins_encode %{
3617     if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
3618       int vector_len = 2;
3619       __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
3620     } else {
3621       __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister);
3622     }
3623   %}
3624   ins_pipe( fpu_reg_reg );
3625 %}
3626 
3627 // Load vectors (32 bytes long)
3628 instruct MoveLeg2VecY(vecY dst, legVecY src) %{
3629   match(Set dst src);
3630   format %{ "vmovdqu $dst,$src\t! load vector (32 bytes)" %}
3631   ins_encode %{
3632     if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
3633       int vector_len = 2;
3634       __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
3635     } else {
3636       __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister);
3637     }
3638   %}
3639   ins_pipe( fpu_reg_reg );
3640 %}
3641 
3642 // Load vectors (64 bytes long)
3643 instruct loadV64_dword(vecZ dst, memory mem) %{
3644   predicate(n->as_LoadVector()->memory_size() == 64 && n->as_LoadVector()->element_size() <= 4);
3645   match(Set dst (LoadVector mem));
3646   ins_cost(125);
3647   format %{ "vmovdqul $dst k0,$mem\t! load vector (64 bytes)" %}
3648   ins_encode %{
3649     int vector_len = 2;
3650     __ evmovdqul($dst$$XMMRegister, $mem$$Address, vector_len);
3651   %}
3652   ins_pipe( pipe_slow );
3653 %}
3654 
3655 // Load vectors (64 bytes long)
3656 instruct loadV64_qword(vecZ dst, memory mem) %{
3657   predicate(n->as_LoadVector()->memory_size() == 64 && n->as_LoadVector()->element_size() > 4);
3658   match(Set dst (LoadVector mem));
3659   ins_cost(125);
3660   format %{ "vmovdquq $dst k0,$mem\t! load vector (64 bytes)" %}
3661   ins_encode %{
3662     int vector_len = 2;
3663     __ evmovdquq($dst$$XMMRegister, $mem$$Address, vector_len);
3664   %}
3665   ins_pipe( pipe_slow );
3666 %}
3667 
3668 instruct MoveVecZ2Leg(legVecZ dst, vecZ  src) %{
3669   match(Set dst src);
3670   format %{ "vmovdquq $dst k0,$src\t! Move vector (64 bytes)" %}
3671   ins_encode %{
3672     int vector_len = 2;
3673     __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
3674   %}
3675   ins_pipe( fpu_reg_reg );
3676 %}
3677 
3678 instruct MoveLeg2VecZ(vecZ dst, legVecZ  src) %{
3679   match(Set dst src);
3680   format %{ "vmovdquq $dst k0,$src\t! Move vector (64 bytes)" %}
3681   ins_encode %{
3682     int vector_len = 2;
3683     __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
3684   %}
3685   ins_pipe( fpu_reg_reg );
3686 %}
3687 
3688 // Store vectors
3689 instruct storeV1(memory mem, vecS src, rRegI tmp) %{
3690   predicate(n->as_StoreVector()->memory_size() == 1);
3691   match(Set mem (StoreVector mem src));
3692   ins_cost(145);
3693   effect(TEMP tmp);
3694   format %{ "movd $tmp,$src\n\t"
3695           "movb $mem,$tmp\t! store vector (1 byte)" %}
3696   ins_encode %{
3697     __ movdl($tmp$$Register, $src$$XMMRegister);
3698     __ movb($mem$$Address, $tmp$$Register);
3699   %}
3700   ins_pipe( pipe_slow );
3701 %}
3702 
3703 instruct storeV2(memory mem, vecS src, rRegI tmp) %{
3704   predicate(n->as_StoreVector()->memory_size() == 2);
3705   match(Set mem (StoreVector mem src));
3706   ins_cost(145);
3707   effect(TEMP tmp);
3708   format %{ "movd $tmp,$src\n\t"
3709           "movw $mem,$tmp\t! store vector (2 bytes)" %}
3710   ins_encode %{
3711     __ movdl($tmp$$Register, $src$$XMMRegister);
3712     __ movw($mem$$Address, $tmp$$Register);
3713   %}
3714   ins_pipe( pipe_slow );
3715 %}
3716 
3717 instruct storeV4(memory mem, vecS src) %{
3718   predicate(n->as_StoreVector()->memory_size() == 4);
3719   match(Set mem (StoreVector mem src));
3720   ins_cost(145);
3721   format %{ "movd    $mem,$src\t! store vector (4 bytes)" %}
3722   ins_encode %{
3723     __ movdl($mem$$Address, $src$$XMMRegister);
3724   %}
3725   ins_pipe( pipe_slow );
3726 %}
3727 
3728 instruct storeV8(memory mem, vecD src) %{
3729   predicate(n->as_StoreVector()->memory_size() == 8);
3730   match(Set mem (StoreVector mem src));
3731   ins_cost(145);
3732   format %{ "movq    $mem,$src\t! store vector (8 bytes)" %}
3733   ins_encode %{
3734     __ movq($mem$$Address, $src$$XMMRegister);
3735   %}
3736   ins_pipe( pipe_slow );
3737 %}
3738 
3739 instruct storeV16(memory mem, vecX src) %{
3740   predicate(n->as_StoreVector()->memory_size() == 16);
3741   match(Set mem (StoreVector mem src));
3742   ins_cost(145);
3743   format %{ "movdqu  $mem,$src\t! store vector (16 bytes)" %}
3744   ins_encode %{
3745     __ movdqu($mem$$Address, $src$$XMMRegister);
3746   %}
3747   ins_pipe( pipe_slow );
3748 %}
3749 
3750 instruct storeV32(memory mem, vecY src) %{
3751   predicate(n->as_StoreVector()->memory_size() == 32);
3752   match(Set mem (StoreVector mem src));
3753   ins_cost(145);
3754   format %{ "vmovdqu $mem,$src\t! store vector (32 bytes)" %}
3755   ins_encode %{
3756     __ vmovdqu($mem$$Address, $src$$XMMRegister);
3757   %}
3758   ins_pipe( pipe_slow );
3759 %}
3760 
3761 instruct storeV64_dword(memory mem, vecZ src) %{
3762   predicate(n->as_StoreVector()->memory_size() == 64 && n->as_StoreVector()->element_size() <= 4);
3763   match(Set mem (StoreVector mem src));
3764   ins_cost(145);
3765   format %{ "vmovdqul $mem k0,$src\t! store vector (64 bytes)" %}
3766   ins_encode %{
3767     int vector_len = 2;
3768     __ evmovdqul($mem$$Address, $src$$XMMRegister, vector_len);
3769   %}
3770   ins_pipe( pipe_slow );
3771 %}
3772 
3773 instruct storeV64_qword(memory mem, vecZ src) %{
3774   predicate(n->as_StoreVector()->memory_size() == 64 && n->as_StoreVector()->element_size() > 4);
3775   match(Set mem (StoreVector mem src));
3776   ins_cost(145);
3777   format %{ "vmovdquq $mem k0,$src\t! store vector (64 bytes)" %}
3778   ins_encode %{
3779     int vector_len = 2;
3780     __ evmovdquq($mem$$Address, $src$$XMMRegister, vector_len);
3781   %}
3782   ins_pipe( pipe_slow );
3783 %}
3784 
3785 // ====================LEGACY REPLICATE=======================================
3786 
3787 instruct Repl4B_mem(vecS dst, memory mem) %{
3788   predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
3789   match(Set dst (ReplicateB (LoadB mem)));
3790   format %{ "punpcklbw $dst,$mem\n\t"
3791             "pshuflw $dst,$dst,0x00\t! replicate4B" %}
3792   ins_encode %{
3793     __ punpcklbw($dst$$XMMRegister, $mem$$Address);
3794     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3795   %}
3796   ins_pipe( pipe_slow );
3797 %}
3798 
3799 instruct Repl8B_mem(vecD dst, memory mem) %{
3800   predicate(n->as_Vector()->length() == 8 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
3801   match(Set dst (ReplicateB (LoadB mem)));
3802   format %{ "punpcklbw $dst,$mem\n\t"
3803             "pshuflw $dst,$dst,0x00\t! replicate8B" %}
3804   ins_encode %{
3805     __ punpcklbw($dst$$XMMRegister, $mem$$Address);
3806     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3807   %}
3808   ins_pipe( pipe_slow );
3809 %}
3810 
3811 instruct Repl16B(vecX dst, rRegI src) %{
3812   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
3813   match(Set dst (ReplicateB src));
3814   format %{ "movd    $dst,$src\n\t"
3815             "punpcklbw $dst,$dst\n\t"
3816             "pshuflw $dst,$dst,0x00\n\t"
3817             "punpcklqdq $dst,$dst\t! replicate16B" %}
3818   ins_encode %{
3819     __ movdl($dst$$XMMRegister, $src$$Register);
3820     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
3821     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3822     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3823   %}
3824   ins_pipe( pipe_slow );
3825 %}
3826 
3827 instruct Repl16B_mem(vecX dst, memory mem) %{
3828   predicate(n->as_Vector()->length() == 16 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
3829   match(Set dst (ReplicateB (LoadB mem)));
3830   format %{ "punpcklbw $dst,$mem\n\t"
3831             "pshuflw $dst,$dst,0x00\n\t"
3832             "punpcklqdq $dst,$dst\t! replicate16B" %}
3833   ins_encode %{
3834     __ punpcklbw($dst$$XMMRegister, $mem$$Address);
3835     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3836     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3837   %}
3838   ins_pipe( pipe_slow );
3839 %}
3840 
3841 instruct Repl32B(vecY dst, rRegI src) %{
3842   predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
3843   match(Set dst (ReplicateB src));
3844   format %{ "movd    $dst,$src\n\t"
3845             "punpcklbw $dst,$dst\n\t"
3846             "pshuflw $dst,$dst,0x00\n\t"
3847             "punpcklqdq $dst,$dst\n\t"
3848             "vinserti128_high $dst,$dst\t! replicate32B" %}
3849   ins_encode %{
3850     __ movdl($dst$$XMMRegister, $src$$Register);
3851     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
3852     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3853     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3854     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3855   %}
3856   ins_pipe( pipe_slow );
3857 %}
3858 
3859 instruct Repl32B_mem(vecY dst, memory mem) %{
3860   predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
3861   match(Set dst (ReplicateB (LoadB mem)));
3862   format %{ "punpcklbw $dst,$mem\n\t"
3863             "pshuflw $dst,$dst,0x00\n\t"
3864             "punpcklqdq $dst,$dst\n\t"
3865             "vinserti128_high $dst,$dst\t! replicate32B" %}
3866   ins_encode %{
3867     __ punpcklbw($dst$$XMMRegister, $mem$$Address);
3868     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3869     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3870     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3871   %}
3872   ins_pipe( pipe_slow );
3873 %}
3874 
3875 instruct Repl64B(legVecZ dst, rRegI src) %{
3876   predicate(n->as_Vector()->length() == 64 && !VM_Version::supports_avx512vlbw());
3877   match(Set dst (ReplicateB src));
3878   format %{ "movd    $dst,$src\n\t"
3879             "punpcklbw $dst,$dst\n\t"
3880             "pshuflw $dst,$dst,0x00\n\t"
3881             "punpcklqdq $dst,$dst\n\t"
3882             "vinserti128_high $dst,$dst\t"
3883             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate64B" %}
3884   ins_encode %{
3885     __ movdl($dst$$XMMRegister, $src$$Register);
3886     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
3887     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3888     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3889     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3890     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3891   %}
3892   ins_pipe( pipe_slow );
3893 %}
3894 
3895 instruct Repl64B_mem(legVecZ dst, memory mem) %{
3896   predicate(n->as_Vector()->length() == 64 && !VM_Version::supports_avx512vlbw());
3897   match(Set dst (ReplicateB (LoadB mem)));
3898   format %{ "punpcklbw $dst,$mem\n\t"
3899             "pshuflw $dst,$dst,0x00\n\t"
3900             "punpcklqdq $dst,$dst\n\t"
3901             "vinserti128_high $dst,$dst\t"
3902             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate64B" %}
3903   ins_encode %{
3904     __ punpcklbw($dst$$XMMRegister, $mem$$Address);
3905     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3906     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3907     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3908     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3909   %}
3910   ins_pipe( pipe_slow );
3911 %}
3912 
3913 instruct Repl16B_imm(vecX dst, immI con) %{
3914   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
3915   match(Set dst (ReplicateB con));
3916   format %{ "movq    $dst,[$constantaddress]\n\t"
3917             "punpcklqdq $dst,$dst\t! replicate16B($con)" %}
3918   ins_encode %{
3919     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
3920     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3921   %}
3922   ins_pipe( pipe_slow );
3923 %}
3924 
3925 instruct Repl32B_imm(vecY dst, immI con) %{
3926   predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
3927   match(Set dst (ReplicateB con));
3928   format %{ "movq    $dst,[$constantaddress]\n\t"
3929             "punpcklqdq $dst,$dst\n\t"
3930             "vinserti128_high $dst,$dst\t! lreplicate32B($con)" %}
3931   ins_encode %{
3932     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
3933     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3934     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3935   %}
3936   ins_pipe( pipe_slow );
3937 %}
3938 
3939 instruct Repl64B_imm(legVecZ dst, immI con) %{
3940   predicate(n->as_Vector()->length() == 64 && !VM_Version::supports_avx512vlbw());
3941   match(Set dst (ReplicateB con));
3942   format %{ "movq    $dst,[$constantaddress]\n\t"
3943             "punpcklqdq $dst,$dst\n\t"
3944             "vinserti128_high $dst,$dst\t"
3945             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate64B($con)" %}
3946   ins_encode %{
3947     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
3948     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3949     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3950     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3951   %}
3952   ins_pipe( pipe_slow );
3953 %}
3954 
3955 instruct Repl4S(vecD dst, rRegI src) %{
3956   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vlbw());
3957   match(Set dst (ReplicateS src));
3958   format %{ "movd    $dst,$src\n\t"
3959             "pshuflw $dst,$dst,0x00\t! replicate4S" %}
3960   ins_encode %{
3961     __ movdl($dst$$XMMRegister, $src$$Register);
3962     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3963   %}
3964   ins_pipe( pipe_slow );
3965 %}
3966 
3967 instruct Repl4S_mem(vecD dst, memory mem) %{
3968   predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
3969   match(Set dst (ReplicateS (LoadS mem)));
3970   format %{ "pshuflw $dst,$mem,0x00\t! replicate4S" %}
3971   ins_encode %{
3972     __ pshuflw($dst$$XMMRegister, $mem$$Address, 0x00);
3973   %}
3974   ins_pipe( pipe_slow );
3975 %}
3976 
3977 instruct Repl8S(vecX dst, rRegI src) %{
3978   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vlbw());
3979   match(Set dst (ReplicateS src));
3980   format %{ "movd    $dst,$src\n\t"
3981             "pshuflw $dst,$dst,0x00\n\t"
3982             "punpcklqdq $dst,$dst\t! replicate8S" %}
3983   ins_encode %{
3984     __ movdl($dst$$XMMRegister, $src$$Register);
3985     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3986     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3987   %}
3988   ins_pipe( pipe_slow );
3989 %}
3990 
3991 instruct Repl8S_mem(vecX dst, memory mem) %{
3992   predicate(n->as_Vector()->length() == 8 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
3993   match(Set dst (ReplicateS (LoadS mem)));
3994   format %{ "pshuflw $dst,$mem,0x00\n\t"
3995             "punpcklqdq $dst,$dst\t! replicate8S" %}
3996   ins_encode %{
3997     __ pshuflw($dst$$XMMRegister, $mem$$Address, 0x00);
3998     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3999   %}
4000   ins_pipe( pipe_slow );
4001 %}
4002 
4003 instruct Repl8S_imm(vecX dst, immI con) %{
4004   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vlbw());
4005   match(Set dst (ReplicateS con));
4006   format %{ "movq    $dst,[$constantaddress]\n\t"
4007             "punpcklqdq $dst,$dst\t! replicate8S($con)" %}
4008   ins_encode %{
4009     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
4010     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4011   %}
4012   ins_pipe( pipe_slow );
4013 %}
4014 
4015 instruct Repl16S(vecY dst, rRegI src) %{
4016   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
4017   match(Set dst (ReplicateS src));
4018   format %{ "movd    $dst,$src\n\t"
4019             "pshuflw $dst,$dst,0x00\n\t"
4020             "punpcklqdq $dst,$dst\n\t"
4021             "vinserti128_high $dst,$dst\t! replicate16S" %}
4022   ins_encode %{
4023     __ movdl($dst$$XMMRegister, $src$$Register);
4024     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
4025     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4026     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
4027   %}
4028   ins_pipe( pipe_slow );
4029 %}
4030 
4031 instruct Repl16S_mem(vecY dst, memory mem) %{
4032   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
4033   match(Set dst (ReplicateS (LoadS mem)));
4034   format %{ "pshuflw $dst,$mem,0x00\n\t"
4035             "punpcklqdq $dst,$dst\n\t"
4036             "vinserti128_high $dst,$dst\t! replicate16S" %}
4037   ins_encode %{
4038     __ pshuflw($dst$$XMMRegister, $mem$$Address, 0x00);
4039     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4040     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
4041   %}
4042   ins_pipe( pipe_slow );
4043 %}
4044 
4045 instruct Repl16S_imm(vecY dst, immI con) %{
4046   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
4047   match(Set dst (ReplicateS con));
4048   format %{ "movq    $dst,[$constantaddress]\n\t"
4049             "punpcklqdq $dst,$dst\n\t"
4050             "vinserti128_high $dst,$dst\t! replicate16S($con)" %}
4051   ins_encode %{
4052     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
4053     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4054     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
4055   %}
4056   ins_pipe( pipe_slow );
4057 %}
4058 
4059 instruct Repl32S(legVecZ dst, rRegI src) %{
4060   predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
4061   match(Set dst (ReplicateS src));
4062   format %{ "movd    $dst,$src\n\t"
4063             "pshuflw $dst,$dst,0x00\n\t"
4064             "punpcklqdq $dst,$dst\n\t"
4065             "vinserti128_high $dst,$dst\t"
4066             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate32S" %}
4067   ins_encode %{
4068     __ movdl($dst$$XMMRegister, $src$$Register);
4069     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
4070     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4071     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
4072     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
4073   %}
4074   ins_pipe( pipe_slow );
4075 %}
4076 
4077 instruct Repl32S_mem(legVecZ dst, memory mem) %{
4078   predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
4079   match(Set dst (ReplicateS (LoadS mem)));
4080   format %{ "pshuflw $dst,$mem,0x00\n\t"
4081             "punpcklqdq $dst,$dst\n\t"
4082             "vinserti128_high $dst,$dst\t"
4083             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate32S" %}
4084   ins_encode %{
4085     __ pshuflw($dst$$XMMRegister, $mem$$Address, 0x00);
4086     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4087     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
4088     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
4089   %}
4090   ins_pipe( pipe_slow );
4091 %}
4092 
4093 instruct Repl32S_imm(legVecZ dst, immI con) %{
4094   predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
4095   match(Set dst (ReplicateS con));
4096   format %{ "movq    $dst,[$constantaddress]\n\t"
4097             "punpcklqdq $dst,$dst\n\t"
4098             "vinserti128_high $dst,$dst\t"
4099             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate32S($con)" %}
4100   ins_encode %{
4101     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
4102     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4103     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
4104     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
4105   %}
4106   ins_pipe( pipe_slow );
4107 %}
4108 
4109 instruct Repl4I(vecX dst, rRegI src) %{
4110   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
4111   match(Set dst (ReplicateI src));
4112   format %{ "movd    $dst,$src\n\t"
4113             "pshufd  $dst,$dst,0x00\t! replicate4I" %}
4114   ins_encode %{
4115     __ movdl($dst$$XMMRegister, $src$$Register);
4116     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
4117   %}
4118   ins_pipe( pipe_slow );
4119 %}
4120 
4121 instruct Repl4I_mem(vecX dst, memory mem) %{
4122   predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vl());
4123   match(Set dst (ReplicateI (LoadI mem)));
4124   format %{ "pshufd  $dst,$mem,0x00\t! replicate4I" %}
4125   ins_encode %{
4126     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
4127   %}
4128   ins_pipe( pipe_slow );
4129 %}
4130 
4131 instruct Repl8I(vecY dst, rRegI src) %{
4132   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
4133   match(Set dst (ReplicateI src));
4134   format %{ "movd    $dst,$src\n\t"
4135             "pshufd  $dst,$dst,0x00\n\t"
4136             "vinserti128_high $dst,$dst\t! replicate8I" %}
4137   ins_encode %{
4138     __ movdl($dst$$XMMRegister, $src$$Register);
4139     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
4140     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
4141   %}
4142   ins_pipe( pipe_slow );
4143 %}
4144 
4145 instruct Repl8I_mem(vecY dst, memory mem) %{
4146   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
4147   match(Set dst (ReplicateI (LoadI mem)));
4148   format %{ "pshufd  $dst,$mem,0x00\n\t"
4149             "vinserti128_high $dst,$dst\t! replicate8I" %}
4150   ins_encode %{
4151     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
4152     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
4153   %}
4154   ins_pipe( pipe_slow );
4155 %}
4156 
4157 instruct Repl16I(legVecZ dst, rRegI src) %{
4158   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vl());
4159   match(Set dst (ReplicateI src));
4160   format %{ "movd    $dst,$src\n\t"
4161             "pshufd  $dst,$dst,0x00\n\t"
4162             "vinserti128_high $dst,$dst\t"
4163             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate16I" %}
4164   ins_encode %{
4165     __ movdl($dst$$XMMRegister, $src$$Register);
4166     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
4167     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
4168     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
4169   %}
4170   ins_pipe( pipe_slow );
4171 %}
4172 
4173 instruct Repl16I_mem(legVecZ dst, memory mem) %{
4174   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vl());
4175   match(Set dst (ReplicateI (LoadI mem)));
4176   format %{ "pshufd  $dst,$mem,0x00\n\t"
4177             "vinserti128_high $dst,$dst\t"
4178             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate16I" %}
4179   ins_encode %{
4180     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
4181     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
4182     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
4183   %}
4184   ins_pipe( pipe_slow );
4185 %}
4186 
4187 instruct Repl4I_imm(vecX dst, immI con) %{
4188   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
4189   match(Set dst (ReplicateI con));
4190   format %{ "movq    $dst,[$constantaddress]\t! replicate4I($con)\n\t"
4191             "punpcklqdq $dst,$dst" %}
4192   ins_encode %{
4193     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
4194     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4195   %}
4196   ins_pipe( pipe_slow );
4197 %}
4198 
4199 instruct Repl8I_imm(vecY dst, immI con) %{
4200   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
4201   match(Set dst (ReplicateI con));
4202   format %{ "movq    $dst,[$constantaddress]\t! replicate8I($con)\n\t"
4203             "punpcklqdq $dst,$dst\n\t"
4204             "vinserti128_high $dst,$dst" %}
4205   ins_encode %{
4206     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
4207     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4208     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
4209   %}
4210   ins_pipe( pipe_slow );
4211 %}
4212 
4213 instruct Repl16I_imm(legVecZ dst, immI con) %{
4214   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vl());
4215   match(Set dst (ReplicateI con));
4216   format %{ "movq    $dst,[$constantaddress]\t"
4217             "punpcklqdq $dst,$dst\n\t"
4218             "vinserti128_high $dst,$dst"
4219             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate16I($con)" %}
4220   ins_encode %{
4221     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
4222     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4223     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
4224     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
4225   %}
4226   ins_pipe( pipe_slow );
4227 %}
4228 
4229 // Long could be loaded into xmm register directly from memory.
4230 instruct Repl2L_mem(vecX dst, memory mem) %{
4231   predicate(n->as_Vector()->length() == 2 && !VM_Version::supports_avx512vlbw());
4232   match(Set dst (ReplicateL (LoadL mem)));
4233   format %{ "movq    $dst,$mem\n\t"
4234             "punpcklqdq $dst,$dst\t! replicate2L" %}
4235   ins_encode %{
4236     __ movq($dst$$XMMRegister, $mem$$Address);
4237     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4238   %}
4239   ins_pipe( pipe_slow );
4240 %}
4241 
4242 // Replicate long (8 byte) scalar to be vector
4243 #ifdef _LP64
4244 instruct Repl4L(vecY dst, rRegL src) %{
4245   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
4246   match(Set dst (ReplicateL src));
4247   format %{ "movdq   $dst,$src\n\t"
4248             "punpcklqdq $dst,$dst\n\t"
4249             "vinserti128_high $dst,$dst\t! replicate4L" %}
4250   ins_encode %{
4251     __ movdq($dst$$XMMRegister, $src$$Register);
4252     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4253     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
4254   %}
4255   ins_pipe( pipe_slow );
4256 %}
4257 
4258 instruct Repl8L(legVecZ dst, rRegL src) %{
4259   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
4260   match(Set dst (ReplicateL src));
4261   format %{ "movdq   $dst,$src\n\t"
4262             "punpcklqdq $dst,$dst\n\t"
4263             "vinserti128_high $dst,$dst\t"
4264             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate8L" %}
4265   ins_encode %{
4266     __ movdq($dst$$XMMRegister, $src$$Register);
4267     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4268     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
4269     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
4270   %}
4271   ins_pipe( pipe_slow );
4272 %}
4273 #else // _LP64
4274 instruct Repl4L(vecY dst, eRegL src, vecY tmp) %{
4275   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
4276   match(Set dst (ReplicateL src));
4277   effect(TEMP dst, USE src, TEMP tmp);
4278   format %{ "movdl   $dst,$src.lo\n\t"
4279             "movdl   $tmp,$src.hi\n\t"
4280             "punpckldq $dst,$tmp\n\t"
4281             "punpcklqdq $dst,$dst\n\t"
4282             "vinserti128_high $dst,$dst\t! replicate4L" %}
4283   ins_encode %{
4284     __ movdl($dst$$XMMRegister, $src$$Register);
4285     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
4286     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
4287     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4288     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
4289   %}
4290   ins_pipe( pipe_slow );
4291 %}
4292 
4293 instruct Repl8L(legVecZ dst, eRegL src, legVecZ tmp) %{
4294   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
4295   match(Set dst (ReplicateL src));
4296   effect(TEMP dst, USE src, TEMP tmp);
4297   format %{ "movdl   $dst,$src.lo\n\t"
4298             "movdl   $tmp,$src.hi\n\t"
4299             "punpckldq $dst,$tmp\n\t"
4300             "punpcklqdq $dst,$dst\n\t"
4301             "vinserti128_high $dst,$dst\t"
4302             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate8L" %}
4303   ins_encode %{
4304     __ movdl($dst$$XMMRegister, $src$$Register);
4305     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
4306     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
4307     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4308     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
4309     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
4310   %}
4311   ins_pipe( pipe_slow );
4312 %}
4313 #endif // _LP64
4314 
4315 instruct Repl4L_imm(vecY dst, immL con) %{
4316   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
4317   match(Set dst (ReplicateL con));
4318   format %{ "movq    $dst,[$constantaddress]\n\t"
4319             "punpcklqdq $dst,$dst\n\t"
4320             "vinserti128_high $dst,$dst\t! replicate4L($con)" %}
4321   ins_encode %{
4322     __ movq($dst$$XMMRegister, $constantaddress($con));
4323     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4324     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
4325   %}
4326   ins_pipe( pipe_slow );
4327 %}
4328 
4329 instruct Repl8L_imm(legVecZ dst, immL con) %{
4330   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
4331   match(Set dst (ReplicateL con));
4332   format %{ "movq    $dst,[$constantaddress]\n\t"
4333             "punpcklqdq $dst,$dst\n\t"
4334             "vinserti128_high $dst,$dst\t"
4335             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate8L($con)" %}
4336   ins_encode %{
4337     __ movq($dst$$XMMRegister, $constantaddress($con));
4338     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4339     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
4340     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
4341   %}
4342   ins_pipe( pipe_slow );
4343 %}
4344 
4345 instruct Repl4L_mem(vecY dst, memory mem) %{
4346   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
4347   match(Set dst (ReplicateL (LoadL mem)));
4348   format %{ "movq    $dst,$mem\n\t"
4349             "punpcklqdq $dst,$dst\n\t"
4350             "vinserti128_high $dst,$dst\t! replicate4L" %}
4351   ins_encode %{
4352     __ movq($dst$$XMMRegister, $mem$$Address);
4353     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4354     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
4355   %}
4356   ins_pipe( pipe_slow );
4357 %}
4358 
4359 instruct Repl8L_mem(legVecZ dst, memory mem) %{
4360   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
4361   match(Set dst (ReplicateL (LoadL mem)));
4362   format %{ "movq    $dst,$mem\n\t"
4363             "punpcklqdq $dst,$dst\n\t"
4364             "vinserti128_high $dst,$dst\t"
4365             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate8L" %}
4366   ins_encode %{
4367     __ movq($dst$$XMMRegister, $mem$$Address);
4368     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4369     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
4370     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
4371   %}
4372   ins_pipe( pipe_slow );
4373 %}
4374 
4375 instruct Repl2F_mem(vecD dst, memory mem) %{
4376   predicate(n->as_Vector()->length() == 2 && UseAVX > 0 && !VM_Version::supports_avx512vl());
4377   match(Set dst (ReplicateF (LoadF mem)));
4378   format %{ "pshufd  $dst,$mem,0x00\t! replicate2F" %}
4379   ins_encode %{
4380     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
4381   %}
4382   ins_pipe( pipe_slow );
4383 %}
4384 
4385 instruct Repl4F_mem(vecX dst, memory mem) %{
4386   predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vl());
4387   match(Set dst (ReplicateF (LoadF mem)));
4388   format %{ "pshufd  $dst,$mem,0x00\t! replicate4F" %}
4389   ins_encode %{
4390     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
4391   %}
4392   ins_pipe( pipe_slow );
4393 %}
4394 
4395 instruct Repl8F(vecY dst, vlRegF src) %{
4396   predicate(n->as_Vector()->length() == 8 && UseAVX > 0 && !VM_Version::supports_avx512vl());
4397   match(Set dst (ReplicateF src));
4398   format %{ "pshufd  $dst,$src,0x00\n\t"
4399             "vinsertf128_high $dst,$dst\t! replicate8F" %}
4400   ins_encode %{
4401     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
4402     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
4403   %}
4404   ins_pipe( pipe_slow );
4405 %}
4406 
4407 instruct Repl8F_mem(vecY dst, memory mem) %{
4408   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
4409   match(Set dst (ReplicateF (LoadF mem)));
4410   format %{ "pshufd  $dst,$mem,0x00\n\t"
4411             "vinsertf128_high $dst,$dst\t! replicate8F" %}
4412   ins_encode %{
4413     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
4414     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
4415   %}
4416   ins_pipe( pipe_slow );
4417 %}
4418 
4419 instruct Repl16F(legVecZ dst, vlRegF src) %{
4420   predicate(n->as_Vector()->length() == 16 && UseAVX > 0 && !VM_Version::supports_avx512vl());
4421   match(Set dst (ReplicateF src));
4422   format %{ "pshufd  $dst,$src,0x00\n\t"
4423             "vinsertf128_high $dst,$dst\t"
4424             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate16F" %}
4425   ins_encode %{
4426     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
4427     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
4428     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
4429   %}
4430   ins_pipe( pipe_slow );
4431 %}
4432 
4433 instruct Repl16F_mem(legVecZ dst, memory mem) %{
4434   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vl());
4435   match(Set dst (ReplicateF (LoadF mem)));
4436   format %{ "pshufd  $dst,$mem,0x00\n\t"
4437             "vinsertf128_high $dst,$dst\t"
4438             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate16F" %}
4439   ins_encode %{
4440     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
4441     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
4442     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
4443   %}
4444   ins_pipe( pipe_slow );
4445 %}
4446 
4447 instruct Repl2F_zero(vecD dst, immF0 zero) %{
4448   predicate(n->as_Vector()->length() == 2 && UseAVX < 3);
4449   match(Set dst (ReplicateF zero));
4450   format %{ "xorps   $dst,$dst\t! replicate2F zero" %}
4451   ins_encode %{
4452     __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
4453   %}
4454   ins_pipe( fpu_reg_reg );
4455 %}
4456 
4457 instruct Repl4F_zero(vecX dst, immF0 zero) %{
4458   predicate(n->as_Vector()->length() == 4 && UseAVX < 3);
4459   match(Set dst (ReplicateF zero));
4460   format %{ "xorps   $dst,$dst\t! replicate4F zero" %}
4461   ins_encode %{
4462     __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
4463   %}
4464   ins_pipe( fpu_reg_reg );
4465 %}
4466 
4467 instruct Repl8F_zero(vecY dst, immF0 zero) %{
4468   predicate(n->as_Vector()->length() == 8 && UseAVX < 3);
4469   match(Set dst (ReplicateF zero));
4470   format %{ "vxorps  $dst,$dst,$dst\t! replicate8F zero" %}
4471   ins_encode %{
4472     int vector_len = 1;
4473     __ vxorps($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4474   %}
4475   ins_pipe( fpu_reg_reg );
4476 %}
4477 
4478 instruct Repl2D_mem(vecX dst, memory mem) %{
4479   predicate(n->as_Vector()->length() == 2 && UseAVX > 0 && !VM_Version::supports_avx512vl());
4480   match(Set dst (ReplicateD (LoadD mem)));
4481   format %{ "pshufd  $dst,$mem,0x44\t! replicate2D" %}
4482   ins_encode %{
4483     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x44);
4484   %}
4485   ins_pipe( pipe_slow );
4486 %}
4487 
4488 instruct Repl4D(vecY dst, vlRegD src) %{
4489   predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vl());
4490   match(Set dst (ReplicateD src));
4491   format %{ "pshufd  $dst,$src,0x44\n\t"
4492             "vinsertf128_high $dst,$dst\t! replicate4D" %}
4493   ins_encode %{
4494     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
4495     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
4496   %}
4497   ins_pipe( pipe_slow );
4498 %}
4499 
4500 instruct Repl4D_mem(vecY dst, memory mem) %{
4501   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
4502   match(Set dst (ReplicateD (LoadD mem)));
4503   format %{ "pshufd  $dst,$mem,0x44\n\t"
4504             "vinsertf128_high $dst,$dst\t! replicate4D" %}
4505   ins_encode %{
4506     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x44);
4507     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
4508   %}
4509   ins_pipe( pipe_slow );
4510 %}
4511 
4512 instruct Repl8D(legVecZ dst, vlRegD src) %{
4513   predicate(n->as_Vector()->length() == 8 && UseAVX > 0 && !VM_Version::supports_avx512vl());
4514   match(Set dst (ReplicateD src));
4515   format %{ "pshufd  $dst,$src,0x44\n\t"
4516             "vinsertf128_high $dst,$dst\t"
4517             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate8D" %}
4518   ins_encode %{
4519     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
4520     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
4521     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
4522   %}
4523   ins_pipe( pipe_slow );
4524 %}
4525 
4526 instruct Repl8D_mem(legVecZ dst, memory mem) %{
4527   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
4528   match(Set dst (ReplicateD (LoadD mem)));
4529   format %{ "pshufd  $dst,$mem,0x44\n\t"
4530             "vinsertf128_high $dst,$dst\t"
4531             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate8D" %}
4532   ins_encode %{
4533     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x44);
4534     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
4535     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
4536   %}
4537   ins_pipe( pipe_slow );
4538 %}
4539 
4540 // Replicate double (8 byte) scalar zero to be vector
4541 instruct Repl2D_zero(vecX dst, immD0 zero) %{
4542   predicate(n->as_Vector()->length() == 2 && UseAVX < 3);
4543   match(Set dst (ReplicateD zero));
4544   format %{ "xorpd   $dst,$dst\t! replicate2D zero" %}
4545   ins_encode %{
4546     __ xorpd($dst$$XMMRegister, $dst$$XMMRegister);
4547   %}
4548   ins_pipe( fpu_reg_reg );
4549 %}
4550 
4551 instruct Repl4D_zero(vecY dst, immD0 zero) %{
4552   predicate(n->as_Vector()->length() == 4 && UseAVX < 3);
4553   match(Set dst (ReplicateD zero));
4554   format %{ "vxorpd  $dst,$dst,$dst,vect256\t! replicate4D zero" %}
4555   ins_encode %{
4556     int vector_len = 1;
4557     __ vxorpd($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4558   %}
4559   ins_pipe( fpu_reg_reg );
4560 %}
4561 
4562 // ====================GENERIC REPLICATE==========================================
4563 
4564 // Replicate byte scalar to be vector
4565 instruct Repl4B(vecS dst, rRegI src) %{
4566   predicate(n->as_Vector()->length() == 4);
4567   match(Set dst (ReplicateB src));
4568   format %{ "movd    $dst,$src\n\t"
4569             "punpcklbw $dst,$dst\n\t"
4570             "pshuflw $dst,$dst,0x00\t! replicate4B" %}
4571   ins_encode %{
4572     __ movdl($dst$$XMMRegister, $src$$Register);
4573     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
4574     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
4575   %}
4576   ins_pipe( pipe_slow );
4577 %}
4578 
4579 instruct Repl8B(vecD dst, rRegI src) %{
4580   predicate(n->as_Vector()->length() == 8);
4581   match(Set dst (ReplicateB src));
4582   format %{ "movd    $dst,$src\n\t"
4583             "punpcklbw $dst,$dst\n\t"
4584             "pshuflw $dst,$dst,0x00\t! replicate8B" %}
4585   ins_encode %{
4586     __ movdl($dst$$XMMRegister, $src$$Register);
4587     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
4588     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
4589   %}
4590   ins_pipe( pipe_slow );
4591 %}
4592 
4593 // Replicate byte scalar immediate to be vector by loading from const table.
4594 instruct Repl4B_imm(vecS dst, immI con) %{
4595   predicate(n->as_Vector()->length() == 4);
4596   match(Set dst (ReplicateB con));
4597   format %{ "movdl   $dst,[$constantaddress]\t! replicate4B($con)" %}
4598   ins_encode %{
4599     __ movdl($dst$$XMMRegister, $constantaddress(replicate4_imm($con$$constant, 1)));
4600   %}
4601   ins_pipe( pipe_slow );
4602 %}
4603 
4604 instruct Repl8B_imm(vecD dst, immI con) %{
4605   predicate(n->as_Vector()->length() == 8);
4606   match(Set dst (ReplicateB con));
4607   format %{ "movq    $dst,[$constantaddress]\t! replicate8B($con)" %}
4608   ins_encode %{
4609     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
4610   %}
4611   ins_pipe( pipe_slow );
4612 %}
4613 
4614 // Replicate byte scalar zero to be vector
4615 instruct Repl4B_zero(vecS dst, immI0 zero) %{
4616   predicate(n->as_Vector()->length() == 4);
4617   match(Set dst (ReplicateB zero));
4618   format %{ "pxor    $dst,$dst\t! replicate4B zero" %}
4619   ins_encode %{
4620     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4621   %}
4622   ins_pipe( fpu_reg_reg );
4623 %}
4624 
4625 instruct Repl8B_zero(vecD dst, immI0 zero) %{
4626   predicate(n->as_Vector()->length() == 8);
4627   match(Set dst (ReplicateB zero));
4628   format %{ "pxor    $dst,$dst\t! replicate8B zero" %}
4629   ins_encode %{
4630     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4631   %}
4632   ins_pipe( fpu_reg_reg );
4633 %}
4634 
4635 instruct Repl16B_zero(vecX dst, immI0 zero) %{
4636   predicate(n->as_Vector()->length() == 16);
4637   match(Set dst (ReplicateB zero));
4638   format %{ "pxor    $dst,$dst\t! replicate16B zero" %}
4639   ins_encode %{
4640     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4641   %}
4642   ins_pipe( fpu_reg_reg );
4643 %}
4644 
4645 instruct Repl32B_zero(vecY dst, immI0 zero) %{
4646   predicate(n->as_Vector()->length() == 32);
4647   match(Set dst (ReplicateB zero));
4648   format %{ "vpxor   $dst,$dst,$dst\t! replicate32B zero" %}
4649   ins_encode %{
4650     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
4651     int vector_len = 1;
4652     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4653   %}
4654   ins_pipe( fpu_reg_reg );
4655 %}
4656 
4657 // Replicate char/short (2 byte) scalar to be vector
4658 instruct Repl2S(vecS dst, rRegI src) %{
4659   predicate(n->as_Vector()->length() == 2);
4660   match(Set dst (ReplicateS src));
4661   format %{ "movd    $dst,$src\n\t"
4662             "pshuflw $dst,$dst,0x00\t! replicate2S" %}
4663   ins_encode %{
4664     __ movdl($dst$$XMMRegister, $src$$Register);
4665     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
4666   %}
4667   ins_pipe( fpu_reg_reg );
4668 %}
4669 
4670 // Replicate char/short (2 byte) scalar immediate to be vector by loading from const table.
4671 instruct Repl2S_imm(vecS dst, immI con) %{
4672   predicate(n->as_Vector()->length() == 2);
4673   match(Set dst (ReplicateS con));
4674   format %{ "movdl   $dst,[$constantaddress]\t! replicate2S($con)" %}
4675   ins_encode %{
4676     __ movdl($dst$$XMMRegister, $constantaddress(replicate4_imm($con$$constant, 2)));
4677   %}
4678   ins_pipe( fpu_reg_reg );
4679 %}
4680 
4681 instruct Repl4S_imm(vecD dst, immI con) %{
4682   predicate(n->as_Vector()->length() == 4);
4683   match(Set dst (ReplicateS con));
4684   format %{ "movq    $dst,[$constantaddress]\t! replicate4S($con)" %}
4685   ins_encode %{
4686     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
4687   %}
4688   ins_pipe( fpu_reg_reg );
4689 %}
4690 
4691 // Replicate char/short (2 byte) scalar zero to be vector
4692 instruct Repl2S_zero(vecS dst, immI0 zero) %{
4693   predicate(n->as_Vector()->length() == 2);
4694   match(Set dst (ReplicateS zero));
4695   format %{ "pxor    $dst,$dst\t! replicate2S zero" %}
4696   ins_encode %{
4697     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4698   %}
4699   ins_pipe( fpu_reg_reg );
4700 %}
4701 
4702 instruct Repl4S_zero(vecD dst, immI0 zero) %{
4703   predicate(n->as_Vector()->length() == 4);
4704   match(Set dst (ReplicateS zero));
4705   format %{ "pxor    $dst,$dst\t! replicate4S zero" %}
4706   ins_encode %{
4707     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4708   %}
4709   ins_pipe( fpu_reg_reg );
4710 %}
4711 
4712 instruct Repl8S_zero(vecX dst, immI0 zero) %{
4713   predicate(n->as_Vector()->length() == 8);
4714   match(Set dst (ReplicateS zero));
4715   format %{ "pxor    $dst,$dst\t! replicate8S zero" %}
4716   ins_encode %{
4717     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4718   %}
4719   ins_pipe( fpu_reg_reg );
4720 %}
4721 
4722 instruct Repl16S_zero(vecY dst, immI0 zero) %{
4723   predicate(n->as_Vector()->length() == 16);
4724   match(Set dst (ReplicateS zero));
4725   format %{ "vpxor   $dst,$dst,$dst\t! replicate16S zero" %}
4726   ins_encode %{
4727     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
4728     int vector_len = 1;
4729     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4730   %}
4731   ins_pipe( fpu_reg_reg );
4732 %}
4733 
4734 // Replicate integer (4 byte) scalar to be vector
4735 instruct Repl2I(vecD dst, rRegI src) %{
4736   predicate(n->as_Vector()->length() == 2);
4737   match(Set dst (ReplicateI src));
4738   format %{ "movd    $dst,$src\n\t"
4739             "pshufd  $dst,$dst,0x00\t! replicate2I" %}
4740   ins_encode %{
4741     __ movdl($dst$$XMMRegister, $src$$Register);
4742     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
4743   %}
4744   ins_pipe( fpu_reg_reg );
4745 %}
4746 
4747 // Integer could be loaded into xmm register directly from memory.
4748 instruct Repl2I_mem(vecD dst, memory mem) %{
4749   predicate(n->as_Vector()->length() == 2);
4750   match(Set dst (ReplicateI (LoadI mem)));
4751   format %{ "movd    $dst,$mem\n\t"
4752             "pshufd  $dst,$dst,0x00\t! replicate2I" %}
4753   ins_encode %{
4754     __ movdl($dst$$XMMRegister, $mem$$Address);
4755     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
4756   %}
4757   ins_pipe( fpu_reg_reg );
4758 %}
4759 
4760 // Replicate integer (4 byte) scalar immediate to be vector by loading from const table.
4761 instruct Repl2I_imm(vecD dst, immI con) %{
4762   predicate(n->as_Vector()->length() == 2);
4763   match(Set dst (ReplicateI con));
4764   format %{ "movq    $dst,[$constantaddress]\t! replicate2I($con)" %}
4765   ins_encode %{
4766     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
4767   %}
4768   ins_pipe( fpu_reg_reg );
4769 %}
4770 
4771 // Replicate integer (4 byte) scalar zero to be vector
4772 instruct Repl2I_zero(vecD dst, immI0 zero) %{
4773   predicate(n->as_Vector()->length() == 2);
4774   match(Set dst (ReplicateI zero));
4775   format %{ "pxor    $dst,$dst\t! replicate2I" %}
4776   ins_encode %{
4777     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4778   %}
4779   ins_pipe( fpu_reg_reg );
4780 %}
4781 
4782 instruct Repl4I_zero(vecX dst, immI0 zero) %{
4783   predicate(n->as_Vector()->length() == 4);
4784   match(Set dst (ReplicateI zero));
4785   format %{ "pxor    $dst,$dst\t! replicate4I zero)" %}
4786   ins_encode %{
4787     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4788   %}
4789   ins_pipe( fpu_reg_reg );
4790 %}
4791 
4792 instruct Repl8I_zero(vecY dst, immI0 zero) %{
4793   predicate(n->as_Vector()->length() == 8);
4794   match(Set dst (ReplicateI zero));
4795   format %{ "vpxor   $dst,$dst,$dst\t! replicate8I zero" %}
4796   ins_encode %{
4797     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
4798     int vector_len = 1;
4799     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4800   %}
4801   ins_pipe( fpu_reg_reg );
4802 %}
4803 
4804 // Replicate long (8 byte) scalar to be vector
4805 #ifdef _LP64
4806 instruct Repl2L(vecX dst, rRegL src) %{
4807   predicate(n->as_Vector()->length() == 2);
4808   match(Set dst (ReplicateL src));
4809   format %{ "movdq   $dst,$src\n\t"
4810             "punpcklqdq $dst,$dst\t! replicate2L" %}
4811   ins_encode %{
4812     __ movdq($dst$$XMMRegister, $src$$Register);
4813     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4814   %}
4815   ins_pipe( pipe_slow );
4816 %}
4817 #else // _LP64
4818 instruct Repl2L(vecX dst, eRegL src, vecX tmp) %{
4819   predicate(n->as_Vector()->length() == 2);
4820   match(Set dst (ReplicateL src));
4821   effect(TEMP dst, USE src, TEMP tmp);
4822   format %{ "movdl   $dst,$src.lo\n\t"
4823             "movdl   $tmp,$src.hi\n\t"
4824             "punpckldq $dst,$tmp\n\t"
4825             "punpcklqdq $dst,$dst\t! replicate2L"%}
4826   ins_encode %{
4827     __ movdl($dst$$XMMRegister, $src$$Register);
4828     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
4829     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
4830     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4831   %}
4832   ins_pipe( pipe_slow );
4833 %}
4834 #endif // _LP64
4835 
4836 // Replicate long (8 byte) scalar immediate to be vector by loading from const table.
4837 instruct Repl2L_imm(vecX dst, immL con) %{
4838   predicate(n->as_Vector()->length() == 2);
4839   match(Set dst (ReplicateL con));
4840   format %{ "movq    $dst,[$constantaddress]\n\t"
4841             "punpcklqdq $dst,$dst\t! replicate2L($con)" %}
4842   ins_encode %{
4843     __ movq($dst$$XMMRegister, $constantaddress($con));
4844     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4845   %}
4846   ins_pipe( pipe_slow );
4847 %}
4848 
4849 // Replicate long (8 byte) scalar zero to be vector
4850 instruct Repl2L_zero(vecX dst, immL0 zero) %{
4851   predicate(n->as_Vector()->length() == 2);
4852   match(Set dst (ReplicateL zero));
4853   format %{ "pxor    $dst,$dst\t! replicate2L zero" %}
4854   ins_encode %{
4855     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4856   %}
4857   ins_pipe( fpu_reg_reg );
4858 %}
4859 
4860 instruct Repl4L_zero(vecY dst, immL0 zero) %{
4861   predicate(n->as_Vector()->length() == 4);
4862   match(Set dst (ReplicateL zero));
4863   format %{ "vpxor   $dst,$dst,$dst\t! replicate4L zero" %}
4864   ins_encode %{
4865     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
4866     int vector_len = 1;
4867     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4868   %}
4869   ins_pipe( fpu_reg_reg );
4870 %}
4871 
4872 // Replicate float (4 byte) scalar to be vector
4873 instruct Repl2F(vecD dst, vlRegF src) %{
4874   predicate(n->as_Vector()->length() == 2);
4875   match(Set dst (ReplicateF src));
4876   format %{ "pshufd  $dst,$dst,0x00\t! replicate2F" %}
4877   ins_encode %{
4878     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
4879   %}
4880   ins_pipe( fpu_reg_reg );
4881 %}
4882 
4883 instruct Repl4F(vecX dst, vlRegF src) %{
4884   predicate(n->as_Vector()->length() == 4);
4885   match(Set dst (ReplicateF src));
4886   format %{ "pshufd  $dst,$dst,0x00\t! replicate4F" %}
4887   ins_encode %{
4888     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
4889   %}
4890   ins_pipe( pipe_slow );
4891 %}
4892 
4893 // Replicate double (8 bytes) scalar to be vector
4894 instruct Repl2D(vecX dst, vlRegD src) %{
4895   predicate(n->as_Vector()->length() == 2);
4896   match(Set dst (ReplicateD src));
4897   format %{ "pshufd  $dst,$src,0x44\t! replicate2D" %}
4898   ins_encode %{
4899     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
4900   %}
4901   ins_pipe( pipe_slow );
4902 %}
4903 
4904 // ====================EVEX REPLICATE=============================================
4905 
4906 instruct Repl4B_mem_evex(vecS dst, memory mem) %{
4907   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4908   match(Set dst (ReplicateB (LoadB mem)));
4909   format %{ "vpbroadcastb  $dst,$mem\t! replicate4B" %}
4910   ins_encode %{
4911     int vector_len = 0;
4912     __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
4913   %}
4914   ins_pipe( pipe_slow );
4915 %}
4916 
4917 instruct Repl8B_mem_evex(vecD dst, memory mem) %{
4918   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4919   match(Set dst (ReplicateB (LoadB mem)));
4920   format %{ "vpbroadcastb  $dst,$mem\t! replicate8B" %}
4921   ins_encode %{
4922     int vector_len = 0;
4923     __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
4924   %}
4925   ins_pipe( pipe_slow );
4926 %}
4927 
4928 instruct Repl16B_evex(vecX dst, rRegI src) %{
4929   predicate(n->as_Vector()->length() == 16 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4930   match(Set dst (ReplicateB src));
4931   format %{ "evpbroadcastb $dst,$src\t! replicate16B" %}
4932   ins_encode %{
4933    int vector_len = 0;
4934     __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vector_len);
4935   %}
4936   ins_pipe( pipe_slow );
4937 %}
4938 
4939 instruct Repl16B_mem_evex(vecX dst, memory mem) %{
4940   predicate(n->as_Vector()->length() == 16 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4941   match(Set dst (ReplicateB (LoadB mem)));
4942   format %{ "vpbroadcastb  $dst,$mem\t! replicate16B" %}
4943   ins_encode %{
4944     int vector_len = 0;
4945     __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
4946   %}
4947   ins_pipe( pipe_slow );
4948 %}
4949 
4950 instruct Repl32B_evex(vecY dst, rRegI src) %{
4951   predicate(n->as_Vector()->length() == 32 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4952   match(Set dst (ReplicateB src));
4953   format %{ "evpbroadcastb $dst,$src\t! replicate32B" %}
4954   ins_encode %{
4955    int vector_len = 1;
4956     __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vector_len);
4957   %}
4958   ins_pipe( pipe_slow );
4959 %}
4960 
4961 instruct Repl32B_mem_evex(vecY dst, memory mem) %{
4962   predicate(n->as_Vector()->length() == 32 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4963   match(Set dst (ReplicateB (LoadB mem)));
4964   format %{ "vpbroadcastb  $dst,$mem\t! replicate32B" %}
4965   ins_encode %{
4966     int vector_len = 1;
4967     __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
4968   %}
4969   ins_pipe( pipe_slow );
4970 %}
4971 
4972 instruct Repl64B_evex(vecZ dst, rRegI src) %{
4973   predicate(n->as_Vector()->length() == 64 && UseAVX > 2 && VM_Version::supports_avx512bw());
4974   match(Set dst (ReplicateB src));
4975   format %{ "evpbroadcastb $dst,$src\t! upper replicate64B" %}
4976   ins_encode %{
4977    int vector_len = 2;
4978     __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vector_len);
4979   %}
4980   ins_pipe( pipe_slow );
4981 %}
4982 
4983 instruct Repl64B_mem_evex(vecZ dst, memory mem) %{
4984   predicate(n->as_Vector()->length() == 64 && UseAVX > 2 && VM_Version::supports_avx512bw());
4985   match(Set dst (ReplicateB (LoadB mem)));
4986   format %{ "vpbroadcastb  $dst,$mem\t! replicate64B" %}
4987   ins_encode %{
4988     int vector_len = 2;
4989     __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
4990   %}
4991   ins_pipe( pipe_slow );
4992 %}
4993 
4994 instruct Repl16B_imm_evex(vecX dst, immI con) %{
4995   predicate(n->as_Vector()->length() == 16 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4996   match(Set dst (ReplicateB con));
4997   format %{ "movq    $dst,[$constantaddress]\n\t"
4998             "vpbroadcastb $dst,$dst\t! replicate16B" %}
4999   ins_encode %{
5000    int vector_len = 0;
5001     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
5002     __ vpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
5003   %}
5004   ins_pipe( pipe_slow );
5005 %}
5006 
5007 instruct Repl32B_imm_evex(vecY dst, immI con) %{
5008   predicate(n->as_Vector()->length() == 32 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
5009   match(Set dst (ReplicateB con));
5010   format %{ "movq    $dst,[$constantaddress]\n\t"
5011             "vpbroadcastb $dst,$dst\t! replicate32B" %}
5012   ins_encode %{
5013    int vector_len = 1;
5014     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
5015     __ vpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
5016   %}
5017   ins_pipe( pipe_slow );
5018 %}
5019 
5020 instruct Repl64B_imm_evex(vecZ dst, immI con) %{
5021   predicate(n->as_Vector()->length() == 64 && UseAVX > 2 && VM_Version::supports_avx512bw());
5022   match(Set dst (ReplicateB con));
5023   format %{ "movq    $dst,[$constantaddress]\n\t"
5024             "vpbroadcastb $dst,$dst\t! upper replicate64B" %}
5025   ins_encode %{
5026    int vector_len = 2;
5027     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
5028     __ vpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
5029   %}
5030   ins_pipe( pipe_slow );
5031 %}
5032 
5033 instruct Repl64B_zero_evex(vecZ dst, immI0 zero) %{
5034   predicate(n->as_Vector()->length() == 64 && UseAVX > 2);
5035   match(Set dst (ReplicateB zero));
5036   format %{ "vpxor   $dst k0,$dst,$dst\t! replicate64B zero" %}
5037   ins_encode %{
5038     // Use vxorpd since AVX does not have vpxor for 512-bit (EVEX will have it).
5039     int vector_len = 2;
5040     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
5041   %}
5042   ins_pipe( fpu_reg_reg );
5043 %}
5044 
5045 instruct Repl4S_evex(vecD dst, rRegI src) %{
5046   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
5047   match(Set dst (ReplicateS src));
5048   format %{ "evpbroadcastw $dst,$src\t! replicate4S" %}
5049   ins_encode %{
5050    int vector_len = 0;
5051     __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
5052   %}
5053   ins_pipe( pipe_slow );
5054 %}
5055 
5056 instruct Repl4S_mem_evex(vecD dst, memory mem) %{
5057   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
5058   match(Set dst (ReplicateS (LoadS mem)));
5059   format %{ "vpbroadcastw  $dst,$mem\t! replicate4S" %}
5060   ins_encode %{
5061     int vector_len = 0;
5062     __ vpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
5063   %}
5064   ins_pipe( pipe_slow );
5065 %}
5066 
5067 instruct Repl8S_evex(vecX dst, rRegI src) %{
5068   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
5069   match(Set dst (ReplicateS src));
5070   format %{ "evpbroadcastw $dst,$src\t! replicate8S" %}
5071   ins_encode %{
5072    int vector_len = 0;
5073     __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
5074   %}
5075   ins_pipe( pipe_slow );
5076 %}
5077 
5078 instruct Repl8S_mem_evex(vecX dst, memory mem) %{
5079   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
5080   match(Set dst (ReplicateS (LoadS mem)));
5081   format %{ "vpbroadcastw  $dst,$mem\t! replicate8S" %}
5082   ins_encode %{
5083     int vector_len = 0;
5084     __ vpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
5085   %}
5086   ins_pipe( pipe_slow );
5087 %}
5088 
5089 instruct Repl16S_evex(vecY dst, rRegI src) %{
5090   predicate(n->as_Vector()->length() == 16 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
5091   match(Set dst (ReplicateS src));
5092   format %{ "evpbroadcastw $dst,$src\t! replicate16S" %}
5093   ins_encode %{
5094    int vector_len = 1;
5095     __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
5096   %}
5097   ins_pipe( pipe_slow );
5098 %}
5099 
5100 instruct Repl16S_mem_evex(vecY dst, memory mem) %{
5101   predicate(n->as_Vector()->length() == 16 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
5102   match(Set dst (ReplicateS (LoadS mem)));
5103   format %{ "vpbroadcastw  $dst,$mem\t! replicate16S" %}
5104   ins_encode %{
5105     int vector_len = 1;
5106     __ vpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
5107   %}
5108   ins_pipe( pipe_slow );
5109 %}
5110 
5111 instruct Repl32S_evex(vecZ dst, rRegI src) %{
5112   predicate(n->as_Vector()->length() == 32 && UseAVX > 2 && VM_Version::supports_avx512bw());
5113   match(Set dst (ReplicateS src));
5114   format %{ "evpbroadcastw $dst,$src\t! replicate32S" %}
5115   ins_encode %{
5116    int vector_len = 2;
5117     __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
5118   %}
5119   ins_pipe( pipe_slow );
5120 %}
5121 
5122 instruct Repl32S_mem_evex(vecZ dst, memory mem) %{
5123   predicate(n->as_Vector()->length() == 32 && UseAVX > 2 && VM_Version::supports_avx512bw());
5124   match(Set dst (ReplicateS (LoadS mem)));
5125   format %{ "vpbroadcastw  $dst,$mem\t! replicate32S" %}
5126   ins_encode %{
5127     int vector_len = 2;
5128     __ vpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
5129   %}
5130   ins_pipe( pipe_slow );
5131 %}
5132 
5133 instruct Repl8S_imm_evex(vecX dst, immI con) %{
5134   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
5135   match(Set dst (ReplicateS con));
5136   format %{ "movq    $dst,[$constantaddress]\n\t"
5137             "vpbroadcastw $dst,$dst\t! replicate8S" %}
5138   ins_encode %{
5139    int vector_len = 0;
5140     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
5141     __ vpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
5142   %}
5143   ins_pipe( pipe_slow );
5144 %}
5145 
5146 instruct Repl16S_imm_evex(vecY dst, immI con) %{
5147   predicate(n->as_Vector()->length() == 16 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
5148   match(Set dst (ReplicateS con));
5149   format %{ "movq    $dst,[$constantaddress]\n\t"
5150             "vpbroadcastw $dst,$dst\t! replicate16S" %}
5151   ins_encode %{
5152    int vector_len = 1;
5153     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
5154     __ vpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
5155   %}
5156   ins_pipe( pipe_slow );
5157 %}
5158 
5159 instruct Repl32S_imm_evex(vecZ dst, immI con) %{
5160   predicate(n->as_Vector()->length() == 32 && UseAVX > 2 && VM_Version::supports_avx512bw());
5161   match(Set dst (ReplicateS con));
5162   format %{ "movq    $dst,[$constantaddress]\n\t"
5163             "vpbroadcastw $dst,$dst\t! replicate32S" %}
5164   ins_encode %{
5165    int vector_len = 2;
5166     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
5167     __ vpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
5168   %}
5169   ins_pipe( pipe_slow );
5170 %}
5171 
5172 instruct Repl32S_zero_evex(vecZ dst, immI0 zero) %{
5173   predicate(n->as_Vector()->length() == 32 && UseAVX > 2);
5174   match(Set dst (ReplicateS zero));
5175   format %{ "vpxor   $dst k0,$dst,$dst\t! replicate32S zero" %}
5176   ins_encode %{
5177     // Use vxorpd since AVX does not have vpxor for 512-bit (EVEX will have it).
5178     int vector_len = 2;
5179     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
5180   %}
5181   ins_pipe( fpu_reg_reg );
5182 %}
5183 
5184 instruct Repl4I_evex(vecX dst, rRegI src) %{
5185   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
5186   match(Set dst (ReplicateI src));
5187   format %{ "evpbroadcastd  $dst,$src\t! replicate4I" %}
5188   ins_encode %{
5189     int vector_len = 0;
5190     __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vector_len);
5191   %}
5192   ins_pipe( pipe_slow );
5193 %}
5194 
5195 instruct Repl4I_mem_evex(vecX dst, memory mem) %{
5196   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
5197   match(Set dst (ReplicateI (LoadI mem)));
5198   format %{ "vpbroadcastd  $dst,$mem\t! replicate4I" %}
5199   ins_encode %{
5200     int vector_len = 0;
5201     __ vpbroadcastd($dst$$XMMRegister, $mem$$Address, vector_len);
5202   %}
5203   ins_pipe( pipe_slow );
5204 %}
5205 
5206 instruct Repl8I_evex(vecY dst, rRegI src) %{
5207   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vl());
5208   match(Set dst (ReplicateI src));
5209   format %{ "evpbroadcastd  $dst,$src\t! replicate8I" %}
5210   ins_encode %{
5211     int vector_len = 1;
5212     __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vector_len);
5213   %}
5214   ins_pipe( pipe_slow );
5215 %}
5216 
5217 instruct Repl8I_mem_evex(vecY dst, memory mem) %{
5218   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vl());
5219   match(Set dst (ReplicateI (LoadI mem)));
5220   format %{ "vpbroadcastd  $dst,$mem\t! replicate8I" %}
5221   ins_encode %{
5222     int vector_len = 1;
5223     __ vpbroadcastd($dst$$XMMRegister, $mem$$Address, vector_len);
5224   %}
5225   ins_pipe( pipe_slow );
5226 %}
5227 
5228 instruct Repl16I_evex(vecZ dst, rRegI src) %{
5229   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
5230   match(Set dst (ReplicateI src));
5231   format %{ "evpbroadcastd  $dst,$src\t! replicate16I" %}
5232   ins_encode %{
5233     int vector_len = 2;
5234     __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vector_len);
5235   %}
5236   ins_pipe( pipe_slow );
5237 %}
5238 
5239 instruct Repl16I_mem_evex(vecZ dst, memory mem) %{
5240   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
5241   match(Set dst (ReplicateI (LoadI mem)));
5242   format %{ "vpbroadcastd  $dst,$mem\t! replicate16I" %}
5243   ins_encode %{
5244     int vector_len = 2;
5245     __ vpbroadcastd($dst$$XMMRegister, $mem$$Address, vector_len);
5246   %}
5247   ins_pipe( pipe_slow );
5248 %}
5249 
5250 instruct Repl4I_imm_evex(vecX dst, immI con) %{
5251   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
5252   match(Set dst (ReplicateI con));
5253   format %{ "movq    $dst,[$constantaddress]\t! replicate8I($con)\n\t"
5254             "vpbroadcastd  $dst,$dst\t! replicate4I" %}
5255   ins_encode %{
5256     int vector_len = 0;
5257     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
5258     __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
5259   %}
5260   ins_pipe( pipe_slow );
5261 %}
5262 
5263 instruct Repl8I_imm_evex(vecY dst, immI con) %{
5264   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vl());
5265   match(Set dst (ReplicateI con));
5266   format %{ "movq    $dst,[$constantaddress]\t! replicate8I($con)\n\t"
5267             "vpbroadcastd  $dst,$dst\t! replicate8I" %}
5268   ins_encode %{
5269     int vector_len = 1;
5270     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
5271     __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
5272   %}
5273   ins_pipe( pipe_slow );
5274 %}
5275 
5276 instruct Repl16I_imm_evex(vecZ dst, immI con) %{
5277   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
5278   match(Set dst (ReplicateI con));
5279   format %{ "movq    $dst,[$constantaddress]\t! replicate16I($con)\n\t"
5280             "vpbroadcastd  $dst,$dst\t! replicate16I" %}
5281   ins_encode %{
5282     int vector_len = 2;
5283     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
5284     __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
5285   %}
5286   ins_pipe( pipe_slow );
5287 %}
5288 
5289 instruct Repl16I_zero_evex(vecZ dst, immI0 zero) %{
5290   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
5291   match(Set dst (ReplicateI zero));
5292   format %{ "vpxor   $dst k0,$dst,$dst\t! replicate16I zero" %}
5293   ins_encode %{
5294     // Use vxorpd since AVX does not have vpxor for 512-bit (AVX2 will have it).
5295     int vector_len = 2;
5296     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
5297   %}
5298   ins_pipe( fpu_reg_reg );
5299 %}
5300 
5301 // Replicate long (8 byte) scalar to be vector
5302 #ifdef _LP64
5303 instruct Repl4L_evex(vecY dst, rRegL src) %{
5304   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
5305   match(Set dst (ReplicateL src));
5306   format %{ "evpbroadcastq  $dst,$src\t! replicate4L" %}
5307   ins_encode %{
5308     int vector_len = 1;
5309     __ evpbroadcastq($dst$$XMMRegister, $src$$Register, vector_len);
5310   %}
5311   ins_pipe( pipe_slow );
5312 %}
5313 
5314 instruct Repl8L_evex(vecZ dst, rRegL src) %{
5315   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
5316   match(Set dst (ReplicateL src));
5317   format %{ "evpbroadcastq  $dst,$src\t! replicate8L" %}
5318   ins_encode %{
5319     int vector_len = 2;
5320     __ evpbroadcastq($dst$$XMMRegister, $src$$Register, vector_len);
5321   %}
5322   ins_pipe( pipe_slow );
5323 %}
5324 #else // _LP64
5325 instruct Repl4L_evex(vecY dst, eRegL src, regD tmp) %{
5326   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
5327   match(Set dst (ReplicateL src));
5328   effect(TEMP dst, USE src, TEMP tmp);
5329   format %{ "movdl   $dst,$src.lo\n\t"
5330             "movdl   $tmp,$src.hi\n\t"
5331             "punpckldq $dst,$tmp\n\t"
5332             "vpbroadcastq  $dst,$dst\t! replicate4L" %}
5333   ins_encode %{
5334     int vector_len = 1;
5335     __ movdl($dst$$XMMRegister, $src$$Register);
5336     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
5337     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
5338     __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
5339   %}
5340   ins_pipe( pipe_slow );
5341 %}
5342 
5343 instruct Repl8L_evex(legVecZ dst, eRegL src, legVecZ tmp) %{
5344   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
5345   match(Set dst (ReplicateL src));
5346   effect(TEMP dst, USE src, TEMP tmp);
5347   format %{ "movdl   $dst,$src.lo\n\t"
5348             "movdl   $tmp,$src.hi\n\t"
5349             "punpckldq $dst,$tmp\n\t"
5350             "vpbroadcastq  $dst,$dst\t! replicate8L" %}
5351   ins_encode %{
5352     int vector_len = 2;
5353     __ movdl($dst$$XMMRegister, $src$$Register);
5354     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
5355     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
5356     __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
5357   %}
5358   ins_pipe( pipe_slow );
5359 %}
5360 #endif // _LP64
5361 
5362 instruct Repl4L_imm_evex(vecY dst, immL con) %{
5363   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
5364   match(Set dst (ReplicateL con));
5365   format %{ "movq    $dst,[$constantaddress]\n\t"
5366             "vpbroadcastq  $dst,$dst\t! replicate4L" %}
5367   ins_encode %{
5368     int vector_len = 1;
5369     __ movq($dst$$XMMRegister, $constantaddress($con));
5370     __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
5371   %}
5372   ins_pipe( pipe_slow );
5373 %}
5374 
5375 instruct Repl8L_imm_evex(vecZ dst, immL con) %{
5376   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
5377   match(Set dst (ReplicateL con));
5378   format %{ "movq    $dst,[$constantaddress]\n\t"
5379             "vpbroadcastq  $dst,$dst\t! replicate8L" %}
5380   ins_encode %{
5381     int vector_len = 2;
5382     __ movq($dst$$XMMRegister, $constantaddress($con));
5383     __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
5384   %}
5385   ins_pipe( pipe_slow );
5386 %}
5387 
5388 instruct Repl2L_mem_evex(vecX dst, memory mem) %{
5389   predicate(n->as_Vector()->length() == 2 && UseAVX > 2 && VM_Version::supports_avx512vl());
5390   match(Set dst (ReplicateL (LoadL mem)));
5391   format %{ "vpbroadcastd  $dst,$mem\t! replicate2L" %}
5392   ins_encode %{
5393     int vector_len = 0;
5394     __ vpbroadcastq($dst$$XMMRegister, $mem$$Address, vector_len);
5395   %}
5396   ins_pipe( pipe_slow );
5397 %}
5398 
5399 instruct Repl4L_mem_evex(vecY dst, memory mem) %{
5400   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
5401   match(Set dst (ReplicateL (LoadL mem)));
5402   format %{ "vpbroadcastd  $dst,$mem\t! replicate4L" %}
5403   ins_encode %{
5404     int vector_len = 1;
5405     __ vpbroadcastq($dst$$XMMRegister, $mem$$Address, vector_len);
5406   %}
5407   ins_pipe( pipe_slow );
5408 %}
5409 
5410 instruct Repl8L_mem_evex(vecZ dst, memory mem) %{
5411   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
5412   match(Set dst (ReplicateL (LoadL mem)));
5413   format %{ "vpbroadcastd  $dst,$mem\t! replicate8L" %}
5414   ins_encode %{
5415     int vector_len = 2;
5416     __ vpbroadcastq($dst$$XMMRegister, $mem$$Address, vector_len);
5417   %}
5418   ins_pipe( pipe_slow );
5419 %}
5420 
5421 instruct Repl8L_zero_evex(vecZ dst, immL0 zero) %{
5422   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
5423   match(Set dst (ReplicateL zero));
5424   format %{ "vpxor   $dst k0,$dst,$dst\t! replicate8L zero" %}
5425   ins_encode %{
5426     // Use vxorpd since AVX does not have vpxor for 512-bit (EVEX will have it).
5427     int vector_len = 2;
5428     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
5429   %}
5430   ins_pipe( fpu_reg_reg );
5431 %}
5432 
5433 instruct Repl8F_evex(vecY dst, regF src) %{
5434   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vl());
5435   match(Set dst (ReplicateF src));
5436   format %{ "vpbroadcastss $dst,$src\t! replicate8F" %}
5437   ins_encode %{
5438     int vector_len = 1;
5439     __ vpbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vector_len);
5440   %}
5441   ins_pipe( pipe_slow );
5442 %}
5443 
5444 instruct Repl8F_mem_evex(vecY dst, memory mem) %{
5445   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vl());
5446   match(Set dst (ReplicateF (LoadF mem)));
5447   format %{ "vbroadcastss  $dst,$mem\t! replicate8F" %}
5448   ins_encode %{
5449     int vector_len = 1;
5450     __ vpbroadcastss($dst$$XMMRegister, $mem$$Address, vector_len);
5451   %}
5452   ins_pipe( pipe_slow );
5453 %}
5454 
5455 instruct Repl16F_evex(vecZ dst, regF src) %{
5456   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
5457   match(Set dst (ReplicateF src));
5458   format %{ "vpbroadcastss $dst,$src\t! replicate16F" %}
5459   ins_encode %{
5460     int vector_len = 2;
5461     __ vpbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vector_len);
5462   %}
5463   ins_pipe( pipe_slow );
5464 %}
5465 
5466 instruct Repl16F_mem_evex(vecZ dst, memory mem) %{
5467   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
5468   match(Set dst (ReplicateF (LoadF mem)));
5469   format %{ "vbroadcastss  $dst,$mem\t! replicate16F" %}
5470   ins_encode %{
5471     int vector_len = 2;
5472     __ vpbroadcastss($dst$$XMMRegister, $mem$$Address, vector_len);
5473   %}
5474   ins_pipe( pipe_slow );
5475 %}
5476 
5477 instruct Repl2F_zero_evex(vecD dst, immF0 zero) %{
5478   predicate(n->as_Vector()->length() == 2 && UseAVX > 2);
5479   match(Set dst (ReplicateF zero));
5480   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate2F zero" %}
5481   ins_encode %{
5482     // Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
5483     int vector_len = 2;
5484     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
5485   %}
5486   ins_pipe( fpu_reg_reg );
5487 %}
5488 
5489 instruct Repl4F_zero_evex(vecX dst, immF0 zero) %{
5490   predicate(n->as_Vector()->length() == 4 && UseAVX > 2);
5491   match(Set dst (ReplicateF zero));
5492   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate4F zero" %}
5493   ins_encode %{
5494     // Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
5495     int vector_len = 2;
5496     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
5497   %}
5498   ins_pipe( fpu_reg_reg );
5499 %}
5500 
5501 instruct Repl8F_zero_evex(vecY dst, immF0 zero) %{
5502   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
5503   match(Set dst (ReplicateF zero));
5504   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate8F zero" %}
5505   ins_encode %{
5506     // Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
5507     int vector_len = 2;
5508     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
5509   %}
5510   ins_pipe( fpu_reg_reg );
5511 %}
5512 
5513 instruct Repl16F_zero_evex(vecZ dst, immF0 zero) %{
5514   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
5515   match(Set dst (ReplicateF zero));
5516   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate16F zero" %}
5517   ins_encode %{
5518     // Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
5519     int vector_len = 2;
5520     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
5521   %}
5522   ins_pipe( fpu_reg_reg );
5523 %}
5524 
5525 instruct Repl4D_evex(vecY dst, regD src) %{
5526   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
5527   match(Set dst (ReplicateD src));
5528   format %{ "vpbroadcastsd $dst,$src\t! replicate4D" %}
5529   ins_encode %{
5530     int vector_len = 1;
5531     __ vpbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
5532   %}
5533   ins_pipe( pipe_slow );
5534 %}
5535 
5536 instruct Repl4D_mem_evex(vecY dst, memory mem) %{
5537   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
5538   match(Set dst (ReplicateD (LoadD mem)));
5539   format %{ "vbroadcastsd  $dst,$mem\t! replicate4D" %}
5540   ins_encode %{
5541     int vector_len = 1;
5542     __ vpbroadcastsd($dst$$XMMRegister, $mem$$Address, vector_len);
5543   %}
5544   ins_pipe( pipe_slow );
5545 %}
5546 
5547 instruct Repl8D_evex(vecZ dst, regD src) %{
5548   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
5549   match(Set dst (ReplicateD src));
5550   format %{ "vpbroadcastsd $dst,$src\t! replicate8D" %}
5551   ins_encode %{
5552     int vector_len = 2;
5553     __ vpbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
5554   %}
5555   ins_pipe( pipe_slow );
5556 %}
5557 
5558 instruct Repl8D_mem_evex(vecZ dst, memory mem) %{
5559   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
5560   match(Set dst (ReplicateD (LoadD mem)));
5561   format %{ "vbroadcastsd  $dst,$mem\t! replicate8D" %}
5562   ins_encode %{
5563     int vector_len = 2;
5564     __ vpbroadcastsd($dst$$XMMRegister, $mem$$Address, vector_len);
5565   %}
5566   ins_pipe( pipe_slow );
5567 %}
5568 
5569 instruct Repl2D_zero_evex(vecX dst, immD0 zero) %{
5570   predicate(n->as_Vector()->length() == 2 && UseAVX > 2);
5571   match(Set dst (ReplicateD zero));
5572   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate2D zero" %}
5573   ins_encode %{
5574     // Use vpxor in place of vxorpd since EVEX has a constriant on dq for vxorpd: this is a 512-bit operation
5575     int vector_len = 2;
5576     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
5577   %}
5578   ins_pipe( fpu_reg_reg );
5579 %}
5580 
5581 instruct Repl4D_zero_evex(vecY dst, immD0 zero) %{
5582   predicate(n->as_Vector()->length() == 4 && UseAVX > 2);
5583   match(Set dst (ReplicateD zero));
5584   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate4D zero" %}
5585   ins_encode %{
5586     // Use vpxor in place of vxorpd since EVEX has a constriant on dq for vxorpd: this is a 512-bit operation
5587     int vector_len = 2;
5588     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
5589   %}
5590   ins_pipe( fpu_reg_reg );
5591 %}
5592 
5593 instruct Repl8D_zero_evex(vecZ dst, immD0 zero) %{
5594   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
5595   match(Set dst (ReplicateD zero));
5596   format %{ "vpxor  $dst k0,$dst,$dst,vect512\t! replicate8D zero" %}
5597   ins_encode %{
5598     // Use vpxor in place of vxorpd since EVEX has a constriant on dq for vxorpd: this is a 512-bit operation
5599     int vector_len = 2;
5600     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
5601   %}
5602   ins_pipe( fpu_reg_reg );
5603 %}
5604 
5605 // ====================VECTOR INSERT=======================================
5606 
5607 instruct rvinsert8B(vecD dst, vecD src, rRegI val, immU3 idx) %{
5608   predicate(UseSSE > 3 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
5609   match(Set dst (VectorInsert (Binary src val) idx));
5610   effect(TEMP dst);
5611   format %{ "movdqu  $dst,$src\n\t"
5612             "pinsrb  $dst,$val\t! Insert 8B" %}
5613   ins_encode %{
5614     if ($dst$$XMMRegister != $src$$XMMRegister) {
5615       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
5616     }
5617     __ pinsrb($dst$$XMMRegister, $val$$Register, $idx$$constant);
5618   %}
5619   ins_pipe( pipe_slow );
5620 %}
5621 
5622 instruct rvinsert16B(vecX dst, vecX src, rRegI val, immU4 idx) %{
5623   predicate(UseSSE > 3 && UseAVX == 0 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
5624   match(Set dst (VectorInsert (Binary src val) idx));
5625   effect(TEMP dst);
5626   format %{ "movdqu  $dst,$src\n\t"
5627             "pinsrb  $dst,$val\t! Insert 16B" %}
5628   ins_encode %{
5629     if ($dst$$XMMRegister != $src$$XMMRegister) {
5630       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
5631     }
5632     __ pinsrb($dst$$XMMRegister, $val$$Register, $idx$$constant);
5633   %}
5634   ins_pipe( pipe_slow );
5635 %}
5636 
5637 instruct rvinsert16B_avx(vecX dst, vecX src, rRegI val, immU4 idx) %{
5638   predicate(UseAVX > 0 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
5639   match(Set dst (VectorInsert (Binary src val) idx));
5640   effect(TEMP dst);
5641   format %{ "vmovdqu  $dst,$src\n\t"
5642             "vpinsrb  $dst,$dst,$val\t! Insert 16B" %}
5643   ins_encode %{
5644     if ($dst$$XMMRegister != $src$$XMMRegister) {
5645       __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister);
5646     }
5647     __ vpinsrb($dst$$XMMRegister, $dst$$XMMRegister, $val$$Register, $idx$$constant);
5648   %}
5649   ins_pipe( pipe_slow );
5650 %}
5651 
5652 instruct rvinsert32B(vecY dst, vecY src, vecY tmp, rRegI val, immU5 idx) %{
5653   predicate(UseAVX > 0 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
5654   match(Set dst (VectorInsert (Binary src val) idx));
5655   effect(TEMP dst, TEMP tmp);
5656   format %{"vmovdqu  $dst,$src\n\t"
5657            "vextracti128  $tmp,$src\n\t"
5658            "vpinsrb  $tmp,$tmp,$val\n\t"
5659            "vinserti128  $dst,$tmp\t! Insert 32B" %}
5660   ins_encode %{
5661     uint x_idx = $idx$$constant & right_n_bits(4);
5662     uint y_idx = ($idx$$constant >> 4) & 1;
5663 
5664     if ($dst$$XMMRegister != $src$$XMMRegister) {
5665       __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister);
5666     }
5667     __ vextracti128($tmp$$XMMRegister, $src$$XMMRegister, y_idx);
5668     __ vpinsrb($tmp$$XMMRegister, $tmp$$XMMRegister, $val$$Register, x_idx);
5669     __ vinserti128($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, y_idx);
5670   %}
5671   ins_pipe( pipe_slow );
5672 %}
5673 
5674 instruct rvinsert64B(vecZ dst, vecZ src, vecZ tmp, vecX tmp1, rRegI val, immU6 idx) %{
5675   predicate(UseAVX > 2 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
5676   match(Set dst (VectorInsert (Binary src val) idx));
5677   effect(TEMP dst, TEMP tmp, TEMP tmp1);
5678   format %{ "evmovdquq  $dst,$src\n\t"
5679             "vextracti64x4  $tmp,$src\n\t"
5680             "vextracti128  $tmp1,$tmp\n\t"
5681             "vpinsrb  $tmp1,$tmp1,$val\n\t"
5682             "vinserti128  $tmp,$tmp,$tmp1\n\t"
5683             "vinserti64x4  $dst,$dst,$tmp\t! Insert 64B" %}
5684   ins_encode %{
5685     uint x_idx = $idx$$constant & right_n_bits(4);
5686     uint y_idx = ($idx$$constant >> 4) & 1;
5687     uint z_idx = ($idx$$constant >> 5) & 1;
5688 
5689     if ($dst$$XMMRegister != $src$$XMMRegister) {
5690       __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, 2);
5691     }
5692     __ vextracti64x4($tmp$$XMMRegister, $src$$XMMRegister, z_idx);
5693     __ vextracti128($tmp1$$XMMRegister, $tmp$$XMMRegister, y_idx);
5694     __ vpinsrb($tmp1$$XMMRegister, $tmp1$$XMMRegister, $val$$Register, x_idx);
5695     __ vinserti128($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp1$$XMMRegister, y_idx);
5696     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, z_idx);
5697   %}
5698   ins_pipe( pipe_slow );
5699 %}
5700 
5701 instruct rvinsert4S(vecD dst, vecD src, rRegI val, immU2 idx) %{
5702   predicate(UseSSE > 3 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
5703   match(Set dst (VectorInsert (Binary src val) idx));
5704   effect(TEMP dst);
5705   format %{ "movdqu  $dst,$src\n\t"
5706             "pinsrw  $dst,$val\t! Insert 4S" %}
5707   ins_encode %{
5708     if ($dst$$XMMRegister != $src$$XMMRegister) {
5709       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
5710     }
5711     __ pinsrw($dst$$XMMRegister, $val$$Register, $idx$$constant);
5712   %}
5713   ins_pipe( pipe_slow );
5714 %}
5715 
5716 instruct rvinsert8S(vecX dst, vecX src, rRegI val, immU3 idx) %{
5717   predicate(UseSSE > 3 && UseAVX == 0 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
5718   match(Set dst (VectorInsert (Binary src val) idx));
5719   effect(TEMP dst);
5720   format %{ "movdqu  $dst,$src\n\t"
5721             "pinsrw  $dst,$val\t! Insert 8S" %}
5722   ins_encode %{
5723     if ($dst$$XMMRegister != $src$$XMMRegister) {
5724       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
5725     }
5726     __ pinsrw($dst$$XMMRegister, $val$$Register, $idx$$constant);
5727   %}
5728   ins_pipe( pipe_slow );
5729 %}
5730 
5731 instruct rvinsert8S_avx(vecX dst, vecX src, rRegI val, immU3 idx) %{
5732   predicate(UseAVX > 0 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
5733   match(Set dst (VectorInsert (Binary src val) idx));
5734   effect(TEMP dst);
5735   format %{ "vmovdqu  $dst,$src\n\t"
5736             "vpinsrw  $dst,$dst,$val\t! Insert 8S" %}
5737   ins_encode %{
5738     if ($dst$$XMMRegister != $src$$XMMRegister) {
5739       __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister);
5740     }
5741     __ vpinsrw($dst$$XMMRegister, $dst$$XMMRegister, $val$$Register, $idx$$constant);
5742   %}
5743   ins_pipe( pipe_slow );
5744 %}
5745 
5746 
5747 instruct rvinsert16S(vecY dst, vecY src, vecX tmp, rRegI val, immU4 idx) %{
5748   predicate(UseAVX > 0 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
5749   match(Set dst (VectorInsert (Binary src val) idx));
5750   effect(TEMP dst, TEMP tmp);
5751   format %{ "vmovdqu  $dst,$src\n\t"
5752             "vextracti128  $tmp,$src\n\t"
5753             "vpinsrw  $tmp,$tmp,$val\n\t"
5754             "vinserti128  $dst,$dst,$tmp\t! Insert 16S" %}
5755   ins_encode %{
5756     uint x_idx = $idx$$constant & right_n_bits(3);
5757     uint y_idx = ($idx$$constant >> 3) & 1;
5758 
5759     if ($dst$$XMMRegister != $src$$XMMRegister) {
5760       __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister);
5761     }
5762     __ vextracti128($tmp$$XMMRegister, $src$$XMMRegister, y_idx);
5763     __ vpinsrw($tmp$$XMMRegister, $tmp$$XMMRegister, $val$$Register, x_idx);
5764     __ vinserti128($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, y_idx);
5765   %}
5766   ins_pipe( pipe_slow );
5767 %}
5768 
5769 instruct rvinsert32S(vecZ dst, vecZ src, vecZ tmp, vecX tmp1, rRegI val, immU5 idx) %{
5770   predicate(UseAVX > 2 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
5771   match(Set dst (VectorInsert (Binary src val) idx));
5772   effect(TEMP dst, TEMP tmp, TEMP tmp1);
5773   format %{ "evmovdquq  $dst,$src\n\t"
5774             "vextracti64x4  $tmp,$src\n\t"
5775             "vextracti128  $tmp1,$tmp\n\t"
5776             "vpinsrw  $tmp1,$tmp1,$val\n\t"
5777             "vinserti128  $tmp,$tmp,$tmp1\n\t"
5778             "vinserti64x4  $dst,$dst,$tmp\t! Insert 32S" %}
5779   ins_encode %{
5780     uint x_idx = $idx$$constant & right_n_bits(3);
5781     uint y_idx = ($idx$$constant >> 3) & 1;
5782     uint z_idx = ($idx$$constant >> 4) & 1;
5783 
5784     if ($dst$$XMMRegister != $src$$XMMRegister) {
5785       __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, 2);
5786     }
5787     __ vextracti64x4($tmp$$XMMRegister, $src$$XMMRegister, z_idx);
5788     __ vextracti128($tmp1$$XMMRegister, $tmp$$XMMRegister, y_idx);
5789     __ vpinsrw($tmp1$$XMMRegister, $tmp1$$XMMRegister, $val$$Register, x_idx);
5790     __ vinserti128($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp1$$XMMRegister, y_idx);
5791     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, z_idx);
5792   %}
5793   ins_pipe( pipe_slow );
5794 %}
5795 
5796 instruct rvinsert2I(vecD dst, vecD src, rRegI val, immU1 idx) %{
5797   predicate(UseSSE > 3 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
5798   match(Set dst (VectorInsert (Binary src val) idx));
5799   effect(TEMP dst);
5800   format %{ "movdqu  $dst,$src\n\t"
5801             "pinsrd  $dst,$val\t! Insert 2I" %}
5802   ins_encode %{
5803     if ($dst$$XMMRegister != $src$$XMMRegister) {
5804       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
5805     }
5806     __ pinsrd($dst$$XMMRegister, $val$$Register, $idx$$constant);
5807   %}
5808   ins_pipe( pipe_slow );
5809 %}
5810 
5811 instruct rvinsert4I(vecX dst, vecX src, rRegI val, immU2 idx) %{
5812   predicate(UseSSE > 3 && UseAVX == 0 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
5813   match(Set dst (VectorInsert (Binary src val) idx));
5814   effect(TEMP dst);
5815   format %{ "movdqu  $dst,$src\n\t"
5816             "pinsrd  $dst,$val\t! Insert 4I" %}
5817   ins_encode %{
5818     if ($dst$$XMMRegister != $src$$XMMRegister) {
5819       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
5820     }
5821     __ pinsrd($dst$$XMMRegister, $val$$Register, $idx$$constant);
5822   %}
5823   ins_pipe( pipe_slow );
5824 %}
5825 
5826 instruct rvinsert4I_avx(vecX dst, vecX src, rRegI val, immU2 idx) %{
5827   predicate(UseAVX > 0 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
5828   match(Set dst (VectorInsert (Binary src val) idx));
5829   effect(TEMP dst);
5830   format %{ "vmovdqu  $dst,$src\n\t"
5831             "vpinsrd  $dst,$val\t! Insert 4I" %}
5832   ins_encode %{
5833     if ($dst$$XMMRegister != $src$$XMMRegister) {
5834       __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister);
5835     }
5836     __ vpinsrd($dst$$XMMRegister, $dst$$XMMRegister, $val$$Register, $idx$$constant);
5837   %}
5838   ins_pipe( pipe_slow );
5839 %}
5840 
5841 instruct rvinsert8I(vecY dst, vecY src, vecY tmp, rRegI val, immU3 idx) %{
5842   predicate(UseAVX > 0 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
5843   match(Set dst (VectorInsert (Binary src val) idx));
5844   effect(TEMP dst, TEMP tmp);
5845   format %{ "vmovdqu  $dst,$src\n\t"
5846             "vextracti128  $tmp,$src\n\t"
5847             "vpinsrd  $tmp,$tmp,$val\n\t"
5848             "vinserti128  $dst,$dst,$tmp\t! Insert 8I" %}
5849   ins_encode %{
5850     uint x_idx = $idx$$constant & right_n_bits(2);
5851     uint y_idx = ($idx$$constant >> 2) & 1;
5852 
5853     if ($dst$$XMMRegister != $src$$XMMRegister) {
5854       __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister);
5855     }
5856     __ vextracti128($tmp$$XMMRegister, $src$$XMMRegister, y_idx);
5857     __ vpinsrd($tmp$$XMMRegister, $tmp$$XMMRegister, $val$$Register, x_idx);
5858     __ vinserti128($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, y_idx);
5859   %}
5860   ins_pipe( pipe_slow );
5861 %}
5862 
5863 instruct rvinsert16I(vecZ dst, vecZ src, vecZ tmp, vecX tmp1, rRegI val, immU4 idx) %{
5864   predicate(UseAVX > 2 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
5865   match(Set dst (VectorInsert (Binary src val) idx));
5866   effect(TEMP dst, TEMP tmp, TEMP tmp1);
5867   format %{ "evmovdquq  $dst,$src\n\t"
5868             "vextracti64x4  $tmp,$src\n\t"
5869             "vextracti128  $tmp,$tmp\n\t"
5870             "vpinsrd  $tmp,$tmp,$val\n\t"
5871             "vinserti128  $tmp,$tmp,$tmp\n\t"
5872             "vinserti64x4  $dst,$dst,$tmp\t! Insert 16I" %}
5873   ins_encode %{
5874     uint x_idx = $idx$$constant & right_n_bits(2);
5875     uint y_idx = ($idx$$constant >> 2) & 1;
5876     uint z_idx = ($idx$$constant >> 3) & 1;
5877 
5878     if ($dst$$XMMRegister != $src$$XMMRegister) {
5879       __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, 2);
5880     }
5881     __ vextracti64x4($tmp$$XMMRegister, $src$$XMMRegister, z_idx);
5882     __ vextracti128($tmp1$$XMMRegister, $tmp$$XMMRegister, y_idx);
5883     __ vpinsrd($tmp1$$XMMRegister, $tmp1$$XMMRegister, $val$$Register, x_idx);
5884     __ vinserti128($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp1$$XMMRegister, y_idx);
5885     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, z_idx);
5886   %}
5887   ins_pipe( pipe_slow );
5888 %}
5889 
5890 instruct rvinsert1L(vecD dst, vecD src, rRegL val, immI0 idx) %{
5891   predicate(UseSSE > 3 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
5892   match(Set dst (VectorInsert (Binary src val) idx));
5893   effect(TEMP dst);
5894   format %{ "movdqu  $dst,$src\n\t"
5895             "pinsrq  $dst,$val\t! Insert 1L" %}
5896   ins_encode %{
5897     if ($dst$$XMMRegister != $src$$XMMRegister) {
5898       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
5899     }
5900     __ pinsrq($dst$$XMMRegister, $val$$Register, 0);
5901   %}
5902   ins_pipe( pipe_slow );
5903 %}
5904 
5905 instruct rvinsert2L(vecX dst, vecX src, rRegL val, immU1 idx) %{
5906   predicate(UseSSE > 3 && UseAVX == 0 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
5907   match(Set dst (VectorInsert (Binary src val) idx));
5908   effect(TEMP dst);
5909   format %{ "movdqu  $dst,$src\n\t"
5910             "pinsrq  $dst,$dst\t! Insert 2L" %}
5911   ins_encode %{
5912     if ($dst$$XMMRegister != $src$$XMMRegister) {
5913       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
5914     }
5915     __ pinsrq($dst$$XMMRegister, $val$$Register, $idx$$constant);
5916   %}
5917   ins_pipe( pipe_slow );
5918 %}
5919 
5920 instruct rvinsert2L_avx(vecX dst, vecX src, rRegL val, immU1 idx) %{
5921   predicate(UseAVX > 0 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
5922   match(Set dst (VectorInsert (Binary src val) idx));
5923   effect(TEMP dst);
5924   format %{ "vmovdqu  $dst,$src\n\t"
5925             "vpinsrq  $dst,$dst,$val\t! Insert 2L" %}
5926   ins_encode %{
5927     if ($dst$$XMMRegister != $src$$XMMRegister) {
5928       __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister);
5929     }
5930     __ vpinsrq($dst$$XMMRegister, $dst$$XMMRegister, $val$$Register, $idx$$constant);
5931   %}
5932   ins_pipe( pipe_slow );
5933 %}
5934 
5935 instruct rvinsert4L(vecY dst, vecY src, vecY tmp, rRegL val, immU2 idx) %{
5936   predicate(UseAVX > 0 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
5937   match(Set dst (VectorInsert (Binary src val) idx));
5938   effect(TEMP dst, TEMP tmp);
5939   format %{ "vmovdqu  $dst,$src\n\t"
5940             "vextracti128  $tmp,$src\n\t"
5941             "vpinsrq  $tmp,$tmp,$val\n\t"
5942             "vinserti128  $dst,$dst,$tmp\t! Insert 4L" %}
5943   ins_encode %{
5944     uint x_idx = $idx$$constant & 1;
5945     uint y_idx = ($idx$$constant >> 1) & 1;
5946 
5947     if ($dst$$XMMRegister != $src$$XMMRegister) {
5948       __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister);
5949     }
5950     __ vextracti128($tmp$$XMMRegister, $src$$XMMRegister, y_idx);
5951     __ vpinsrq($tmp$$XMMRegister, $tmp$$XMMRegister, $val$$Register, x_idx);
5952     __ vinserti128($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, y_idx);
5953   %}
5954   ins_pipe( pipe_slow );
5955 %}
5956 
5957 instruct rvinsert8L(vecZ dst, vecZ src, vecZ tmp, vecX tmp1, rRegL val, immU3 idx) %{
5958   predicate(UseAVX > 2 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
5959   match(Set dst (VectorInsert (Binary src val) idx));
5960   effect(TEMP dst, TEMP tmp, TEMP tmp1);
5961   format %{ "evmovdquq  $dst,$src\n\t"
5962             "vextracti64x4  $tmp,$src\n\t"
5963             "vextracti128  $tmp,$tmp\n\t"
5964             "vpinsrq  $tmp,$tmp,$val\n\t"
5965             "vinserti128  $tmp,$tmp,$tmp\n\t"
5966             "vinserti64x4  $dst,$dst,$tmp\t! Insert 8L" %}
5967   ins_encode %{
5968     uint x_idx = $idx$$constant & 1;
5969     uint y_idx = ($idx$$constant >> 1) & 1;
5970     uint z_idx = ($idx$$constant >> 2) & 1;
5971 
5972     if ($dst$$XMMRegister != $src$$XMMRegister) {
5973       __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, 2);
5974     }
5975     __ vextracti64x4($tmp$$XMMRegister, $src$$XMMRegister, z_idx);
5976     __ vextracti128($tmp1$$XMMRegister, $tmp$$XMMRegister, y_idx);
5977     __ vpinsrq($tmp1$$XMMRegister, $tmp1$$XMMRegister, $val$$Register, x_idx);
5978     __ vinserti128($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp1$$XMMRegister, y_idx);
5979     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, z_idx);
5980   %}
5981   ins_pipe( pipe_slow );
5982 %}
5983 
5984 instruct rvinsert2F(vecD dst, vecD src, regF val, immU1 idx) %{
5985   predicate(UseSSE > 3 && UseAVX == 0 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
5986   match(Set dst (VectorInsert (Binary src val) idx));
5987   effect(TEMP dst);
5988   format %{ "movdqu  $dst,$src\n\t"
5989             "insertps  $dst,$dst,$val\t! Insert 2F" %}
5990   ins_encode %{
5991     if ($dst$$XMMRegister != $src$$XMMRegister) {
5992       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
5993     }
5994     __ insertps($dst$$XMMRegister, $val$$XMMRegister, $idx$$constant);
5995   %}
5996   ins_pipe( pipe_slow );
5997 %}
5998 
5999 instruct rvinsert2F_avx(vecD dst, vecD src, regF val, immU1 idx) %{
6000   predicate(UseAVX > 0 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
6001   match(Set dst (VectorInsert (Binary src val) idx));
6002   effect(TEMP dst);
6003   format %{ "movdqu  $dst,$src\n\t"
6004             "insertps  $dst,$dst,$val\t! Insert 2F" %}
6005   ins_encode %{
6006     if ($dst$$XMMRegister != $src$$XMMRegister) {
6007       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
6008     }
6009     __ vinsertps($dst$$XMMRegister, $dst$$XMMRegister, $val$$XMMRegister, $idx$$constant);
6010   %}
6011   ins_pipe( pipe_slow );
6012 %}
6013 
6014 instruct rvinsert4F(vecX dst, vecX src, regF val, immU2 idx) %{
6015   predicate(UseSSE > 3 && UseAVX == 0 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
6016   match(Set dst (VectorInsert (Binary src val) idx));
6017   effect(TEMP dst);
6018   format %{ "movdqu  $dst,$src\n\t"
6019             "insertps  $dst,$dst,$val\t! Insert 4F" %}
6020   ins_encode %{
6021     if ($dst$$XMMRegister != $src$$XMMRegister) {
6022       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
6023     }
6024     __ insertps($dst$$XMMRegister, $val$$XMMRegister, $idx$$constant);
6025   %}
6026   ins_pipe( pipe_slow );
6027 %}
6028 
6029 instruct rvinsert4F_avx(vecX dst, vecX src, regF val, immU2 idx) %{
6030   predicate(UseAVX > 0 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
6031   match(Set dst (VectorInsert (Binary src val) idx));
6032   effect(TEMP dst);
6033   format %{ "vmovdqu  $dst,$src\n\t"
6034             "vinsertps  $dst,$dst,$val\t! Insert 4F" %}
6035   ins_encode %{
6036     if ($dst$$XMMRegister != $src$$XMMRegister) {
6037       __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister);
6038     }
6039     __ vinsertps($dst$$XMMRegister, $dst$$XMMRegister, $val$$XMMRegister, $idx$$constant);
6040   %}
6041   ins_pipe( pipe_slow );
6042 %}
6043 
6044 instruct rvinsert8F(vecY dst, vecY src, vecY tmp, regF val, immU3 idx) %{
6045   predicate(UseAVX > 0 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
6046   match(Set dst (VectorInsert (Binary src val) idx));
6047   effect(TEMP dst, TEMP tmp);
6048   format %{ "vmovdqu  $dst,$src\n\t"
6049             "vextractf128  $tmp,$src\n\t"
6050             "vinsertps  $tmp,$tmp,$val\n\t"
6051             "vinsertf128  $dst,$dst,$tmp\t! Insert 8F" %}
6052   ins_encode %{
6053     uint x_idx = $idx$$constant & right_n_bits(2);
6054     uint y_idx = ($idx$$constant >> 2) & 1;
6055 
6056     if ($dst$$XMMRegister != $src$$XMMRegister) {
6057       __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister);
6058     }
6059     __ vextractf128($tmp$$XMMRegister, $src$$XMMRegister, y_idx);
6060     __ vinsertps($tmp$$XMMRegister, $tmp$$XMMRegister, $val$$XMMRegister, x_idx);
6061     __ vinsertf128($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, y_idx);
6062   %}
6063   ins_pipe( pipe_slow );
6064 %}
6065 
6066 instruct rvinsert16F(vecZ dst, vecZ src, vecZ tmp, vecX tmp1, regF val, immU4 idx) %{
6067   predicate(UseAVX > 2 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
6068   match(Set dst (VectorInsert (Binary src val) idx));
6069   effect(TEMP dst, TEMP tmp, TEMP tmp1);
6070   format %{ "evmovdquq  $dst,$src\n\t"
6071             "vextractf128  $tmp,$src\n\t"
6072             "vinsertps  $tmp,$tmp,$val\n\t"
6073             "movsbl  $dst,$dst\t! Insert 4I" %}
6074   ins_encode %{
6075     uint x_idx = $idx$$constant & right_n_bits(2);
6076     uint y_idx = ($idx$$constant >> 2) & 1;
6077     uint z_idx = ($idx$$constant >> 3) & 1;
6078 
6079     if ($dst$$XMMRegister != $src$$XMMRegister) {
6080       __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, 2);
6081     }
6082     __ vextractf64x4($tmp$$XMMRegister, $src$$XMMRegister, z_idx);
6083     __ vextractf128($tmp1$$XMMRegister, $tmp$$XMMRegister, y_idx);
6084     __ vinsertps($tmp1$$XMMRegister, $tmp1$$XMMRegister, $val$$XMMRegister, x_idx);
6085     __ vinsertf128($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp1$$XMMRegister, y_idx);
6086     __ vinsertf64x4($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, z_idx);
6087   %}
6088   ins_pipe( pipe_slow );
6089 %}
6090 
6091 instruct rvinsert1D(vecD dst, vecD src, regD val, rRegL tmp, immI0 idx) %{
6092   predicate(UseSSE > 3 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
6093   match(Set dst (VectorInsert (Binary src val) idx));
6094   effect(TEMP dst, TEMP tmp);
6095   format %{ "movdqu  $dst,$src\n\t"
6096             "movq $tmp,$val\n\t"
6097             "pinsrq  $dst,$tmp\t! Insert 1D" %}
6098   ins_encode %{
6099     if ($dst$$XMMRegister != $src$$XMMRegister) {
6100       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
6101     }
6102     __ movq($tmp$$Register, $val$$XMMRegister);
6103     __ pinsrq($dst$$XMMRegister, $tmp$$Register, 0);
6104   %}
6105   ins_pipe( pipe_slow );
6106 %}
6107 
6108 instruct rvinsert2D(vecX dst, vecX src, regD val, rRegL tmp, immU1 idx) %{
6109   predicate(UseSSE > 3 && UseAVX == 0 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
6110   match(Set dst (VectorInsert (Binary src val) idx));
6111   effect(TEMP dst, TEMP tmp);
6112   format %{ "movdqu  $dst,$src\n\t"
6113             "movq  $dst,$src\n\t"
6114             "pinsrq  $dst,$dst\t! Insert 2D" %}
6115   ins_encode %{
6116     if ($dst$$XMMRegister != $src$$XMMRegister) {
6117       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
6118     }
6119     __ movq($tmp$$Register, $val$$XMMRegister);
6120     __ pinsrq($dst$$XMMRegister, $tmp$$Register, $idx$$constant);
6121   %}
6122   ins_pipe( pipe_slow );
6123 %}
6124 
6125 instruct rvinsert2D_avx(vecX dst, vecX src, regD val, rRegL tmp, immU1 idx) %{
6126   predicate(UseAVX > 0 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
6127   match(Set dst (VectorInsert (Binary src val) idx));
6128   effect(TEMP dst, TEMP tmp);
6129   format %{ "vmovdqu  $dst,$src\n\t"
6130             "movq  $tmp,$val\n\t"
6131             "vpinsrq  $dst,$dst,$tmp\t! Insert 2D" %}
6132   ins_encode %{
6133     if ($dst$$XMMRegister != $src$$XMMRegister) {
6134       __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister);
6135     }
6136     __ movq($tmp$$Register, $val$$XMMRegister);
6137     __ vpinsrq($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$Register, $idx$$constant);
6138   %}
6139   ins_pipe( pipe_slow );
6140 %}
6141 
6142 instruct rvinsert4D(vecY dst, vecY src, vecY tmp, regD val, rRegL tmp1, immU2 idx) %{
6143   predicate(UseAVX > 0 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
6144   match(Set dst (VectorInsert (Binary src val) idx));
6145   effect(TEMP dst, TEMP tmp, TEMP tmp1);
6146   format %{ "vmovdqu  $dst,$src\n\t"
6147             "vextracti128  $tmp,$src\n\t"
6148             "movq $tmp1,$val\n\t"
6149             "vpinsrq  $tmp,$tmp,$tmp1\n\t"
6150             "vinserti128  $dst,$dst,$tmp\t! Insert 4D" %}
6151   ins_encode %{
6152     uint x_idx = $idx$$constant & 1;
6153     uint y_idx = ($idx$$constant >> 1) & 1;
6154 
6155     if ($dst$$XMMRegister != $src$$XMMRegister) {
6156       __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister);
6157     }
6158     __ vextractf128($tmp$$XMMRegister, $src$$XMMRegister, y_idx);
6159     __ movq($tmp1$$Register, $val$$XMMRegister);
6160     __ vpinsrq($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp1$$Register, x_idx);
6161     __ vinsertf128($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, y_idx);
6162   %}
6163   ins_pipe( pipe_slow );
6164 %}
6165 
6166 instruct rvinsert8D(vecZ dst, vecZ src, vecZ tmp, vecY tmp2, regD val, rRegL tmp1, immU3 idx) %{
6167   predicate(UseAVX > 2 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
6168   match(Set dst (VectorInsert (Binary src val) idx));
6169   effect(TEMP dst, TEMP tmp, TEMP tmp1, TEMP tmp2);
6170   format %{ "evmovdquq  $dst,$src\n\t"
6171             "vextractf64x4  $tmp,$src\n\t"
6172             "vextractf128  $tmp,$tmp\n\t"
6173             "movq $tmp1,$val\n\t"
6174             "vpinsrq  $tmp,$tmp,$val\n\t"
6175             "vinsertf128  $tmp,$tmp,$tmp\n\t"
6176             "vinsertf64x4  $dst,$dst,$tmp\t! Insert 8D" %}
6177   ins_encode %{
6178     uint x_idx = $idx$$constant & 1;
6179     uint y_idx = ($idx$$constant >> 1) & 1;
6180     uint z_idx = ($idx$$constant >> 2) & 1;
6181 
6182     if ($dst$$XMMRegister != $src$$XMMRegister) {
6183       __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, 2);
6184     }
6185     __ vextractf64x4($tmp2$$XMMRegister, $src$$XMMRegister, z_idx);
6186     __ vextractf128($tmp$$XMMRegister, $tmp2$$XMMRegister, y_idx);
6187     __ movq($tmp1$$Register, $val$$XMMRegister);
6188     __ vpinsrq($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp1$$Register, x_idx);
6189     __ vinsertf128($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, y_idx);
6190     __ vinsertf64x4($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister, z_idx);
6191   %}
6192   ins_pipe( pipe_slow );
6193 %}
6194 
6195 // ====================REDUCTION ARITHMETIC=======================================
6196 
6197 instruct rsadd8B_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, rRegI tmp2) %{
6198   predicate(UseSSE > 3 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
6199   match(Set dst (AddReductionVI src1 src2));
6200   effect(TEMP tmp, TEMP tmp2, TEMP dst);
6201   format %{
6202             "pshufd  $tmp,$src2,0x1\n\t"
6203             "paddb   $tmp,$src2\n\t"
6204             "movzbl  $dst,$src1\n\t"
6205             "pextrb  $tmp2,$tmp, 0x0\n\t"
6206             "addl    $dst,$tmp2\n\t"
6207             "pextrb  $tmp2,$tmp, 0x1\n\t"
6208             "addl    $dst,$tmp2\n\t"
6209             "pextrb  $tmp2,$tmp, 0x2\n\t"
6210             "addl    $dst,$tmp2\n\t"
6211             "pextrb  $tmp2,$tmp, 0x3\n\t"
6212             "addl    $dst,$tmp2\n\t"
6213             "movsbl  $dst,$dst\t! add reduction8B" %}
6214   ins_encode %{
6215     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x1);
6216     __ paddb($tmp$$XMMRegister, $src2$$XMMRegister);
6217     __ movzbl($dst$$Register, $src1$$Register);
6218     __ pextrb($tmp2$$Register, $tmp$$XMMRegister, 0x0);
6219     __ addl($dst$$Register, $tmp2$$Register);
6220     __ pextrb($tmp2$$Register, $tmp$$XMMRegister, 0x1);
6221     __ addl($dst$$Register, $tmp2$$Register);
6222     __ pextrb($tmp2$$Register, $tmp$$XMMRegister, 0x2);
6223     __ addl($dst$$Register, $tmp2$$Register);
6224     __ pextrb($tmp2$$Register, $tmp$$XMMRegister, 0x3);
6225     __ addl($dst$$Register, $tmp2$$Register);
6226     __ movsbl($dst$$Register, $dst$$Register);
6227   %}
6228   ins_pipe( pipe_slow );
6229 %}
6230 
6231 instruct rsadd16B_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2, rRegI tmp3) %{
6232   predicate(UseSSE > 3 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
6233   match(Set dst (AddReductionVI src1 src2));
6234   effect(TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP dst);
6235   format %{ "pshufd  $tmp,$src2,0xE\n\t"
6236             "paddb   $tmp,$src2\n\t"
6237             "pshufd  $tmp2,$tmp,0x1\n\t"
6238             "paddb   $tmp,$tmp,$tmp2\n\t"
6239             "movzbl  $dst,$src1\n\t"
6240             "pextrb  $tmp3,$tmp, 0x0\n\t"
6241             "addl    $dst,$tmp3\n\t"
6242             "pextrb  $tmp3,$tmp, 0x1\n\t"
6243             "addl    $dst,$tmp3\n\t"
6244             "pextrb  $tmp3,$tmp, 0x2\n\t"
6245             "addl    $dst,$tmp3\n\t"
6246             "pextrb  $tmp3,$tmp, 0x3\n\t"
6247             "addl    $dst,$tmp3\n\t"
6248             "movsbl  $dst,$dst\t! add reduction16B" %}
6249   ins_encode %{
6250     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
6251     __ paddb($tmp$$XMMRegister, $src2$$XMMRegister);
6252     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
6253     __ paddb($tmp$$XMMRegister, $tmp2$$XMMRegister);
6254     __ movzbl($dst$$Register, $src1$$Register);
6255     __ pextrb($tmp3$$Register, $tmp$$XMMRegister, 0x0);
6256     __ addl($dst$$Register, $tmp3$$Register);
6257     __ pextrb($tmp3$$Register, $tmp$$XMMRegister, 0x1);
6258     __ addl($dst$$Register, $tmp3$$Register);
6259     __ pextrb($tmp3$$Register, $tmp$$XMMRegister, 0x2);
6260     __ addl($dst$$Register, $tmp3$$Register);
6261     __ pextrb($tmp3$$Register, $tmp$$XMMRegister, 0x3);
6262     __ addl($dst$$Register, $tmp3$$Register);
6263     __ movsbl($dst$$Register, $dst$$Register);
6264   %}
6265   ins_pipe( pipe_slow );
6266 %}
6267 
6268 instruct rvadd32B_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2, rRegI tmp3) %{
6269   predicate(UseAVX > 1 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
6270   match(Set dst (AddReductionVI src1 src2));
6271   effect(TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP dst);
6272   format %{ "vextracti128_high  $tmp,$src2\n\t"
6273             "vpaddb  $tmp,$tmp,$src2\n\t"
6274             "pshufd  $tmp2,$tmp,0xE\n\t"
6275             "vpaddb  $tmp,$tmp,$tmp2\n\t"
6276             "pshufd  $tmp2,$tmp,0x1\n\t"
6277             "vpaddb  $tmp,$tmp,$tmp2\n\t"
6278             "movzbl  $dst,$src1\n\t"
6279             "pextrb  $tmp3,$tmp, 0x0\n\t"
6280             "addl    $dst,$tmp3\n\t"
6281             "pextrb  $tmp3,$tmp, 0x1\n\t"
6282             "addl    $dst,$tmp3\n\t"
6283             "pextrb  $tmp3,$tmp, 0x2\n\t"
6284             "addl    $dst,$tmp3\n\t"
6285             "pextrb  $tmp3,$tmp, 0x3\n\t"
6286             "addl    $dst,$tmp3\n\t"
6287             "movsbl  $dst,$dst\t! add reduction32B" %}
6288   ins_encode %{
6289     int vector_len = 0;
6290     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
6291     __ vpaddb($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
6292     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
6293     __ vpaddb($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
6294     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
6295     __ vpaddb($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
6296     __ movzbl($dst$$Register, $src1$$Register);
6297     __ pextrb($tmp3$$Register, $tmp$$XMMRegister, 0x0);
6298     __ addl($dst$$Register, $tmp3$$Register);
6299     __ pextrb($tmp3$$Register, $tmp$$XMMRegister, 0x1);
6300     __ addl($dst$$Register, $tmp3$$Register);
6301     __ pextrb($tmp3$$Register, $tmp$$XMMRegister, 0x2);
6302     __ addl($dst$$Register, $tmp3$$Register);
6303     __ pextrb($tmp3$$Register, $tmp$$XMMRegister, 0x3);
6304     __ addl($dst$$Register, $tmp3$$Register);
6305     __ movsbl($dst$$Register, $dst$$Register);
6306   %}
6307   ins_pipe( pipe_slow );
6308 %}
6309 
6310 instruct rvadd64B_reduction_reg(rRegI dst, rRegI src1, vecZ src2, regF tmp, regF tmp2, rRegI tmp3) %{
6311   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
6312   match(Set dst (AddReductionVI src1 src2));
6313   effect(TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP dst);
6314   format %{ "vextracti64x4_high  $tmp2,$src2\n\t"
6315             "vpaddb  $tmp2,$tmp2,$src2\n\t"
6316             "vextracti128_high  $tmp,$tmp2\n\t"
6317             "vpaddb  $tmp,$tmp,$tmp2\n\t"
6318             "pshufd  $tmp2,$tmp,0xE\n\t"
6319             "vpaddb  $tmp,$tmp,$tmp2\n\t"
6320             "pshufd  $tmp2,$tmp,0x1\n\t"
6321             "vpaddb  $tmp,$tmp,$tmp2\n\t"
6322             "movzbl  $dst,$src1\n\t"
6323             "movdl   $tmp3,$tmp\n\t"
6324             "addl    $dst,$tmp3\n\t"
6325             "shrl    $tmp3,0x8\n\t"
6326             "addl    $dst,$tmp3\n\t"
6327             "shrl    $tmp3,0x8\n\t"
6328             "addl    $dst,$tmp3\n\t"
6329             "shrl    $tmp3,0x8\n\t"
6330             "addl    $dst,$tmp3\n\t"
6331             "movsbl  $dst,$dst\t! add reduction64B" %}
6332   ins_encode %{
6333     int vector_len = 0;
6334     __ vextracti64x4_high($tmp2$$XMMRegister, $src2$$XMMRegister);
6335     __ vpaddb($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
6336     __ vextracti128_high($tmp$$XMMRegister, $tmp2$$XMMRegister);
6337     __ vpaddb($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
6338     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
6339     __ vpaddb($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
6340     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
6341     __ vpaddb($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
6342     __ movzbl($dst$$Register, $src1$$Register);
6343     __ movdl($tmp3$$Register, $tmp$$XMMRegister);
6344     __ addl($dst$$Register, $tmp3$$Register);
6345     __ shrl($tmp3$$Register, 8);
6346     __ addl($dst$$Register, $tmp3$$Register);
6347     __ shrl($tmp3$$Register, 8);
6348     __ addl($dst$$Register, $tmp3$$Register);
6349     __ shrl($tmp3$$Register, 8);
6350     __ addl($dst$$Register, $tmp3$$Register);
6351     __ movsbl($dst$$Register, $dst$$Register);
6352   %}
6353   ins_pipe( pipe_slow );
6354 %}
6355 
6356 instruct rsadd4S_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, rRegI tmp2) %{
6357   predicate(UseSSE > 3 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
6358   match(Set dst (AddReductionVI src1 src2));
6359   effect(TEMP tmp, TEMP tmp2, TEMP dst);
6360   format %{
6361             "movdqu   $tmp,$src2\n\t"
6362             "phaddw   $tmp,$tmp\n\t"
6363             "phaddw   $tmp,$tmp\n\t"
6364             "movzwl   $dst,$src1\n\t"
6365             "pextrw   $tmp2,$tmp, 0x0\n\t"
6366             "addw     $dst,$tmp2\n\t"
6367             "movswl  $dst,$dst\t! add reduction4S" %}
6368   ins_encode %{
6369     __ movdqu($tmp$$XMMRegister, $src2$$XMMRegister);
6370     __ phaddw($tmp$$XMMRegister, $tmp$$XMMRegister);
6371     __ phaddw($tmp$$XMMRegister, $tmp$$XMMRegister);
6372     __ movzwl($dst$$Register, $src1$$Register);
6373     __ pextrw($tmp2$$Register, $tmp$$XMMRegister, 0x0);
6374     __ addw($dst$$Register, $tmp2$$Register);
6375     __ movswl($dst$$Register, $dst$$Register);
6376   %}
6377   ins_pipe( pipe_slow );
6378 %}
6379 
6380 instruct rvadd4S_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, rRegI tmp2) %{
6381   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
6382   match(Set dst (AddReductionVI src1 src2));
6383   effect(TEMP tmp, TEMP tmp2, TEMP dst);
6384   format %{ "vphaddw  $tmp,$src2,$src2\n\t"
6385             "vphaddw  $tmp,$tmp,$tmp\n\t"
6386             "movzwl   $dst,$src1\n\t"
6387             "pextrw   $tmp2,$tmp, 0x0\n\t"
6388             "addw     $dst,$tmp2\n\t"
6389             "movswl  $dst,$dst\t! add reduction4S" %}
6390   ins_encode %{
6391     int vector_len = 0;
6392     __ vphaddw($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len);
6393     __ vphaddw($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp$$XMMRegister, vector_len);
6394     __ movzwl($dst$$Register, $src1$$Register);
6395     __ pextrw($tmp2$$Register, $tmp$$XMMRegister, 0x0);
6396     __ addw($dst$$Register, $tmp2$$Register);
6397     __ movswl($dst$$Register, $dst$$Register);
6398   %}
6399   ins_pipe( pipe_slow );
6400 %}
6401 
6402 instruct rsadd8S_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, rRegI tmp2) %{
6403   predicate(UseSSE > 3 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
6404   match(Set dst (AddReductionVI src1 src2));
6405   effect(TEMP tmp, TEMP tmp2, TEMP dst);
6406   format %{
6407             "movdqu   $tmp,$src2\n\t"
6408             "phaddw  $tmp,$tmp\n\t"
6409             "phaddw  $tmp,$tmp\n\t"
6410             "phaddw  $tmp,$tmp\n\t"
6411             "movzwl   $dst,$src1\n\t"
6412             "pextrw   $tmp2,$tmp, 0x0\n\t"
6413             "addw     $dst,$tmp2\n\t"
6414             "movswl  $dst,$dst\t! add reduction8S" %}
6415   ins_encode %{
6416     __ movdqu($tmp$$XMMRegister, $src2$$XMMRegister);
6417     __ phaddw($tmp$$XMMRegister, $tmp$$XMMRegister);
6418     __ phaddw($tmp$$XMMRegister, $tmp$$XMMRegister);
6419     __ phaddw($tmp$$XMMRegister, $tmp$$XMMRegister);
6420     __ movzwl($dst$$Register, $src1$$Register);
6421     __ pextrw($tmp2$$Register, $tmp$$XMMRegister, 0x0);
6422     __ addw($dst$$Register, $tmp2$$Register);
6423     __ movswl($dst$$Register, $dst$$Register);
6424   %}
6425   ins_pipe( pipe_slow );
6426 %}
6427 
6428 instruct rvadd8S_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, rRegI tmp2) %{
6429   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
6430   match(Set dst (AddReductionVI src1 src2));
6431   effect(TEMP tmp, TEMP tmp2, TEMP dst);
6432   format %{ "vphaddw  $tmp,$src2,$src2\n\t"
6433             "vphaddw  $tmp,$tmp,$tmp\n\t"
6434             "vphaddw  $tmp,$tmp,$tmp\n\t"
6435             "movzwl   $dst,$src1\n\t"
6436             "pextrw   $tmp2,$tmp, 0x0\n\t"
6437             "addw     $dst,$tmp2\n\t"
6438             "movswl  $dst,$dst\t! add reduction8S" %}
6439   ins_encode %{
6440     int vector_len = 0;
6441     __ vphaddw($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len);
6442     __ vphaddw($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp$$XMMRegister, vector_len);
6443     __ vphaddw($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp$$XMMRegister, vector_len);
6444     __ movzwl($dst$$Register, $src1$$Register);
6445     __ pextrw($tmp2$$Register, $tmp$$XMMRegister, 0x0);
6446     __ addw($dst$$Register, $tmp2$$Register);
6447     __ movswl($dst$$Register, $dst$$Register);
6448   %}
6449   ins_pipe( pipe_slow );
6450 %}
6451 
6452 instruct rvadd16S_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, rRegI tmp2) %{
6453   predicate(UseAVX > 1 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
6454   match(Set dst (AddReductionVI src1 src2));
6455   effect(TEMP tmp, TEMP tmp2, TEMP dst);
6456   format %{ "vphaddw  $tmp,$src2,$src2\n\t"
6457             "vphaddw  $tmp,$tmp,$tmp\n\t"
6458             "vphaddw  $tmp,$tmp,$tmp\n\t"
6459             "vphaddw  $tmp,$tmp,$tmp\n\t"
6460             "movzwl   $dst,$src1\n\t"
6461             "pextrw   $tmp2,$tmp, 0x0\n\t"
6462             "addw     $dst,$tmp2\n\t"
6463             "movswl  $dst,$dst\t! add reduction16S" %}
6464   ins_encode %{
6465     int vector_len = 1;
6466     __ vphaddw($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len);
6467     __ vpermq($tmp$$XMMRegister, $tmp$$XMMRegister, 0xD8, vector_len);
6468     __ vphaddw($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp$$XMMRegister, vector_len);
6469     __ vphaddw($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp$$XMMRegister, vector_len);
6470     __ vphaddw($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp$$XMMRegister, vector_len);
6471     __ movzwl($dst$$Register, $src1$$Register);
6472     __ pextrw($tmp2$$Register, $tmp$$XMMRegister, 0x0);
6473     __ addw($dst$$Register, $tmp2$$Register);
6474     __ movswl($dst$$Register, $dst$$Register);
6475   %}
6476   ins_pipe( pipe_slow );
6477 %}
6478 
6479 instruct rvadd32S_reduction_reg_evex(rRegI dst, rRegI src1, vecZ src2, regF tmp, regF tmp2, rRegI tmp3) %{
6480   predicate(UseAVX > 2  && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
6481   match(Set dst (AddReductionVI src1 src2));
6482   effect(TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP dst);
6483   format %{ "vextracti64x4_high  $tmp2,$src2\n\t"
6484             "vpaddw  $tmp2,$tmp2,$src2\n\t"
6485             "vextracti128_high  $tmp,$tmp2\n\t"
6486             "vpaddw  $tmp,$tmp,$tmp2\n\t"
6487             "pshufd  $tmp2,$tmp,0xE\n\t"
6488             "vpaddw  $tmp,$tmp,$tmp2\n\t"
6489             "pshufd  $tmp2,$tmp,0x1\n\t"
6490             "vpaddw  $tmp,$tmp,$tmp2\n\t"
6491             "movdl   $tmp3,$tmp\n\t"
6492             "addw    $dst,$tmp3\n\t"
6493             "shrl    $tmp3,0x16\n\t"
6494             "addw     $dst,$tmp3\n\t"
6495             "movswl  $dst,$dst\t! add reduction32S" %}
6496   ins_encode %{
6497     __ vextracti64x4_high($tmp2$$XMMRegister, $src2$$XMMRegister);
6498     __ vpaddw($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
6499     __ vextracti128_high($tmp$$XMMRegister, $tmp2$$XMMRegister);
6500     __ vpaddw($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
6501     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
6502     __ vpaddw($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
6503     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
6504     __ vpaddw($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
6505     __ movdl($tmp3$$Register, $tmp$$XMMRegister);
6506     __ movzwl($dst$$Register, $src1$$Register);
6507     __ addw($dst$$Register, $tmp3$$Register);
6508     __ shrl($tmp3$$Register, 16);
6509     __ addw($dst$$Register, $tmp3$$Register);
6510     __ movswl($dst$$Register, $dst$$Register);
6511   %}
6512   ins_pipe( pipe_slow );
6513 %}
6514 
6515 instruct rsadd2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, vecD tmp, vecD tmp2) %{
6516   predicate(UseSSE > 2 && UseAVX == 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
6517   match(Set dst (AddReductionVI src1 src2));
6518   effect(TEMP tmp2, TEMP tmp);
6519   format %{ "movdqu  $tmp2,$src2\n\t"
6520             "phaddd  $tmp2,$tmp2\n\t"
6521             "movd    $tmp,$src1\n\t"
6522             "paddd   $tmp,$tmp2\n\t"
6523             "movd    $dst,$tmp\t! add reduction2I" %}
6524   ins_encode %{
6525     __ movdqu($tmp2$$XMMRegister, $src2$$XMMRegister);
6526     __ phaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister);
6527     __ movdl($tmp$$XMMRegister, $src1$$Register);
6528     __ paddd($tmp$$XMMRegister, $tmp2$$XMMRegister);
6529     __ movdl($dst$$Register, $tmp$$XMMRegister);
6530   %}
6531   ins_pipe( pipe_slow );
6532 %}
6533 
6534 instruct rvadd2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, vecD tmp, vecD tmp2) %{
6535   predicate(VM_Version::supports_avxonly()  && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
6536   match(Set dst (AddReductionVI src1 src2));
6537   effect(TEMP tmp, TEMP tmp2);
6538   format %{ "vphaddd  $tmp,$src2,$src2\n\t"
6539             "movd     $tmp2,$src1\n\t"
6540             "vpaddd   $tmp2,$tmp2,$tmp\n\t"
6541             "movd     $dst,$tmp2\t! add reduction2I" %}
6542   ins_encode %{
6543     int vector_len = 0;
6544     __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len);
6545     __ movdl($tmp2$$XMMRegister, $src1$$Register);
6546     __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
6547     __ movdl($dst$$Register, $tmp2$$XMMRegister);
6548   %}
6549   ins_pipe( pipe_slow );
6550 %}
6551 
6552 instruct rvadd2I_reduction_reg_evex(rRegI dst, rRegI src1, vecD src2, vecD tmp, vecD tmp2) %{
6553   predicate(UseAVX > 2  && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
6554   match(Set dst (AddReductionVI src1 src2));
6555   effect(TEMP tmp, TEMP tmp2);
6556   format %{ "pshufd  $tmp2,$src2,0x1\n\t"
6557             "vpaddd  $tmp,$src2,$tmp2\n\t"
6558             "movd    $tmp2,$src1\n\t"
6559             "vpaddd  $tmp2,$tmp,$tmp2\n\t"
6560             "movd    $dst,$tmp2\t! add reduction2I" %}
6561   ins_encode %{
6562     int vector_len = 0;
6563     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
6564     __ vpaddd($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
6565     __ movdl($tmp2$$XMMRegister, $src1$$Register);
6566     __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
6567     __ movdl($dst$$Register, $tmp2$$XMMRegister);
6568   %}
6569   ins_pipe( pipe_slow );
6570 %}
6571 
6572 instruct rsadd4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, vecX tmp, vecX tmp2) %{
6573   predicate(UseSSE > 2 && UseAVX == 0  && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
6574   match(Set dst (AddReductionVI src1 src2));
6575   effect(TEMP tmp, TEMP tmp2);
6576   format %{ "movdqu  $tmp,$src2\n\t"
6577             "phaddd  $tmp,$tmp\n\t"
6578             "phaddd  $tmp,$tmp\n\t"
6579             "movd    $tmp2,$src1\n\t"
6580             "paddd   $tmp2,$tmp\n\t"
6581             "movd    $dst,$tmp2\t! add reduction4I" %}
6582   ins_encode %{
6583     __ movdqu($tmp$$XMMRegister, $src2$$XMMRegister);
6584     __ phaddd($tmp$$XMMRegister, $tmp$$XMMRegister);
6585     __ phaddd($tmp$$XMMRegister, $tmp$$XMMRegister);
6586     __ movdl($tmp2$$XMMRegister, $src1$$Register);
6587     __ paddd($tmp2$$XMMRegister, $tmp$$XMMRegister);
6588     __ movdl($dst$$Register, $tmp2$$XMMRegister);
6589   %}
6590   ins_pipe( pipe_slow );
6591 %}
6592 
6593 instruct rvadd4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, vecX tmp, vecX tmp2) %{
6594   predicate(VM_Version::supports_avxonly() && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
6595   match(Set dst (AddReductionVI src1 src2));
6596   effect(TEMP tmp, TEMP tmp2);
6597   format %{ "vphaddd  $tmp,$src2,$src2\n\t"
6598             "vphaddd  $tmp,$tmp,$tmp\n\t"
6599             "movd     $tmp2,$src1\n\t"
6600             "vpaddd   $tmp2,$tmp2,$tmp\n\t"
6601             "movd     $dst,$tmp2\t! add reduction4I" %}
6602   ins_encode %{
6603     int vector_len = 0;
6604     __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len);
6605     __ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp$$XMMRegister, vector_len);
6606     __ movdl($tmp2$$XMMRegister, $src1$$Register);
6607     __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
6608     __ movdl($dst$$Register, $tmp2$$XMMRegister);
6609   %}
6610   ins_pipe( pipe_slow );
6611 %}
6612 
6613 instruct rvadd4I_reduction_reg_evex(rRegI dst, rRegI src1, vecX src2, vecX tmp, vecX tmp2) %{
6614   predicate(UseAVX > 2  && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
6615   match(Set dst (AddReductionVI src1 src2));
6616   effect(TEMP tmp, TEMP tmp2);
6617   format %{ "pshufd  $tmp2,$src2,0xE\n\t"
6618             "vpaddd  $tmp,$src2,$tmp2\n\t"
6619             "pshufd  $tmp2,$tmp,0x1\n\t"
6620             "vpaddd  $tmp,$tmp,$tmp2\n\t"
6621             "movd    $tmp2,$src1\n\t"
6622             "vpaddd  $tmp2,$tmp,$tmp2\n\t"
6623             "movd    $dst,$tmp2\t! add reduction4I" %}
6624   ins_encode %{
6625     int vector_len = 0;
6626     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
6627     __ vpaddd($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
6628     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
6629     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
6630     __ movdl($tmp2$$XMMRegister, $src1$$Register);
6631     __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
6632     __ movdl($dst$$Register, $tmp2$$XMMRegister);
6633   %}
6634   ins_pipe( pipe_slow );
6635 %}
6636 
6637 instruct rvadd8I_reduction_reg(rRegI dst, rRegI src1, vecY src2, vecY tmp, vecY tmp2) %{
6638   predicate(VM_Version::supports_avxonly()  && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
6639   match(Set dst (AddReductionVI src1 src2));
6640   effect(TEMP tmp, TEMP tmp2);
6641   format %{ "vphaddd  $tmp,$src2,$src2\n\t"
6642             "vphaddd  $tmp,$tmp,$tmp2\n\t"
6643             "vextracti128_high  $tmp2,$tmp\n\t"
6644             "vpaddd   $tmp,$tmp,$tmp2\n\t"
6645             "movd     $tmp2,$src1\n\t"
6646             "vpaddd   $tmp2,$tmp2,$tmp\n\t"
6647             "movd     $dst,$tmp2\t! add reduction8I" %}
6648   ins_encode %{
6649     int vector_len = 1;
6650     __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len);
6651     __ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
6652     __ vextracti128_high($tmp2$$XMMRegister, $tmp$$XMMRegister);
6653     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
6654     __ movdl($tmp2$$XMMRegister, $src1$$Register);
6655     __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
6656     __ movdl($dst$$Register, $tmp2$$XMMRegister);
6657   %}
6658   ins_pipe( pipe_slow );
6659 %}
6660 
6661 instruct rvadd8I_reduction_reg_evex(rRegI dst, rRegI src1, vecY src2, vecY tmp, vecY tmp2) %{
6662   predicate(UseAVX > 2  && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
6663   match(Set dst (AddReductionVI src1 src2));
6664   effect(TEMP tmp, TEMP tmp2);
6665   format %{ "vextracti128_high  $tmp,$src2\n\t"
6666             "vpaddd  $tmp,$tmp,$src2\n\t"
6667             "pshufd  $tmp2,$tmp,0xE\n\t"
6668             "vpaddd  $tmp,$tmp,$tmp2\n\t"
6669             "pshufd  $tmp2,$tmp,0x1\n\t"
6670             "vpaddd  $tmp,$tmp,$tmp2\n\t"
6671             "movd    $tmp2,$src1\n\t"
6672             "vpaddd  $tmp2,$tmp,$tmp2\n\t"
6673             "movd    $dst,$tmp2\t! add reduction8I" %}
6674   ins_encode %{
6675     int vector_len = 0;
6676     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
6677     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
6678     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
6679     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
6680     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
6681     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
6682     __ movdl($tmp2$$XMMRegister, $src1$$Register);
6683     __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
6684     __ movdl($dst$$Register, $tmp2$$XMMRegister);
6685   %}
6686   ins_pipe( pipe_slow );
6687 %}
6688 
6689 instruct rvadd16I_reduction_reg_evex(rRegI dst, rRegI src1, legVecZ src2, legVecZ tmp, legVecZ tmp2, legVecZ tmp3) %{
6690   predicate(UseAVX > 2  && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
6691   match(Set dst (AddReductionVI src1 src2));
6692   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
6693   format %{ "vextracti64x4_high  $tmp3,$src2\n\t"
6694             "vpaddd  $tmp3,$tmp3,$src2\n\t"
6695             "vextracti128_high  $tmp,$tmp3\n\t"
6696             "vpaddd  $tmp,$tmp,$tmp3\n\t"
6697             "pshufd  $tmp2,$tmp,0xE\n\t"
6698             "vpaddd  $tmp,$tmp,$tmp2\n\t"
6699             "pshufd  $tmp2,$tmp,0x1\n\t"
6700             "vpaddd  $tmp,$tmp,$tmp2\n\t"
6701             "movd    $tmp2,$src1\n\t"
6702             "vpaddd  $tmp2,$tmp,$tmp2\n\t"
6703             "movd    $dst,$tmp2\t! mul reduction16I" %}
6704   ins_encode %{
6705     __ vextracti64x4_high($tmp3$$XMMRegister, $src2$$XMMRegister);
6706     __ vpaddd($tmp3$$XMMRegister, $tmp3$$XMMRegister, $src2$$XMMRegister, 1);
6707     __ vextracti128_high($tmp$$XMMRegister, $tmp3$$XMMRegister);
6708     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp3$$XMMRegister, 0);
6709     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
6710     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
6711     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
6712     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
6713     __ movdl($tmp2$$XMMRegister, $src1$$Register);
6714     __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
6715     __ movdl($dst$$Register, $tmp2$$XMMRegister);
6716   %}
6717   ins_pipe( pipe_slow );
6718 %}
6719 
6720 #ifdef _LP64
6721 instruct rvadd2L_reduction_reg(rRegL dst, rRegL src1, vecX src2, vecX tmp, vecX tmp2) %{
6722   match(Set dst (AddReductionVL src1 src2));
6723   effect(TEMP tmp, TEMP tmp2);
6724   format %{ "pshufd  $tmp2,$src2,0xE\n\t"
6725             "paddq   $tmp2,$src2\n\t"
6726             "movdq   $tmp,$src1\n\t"
6727             "paddq   $tmp2,$tmp\n\t"
6728             "movdq   $dst,$tmp2\t! add reduction2L" %}
6729   ins_encode %{
6730     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
6731     __ paddq($tmp2$$XMMRegister, $src2$$XMMRegister);
6732     __ movdq($tmp$$XMMRegister, $src1$$Register);
6733     __ paddq($tmp2$$XMMRegister, $tmp$$XMMRegister);
6734     __ movdq($dst$$Register, $tmp2$$XMMRegister);
6735   %}
6736   ins_pipe( pipe_slow );
6737 %}
6738 
6739 instruct rvadd4L_reduction_reg(rRegL dst, rRegL src1, vecY src2, vecY tmp, vecY tmp2) %{
6740   predicate(UseAVX > 1);
6741   match(Set dst (AddReductionVL src1 src2));
6742   effect(TEMP tmp, TEMP tmp2);
6743   format %{ "vextracti128_high  $tmp,$src2\n\t"
6744             "vpaddq  $tmp2,$tmp,$src2\n\t"
6745             "pshufd  $tmp,$tmp2,0xE\n\t"
6746             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
6747             "movdq   $tmp,$src1\n\t"
6748             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
6749             "movdq   $dst,$tmp2\t! add reduction4L" %}
6750   ins_encode %{
6751     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
6752     __ vpaddq($tmp2$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, 0);
6753     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
6754     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
6755     __ movdq($tmp$$XMMRegister, $src1$$Register);
6756     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
6757     __ movdq($dst$$Register, $tmp2$$XMMRegister);
6758   %}
6759   ins_pipe( pipe_slow );
6760 %}
6761 
6762 instruct rvadd8L_reduction_reg(rRegL dst, rRegL src1, legVecZ src2, legVecZ tmp, legVecZ tmp2) %{
6763   predicate(UseAVX > 2);
6764   match(Set dst (AddReductionVL src1 src2));
6765   effect(TEMP tmp, TEMP tmp2);
6766   format %{ "vextracti64x4_high  $tmp2,$src2\n\t"
6767             "vpaddq  $tmp2,$tmp2,$src2\n\t"
6768             "vextracti128_high  $tmp,$tmp2\n\t"
6769             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
6770             "pshufd  $tmp,$tmp2,0xE\n\t"
6771             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
6772             "movdq   $tmp,$src1\n\t"
6773             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
6774             "movdq   $dst,$tmp2\t! add reduction8L" %}
6775   ins_encode %{
6776     __ vextracti64x4_high($tmp2$$XMMRegister, $src2$$XMMRegister);
6777     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
6778     __ vextracti128_high($tmp$$XMMRegister, $tmp2$$XMMRegister);
6779     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
6780     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
6781     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
6782     __ movdq($tmp$$XMMRegister, $src1$$Register);
6783     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
6784     __ movdq($dst$$Register, $tmp2$$XMMRegister);
6785   %}
6786   ins_pipe( pipe_slow );
6787 %}
6788 #endif
6789 
6790 instruct rsadd2F_reduction_reg(regF dst, vecD src2, vecD tmp) %{
6791   predicate(UseSSE >= 1 && UseAVX == 0);
6792   match(Set dst (AddReductionVF dst src2));
6793   effect(TEMP dst, TEMP tmp);
6794   format %{ "addss   $dst,$src2\n\t"
6795             "pshufd  $tmp,$src2,0x01\n\t"
6796             "addss   $dst,$tmp\t! add reduction2F" %}
6797   ins_encode %{
6798     __ addss($dst$$XMMRegister, $src2$$XMMRegister);
6799     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
6800     __ addss($dst$$XMMRegister, $tmp$$XMMRegister);
6801   %}
6802   ins_pipe( pipe_slow );
6803 %}
6804 
6805 instruct rvadd2F_reduction_reg(regF dst, vecD src2, vecD tmp) %{
6806   predicate(UseAVX > 0);
6807   match(Set dst (AddReductionVF dst src2));
6808   effect(TEMP dst, TEMP tmp);
6809   format %{ "vaddss  $dst,$dst,$src2\n\t"
6810             "pshufd  $tmp,$src2,0x01\n\t"
6811             "vaddss  $dst,$dst,$tmp\t! add reduction2F" %}
6812   ins_encode %{
6813     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
6814     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
6815     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6816   %}
6817   ins_pipe( pipe_slow );
6818 %}
6819 
6820 instruct rsadd4F_reduction_reg(regF dst, vecX src2, vecX tmp) %{
6821   predicate(UseSSE >= 1 && UseAVX == 0);
6822   match(Set dst (AddReductionVF dst src2));
6823   effect(TEMP dst, TEMP tmp);
6824   format %{ "addss   $dst,$src2\n\t"
6825             "pshufd  $tmp,$src2,0x01\n\t"
6826             "addss   $dst,$tmp\n\t"
6827             "pshufd  $tmp,$src2,0x02\n\t"
6828             "addss   $dst,$tmp\n\t"
6829             "pshufd  $tmp,$src2,0x03\n\t"
6830             "addss   $dst,$tmp\t! add reduction4F" %}
6831   ins_encode %{
6832     __ addss($dst$$XMMRegister, $src2$$XMMRegister);
6833     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
6834     __ addss($dst$$XMMRegister, $tmp$$XMMRegister);
6835     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
6836     __ addss($dst$$XMMRegister, $tmp$$XMMRegister);
6837     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
6838     __ addss($dst$$XMMRegister, $tmp$$XMMRegister);
6839   %}
6840   ins_pipe( pipe_slow );
6841 %}
6842 
6843 instruct rvadd4F_reduction_reg(regF dst, vecX src2, vecX tmp) %{
6844   predicate(UseAVX > 0);
6845   match(Set dst (AddReductionVF dst src2));
6846   effect(TEMP tmp, TEMP dst);
6847   format %{ "vaddss  $dst,dst,$src2\n\t"
6848             "pshufd  $tmp,$src2,0x01\n\t"
6849             "vaddss  $dst,$dst,$tmp\n\t"
6850             "pshufd  $tmp,$src2,0x02\n\t"
6851             "vaddss  $dst,$dst,$tmp\n\t"
6852             "pshufd  $tmp,$src2,0x03\n\t"
6853             "vaddss  $dst,$dst,$tmp\t! add reduction4F" %}
6854   ins_encode %{
6855     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
6856     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
6857     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6858     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
6859     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6860     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
6861     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6862   %}
6863   ins_pipe( pipe_slow );
6864 %}
6865 
6866 instruct radd8F_reduction_reg(regF dst, vecY src2, vecY tmp, vecY tmp2) %{
6867   predicate(UseAVX > 0);
6868   match(Set dst (AddReductionVF dst src2));
6869   effect(TEMP tmp, TEMP dst, TEMP tmp2);
6870   format %{ "vaddss  $dst,$dst,$src2\n\t"
6871             "pshufd  $tmp,$src2,0x01\n\t"
6872             "vaddss  $dst,$dst,$tmp\n\t"
6873             "pshufd  $tmp,$src2,0x02\n\t"
6874             "vaddss  $dst,$dst,$tmp\n\t"
6875             "pshufd  $tmp,$src2,0x03\n\t"
6876             "vaddss  $dst,$dst,$tmp\n\t"
6877             "vextractf128_high  $tmp2,$src2\n\t"
6878             "vaddss  $dst,$dst,$tmp2\n\t"
6879             "pshufd  $tmp,$tmp2,0x01\n\t"
6880             "vaddss  $dst,$dst,$tmp\n\t"
6881             "pshufd  $tmp,$tmp2,0x02\n\t"
6882             "vaddss  $dst,$dst,$tmp\n\t"
6883             "pshufd  $tmp,$tmp2,0x03\n\t"
6884             "vaddss  $dst,$dst,$tmp\t! add reduction8F" %}
6885   ins_encode %{
6886     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
6887     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
6888     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6889     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
6890     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6891     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
6892     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6893     __ vextractf128_high($tmp2$$XMMRegister, $src2$$XMMRegister);
6894     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
6895     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
6896     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6897     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
6898     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6899     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
6900     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6901   %}
6902   ins_pipe( pipe_slow );
6903 %}
6904 
6905 instruct radd16F_reduction_reg(regF dst, legVecZ src2, legVecZ tmp, legVecZ tmp2) %{
6906   predicate(UseAVX > 2);
6907   match(Set dst (AddReductionVF dst src2));
6908   effect(TEMP tmp, TEMP dst, TEMP tmp2);
6909   format %{ "vaddss  $dst,$dst,$src2\n\t"
6910             "pshufd  $tmp,$src2,0x01\n\t"
6911             "vaddss  $dst,$dst,$tmp\n\t"
6912             "pshufd  $tmp,$src2,0x02\n\t"
6913             "vaddss  $dst,$dst,$tmp\n\t"
6914             "pshufd  $tmp,$src2,0x03\n\t"
6915             "vaddss  $dst,$dst,$tmp\n\t"
6916             "vextractf32x4  $tmp2,$src2,0x1\n\t"
6917             "vaddss  $dst,$dst,$tmp2\n\t"
6918             "pshufd  $tmp,$tmp2,0x01\n\t"
6919             "vaddss  $dst,$dst,$tmp\n\t"
6920             "pshufd  $tmp,$tmp2,0x02\n\t"
6921             "vaddss  $dst,$dst,$tmp\n\t"
6922             "pshufd  $tmp,$tmp2,0x03\n\t"
6923             "vaddss  $dst,$dst,$tmp\n\t"
6924             "vextractf32x4  $tmp2,$src2,0x2\n\t"
6925             "vaddss  $dst,$dst,$tmp2\n\t"
6926             "pshufd  $tmp,$tmp2,0x01\n\t"
6927             "vaddss  $dst,$dst,$tmp\n\t"
6928             "pshufd  $tmp,$tmp2,0x02\n\t"
6929             "vaddss  $dst,$dst,$tmp\n\t"
6930             "pshufd  $tmp,$tmp2,0x03\n\t"
6931             "vaddss  $dst,$dst,$tmp\n\t"
6932             "vextractf32x4  $tmp2,$src2,0x3\n\t"
6933             "vaddss  $dst,$dst,$tmp2\n\t"
6934             "pshufd  $tmp,$tmp2,0x01\n\t"
6935             "vaddss  $dst,$dst,$tmp\n\t"
6936             "pshufd  $tmp,$tmp2,0x02\n\t"
6937             "vaddss  $dst,$dst,$tmp\n\t"
6938             "pshufd  $tmp,$tmp2,0x03\n\t"
6939             "vaddss  $dst,$dst,$tmp\t! add reduction16F" %}
6940   ins_encode %{
6941     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
6942     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
6943     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6944     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
6945     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6946     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
6947     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6948     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
6949     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
6950     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
6951     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6952     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
6953     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6954     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
6955     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6956     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
6957     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
6958     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
6959     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6960     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
6961     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6962     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
6963     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6964     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
6965     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
6966     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
6967     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6968     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
6969     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6970     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
6971     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
6972   %}
6973   ins_pipe( pipe_slow );
6974 %}
6975 
6976 instruct rsadd2D_reduction_reg(regD dst, vecX src2, vecX tmp) %{
6977   predicate(UseSSE >= 1 && UseAVX == 0);
6978   match(Set dst (AddReductionVD dst src2));
6979   effect(TEMP tmp, TEMP dst);
6980   format %{ "addsd   $dst,$src2\n\t"
6981             "pshufd  $tmp,$src2,0xE\n\t"
6982             "addsd   $dst,$tmp\t! add reduction2D" %}
6983   ins_encode %{
6984     __ addsd($dst$$XMMRegister, $src2$$XMMRegister);
6985     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
6986     __ addsd($dst$$XMMRegister, $tmp$$XMMRegister);
6987   %}
6988   ins_pipe( pipe_slow );
6989 %}
6990 
6991 instruct rvadd2D_reduction_reg(regD dst, vecX src2, vecX tmp) %{
6992   predicate(UseAVX > 0);
6993   match(Set dst (AddReductionVD dst src2));
6994   effect(TEMP tmp, TEMP dst);
6995   format %{ "vaddsd  $dst,$dst,$src2\n\t"
6996             "pshufd  $tmp,$src2,0xE\n\t"
6997             "vaddsd  $dst,$dst,$tmp\t! add reduction2D" %}
6998   ins_encode %{
6999     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
7000     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
7001     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7002   %}
7003   ins_pipe( pipe_slow );
7004 %}
7005 
7006 instruct rvadd4D_reduction_reg(regD dst, vecY src2, vecX tmp, vecX tmp2) %{
7007   predicate(UseAVX > 0);
7008   match(Set dst (AddReductionVD dst src2));
7009   effect(TEMP tmp, TEMP dst, TEMP tmp2);
7010   format %{ "vaddsd  $dst,$dst,$src2\n\t"
7011             "pshufd  $tmp,$src2,0xE\n\t"
7012             "vaddsd  $dst,$dst,$tmp\n\t"
7013             "vextractf128  $tmp2,$src2,0x1\n\t"
7014             "vaddsd  $dst,$dst,$tmp2\n\t"
7015             "pshufd  $tmp,$tmp2,0xE\n\t"
7016             "vaddsd  $dst,$dst,$tmp\t! add reduction4D" %}
7017   ins_encode %{
7018     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
7019     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
7020     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7021     __ vextractf128($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
7022     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
7023     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
7024     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7025   %}
7026   ins_pipe( pipe_slow );
7027 %}
7028 
7029 instruct rvadd8D_reduction_reg(regD dst, legVecZ src2, legVecZ tmp, legVecZ tmp2) %{
7030   predicate(UseAVX > 2);
7031   match(Set dst (AddReductionVD dst src2));
7032   effect(TEMP tmp, TEMP dst, TEMP tmp2);
7033   format %{ "vaddsd  $dst,$dst,$src2\n\t"
7034             "pshufd  $tmp,$src2,0xE\n\t"
7035             "vaddsd  $dst,$dst,$tmp\n\t"
7036             "vextractf32x4  $tmp2,$src2,0x1\n\t"
7037             "vaddsd  $dst,$dst,$tmp2\n\t"
7038             "pshufd  $tmp,$tmp2,0xE\n\t"
7039             "vaddsd  $dst,$dst,$tmp\n\t"
7040             "vextractf32x4  $tmp2,$src2,0x2\n\t"
7041             "vaddsd  $dst,$dst,$tmp2\n\t"
7042             "pshufd  $tmp,$tmp2,0xE\n\t"
7043             "vaddsd  $dst,$dst,$tmp\n\t"
7044             "vextractf32x4  $tmp2,$src2,0x3\n\t"
7045             "vaddsd  $dst,$dst,$tmp2\n\t"
7046             "pshufd  $tmp,$tmp2,0xE\n\t"
7047             "vaddsd  $dst,$dst,$tmp\t! add reduction8D" %}
7048   ins_encode %{
7049     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
7050     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
7051     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7052     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
7053     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
7054     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
7055     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7056     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
7057     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
7058     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
7059     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7060     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
7061     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
7062     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
7063     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7064   %}
7065   ins_pipe( pipe_slow );
7066 %}
7067 
7068 instruct rssub2F_reduction_reg(regF dst, vecD src2, regF tmp) %{
7069   predicate(UseSSE >= 1 && UseAVX == 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
7070   match(Set dst (SubReductionVFP dst src2));
7071   effect(TEMP dst, TEMP tmp);
7072   format %{ "subss  $dst,$src2\n\t"
7073             "pshufd  $tmp,$src2,0x01\n\t"
7074             "subss  $dst,$dst,$tmp\t! sub reduction2F" %}
7075   ins_encode %{
7076     __ subss($dst$$XMMRegister, $src2$$XMMRegister);
7077     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
7078     __ subss($dst$$XMMRegister, $tmp$$XMMRegister);
7079   %}
7080   ins_pipe( pipe_slow );
7081 %}
7082 
7083 instruct rvsub2F_reduction_reg(regF dst, vecD src2, regF tmp) %{
7084   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
7085   match(Set dst (SubReductionVFP dst src2));
7086   effect(TEMP dst, TEMP tmp);
7087   format %{ "vsubss  $dst,$dst,$src2\n\t"
7088             "pshufd  $tmp,$src2,0x01\n\t"
7089             "vsubss  $dst,$dst,$tmp\t! sub reduction2F" %}
7090   ins_encode %{
7091     __ vsubss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
7092     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
7093     __ vsubss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7094   %}
7095   ins_pipe( pipe_slow );
7096 %}
7097 
7098 instruct rssub4F_reduction_reg(regF dst, vecX src2, regF tmp) %{
7099   predicate(UseSSE >= 1 && UseAVX == 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
7100   match(Set dst (SubReductionVFP dst src2));
7101   effect(TEMP dst, TEMP tmp);
7102   format %{ "subss   $dst,$src2\n\t"
7103             "pshufd  $tmp,$src2,0x01\n\t"
7104             "subss   $dst,$tmp\n\t"
7105             "pshufd  $tmp,$src2,0x02\n\t"
7106             "subss   $dst,$tmp\n\t"
7107             "pshufd  $tmp,$src2,0x03\n\t"
7108             "subss   $dst,$tmp\t! sub reduction4F" %}
7109   ins_encode %{
7110     __ subss($dst$$XMMRegister, $src2$$XMMRegister);
7111     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
7112     __ subss($dst$$XMMRegister, $tmp$$XMMRegister);
7113     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
7114     __ subss($dst$$XMMRegister, $tmp$$XMMRegister);
7115     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
7116     __ subss($dst$$XMMRegister, $tmp$$XMMRegister);
7117   %}
7118   ins_pipe( pipe_slow );
7119 %}
7120 
7121 instruct rvsub4F_reduction_reg(regF dst, vecX src2, regF tmp) %{
7122   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
7123   match(Set dst (SubReductionVFP dst src2));
7124   effect(TEMP tmp, TEMP dst);
7125   format %{ "vsubss  $dst,dst,$src2\n\t"
7126             "pshufd  $tmp,$src2,0x01\n\t"
7127             "vsubss  $dst,$dst,$tmp\n\t"
7128             "pshufd  $tmp,$src2,0x02\n\t"
7129             "vsubss  $dst,$dst,$tmp\n\t"
7130             "pshufd  $tmp,$src2,0x03\n\t"
7131             "vsubss  $dst,$dst,$tmp\t! sub reduction4F" %}
7132   ins_encode %{
7133     __ vsubss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
7134     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
7135     __ vsubss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7136     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
7137     __ vsubss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7138     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
7139     __ vsubss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7140   %}
7141   ins_pipe( pipe_slow );
7142 %}
7143 
7144 instruct rsub8F_reduction_reg(regF dst, vecY src2, regF tmp, regF tmp2) %{
7145   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
7146   match(Set dst (SubReductionVFP dst src2));
7147   effect(TEMP tmp, TEMP dst, TEMP tmp2);
7148   format %{ "vsubss  $dst,$dst,$src2\n\t"
7149             "pshufd  $tmp,$src2,0x01\n\t"
7150             "vsubss  $dst,$dst,$tmp\n\t"
7151             "pshufd  $tmp,$src2,0x02\n\t"
7152             "vsubss  $dst,$dst,$tmp\n\t"
7153             "pshufd  $tmp,$src2,0x03\n\t"
7154             "vsubss  $dst,$dst,$tmp\n\t"
7155             "vextractf128_high  $tmp2,$src2\n\t"
7156             "vsubss  $dst,$dst,$tmp2\n\t"
7157             "pshufd  $tmp,$tmp2,0x01\n\t"
7158             "vsubss  $dst,$dst,$tmp\n\t"
7159             "pshufd  $tmp,$tmp2,0x02\n\t"
7160             "vsubss  $dst,$dst,$tmp\n\t"
7161             "pshufd  $tmp,$tmp2,0x03\n\t"
7162             "vsubss  $dst,$dst,$tmp\t! sub reduction8F" %}
7163   ins_encode %{
7164     __ vsubss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
7165     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
7166     __ vsubss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7167     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
7168     __ vsubss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7169     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
7170     __ vsubss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7171     __ vextractf128_high($tmp2$$XMMRegister, $src2$$XMMRegister);
7172     __ vsubss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
7173     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
7174     __ vsubss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7175     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
7176     __ vsubss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7177     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
7178     __ vsubss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7179   %}
7180   ins_pipe( pipe_slow );
7181 %}
7182 
7183 instruct rsub16F_reduction_reg(regF dst, vecZ src2, regF tmp, regF tmp2) %{
7184   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
7185   match(Set dst (SubReductionVFP dst src2));
7186   effect(TEMP tmp, TEMP dst, TEMP tmp2);
7187   format %{ "vsubss  $dst,$dst,$src2\n\t"
7188             "pshufd  $tmp,$src2,0x01\n\t"
7189             "vsubss  $dst,$dst,$tmp\n\t"
7190             "pshufd  $tmp,$src2,0x02\n\t"
7191             "vsubss  $dst,$dst,$tmp\n\t"
7192             "pshufd  $tmp,$src2,0x03\n\t"
7193             "vsubss  $dst,$dst,$tmp\n\t"
7194             "vextractf32x4  $tmp2,$src2,0x1\n\t"
7195             "vsubss  $dst,$dst,$tmp2\n\t"
7196             "pshufd  $tmp,$tmp2,0x01\n\t"
7197             "vsubss  $dst,$dst,$tmp\n\t"
7198             "pshufd  $tmp,$tmp2,0x02\n\t"
7199             "vsubss  $dst,$dst,$tmp\n\t"
7200             "pshufd  $tmp,$tmp2,0x03\n\t"
7201             "vsubss  $dst,$dst,$tmp\n\t"
7202             "vextractf32x4  $tmp2,$src2,0x2\n\t"
7203             "vsubss  $dst,$dst,$tmp2\n\t"
7204             "pshufd  $tmp,$tmp2,0x01\n\t"
7205             "vsubss  $dst,$dst,$tmp\n\t"
7206             "pshufd  $tmp,$tmp2,0x02\n\t"
7207             "vsubss  $dst,$dst,$tmp\n\t"
7208             "pshufd  $tmp,$tmp2,0x03\n\t"
7209             "vsubss  $dst,$dst,$tmp\n\t"
7210             "vextractf32x4  $tmp2,$src2,0x3\n\t"
7211             "vsubss  $dst,$dst,$tmp2\n\t"
7212             "pshufd  $tmp,$tmp2,0x01\n\t"
7213             "vsubss  $dst,$dst,$tmp\n\t"
7214             "pshufd  $tmp,$tmp2,0x02\n\t"
7215             "vsubss  $dst,$dst,$tmp\n\t"
7216             "pshufd  $tmp,$tmp2,0x03\n\t"
7217             "vsubss  $dst,$dst,$tmp\t! sub reduction16F" %}
7218   ins_encode %{
7219     __ vsubss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
7220     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
7221     __ vsubss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7222     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
7223     __ vsubss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7224     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
7225     __ vsubss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7226     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
7227     __ vsubss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
7228     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
7229     __ vsubss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7230     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
7231     __ vsubss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7232     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
7233     __ vsubss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7234     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
7235     __ vsubss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
7236     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
7237     __ vsubss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7238     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
7239     __ vsubss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7240     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
7241     __ vsubss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7242     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
7243     __ vsubss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
7244     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
7245     __ vsubss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7246     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
7247     __ vsubss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7248     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
7249     __ vsubss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7250   %}
7251   ins_pipe( pipe_slow );
7252 %}
7253 
7254 instruct rssub2D_reduction_reg(regD dst, vecX src2, regD tmp) %{
7255   predicate(UseSSE >= 1 && UseAVX == 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
7256   match(Set dst (SubReductionVFP dst src2));
7257   effect(TEMP tmp, TEMP dst);
7258   format %{ "subsd   $dst,$src2\n\t"
7259             "pshufd  $tmp,$src2,0xE\n\t"
7260             "subsd   $dst,$tmp\t! sub reduction2D" %}
7261   ins_encode %{
7262     __ subsd($dst$$XMMRegister, $src2$$XMMRegister);
7263     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
7264     __ subsd($dst$$XMMRegister, $tmp$$XMMRegister);
7265   %}
7266   ins_pipe( pipe_slow );
7267 %}
7268 
7269 instruct rvsub2D_reduction_reg(regD dst, vecX src2, regD tmp) %{
7270   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
7271   match(Set dst (SubReductionVFP dst src2));
7272   effect(TEMP tmp, TEMP dst);
7273   format %{ "vsubsd  $dst,$dst,$src2\n\t"
7274             "pshufd  $tmp,$src2,0xE\n\t"
7275             "vsubsd  $dst,$dst,$tmp\t! sub reduction2D" %}
7276   ins_encode %{
7277     __ vsubsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
7278     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
7279     __ vsubsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7280   %}
7281   ins_pipe( pipe_slow );
7282 %}
7283 
7284 instruct rvsub4D_reduction_reg(regD dst, vecY src2, regD tmp, regD tmp2) %{
7285   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
7286   match(Set dst (SubReductionVFP dst src2));
7287   effect(TEMP tmp, TEMP dst, TEMP tmp2);
7288   format %{ "vsubsd  $dst,$dst,$src2\n\t"
7289             "pshufd  $tmp,$src2,0xE\n\t"
7290             "vsubsd  $dst,$dst,$tmp\n\t"
7291             "vextractf32x4  $tmp2,$src2,0x1\n\t"
7292             "vsubsd  $dst,$dst,$tmp2\n\t"
7293             "pshufd  $tmp,$tmp2,0xE\n\t"
7294             "vsubsd  $dst,$dst,$tmp\t! sub reduction4D" %}
7295   ins_encode %{
7296     __ vsubsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
7297     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
7298     __ vsubsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7299     __ vextractf128($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
7300     __ vsubsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
7301     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
7302     __ vsubsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7303   %}
7304   ins_pipe( pipe_slow );
7305 %}
7306 
7307 instruct rvsub8D_reduction_reg(regD dst, vecZ src2, regD tmp, regD tmp2) %{
7308   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
7309   match(Set dst (SubReductionVFP dst src2));
7310   effect(TEMP tmp, TEMP dst, TEMP tmp2);
7311   format %{ "vsubsd  $dst,$dst,$src2\n\t"
7312             "pshufd  $tmp,$src2,0xE\n\t"
7313             "vsubsd  $dst,$dst,$tmp\n\t"
7314             "vextractf32x4  $tmp2,$src2,0x1\n\t"
7315             "vsubsd  $dst,$dst,$tmp2\n\t"
7316             "pshufd  $tmp,$tmp2,0xE\n\t"
7317             "vsubsd  $dst,$dst,$tmp\n\t"
7318             "vextractf32x4  $tmp2,$src2,0x2\n\t"
7319             "vsubsd  $dst,$dst,$tmp2\n\t"
7320             "pshufd  $tmp,$tmp2,0xE\n\t"
7321             "vsubsd  $dst,$dst,$tmp\n\t"
7322             "vextractf32x4  $tmp2,$src2,0x3\n\t"
7323             "vsubsd  $dst,$dst,$tmp2\n\t"
7324             "pshufd  $tmp,$tmp2,0xE\n\t"
7325             "vsubsd  $dst,$dst,$tmp\t! sub reduction8D" %}
7326   ins_encode %{
7327     __ vsubsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
7328     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
7329     __ vsubsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7330     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
7331     __ vsubsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
7332     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
7333     __ vsubsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7334     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
7335     __ vsubsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
7336     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
7337     __ vsubsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7338     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
7339     __ vsubsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
7340     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
7341     __ vsubsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7342   %}
7343   ins_pipe( pipe_slow );
7344 %}
7345 
7346 instruct rsmul8B_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp1, rRegI tmp2, rRegI tmp3) %{
7347   predicate(UseSSE > 3 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
7348   match(Set dst (MulReductionVI src1 src2));
7349   effect(TEMP dst, TEMP tmp, TEMP tmp1, TEMP tmp2, TEMP tmp3);
7350   format %{ "pmovsxbw $tmp,$src2\n\t"
7351             "pshufd   $tmp1,$tmp,0xE\n\t"
7352             "pmullw   $tmp,$tmp1\n\t"
7353             "pshufd   $tmp1,$tmp,0x1\n\t"
7354             "pmullw   $tmp,$tmp1\n\t"
7355             "pextrw   $tmp2,$tmp, 0x1\n\t"
7356             "pextrw   $tmp3,$tmp, 0x0\n\t"
7357             "imul     $tmp2,$tmp3 \n\t"
7358             "movsbl   $dst,$src1\n\t"
7359             "imull    $dst,$tmp2\n\t"
7360             "movsbl   $dst,$dst\t! mul reduction8B" %}
7361   ins_encode %{
7362     __ pmovsxbw($tmp$$XMMRegister, $src2$$XMMRegister);
7363     __ pshufd($tmp1$$XMMRegister, $tmp$$XMMRegister, 0xE);
7364     __ pmullw($tmp$$XMMRegister, $tmp1$$XMMRegister);
7365     __ pshufd($tmp1$$XMMRegister, $tmp$$XMMRegister, 0x1);
7366     __ pmullw($tmp$$XMMRegister, $tmp1$$XMMRegister);
7367     __ pextrw($tmp2$$Register, $tmp$$XMMRegister, 0x1);
7368     __ pextrw($tmp3$$Register, $tmp$$XMMRegister, 0x0);
7369     __ imull($tmp2$$Register, $tmp3$$Register);
7370     __ movsbl($dst$$Register, $src1$$Register);
7371     __ imull($dst$$Register, $tmp2$$Register);
7372     __ movsbl($dst$$Register, $dst$$Register);
7373   %}
7374   ins_pipe( pipe_slow );
7375 %}
7376 
7377 instruct rsmul16B_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp1, rRegI tmp2, rRegI tmp3) %{
7378   predicate(UseSSE > 3 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
7379   match(Set dst (MulReductionVI src1 src2));
7380   effect(TEMP dst, TEMP tmp, TEMP tmp1, TEMP tmp2, TEMP tmp3);
7381   format %{ "pmovsxbw $tmp,$src2\n\t"
7382             "pshufd   $tmp1,$src2,0xEE\n\t"
7383             "pmovsxbw $tmp1,$tmp1\n\t"
7384             "pmullw   $tmp,$tmp1\n\t"
7385             "pshufd   $tmp1,$tmp,0xE\n\t"
7386             "pmullw   $tmp,$tmp1\n\t"
7387             "pshufd   $tmp1,$tmp,0x1\n\t"
7388             "pmullw   $tmp,$tmp1\n\t"
7389             "pextrw   $tmp2,$tmp, 0x1\n\t"
7390             "pextrw   $tmp3,$tmp, 0x0\n\t"
7391             "imull    $tmp2,$tmp3 \n\t"
7392             "movsbl   $dst,$src1\n\t"
7393             "imull    $dst,$tmp2\n\t"
7394             "movsbl   $dst,$dst\t! mul reduction16B" %}
7395   ins_encode %{
7396     int vector_len = 0;
7397     __ pmovsxbw($tmp$$XMMRegister, $src2$$XMMRegister);
7398     __ pshufd($tmp1$$XMMRegister, $src2$$XMMRegister, 0xEE);
7399     __ pmovsxbw($tmp1$$XMMRegister, $tmp1$$XMMRegister);
7400     __ pmullw($tmp$$XMMRegister, $tmp1$$XMMRegister);
7401     __ pshufd($tmp1$$XMMRegister, $tmp$$XMMRegister, 0xE);
7402     __ pmullw($tmp$$XMMRegister, $tmp1$$XMMRegister);
7403     __ pshufd($tmp1$$XMMRegister, $tmp$$XMMRegister, 0x1);
7404     __ pmullw($tmp$$XMMRegister, $tmp1$$XMMRegister);
7405     __ pextrw($tmp2$$Register, $tmp$$XMMRegister, 0x1);
7406     __ pextrw($tmp3$$Register, $tmp$$XMMRegister, 0x0);
7407     __ imull($tmp2$$Register, $tmp3$$Register);
7408     __ movsbl($dst$$Register, $src1$$Register);
7409     __ imull($dst$$Register, $tmp2$$Register);
7410     __ movsbl($dst$$Register, $dst$$Register);
7411   %}
7412   ins_pipe( pipe_slow );
7413 %}
7414 
7415 instruct rvmul32B_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp1, rRegI tmp2, rRegI tmp3) %{
7416   predicate(UseAVX > 1 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
7417   match(Set dst (MulReductionVI src1 src2));
7418   effect(TEMP dst, TEMP tmp, TEMP tmp1, TEMP tmp2, TEMP tmp3);
7419   format %{ "vextracti128_high  $tmp,$src2\n\t"
7420             "pmovsxbw $tmp,$tmp\n\t"
7421             "pmovsxbw $tmp1,$src2\n\t"
7422             "vpmullw  $tmp,$tmp,$tmp1\n\t"
7423             "vextracti128_high  $tmp1,$tmp\n\t"
7424             "vpmullw  $tmp,$tmp,$tmp1\n\t"
7425             "pshufd   $tmp1,$tmp,0xE\n\t"
7426             "vpmullw  $tmp,$tmp,$tmp1\n\t"
7427             "pshufd   $tmp1,$tmp,0x1\n\t"
7428             "vpmullw  $tmp,$tmp,$tmp1\n\t"
7429             "pextrw   $tmp2,$tmp, 0x1\n\t"
7430             "pextrw   $tmp3,$tmp, 0x0\n\t"
7431             "imull    $tmp2,$tmp3 \n\t"
7432             "movsbl   $dst,$src1\n\t"
7433             "imull    $dst,$tmp2\n\t"
7434             "movsbl   $dst,$dst\t! mul reduction32B" %}
7435   ins_encode %{
7436     int vector_len = 1;
7437     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
7438     __ vpmovsxbw($tmp$$XMMRegister, $tmp$$XMMRegister, vector_len);
7439     __ vpmovsxbw($tmp1$$XMMRegister, $src2$$XMMRegister, vector_len);
7440     __ vpmullw($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp1$$XMMRegister, vector_len);
7441     __ vextracti128_high($tmp1$$XMMRegister, $tmp$$XMMRegister);
7442     __ vpmullw($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp1$$XMMRegister, 0);
7443     __ pshufd($tmp1$$XMMRegister, $tmp$$XMMRegister, 0xE);
7444     __ vpmullw($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp1$$XMMRegister, 0);
7445     __ pshufd($tmp1$$XMMRegister, $tmp$$XMMRegister, 0x1);
7446     __ vpmullw($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp1$$XMMRegister, 0);
7447     __ pextrw($tmp2$$Register, $tmp$$XMMRegister, 0x1);
7448     __ pextrw($tmp3$$Register, $tmp$$XMMRegister, 0x0);
7449     __ imull($tmp2$$Register, $tmp3$$Register);
7450     __ movsbl($dst$$Register, $src1$$Register);
7451     __ imull($dst$$Register, $tmp2$$Register);
7452     __ movsbl($dst$$Register, $dst$$Register);
7453   %}
7454   ins_pipe( pipe_slow );
7455 %}
7456 
7457 instruct rvmul64B_reduction_reg(rRegI dst, rRegI src1, vecZ src2, regF tmp, regF tmp1, rRegI tmp2, rRegI tmp3) %{
7458   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
7459   match(Set dst (MulReductionVI src1 src2));
7460   effect(TEMP dst, TEMP tmp, TEMP tmp1, TEMP tmp2, TEMP tmp3);
7461   format %{ "vextracti64x4_high  $tmp,$src2\n\t"
7462             "vpmovsxbw $tmp,$tmp\n\t"
7463             "vpmovsxbw $tmp1,$src2\n\t"
7464             "vpmullw  $tmp,$tmp,$tmp1\n\t"
7465             "vextracti64x4_high  $tmp1,$tmp\n\t"
7466             "vpmullw  $tmp,$tmp,$tmp1\n\t"
7467             "vextracti128_high  $tmp1,$tmp\n\t"
7468             "vpmullw  $tmp,$tmp,$tmp1\n\t"
7469             "pshufd   $tmp1,$tmp,0xE\n\t"
7470             "vpmullw  $tmp,$tmp,$tmp1\n\t"
7471             "pshufd   $tmp1,$tmp,0x1\n\t"
7472             "vpmullw  $tmp,$tmp,$tmp1\n\t"
7473             "pextrw   $tmp2,$tmp, 0x1\n\t"
7474             "pextrw   $tmp3,$tmp, 0x0\n\t"
7475             "imull    $tmp2,$tmp3 \n\t"
7476             "movsbl   $dst,$src1\n\t"
7477             "imull    $dst,$tmp2\n\t"
7478             "movsbl   $dst,$dst\t! mul reduction64B" %}
7479   ins_encode %{
7480     int vector_len = 2;
7481     __ vextracti64x4_high($tmp$$XMMRegister, $src2$$XMMRegister);
7482     __ vpmovsxbw($tmp$$XMMRegister, $tmp$$XMMRegister, vector_len);
7483     __ vpmovsxbw($tmp1$$XMMRegister, $src2$$XMMRegister, vector_len);
7484     __ vpmullw($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp1$$XMMRegister, vector_len);
7485     __ vextracti64x4_high($tmp1$$XMMRegister, $tmp$$XMMRegister);
7486     __ vpmullw($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp1$$XMMRegister, 1);
7487     __ vextracti128_high($tmp1$$XMMRegister, $tmp$$XMMRegister);
7488     __ vpmullw($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp1$$XMMRegister, 0);
7489     __ pshufd($tmp1$$XMMRegister, $tmp$$XMMRegister, 0xE);
7490     __ vpmullw($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp1$$XMMRegister, 0);
7491     __ pshufd($tmp1$$XMMRegister, $tmp$$XMMRegister, 0x1);
7492     __ vpmullw($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp1$$XMMRegister, 0);
7493     __ pextrw($tmp2$$Register, $tmp$$XMMRegister, 0x1);
7494     __ pextrw($tmp3$$Register, $tmp$$XMMRegister, 0x0);
7495     __ imull($tmp2$$Register, $tmp3$$Register);
7496     __ movsbl($dst$$Register, $src1$$Register);
7497     __ imull($dst$$Register, $tmp2$$Register);
7498     __ movsbl($dst$$Register, $dst$$Register);
7499   %}
7500   ins_pipe( pipe_slow );
7501 %}
7502 
7503 instruct rsmul4S_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, rRegI tmp2, rRegI tmp3) %{
7504   predicate(UseSSE > 1 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
7505   match(Set dst (MulReductionVI src1 src2));
7506   effect(TEMP dst, TEMP tmp, TEMP tmp2, TEMP tmp3);
7507   format %{ "pshufd  $tmp,$src2,0x1\n\t"
7508             "pmullw  $tmp,$src2\n\t"
7509             "pextrw  $tmp2,$tmp, 0x1\n\t"
7510             "pextrw  $tmp3,$tmp, 0x0\n\t"
7511             "imull    $tmp2,$tmp3 \n\t"
7512             "movswl   $dst,$src1\n\t"
7513             "imull    $dst,$tmp2\n\t"
7514             "movswl   $dst,$dst\t! mul reduction4S" %}
7515   ins_encode %{
7516     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x1);
7517     __ pmullw($tmp$$XMMRegister, $src2$$XMMRegister);
7518     __ pextrw($tmp2$$Register, $tmp$$XMMRegister, 0x1);
7519     __ pextrw($tmp3$$Register, $tmp$$XMMRegister, 0x0);
7520     __ imull($tmp2$$Register, $tmp3$$Register);
7521     __ movswl($dst$$Register, $src1$$Register);
7522     __ imull($dst$$Register, $tmp2$$Register);
7523     __ movswl($dst$$Register, $dst$$Register);
7524   %}
7525   ins_pipe( pipe_slow );
7526 %}
7527 
7528 instruct rsmul8S_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp1, rRegI tmp2, rRegI tmp3) %{
7529   predicate(UseSSE > 1 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
7530   match(Set dst (MulReductionVI src1 src2));
7531   effect(TEMP dst, TEMP tmp, TEMP tmp1, TEMP tmp2, TEMP tmp3);
7532   format %{ "pshufd  $tmp,$src2,0xE\n\t"
7533             "pmullw  $tmp,$src2\n\t"
7534             "pshufd  $tmp1,$tmp,0x1\n\t"
7535             "pmullw  $tmp,$tmp1\n\t"
7536             "pextrw  $tmp2,$tmp, 0x1\n\t"
7537             "pextrw  $tmp3,$tmp, 0x0\n\t"
7538             "imul    $tmp2,$tmp3 \n\t"
7539             "movswl   $dst,$src1\n\t"
7540             "imull    $dst,$tmp2\n\t"
7541             "movswl   $dst,$dst\t! mul reduction8S" %}
7542   ins_encode %{
7543     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
7544     __ pmullw($tmp$$XMMRegister, $src2$$XMMRegister);
7545     __ pshufd($tmp1$$XMMRegister, $tmp$$XMMRegister, 0x1);
7546     __ pmullw($tmp$$XMMRegister, $tmp1$$XMMRegister);
7547     __ pextrw($tmp2$$Register, $tmp$$XMMRegister, 0x1);
7548     __ pextrw($tmp3$$Register, $tmp$$XMMRegister, 0x0);
7549     __ imull($tmp2$$Register, $tmp3$$Register);
7550     __ movswl($dst$$Register, $src1$$Register);
7551     __ imull($dst$$Register, $tmp2$$Register);
7552     __ movswl($dst$$Register, $dst$$Register);
7553   %}
7554   ins_pipe( pipe_slow );
7555 %}
7556 
7557 instruct rvmul16S_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp1, rRegI tmp2, rRegI tmp3) %{
7558   predicate(UseAVX > 1 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
7559   match(Set dst (MulReductionVI src1 src2));
7560   effect(TEMP dst, TEMP tmp, TEMP tmp1, TEMP tmp2, TEMP tmp3);
7561   format %{ "vextracti128_high  $tmp,$src2\n\t"
7562             "vpmullw  $tmp,$tmp,$src2\n\t"
7563             "pshufd  $tmp1,$tmp,0xE\n\t"
7564             "pmullw  $tmp,$tmp1\n\t"
7565             "pshufd  $tmp1,$tmp,0x1\n\t"
7566             "pmullw  $tmp,$tmp1\n\t"
7567             "pextrw  $tmp2,$tmp, 0x1\n\t"
7568             "pextrw  $tmp3,$tmp, 0x0\n\t"
7569             "imul    $tmp2,$tmp3 \n\t"
7570             "movswl   $dst,$src1\n\t"
7571             "imull    $dst,$tmp2\n\t"
7572             "movswl   $dst,$dst\t! mul reduction16S" %}
7573   ins_encode %{
7574     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
7575     __ vpmullw($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, 1);
7576     __ pshufd($tmp1$$XMMRegister, $tmp$$XMMRegister, 0xE);
7577     __ vpmullw($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp1$$XMMRegister, 0);
7578     __ pshufd($tmp1$$XMMRegister, $tmp$$XMMRegister, 0x1);
7579     __ vpmullw($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp1$$XMMRegister, 0);
7580     __ pextrw($tmp2$$Register, $tmp$$XMMRegister, 0x1);
7581     __ pextrw($tmp3$$Register, $tmp$$XMMRegister, 0x0);
7582     __ imull($tmp2$$Register, $tmp3$$Register);
7583     __ movswl($dst$$Register, $src1$$Register);
7584     __ imull($dst$$Register, $tmp2$$Register);
7585     __ movswl($dst$$Register, $dst$$Register);
7586   %}
7587   ins_pipe( pipe_slow );
7588 %}
7589 
7590 instruct rvmul32S_reduction_reg(rRegI dst, rRegI src1, vecZ src2, regF tmp, regF tmp1, rRegI tmp2, rRegI tmp3) %{
7591   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
7592   match(Set dst (MulReductionVI src1 src2));
7593   effect(TEMP dst, TEMP tmp, TEMP tmp1, TEMP tmp2, TEMP tmp3);
7594   format %{ "vextracti64x4_high  $tmp1,$src2\n\t"
7595             "vpmullw  $tmp1,$tmp1,$src2\n\t"
7596             "vextracti128_high  $tmp,$tmp1\n\t"
7597             "vpmullw  $tmp,$tmp,$tmp1\n\t"
7598             "pshufd  $tmp1,$tmp,0xE\n\t"
7599             "pmullw  $tmp,$tmp1\n\t"
7600             "pshufd  $tmp1,$tmp,0x1\n\t"
7601             "pmullw  $tmp,$tmp1\n\t"
7602             "pextrw  $tmp2,$tmp, 0x1\n\t"
7603             "pextrw  $tmp3,$tmp, 0x0\n\t"
7604             "imul    $tmp2,$tmp3 \n\t"
7605             "movswl   $dst,$src1\n\t"
7606             "imull    $dst,$tmp2\n\t"
7607             "movswl   $dst,$dst\t! mul reduction32S" %}
7608   ins_encode %{
7609     int vector_len = 0;
7610     __ vextracti64x4_high($tmp1$$XMMRegister, $src2$$XMMRegister);
7611     __ vpmullw($tmp1$$XMMRegister, $tmp1$$XMMRegister, $src2$$XMMRegister, 1);
7612     __ vextracti128_high($tmp$$XMMRegister, $tmp1$$XMMRegister);
7613     __ vpmullw($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp1$$XMMRegister, vector_len);
7614     __ pshufd($tmp1$$XMMRegister, $tmp$$XMMRegister, 0xE);
7615     __ vpmullw($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp1$$XMMRegister, vector_len);
7616     __ pshufd($tmp1$$XMMRegister, $tmp$$XMMRegister, 0x1);
7617     __ vpmullw($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp1$$XMMRegister, vector_len);
7618     __ pextrw($tmp2$$Register, $tmp$$XMMRegister, 0x1);
7619     __ pextrw($tmp3$$Register, $tmp$$XMMRegister, 0x0);
7620     __ imull($tmp2$$Register, $tmp3$$Register);
7621     __ movswl($dst$$Register, $src1$$Register);
7622     __ imull($dst$$Register, $tmp2$$Register);
7623     __ movswl($dst$$Register, $dst$$Register);
7624   %}
7625   ins_pipe( pipe_slow );
7626 %}
7627 
7628 
7629 instruct rsmul2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, vecD tmp, vecD tmp2) %{
7630   predicate(UseSSE > 3 && UseAVX == 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
7631   match(Set dst (MulReductionVI src1 src2));
7632   effect(TEMP tmp, TEMP tmp2);
7633   format %{ "pshufd  $tmp2,$src2,0x1\n\t"
7634             "pmulld  $tmp2,$src2\n\t"
7635             "movd    $tmp,$src1\n\t"
7636             "pmulld  $tmp2,$tmp\n\t"
7637             "movd    $dst,$tmp2\t! mul reduction2I" %}
7638   ins_encode %{
7639     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
7640     __ pmulld($tmp2$$XMMRegister, $src2$$XMMRegister);
7641     __ movdl($tmp$$XMMRegister, $src1$$Register);
7642     __ pmulld($tmp2$$XMMRegister, $tmp$$XMMRegister);
7643     __ movdl($dst$$Register, $tmp2$$XMMRegister);
7644   %}
7645   ins_pipe( pipe_slow );
7646 %}
7647 
7648 instruct rvmul2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, vecD tmp, vecD tmp2) %{
7649   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
7650   match(Set dst (MulReductionVI src1 src2));
7651   effect(TEMP tmp, TEMP tmp2);
7652   format %{ "pshufd   $tmp2,$src2,0x1\n\t"
7653             "vpmulld  $tmp,$src2,$tmp2\n\t"
7654             "movd     $tmp2,$src1\n\t"
7655             "vpmulld  $tmp2,$tmp,$tmp2\n\t"
7656             "movd     $dst,$tmp2\t! mul reduction2I" %}
7657   ins_encode %{
7658     int vector_len = 0;
7659     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
7660     __ vpmulld($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
7661     __ movdl($tmp2$$XMMRegister, $src1$$Register);
7662     __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
7663     __ movdl($dst$$Register, $tmp2$$XMMRegister);
7664   %}
7665   ins_pipe( pipe_slow );
7666 %}
7667 
7668 instruct rsmul4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, vecX tmp, vecX tmp2) %{
7669   predicate(UseSSE > 3 && UseAVX == 0 && UseAVX == 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
7670   match(Set dst (MulReductionVI src1 src2));
7671   effect(TEMP tmp, TEMP tmp2);
7672   format %{ "pshufd  $tmp2,$src2,0xE\n\t"
7673             "pmulld  $tmp2,$src2\n\t"
7674             "pshufd  $tmp,$tmp2,0x1\n\t"
7675             "pmulld  $tmp2,$tmp\n\t"
7676             "movd    $tmp,$src1\n\t"
7677             "pmulld  $tmp2,$tmp\n\t"
7678             "movd    $dst,$tmp2\t! mul reduction4I" %}
7679   ins_encode %{
7680     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
7681     __ pmulld($tmp2$$XMMRegister, $src2$$XMMRegister);
7682     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x1);
7683     __ pmulld($tmp2$$XMMRegister, $tmp$$XMMRegister);
7684     __ movdl($tmp$$XMMRegister, $src1$$Register);
7685     __ pmulld($tmp2$$XMMRegister, $tmp$$XMMRegister);
7686     __ movdl($dst$$Register, $tmp2$$XMMRegister);
7687   %}
7688   ins_pipe( pipe_slow );
7689 %}
7690 
7691 instruct rvmul4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, vecX tmp, vecX tmp2) %{
7692   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
7693   match(Set dst (MulReductionVI src1 src2));
7694   effect(TEMP tmp, TEMP tmp2);
7695   format %{ "pshufd   $tmp2,$src2,0xE\n\t"
7696             "vpmulld  $tmp,$src2,$tmp2\n\t"
7697             "pshufd   $tmp2,$tmp,0x1\n\t"
7698             "vpmulld  $tmp,$tmp,$tmp2\n\t"
7699             "movd     $tmp2,$src1\n\t"
7700             "vpmulld  $tmp2,$tmp,$tmp2\n\t"
7701             "movd     $dst,$tmp2\t! mul reduction4I" %}
7702   ins_encode %{
7703     int vector_len = 0;
7704     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
7705     __ vpmulld($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
7706     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
7707     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
7708     __ movdl($tmp2$$XMMRegister, $src1$$Register);
7709     __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
7710     __ movdl($dst$$Register, $tmp2$$XMMRegister);
7711   %}
7712   ins_pipe( pipe_slow );
7713 %}
7714 
7715 instruct rvmul8I_reduction_reg(rRegI dst, rRegI src1, vecY src2, vecY tmp, vecY tmp2) %{
7716   predicate(UseAVX > 1 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
7717   match(Set dst (MulReductionVI src1 src2));
7718   effect(TEMP tmp, TEMP tmp2);
7719   format %{ "vextracti128_high  $tmp,$src2\n\t"
7720             "vpmulld  $tmp,$tmp,$src2\n\t"
7721             "pshufd   $tmp2,$tmp,0xE\n\t"
7722             "vpmulld  $tmp,$tmp,$tmp2\n\t"
7723             "pshufd   $tmp2,$tmp,0x1\n\t"
7724             "vpmulld  $tmp,$tmp,$tmp2\n\t"
7725             "movd     $tmp2,$src1\n\t"
7726             "vpmulld  $tmp2,$tmp,$tmp2\n\t"
7727             "movd     $dst,$tmp2\t! mul reduction8I" %}
7728   ins_encode %{
7729     int vector_len = 0;
7730     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
7731     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
7732     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
7733     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
7734     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
7735     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
7736     __ movdl($tmp2$$XMMRegister, $src1$$Register);
7737     __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
7738     __ movdl($dst$$Register, $tmp2$$XMMRegister);
7739   %}
7740   ins_pipe( pipe_slow );
7741 %}
7742 
7743 instruct rvmul16I_reduction_reg(rRegI dst, rRegI src1, legVecZ src2, legVecZ tmp, legVecZ tmp2, legVecZ tmp3) %{
7744   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
7745   match(Set dst (MulReductionVI src1 src2));
7746   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
7747   format %{ "vextracti64x4_high  $tmp3,$src2\n\t"
7748             "vpmulld  $tmp3,$tmp3,$src2\n\t"
7749             "vextracti128_high  $tmp,$tmp3\n\t"
7750             "vpmulld  $tmp,$tmp,$src2\n\t"
7751             "pshufd   $tmp2,$tmp,0xE\n\t"
7752             "vpmulld  $tmp,$tmp,$tmp2\n\t"
7753             "pshufd   $tmp2,$tmp,0x1\n\t"
7754             "vpmulld  $tmp,$tmp,$tmp2\n\t"
7755             "movd     $tmp2,$src1\n\t"
7756             "vpmulld  $tmp2,$tmp,$tmp2\n\t"
7757             "movd     $dst,$tmp2\t! mul reduction16I" %}
7758   ins_encode %{
7759     __ vextracti64x4_high($tmp3$$XMMRegister, $src2$$XMMRegister);
7760     __ vpmulld($tmp3$$XMMRegister, $tmp3$$XMMRegister, $src2$$XMMRegister, 1);
7761     __ vextracti128_high($tmp$$XMMRegister, $tmp3$$XMMRegister);
7762     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp3$$XMMRegister, 0);
7763     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
7764     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
7765     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
7766     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
7767     __ movdl($tmp2$$XMMRegister, $src1$$Register);
7768     __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
7769     __ movdl($dst$$Register, $tmp2$$XMMRegister);
7770   %}
7771   ins_pipe( pipe_slow );
7772 %}
7773 
7774 #ifdef _LP64
7775 instruct rvmul2L_reduction_reg(rRegL dst, rRegL src1, vecX src2, vecX tmp, vecX tmp2) %{
7776   predicate(UseAVX > 2 && VM_Version::supports_avx512dq());
7777   match(Set dst (MulReductionVL src1 src2));
7778   effect(TEMP tmp, TEMP tmp2);
7779   format %{ "pshufd   $tmp2,$src2,0xE\n\t"
7780             "vpmullq  $tmp,$src2,$tmp2\n\t"
7781             "movdq    $tmp2,$src1\n\t"
7782             "vpmullq  $tmp2,$tmp,$tmp2\n\t"
7783             "movdq    $dst,$tmp2\t! mul reduction2L" %}
7784   ins_encode %{
7785     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
7786     __ vpmullq($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, 0);
7787     __ movdq($tmp2$$XMMRegister, $src1$$Register);
7788     __ vpmullq($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
7789     __ movdq($dst$$Register, $tmp2$$XMMRegister);
7790   %}
7791   ins_pipe( pipe_slow );
7792 %}
7793 
7794 instruct rvmul4L_reduction_reg(rRegL dst, rRegL src1, vecY src2, vecY tmp, vecY tmp2) %{
7795   predicate(UseAVX > 2 && VM_Version::supports_avx512dq());
7796   match(Set dst (MulReductionVL src1 src2));
7797   effect(TEMP tmp, TEMP tmp2);
7798   format %{ "vextracti128_high  $tmp,$src2\n\t"
7799             "vpmullq  $tmp2,$tmp,$src2\n\t"
7800             "pshufd   $tmp,$tmp2,0xE\n\t"
7801             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
7802             "movdq    $tmp,$src1\n\t"
7803             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
7804             "movdq    $dst,$tmp2\t! mul reduction4L" %}
7805   ins_encode %{
7806     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
7807     __ vpmullq($tmp2$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, 0);
7808     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
7809     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
7810     __ movdq($tmp$$XMMRegister, $src1$$Register);
7811     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
7812     __ movdq($dst$$Register, $tmp2$$XMMRegister);
7813   %}
7814   ins_pipe( pipe_slow );
7815 %}
7816 
7817 instruct rvmul8L_reduction_reg(rRegL dst, rRegL src1, legVecZ src2, legVecZ tmp, legVecZ tmp2) %{
7818   predicate(UseAVX > 2 && VM_Version::supports_avx512dq());
7819   match(Set dst (MulReductionVL src1 src2));
7820   effect(TEMP tmp, TEMP tmp2);
7821   format %{ "vextracti64x4_high  $tmp2,$src2\n\t"
7822             "vpmullq  $tmp2,$tmp2,$src2\n\t"
7823             "vextracti128_high  $tmp,$tmp2\n\t"
7824             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
7825             "pshufd   $tmp,$tmp2,0xE\n\t"
7826             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
7827             "movdq    $tmp,$src1\n\t"
7828             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
7829             "movdq    $dst,$tmp2\t! mul reduction8L" %}
7830   ins_encode %{
7831     __ vextracti64x4_high($tmp2$$XMMRegister, $src2$$XMMRegister);
7832     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
7833     __ vextracti128_high($tmp$$XMMRegister, $tmp2$$XMMRegister);
7834     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
7835     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
7836     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
7837     __ movdq($tmp$$XMMRegister, $src1$$Register);
7838     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
7839     __ movdq($dst$$Register, $tmp2$$XMMRegister);
7840   %}
7841   ins_pipe( pipe_slow );
7842 %}
7843 #endif
7844 
7845 instruct rsmul2F_reduction(regF dst, vecD src2, vecD tmp) %{
7846   predicate(UseSSE >= 1 && UseAVX == 0);
7847   match(Set dst (MulReductionVF dst src2));
7848   effect(TEMP dst, TEMP tmp);
7849   format %{ "mulss   $dst,$src2\n\t"
7850             "pshufd  $tmp,$src2,0x01\n\t"
7851             "mulss   $dst,$tmp\t! mul reduction2F" %}
7852   ins_encode %{
7853     __ mulss($dst$$XMMRegister, $src2$$XMMRegister);
7854     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
7855     __ mulss($dst$$XMMRegister, $tmp$$XMMRegister);
7856   %}
7857   ins_pipe( pipe_slow );
7858 %}
7859 
7860 instruct rvmul2F_reduction_reg(regF dst, vecD src2, vecD tmp) %{
7861   predicate(UseAVX > 0);
7862   match(Set dst (MulReductionVF dst src2));
7863   effect(TEMP tmp, TEMP dst);
7864   format %{ "vmulss  $dst,$dst,$src2\n\t"
7865             "pshufd  $tmp,$src2,0x01\n\t"
7866             "vmulss  $dst,$dst,$tmp\t! mul reduction2F" %}
7867   ins_encode %{
7868     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
7869     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
7870     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7871   %}
7872   ins_pipe( pipe_slow );
7873 %}
7874 
7875 instruct rsmul4F_reduction_reg(regF dst, vecX src2, vecX tmp) %{
7876   predicate(UseSSE >= 1 && UseAVX == 0);
7877   match(Set dst (MulReductionVF dst src2));
7878   effect(TEMP dst, TEMP tmp);
7879   format %{ "mulss   $dst,$src2\n\t"
7880             "pshufd  $tmp,$src2,0x01\n\t"
7881             "mulss   $dst,$tmp\n\t"
7882             "pshufd  $tmp,$src2,0x02\n\t"
7883             "mulss   $dst,$tmp\n\t"
7884             "pshufd  $tmp,$src2,0x03\n\t"
7885             "mulss   $dst,$tmp\t! mul reduction4F" %}
7886   ins_encode %{
7887     __ mulss($dst$$XMMRegister, $src2$$XMMRegister);
7888     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
7889     __ mulss($dst$$XMMRegister, $tmp$$XMMRegister);
7890     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
7891     __ mulss($dst$$XMMRegister, $tmp$$XMMRegister);
7892     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
7893     __ mulss($dst$$XMMRegister, $tmp$$XMMRegister);
7894   %}
7895   ins_pipe( pipe_slow );
7896 %}
7897 
7898 instruct rvmul4F_reduction_reg(regF dst, vecX src2, vecX tmp) %{
7899   predicate(UseAVX > 0);
7900   match(Set dst (MulReductionVF dst src2));
7901   effect(TEMP tmp, TEMP dst);
7902   format %{ "vmulss  $dst,$dst,$src2\n\t"
7903             "pshufd  $tmp,$src2,0x01\n\t"
7904             "vmulss  $dst,$dst,$tmp\n\t"
7905             "pshufd  $tmp,$src2,0x02\n\t"
7906             "vmulss  $dst,$dst,$tmp\n\t"
7907             "pshufd  $tmp,$src2,0x03\n\t"
7908             "vmulss  $dst,$dst,$tmp\t! mul reduction4F" %}
7909   ins_encode %{
7910     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
7911     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
7912     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7913     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
7914     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7915     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
7916     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7917   %}
7918   ins_pipe( pipe_slow );
7919 %}
7920 
7921 instruct rvmul8F_reduction_reg(regF dst, vecY src2, vecY tmp, vecY tmp2) %{
7922   predicate(UseAVX > 0);
7923   match(Set dst (MulReductionVF dst src2));
7924   effect(TEMP tmp, TEMP dst, TEMP tmp2);
7925   format %{ "vmulss  $dst,$dst,$src2\n\t"
7926             "pshufd  $tmp,$src2,0x01\n\t"
7927             "vmulss  $dst,$dst,$tmp\n\t"
7928             "pshufd  $tmp,$src2,0x02\n\t"
7929             "vmulss  $dst,$dst,$tmp\n\t"
7930             "pshufd  $tmp,$src2,0x03\n\t"
7931             "vmulss  $dst,$dst,$tmp\n\t"
7932             "vextractf128_high  $tmp2,$src2\n\t"
7933             "vmulss  $dst,$dst,$tmp2\n\t"
7934             "pshufd  $tmp,$tmp2,0x01\n\t"
7935             "vmulss  $dst,$dst,$tmp\n\t"
7936             "pshufd  $tmp,$tmp2,0x02\n\t"
7937             "vmulss  $dst,$dst,$tmp\n\t"
7938             "pshufd  $tmp,$tmp2,0x03\n\t"
7939             "vmulss  $dst,$dst,$tmp\t! mul reduction8F" %}
7940   ins_encode %{
7941     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
7942     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
7943     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7944     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
7945     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7946     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
7947     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7948     __ vextractf128_high($tmp2$$XMMRegister, $src2$$XMMRegister);
7949     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
7950     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
7951     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7952     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
7953     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7954     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
7955     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7956   %}
7957   ins_pipe( pipe_slow );
7958 %}
7959 
7960 instruct rvmul16F_reduction_reg(regF dst, legVecZ src2, legVecZ tmp, legVecZ tmp2) %{
7961   predicate(UseAVX > 2);
7962   match(Set dst (MulReductionVF dst src2));
7963   effect(TEMP tmp, TEMP dst, TEMP tmp2);
7964   format %{ "vmulss  $dst,$dst,$src2\n\t"
7965             "pshufd  $tmp,$src2,0x01\n\t"
7966             "vmulss  $dst,$dst,$tmp\n\t"
7967             "pshufd  $tmp,$src2,0x02\n\t"
7968             "vmulss  $dst,$dst,$tmp\n\t"
7969             "pshufd  $tmp,$src2,0x03\n\t"
7970             "vmulss  $dst,$dst,$tmp\n\t"
7971             "vextractf32x4  $tmp2,$src2,0x1\n\t"
7972             "vmulss  $dst,$dst,$tmp2\n\t"
7973             "pshufd  $tmp,$tmp2,0x01\n\t"
7974             "vmulss  $dst,$dst,$tmp\n\t"
7975             "pshufd  $tmp,$tmp2,0x02\n\t"
7976             "vmulss  $dst,$dst,$tmp\n\t"
7977             "pshufd  $tmp,$tmp2,0x03\n\t"
7978             "vmulss  $dst,$dst,$tmp\n\t"
7979             "vextractf32x4  $tmp2,$src2,0x2\n\t"
7980             "vmulss  $dst,$dst,$tmp2\n\t"
7981             "pshufd  $tmp,$tmp2,0x01\n\t"
7982             "vmulss  $dst,$dst,$tmp\n\t"
7983             "pshufd  $tmp,$tmp2,0x02\n\t"
7984             "vmulss  $dst,$dst,$tmp\n\t"
7985             "pshufd  $tmp,$tmp2,0x03\n\t"
7986             "vmulss  $dst,$dst,$tmp\n\t"
7987             "vextractf32x4  $tmp2,$src2,0x3\n\t"
7988             "vmulss  $dst,$dst,$tmp2\n\t"
7989             "pshufd  $tmp,$tmp2,0x01\n\t"
7990             "vmulss  $dst,$dst,$tmp\n\t"
7991             "pshufd  $tmp,$tmp2,0x02\n\t"
7992             "vmulss  $dst,$dst,$tmp\n\t"
7993             "pshufd  $tmp,$tmp2,0x03\n\t"
7994             "vmulss  $dst,$dst,$tmp\t! mul reduction16F" %}
7995   ins_encode %{
7996     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
7997     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
7998     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
7999     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
8000     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
8001     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
8002     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
8003     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
8004     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
8005     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
8006     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
8007     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
8008     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
8009     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
8010     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
8011     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
8012     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
8013     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
8014     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
8015     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
8016     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
8017     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
8018     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
8019     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
8020     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
8021     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
8022     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
8023     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
8024     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
8025     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
8026     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
8027   %}
8028   ins_pipe( pipe_slow );
8029 %}
8030 
8031 instruct rsmul2D_reduction_reg(regD dst, vecX src2, vecX tmp) %{
8032   predicate(UseSSE >= 1 && UseAVX == 0);
8033   match(Set dst (MulReductionVD dst src2));
8034   effect(TEMP dst, TEMP tmp);
8035   format %{ "mulsd   $dst,$src2\n\t"
8036             "pshufd  $tmp,$src2,0xE\n\t"
8037             "mulsd   $dst,$tmp\t! mul reduction2D" %}
8038   ins_encode %{
8039     __ mulsd($dst$$XMMRegister, $src2$$XMMRegister);
8040     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
8041     __ mulsd($dst$$XMMRegister, $tmp$$XMMRegister);
8042   %}
8043   ins_pipe( pipe_slow );
8044 %}
8045 
8046 instruct rvmul2D_reduction_reg(regD dst, vecX src2, vecX tmp) %{
8047   predicate(UseAVX > 0);
8048   match(Set dst (MulReductionVD dst src2));
8049   effect(TEMP tmp, TEMP dst);
8050   format %{ "vmulsd  $dst,$dst,$src2\n\t"
8051             "pshufd  $tmp,$src2,0xE\n\t"
8052             "vmulsd  $dst,$dst,$tmp\t! mul reduction2D" %}
8053   ins_encode %{
8054     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
8055     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
8056     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
8057   %}
8058   ins_pipe( pipe_slow );
8059 %}
8060 
8061 instruct rvmul4D_reduction_reg(regD dst, vecY src2, vecY tmp, vecY tmp2) %{
8062   predicate(UseAVX > 0);
8063   match(Set dst (MulReductionVD dst src2));
8064   effect(TEMP tmp, TEMP dst, TEMP tmp2);
8065   format %{ "vmulsd  $dst,$dst,$src2\n\t"
8066             "pshufd  $tmp,$src2,0xE\n\t"
8067             "vmulsd  $dst,$dst,$tmp\n\t"
8068             "vextractf128_high  $tmp2,$src2\n\t"
8069             "vmulsd  $dst,$dst,$tmp2\n\t"
8070             "pshufd  $tmp,$tmp2,0xE\n\t"
8071             "vmulsd  $dst,$dst,$tmp\t! mul reduction4D" %}
8072   ins_encode %{
8073     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
8074     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
8075     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
8076     __ vextractf128_high($tmp2$$XMMRegister, $src2$$XMMRegister);
8077     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
8078     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
8079     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
8080   %}
8081   ins_pipe( pipe_slow );
8082 %}
8083 
8084 instruct rvmul8D_reduction_reg(regD dst, legVecZ src2, legVecZ tmp, legVecZ tmp2) %{
8085   predicate(UseAVX > 2);
8086   match(Set dst (MulReductionVD dst src2));
8087   effect(TEMP tmp, TEMP dst, TEMP tmp2);
8088   format %{ "vmulsd  $dst,$dst,$src2\n\t"
8089             "pshufd  $tmp,$src2,0xE\n\t"
8090             "vmulsd  $dst,$dst,$tmp\n\t"
8091             "vextractf32x4  $tmp2,$src2,0x1\n\t"
8092             "vmulsd  $dst,$dst,$tmp2\n\t"
8093             "pshufd  $tmp,$src2,0xE\n\t"
8094             "vmulsd  $dst,$dst,$tmp\n\t"
8095             "vextractf32x4  $tmp2,$src2,0x2\n\t"
8096             "vmulsd  $dst,$dst,$tmp2\n\t"
8097             "pshufd  $tmp,$tmp2,0xE\n\t"
8098             "vmulsd  $dst,$dst,$tmp\n\t"
8099             "vextractf32x4  $tmp2,$src2,0x3\n\t"
8100             "vmulsd  $dst,$dst,$tmp2\n\t"
8101             "pshufd  $tmp,$tmp2,0xE\n\t"
8102             "vmulsd  $dst,$dst,$tmp\t! mul reduction8D" %}
8103   ins_encode %{
8104     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
8105     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
8106     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
8107     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
8108     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
8109     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
8110     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
8111     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
8112     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
8113     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
8114     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
8115     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
8116     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
8117     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
8118     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
8119   %}
8120   ins_pipe( pipe_slow );
8121 %}
8122 
8123 //--------------------Min Reduction --------------------
8124 instruct rsmin8B_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, rRegI tmp2, rRegI tmp3, regF tmp4) %{
8125   predicate(UseSSE > 3 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
8126   match(Set dst (MinReductionV src1 src2));
8127   effect(TEMP dst, TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP tmp4);
8128   format %{ "pshufd  $tmp,$src2,0x1\n\t"
8129             "pminsb  $tmp,$src2\n\t"
8130             "pextrb  $tmp2,$tmp, 0x1\n\t"
8131             "movsbl  $tmp2,$tmp2\n\t"
8132             "pextrb  $tmp3,$tmp,0x0\n\t"
8133             "movsbl  $tmp3,$tmp3\n\t"
8134             "cmpl  $tmp2,$tmp3\n\t"
8135             "cmovl  $tmp3,$tmp2\n\t"
8136             "cmpl  $src1,$tmp3\n\t"
8137             "cmovl  $tmp3,$src1, 0x0\n\t"
8138             "movl  $dst,$tmp2\n\t"
8139             "pextrb  $tmp2,$tmp\n\t"
8140             "movsbl  $tmp2,$tmp2\n\t"
8141             "pextrb  $tmp3,$tmp\n\t"
8142             "movsbl  $tmp3,$tmp3\n\t"
8143             "cmpl  $tmp2,$tmp3\n\t"
8144             "cmovl  $tmp3,$tmp2\n\t"
8145             "cmpl  $tmp3,$dst\n\t"
8146             "cmovl  $dst,$tmp3\t! min reduction4S" %}
8147   ins_encode %{
8148     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister,0x1);
8149     __ pminsb($tmp$$XMMRegister, $src2$$XMMRegister);
8150     __ pextrb($tmp2$$Register, $tmp$$XMMRegister,0x1);
8151     __ movsbl($tmp2$$Register, $tmp2$$Register);
8152     __ pextrb($tmp3$$Register, $tmp$$XMMRegister,0x0);
8153     __ movsbl($tmp3$$Register, $tmp3$$Register);
8154     __ cmpl($tmp2$$Register, $tmp3$$Register);
8155     __ cmovl(Assembler::less, $tmp3$$Register, $tmp2$$Register);
8156     __ cmpl($src1$$Register, $tmp3$$Register);
8157     __ cmovl(Assembler::less, $tmp3$$Register, $src1$$Register);
8158     __ movl($dst$$Register, $tmp3$$Register);
8159     __ pextrb($tmp2$$Register, $tmp$$XMMRegister,0x3);
8160     __ movsbl($tmp2$$Register, $tmp2$$Register);
8161     __ pextrb($tmp3$$Register, $tmp$$XMMRegister,0x2);
8162     __ movsbl($tmp3$$Register, $tmp3$$Register);
8163     __ cmpl($tmp2$$Register, $tmp3$$Register);
8164     __ cmovl(Assembler::less, $tmp3$$Register, $tmp2$$Register);
8165     __ cmpl($tmp3$$Register, $dst$$Register);
8166     __ cmovl(Assembler::less, $dst$$Register, $tmp3$$Register);
8167     __ movsbl($dst$$Register, $dst$$Register);
8168   %}
8169   ins_pipe( pipe_slow );
8170 %}
8171 
8172 instruct rsmin16B_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, rRegI tmp2, rRegI tmp3, regF tmp4) %{
8173   predicate(UseSSE > 3 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
8174   match(Set dst (MinReductionV src1 src2));
8175   effect(TEMP dst, TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP tmp4);
8176   format %{ "pshufd  $tmp4,$src2,0xE\n\t"
8177             "pminsb  $tmp4,$src2\n\t"
8178             "pshufd  $tmp,$tmp4,0x1\n\t"
8179             "pminsb  $tmp,$tmp4\n\t"
8180             "pextrb  $tmp2,$tmp, 0x1\n\t"
8181             "movsbl  $tmp2,$tmp2\n\t"
8182             "pextrb  $tmp3,$tmp,0x0\n\t"
8183             "movsbl  $tmp3,$tmp3\n\t"
8184             "cmpl  $tmp2,$tmp3\n\t"
8185             "cmovl  $tmp3,$tmp2\n\t"
8186             "cmpl  $src1,$tmp3\n\t"
8187             "cmovl  $tmp3,$src1, 0x0\n\t"
8188             "movl  $dst,$tmp2\n\t"
8189             "pextrb  $tmp2,$tmp\n\t"
8190             "movsbl  $tmp2,$tmp2\n\t"
8191             "pextrb  $tmp3,$tmp\n\t"
8192             "movsbl  $tmp3,$tmp3\n\t"
8193             "cmpl  $tmp2,$tmp3\n\t"
8194             "cmovl  $tmp3,$tmp2\n\t"
8195             "cmpl  $tmp3,$dst\n\t"
8196             "cmovl  $dst,$tmp3\t! min reduction4S" %}
8197   ins_encode %{
8198     __ pshufd($tmp4$$XMMRegister, $src2$$XMMRegister, 0xE);
8199     __ pminsb($tmp4$$XMMRegister, $src2$$XMMRegister);
8200     __ pshufd($tmp$$XMMRegister, $tmp4$$XMMRegister,0x1);
8201     __ pminsb($tmp$$XMMRegister, $tmp4$$XMMRegister);
8202     __ pextrb($tmp2$$Register, $tmp$$XMMRegister,0x1);
8203     __ movsbl($tmp2$$Register, $tmp2$$Register);
8204     __ pextrb($tmp3$$Register, $tmp$$XMMRegister,0x0);
8205     __ movsbl($tmp3$$Register, $tmp3$$Register);
8206     __ cmpl($tmp2$$Register, $tmp3$$Register);
8207     __ cmovl(Assembler::less, $tmp3$$Register, $tmp2$$Register);
8208     __ cmpl($src1$$Register, $tmp3$$Register);
8209     __ cmovl(Assembler::less, $tmp3$$Register, $src1$$Register);
8210     __ movl($dst$$Register, $tmp3$$Register);
8211     __ pextrb($tmp2$$Register, $tmp$$XMMRegister,0x3);
8212     __ movsbl($tmp2$$Register, $tmp2$$Register);
8213     __ pextrb($tmp3$$Register, $tmp$$XMMRegister,0x2);
8214     __ movsbl($tmp3$$Register, $tmp3$$Register);
8215     __ cmpl($tmp2$$Register, $tmp3$$Register);
8216     __ cmovl(Assembler::less, $tmp3$$Register, $tmp2$$Register);
8217     __ cmpl($tmp3$$Register, $dst$$Register);
8218     __ cmovl(Assembler::less, $dst$$Register, $tmp3$$Register);
8219     __ movsbl($dst$$Register, $dst$$Register);
8220   %}
8221   ins_pipe( pipe_slow );
8222 %}
8223 
8224 instruct rvmin16B_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, rRegI tmp2, rRegI tmp3, regF tmp4) %{
8225   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
8226   match(Set dst (MinReductionV src1 src2));
8227   effect(TEMP dst, TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP tmp4);
8228   format %{ "pshufd  $tmp4,$src2,0xE\n\t"
8229             "vpminsb  $tmp,$tmp4,$src2\n\t"
8230             "pshufd  $tmp,$tmp4,0x1\n\t"
8231             "vpminsb  $tmp,$tmp,$tmp4\n\t"
8232             "pextrb  $tmp2,$tmp, 0x1\n\t"
8233             "movsbl  $tmp2,$tmp2\n\t"
8234             "pextrb  $tmp3,$tmp,0x0\n\t"
8235             "movsbl  $tmp3,$tmp3\n\t"
8236             "cmpl  $tmp2,$tmp3\n\t"
8237             "cmovl  $tmp3,$tmp2\n\t"
8238             "cmpl  $src1,$tmp3\n\t"
8239             "cmovl  $tmp3,$src1, 0x0\n\t"
8240             "movl  $dst,$tmp2\n\t"
8241             "pextrb  $tmp2,$tmp\n\t"
8242             "movsbl  $tmp2,$tmp2\n\t"
8243             "pextrb  $tmp3,$tmp\n\t"
8244             "movsbl  $tmp3,$tmp3\n\t"
8245             "cmpl  $tmp2,$tmp3\n\t"
8246             "cmovl  $tmp3,$tmp2\n\t"
8247             "cmpl  $tmp3,$dst\n\t"
8248             "cmovl  $dst,$tmp3\t! min reduction4S" %}
8249   ins_encode %{
8250     int vector_len = 0;
8251     __ pshufd($tmp4$$XMMRegister, $src2$$XMMRegister, 0xE);
8252     __ vpminsb($tmp4$$XMMRegister, $tmp4$$XMMRegister, $src2$$XMMRegister, 0);
8253     __ pshufd($tmp$$XMMRegister, $tmp4$$XMMRegister,0x1);
8254     __ vpminsb($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp4$$XMMRegister, 0);
8255     __ pextrb($tmp2$$Register, $tmp$$XMMRegister,0x1);
8256     __ movsbl($tmp2$$Register, $tmp2$$Register);
8257     __ pextrb($tmp3$$Register, $tmp$$XMMRegister,0x0);
8258     __ movsbl($tmp3$$Register, $tmp3$$Register);
8259     __ cmpl($tmp2$$Register, $tmp3$$Register);
8260     __ cmovl(Assembler::less, $tmp3$$Register, $tmp2$$Register);
8261     __ cmpl($src1$$Register, $tmp3$$Register);
8262     __ cmovl(Assembler::less, $tmp3$$Register, $src1$$Register);
8263     __ movl($dst$$Register, $tmp3$$Register);
8264     __ pextrb($tmp2$$Register, $tmp$$XMMRegister,0x3);
8265     __ movsbl($tmp2$$Register, $tmp2$$Register);
8266     __ pextrb($tmp3$$Register, $tmp$$XMMRegister,0x2);
8267     __ movsbl($tmp3$$Register, $tmp3$$Register);
8268     __ cmpl($tmp2$$Register, $tmp3$$Register);
8269     __ cmovl(Assembler::less, $tmp3$$Register, $tmp2$$Register);
8270     __ cmpl($tmp3$$Register, $dst$$Register);
8271     __ cmovl(Assembler::less, $dst$$Register, $tmp3$$Register);
8272     __ movsbl($dst$$Register, $dst$$Register);
8273   %}
8274   ins_pipe( pipe_slow );
8275 %}
8276 
8277 instruct rvmin32B_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, rRegI tmp2, rRegI tmp3, regF tmp4) %{
8278   predicate(UseAVX > 1 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
8279   match(Set dst (MinReductionV src1 src2));
8280   effect(TEMP dst, TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP tmp4);
8281   format %{ "vextracti128_high  $tmp,$src2\n\t"
8282             "vpminsb  $tmp,$tmp,$src2\n\t"
8283             "pshufd  $tmp4,$tmp,0xE\n\t"
8284             "vpminsb  $tmp4,$tmp4,$tmp\n\t"
8285             "pshufd  $tmp,$tmp4,0x1\n\t"
8286             "vpminsb  $tmp,$tmp,$tmp4\n\t"
8287             "pextrb  $tmp2,$tmp, 0x1\n\t"
8288             "movsbl  $tmp2,$tmp2\n\t"
8289             "pextrb  $tmp3,$tmp,0x0\n\t"
8290             "movsbl  $tmp3,$tmp3\n\t"
8291             "cmpl  $tmp2,$tmp3\n\t"
8292             "cmovl  $tmp3,$tmp2\n\t"
8293             "cmpl  $src1,$tmp3\n\t"
8294             "cmovl  $tmp3,$src1, 0x0\n\t"
8295             "movl  $dst,$tmp2\n\t"
8296             "pextrb  $tmp2,$tmp\n\t"
8297             "movsbl  $tmp2,$tmp2\n\t"
8298             "pextrb  $tmp3,$tmp\n\t"
8299             "movsbl  $tmp3,$tmp3\n\t"
8300             "cmpl  $tmp2,$tmp3\n\t"
8301             "cmovl  $tmp3,$tmp2\n\t"
8302             "cmpl  $tmp3,$dst\n\t"
8303             "cmovl  $dst,$tmp3\t! min reduction4S" %}
8304   ins_encode %{
8305     int vector_len = 1;
8306     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
8307     __ vpminsb($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
8308     __ pshufd($tmp4$$XMMRegister, $tmp$$XMMRegister, 0xE);
8309     __ vpminsb($tmp4$$XMMRegister, $tmp4$$XMMRegister, $tmp$$XMMRegister, 0);
8310     __ pshufd($tmp$$XMMRegister, $tmp4$$XMMRegister,0x1);
8311     __ vpminsb($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp4$$XMMRegister, 0);
8312     __ pextrb($tmp2$$Register, $tmp$$XMMRegister,0x1);
8313     __ movsbl($tmp2$$Register, $tmp2$$Register);
8314     __ pextrb($tmp3$$Register, $tmp$$XMMRegister,0x0);
8315     __ movsbl($tmp3$$Register, $tmp3$$Register);
8316     __ cmpl($tmp2$$Register, $tmp3$$Register);
8317     __ cmovl(Assembler::less, $tmp3$$Register, $tmp2$$Register);
8318     __ cmpl($src1$$Register, $tmp3$$Register);
8319     __ cmovl(Assembler::less, $tmp3$$Register, $src1$$Register);
8320     __ movl($dst$$Register, $tmp3$$Register);
8321     __ pextrb($tmp2$$Register, $tmp$$XMMRegister,0x3);
8322     __ movsbl($tmp2$$Register, $tmp2$$Register);
8323     __ pextrb($tmp3$$Register, $tmp$$XMMRegister,0x2);
8324     __ movsbl($tmp3$$Register, $tmp3$$Register);
8325     __ cmpl($tmp2$$Register, $tmp3$$Register);
8326     __ cmovl(Assembler::less, $tmp3$$Register, $tmp2$$Register);
8327     __ cmpl($tmp3$$Register, $dst$$Register);
8328     __ cmovl(Assembler::less, $dst$$Register, $tmp3$$Register);
8329     __ movsbl($dst$$Register, $dst$$Register);
8330   %}
8331   ins_pipe( pipe_slow );
8332 %}
8333 
8334 instruct rvmin64B_reduction_reg(rRegI dst, rRegI src1, vecZ src2, regF tmp, rRegI tmp2, rRegI tmp3, regF tmp4) %{
8335   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
8336   match(Set dst (MinReductionV src1 src2));
8337   effect(TEMP dst, TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP tmp4);
8338   format %{ "vextracti64x4_high  $tmp4,$src2\n\t"
8339             "vpminsb  $tmp4,$tmp4,$src2\n\t"
8340             "vextracti128_high  $tmp,$tmp4\n\t"
8341             "vpminsb  $tmp,$tmp,$tmp4\n\t"
8342             "pshufd  $tmp4,$tmp,0xE\n\t"
8343             "vpminsb  $tmp4,$tmp4,$tmp\n\t"
8344             "pshufd  $tmp,$tmp4,0x1\n\t"
8345             "vpminsb  $tmp,$tmp,$tmp4\n\t"
8346             "pextrb  $tmp2,$tmp, 0x1\n\t"
8347             "movsbl  $tmp2,$tmp2\n\t"
8348             "pextrb  $tmp3,$tmp,0x0\n\t"
8349             "movsbl  $tmp3,$tmp3\n\t"
8350             "cmpl  $tmp2,$tmp3\n\t"
8351             "cmovl  $tmp3,$tmp2\n\t"
8352             "cmpl  $src1,$tmp3\n\t"
8353             "cmovl  $tmp3,$src1, 0x0\n\t"
8354             "movl  $dst,$tmp2\n\t"
8355             "pextrb  $tmp2,$tmp\n\t"
8356             "movsbl  $tmp2,$tmp2\n\t"
8357             "pextrb  $tmp3,$tmp\n\t"
8358             "movsbl  $tmp3,$tmp3\n\t"
8359             "cmpl  $tmp2,$tmp3\n\t"
8360             "cmovl  $tmp3,$tmp2\n\t"
8361             "cmpl  $tmp3,$dst\n\t"
8362             "cmovl  $dst,$tmp3\t! min reduction4S" %}
8363   ins_encode %{
8364     __ vextracti64x4_high($tmp4$$XMMRegister, $src2$$XMMRegister);
8365     __ vpminsb($tmp4$$XMMRegister, $tmp4$$XMMRegister, $src2$$XMMRegister, 2);
8366     __ vextracti128_high($tmp$$XMMRegister, $tmp4$$XMMRegister);
8367     __ vpminsb($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp4$$XMMRegister, 1);
8368     __ pshufd($tmp4$$XMMRegister, $tmp$$XMMRegister, 0xE);
8369     __ vpminsb($tmp4$$XMMRegister, $tmp4$$XMMRegister, $tmp$$XMMRegister, 0);
8370     __ pshufd($tmp$$XMMRegister, $tmp4$$XMMRegister,0x1);
8371     __ vpminsb($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp4$$XMMRegister, 0);
8372     __ pextrb($tmp2$$Register, $tmp$$XMMRegister,0x1);
8373     __ movsbl($tmp2$$Register, $tmp2$$Register);
8374     __ pextrb($tmp3$$Register, $tmp$$XMMRegister,0x0);
8375     __ movsbl($tmp3$$Register, $tmp3$$Register);
8376     __ cmpl($tmp2$$Register, $tmp3$$Register);
8377     __ cmovl(Assembler::less, $tmp3$$Register, $tmp2$$Register);
8378     __ cmpl($src1$$Register, $tmp3$$Register);
8379     __ cmovl(Assembler::less, $tmp3$$Register, $src1$$Register);
8380     __ movl($dst$$Register, $tmp3$$Register);
8381     __ pextrb($tmp2$$Register, $tmp$$XMMRegister,0x3);
8382     __ movsbl($tmp2$$Register, $tmp2$$Register);
8383     __ pextrb($tmp3$$Register, $tmp$$XMMRegister,0x2);
8384     __ movsbl($tmp3$$Register, $tmp3$$Register);
8385     __ cmpl($tmp2$$Register, $tmp3$$Register);
8386     __ cmovl(Assembler::less, $tmp3$$Register, $tmp2$$Register);
8387     __ cmpl($tmp3$$Register, $dst$$Register);
8388     __ cmovl(Assembler::less, $dst$$Register, $tmp3$$Register);
8389     __ movsbl($dst$$Register, $dst$$Register);
8390   %}
8391   ins_pipe( pipe_slow );
8392 %}
8393 
8394 instruct rsmin4S_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, rRegI tmp2, rRegI tmp3) %{
8395   predicate(UseSSE > 1 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
8396   match(Set dst (MinReductionV src1 src2));
8397   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
8398   format %{ "pshufd  $tmp,$src2,0x1\n\t"
8399             "pminsw  $tmp,$src2\n\t"
8400             "pextrw  $tmp2,$tmp, 0x1\n\t"
8401             "movswl  $tmp2,$tmp2\n\t"
8402             "pextrb  $tmp3,$tmp, 0x0\n\t"
8403             "movswl  $tmp3,$tmp3,0x1\n\t"
8404             "cmpl  $tmp2,$tmp3\n\t"
8405             "cmovl  $tmp3,tmp2\n\t"
8406             "cmpl  $src1,$tmp3\n\t"
8407             "cmovl  $tmp3,$src1\n\t"
8408             "movswl  $dst,$tmp3\t! min reduction4S" %}
8409   ins_encode %{
8410     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister,0x1);
8411     __ pminsw($tmp$$XMMRegister, $src2$$XMMRegister);
8412     __ pextrw($tmp2$$Register, $tmp$$XMMRegister,0x1);
8413     __ movswl($tmp2$$Register, $tmp2$$Register);
8414     __ pextrw($tmp3$$Register, $tmp$$XMMRegister,0x0);
8415     __ movswl($tmp3$$Register, $tmp3$$Register);
8416     __ cmpl($tmp2$$Register, $tmp3$$Register);
8417     __ cmovl(Assembler::less, $tmp3$$Register, $tmp2$$Register);
8418     __ cmpl($src1$$Register, $tmp3$$Register);
8419     __ cmovl(Assembler::less, $tmp3$$Register, $src1$$Register);
8420     __ movl($dst$$Register, $tmp3$$Register);
8421   %}
8422   ins_pipe( pipe_slow );
8423 %}
8424 
8425 instruct rsmin8S_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2, rRegI tmp3, rRegI tmp4) %{
8426   predicate(UseSSE > 1 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
8427   match(Set dst (MinReductionV src1 src2));
8428   effect(TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP tmp4);
8429   format %{ "pshufd  $tmp2,$src2,0xE\n\t"
8430             "pminsw  $tmp2,$src2\n\t"
8431             "pshufd  $tmp,$tmp2,0x1\n\t"
8432             "pminsw  $tmp,$tmp2\n\t"
8433             "pextrw  $tmp2,$tmp\n\t"
8434             "movswl  $tmp2,$tmp2\n\t"
8435             "pextrw  $tmp3,$tmp, 0x0\n\t"
8436             "movswl  $tmp3,$tmp3\n\t"
8437             "cmpl    $tmp2,$tmp3\n\t"
8438             "cmovl  $tmp3,$tmp2\n\t"
8439             "cmpl  $src1,$tmp3\n\t"
8440             "cmovl  $tmp3,$src1\n\t"
8441             "movl  $dst,$tmp3\t! min reduction8S" %}
8442   ins_encode %{
8443     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister,0xE);
8444     __ pminsw($tmp2$$XMMRegister, $src2$$XMMRegister);
8445     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister,0x1);
8446     __ pminsw($tmp$$XMMRegister, $tmp2$$XMMRegister);
8447     __ pextrw($tmp4$$Register, $tmp$$XMMRegister,0x1);
8448     __ movswl($tmp4$$Register, $tmp4$$Register);
8449     __ pextrw($tmp3$$Register, $tmp$$XMMRegister,0x0);
8450     __ movswl($tmp3$$Register, $tmp3$$Register);
8451     __ cmpl($tmp4$$Register, $tmp3$$Register);
8452     __ cmovl(Assembler::less, $tmp3$$Register, $tmp4$$Register);
8453     __ cmpl($src1$$Register, $tmp3$$Register);
8454     __ cmovl(Assembler::less, $tmp3$$Register, $src1$$Register);
8455     __ movl($dst$$Register, $tmp3$$Register);
8456   %}
8457   ins_pipe( pipe_slow );
8458 %}
8459 
8460 instruct rvmin8S_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2, rRegI tmp3, rRegI tmp4) %{
8461   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
8462   match(Set dst (MinReductionV src1 src2));
8463   effect(TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP tmp4);
8464   format %{ "pshufd   $tmp,$src2,0xE\n\t"
8465             "vpminsw  $tmp,$tmp,$src2\n\t"
8466             "pshufd   $tmp2,$tmp,0x1\n\t"
8467             "vpminsw  $tmp,$tmp,$tmp2\n\t"
8468             "movzwl   $dst,$src1\n\t"
8469             "pextrw   $tmp3,$tmp, 0x0\n\t"
8470             "vpminsw  $dst,$dst,$tmp3\n\t"
8471             "pextrw   $tmp3,$tmp, 0x1\n\t"
8472             "vpminsw  $dst,$dst,$tmp3\n\t"
8473             "movswl   $dst,$dst\t! min reduction8S" %}
8474   ins_encode %{
8475     int vector_len = 0;
8476     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
8477     __ vpminsw($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
8478     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister,0x1);
8479     __ vpminsw($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
8480     __ pextrw($tmp4$$Register, $tmp$$XMMRegister,0x1);
8481     __ movswl($tmp4$$Register, $tmp4$$Register);
8482     __ pextrw($tmp3$$Register, $tmp$$XMMRegister,0x0);
8483     __ movswl($tmp3$$Register, $tmp3$$Register);
8484     __ cmpl($tmp4$$Register, $tmp3$$Register);
8485     __ cmovl(Assembler::less, $tmp3$$Register, $tmp4$$Register);
8486     __ cmpl($src1$$Register, $tmp3$$Register);
8487     __ cmovl(Assembler::less, $tmp3$$Register, $src1$$Register);
8488     __ movl($dst$$Register, $tmp3$$Register);
8489   %}
8490   ins_pipe( pipe_slow );
8491 %}
8492 
8493 instruct rvmin16S_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2, rRegI tmp3, rRegI tmp4) %{
8494   predicate(UseAVX > 1 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
8495   match(Set dst (MinReductionV src1 src2));
8496   effect(TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP tmp4);
8497   format %{ "vextracti128_high  $tmp,$src2\n\t"
8498             "vpminsw  $tmp,$tmp,$src2\n\t"
8499             "pshufd  $tmp2,$tmp,0xE\n\t"
8500             "vpminsw  $tmp,$tmp,$tmp2\n\t"
8501             "pshufd  $tmp2,$tmp,0x1\n\t"
8502             "vpminsw  $tmp,$tmp,$tmp2\n\t"
8503             "pextrw  $tmp2,$tmp, 0x1\n\t"
8504             "movswl  $tmp2,$tmp2\n\t"
8505             "pextrw  $tmp3,$tmp, 0x0\n\t"
8506             "movswl  $tmp3,$tmp3\n\t"
8507             "cmpl  $tmp2$tmp3\n\t"
8508             "cmovl  $tmp3,$tmp2\n\t"
8509             "cmpl  $src1,$tmp3\n\t"
8510             "cmovl  $tmp3,$src1\n\t"
8511             "movl  $dst,$tmp3\t! min reduction16S" %}
8512   ins_encode %{
8513     int vector_len = 1;
8514     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
8515     __ vpminsw($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
8516     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
8517     __ vpminsw($tmp$$XMMRegister,$tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
8518     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister,0x1);
8519     __ vpminsw($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
8520     __ pextrw($tmp4$$Register, $tmp$$XMMRegister,0x1);
8521     __ movswl($tmp4$$Register, $tmp4$$Register);
8522     __ pextrw($tmp3$$Register, $tmp$$XMMRegister,0x0);
8523     __ movswl($tmp3$$Register, $tmp3$$Register);
8524     __ cmpl($tmp4$$Register, $tmp3$$Register);
8525     __ cmovl(Assembler::less, $tmp3$$Register, $tmp4$$Register);
8526     __ cmpl($src1$$Register, $tmp3$$Register);
8527     __ cmovl(Assembler::less, $tmp3$$Register, $src1$$Register);
8528     __ movl($dst$$Register, $tmp3$$Register);
8529   %}
8530   ins_pipe( pipe_slow );
8531 %}
8532 
8533 instruct rvmin32S_reduction_reg(rRegI dst, rRegI src1, vecZ src2, regF tmp, regF tmp2, rRegI tmp3, rRegI tmp4) %{
8534   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
8535   match(Set dst (MinReductionV src1 src2));
8536   effect(TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP tmp4);
8537   format %{ "vextracti64x4_high  $tmp2,$src2\n\t"
8538             "vpminsw  $tmp2,$tmp2,$src2\n\t"
8539             "vextracti128_high  $tmp,$tmp2\n\t"
8540             "vpminsw  $tmp,$tmp,$tmp2\n\t"
8541             "pshufd  $tmp2,$tmp,0xE\n\t"
8542             "vpminsw  $tmp,$tmp,$tmp2\n\t"
8543             "pshufd  $tmp2,$tmp,0x1\n\t"
8544             "vpminsw  $tmp,$tmp,$tmp2\n\t"
8545             "pextrw  $tmp3,$tmp, 0x0\n\t"
8546             "movswl  $dst,$src1\n\t"
8547             "pextrw  $tmp3,$tmp, 0x0\n\t"
8548             "movswl  $dst,$src1\n\t"
8549             "cmpl  $tmp2$tmp3\n\t"
8550             "cmovl  $tmp3,$tmp2\n\t"
8551             "cmpl  $src1,$tmp3\n\t"
8552             "cmovl  $tmp3,$src1\n\t"
8553             "movl  $dst,$dst\t! min reduction32S" %}
8554   ins_encode %{
8555     int vector_len = 2;
8556     __ vextracti64x4_high($tmp2$$XMMRegister, $src2$$XMMRegister);
8557     __ vpminsw($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, vector_len);
8558     __ vextracti128_high($tmp$$XMMRegister, $tmp2$$XMMRegister);
8559     __ vpminsw($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
8560     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
8561     __ vpminsw($tmp$$XMMRegister,$tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
8562     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister,0x1);
8563     __ vpminsw($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
8564     __ pextrw($tmp4$$Register, $tmp$$XMMRegister,0x1);
8565     __ movswl($tmp4$$Register, $tmp4$$Register);
8566     __ pextrw($tmp3$$Register, $tmp$$XMMRegister,0x0);
8567     __ movswl($tmp3$$Register, $tmp3$$Register);
8568     __ cmpl($tmp4$$Register, $tmp3$$Register);
8569     __ cmovl(Assembler::less, $tmp3$$Register, $tmp4$$Register);
8570     __ cmpl($src1$$Register, $tmp3$$Register);
8571     __ cmovl(Assembler::less, $tmp3$$Register, $src1$$Register);
8572     __ movl($dst$$Register, $tmp3$$Register);
8573   %}
8574   ins_pipe( pipe_slow );
8575 %}
8576 
8577 instruct rsmin2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
8578   predicate(UseSSE > 3 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
8579   match(Set dst (MinReductionV src1 src2));
8580   effect(TEMP tmp, TEMP tmp2);
8581   format %{ "pshufd  $tmp,$src2,0x1\n\t"
8582             "pminsd  $tmp,$src2\n\t"
8583             "movd    $tmp2,$src1\n\t"
8584             "pminsd  $tmp2,$tmp\n\t"
8585             "movd    $dst,$tmp2\t! min reduction2I" %}
8586   ins_encode %{
8587     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x1);
8588     __ pminsd($tmp$$XMMRegister, $src2$$XMMRegister);
8589     __ movdl($tmp2$$XMMRegister, $src1$$Register);
8590     __ pminsd($tmp2$$XMMRegister, $tmp$$XMMRegister);
8591     __ movdl($dst$$Register, $tmp2$$XMMRegister);
8592   %}
8593   ins_pipe( pipe_slow );
8594 %}
8595 
8596 instruct rvmin2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
8597   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
8598   match(Set dst (MinReductionV src1 src2));
8599   effect(TEMP tmp, TEMP tmp2);
8600   format %{ "pshufd   $tmp,$src2,0x1\n\t"
8601             "vpminsd  $tmp2,$tmp,$src2\n\t"
8602             "movd     $tmp,$src1\n\t"
8603             "vpminsd  $tmp2,$tmp,$tmp2\n\t"
8604             "movd     $dst,$tmp2\t! min reduction2I" %}
8605   ins_encode %{
8606     int vector_len = 0;
8607     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x1);
8608     __ vpminsd($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
8609     __ movdl($tmp2$$XMMRegister, $src1$$Register);
8610     __ vpminsd($tmp2$$XMMRegister,$tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
8611     __ movdl($dst$$Register, $tmp2$$XMMRegister);
8612   %}
8613   ins_pipe( pipe_slow );
8614 %}
8615 
8616 instruct rsmin4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
8617   predicate(UseSSE > 3 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
8618   match(Set dst (MinReductionV src1 src2));
8619   effect(TEMP tmp, TEMP tmp2);
8620   format %{ "pshufd  $tmp,$src2,0xE\n\t"
8621             "pminsd  $tmp,$src2\n\t"
8622             "pshufd  $tmp2,$tmp,0x1\n\t"
8623             "pminsd  $tmp2,$tmp\n\t"
8624             "movd    $tmp,$src1\n\t"
8625             "pminsd  $tmp2,$tmp\n\t"
8626             "movd    $dst,$tmp2\t! min reduction4I" %}
8627   ins_encode %{
8628     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
8629     __ pminsd($tmp$$XMMRegister, $src2$$XMMRegister);
8630     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
8631     __ pminsd($tmp2$$XMMRegister,$tmp$$XMMRegister);
8632     __ movdl($tmp$$XMMRegister, $src1$$Register);
8633     __ pminsd($tmp2$$XMMRegister,$tmp$$XMMRegister);
8634     __ movdl($dst$$Register, $tmp2$$XMMRegister);
8635   %}
8636   ins_pipe( pipe_slow );
8637 %}
8638 
8639 instruct rvmin4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
8640   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
8641   match(Set dst (MinReductionV src1 src2));
8642   effect(TEMP tmp, TEMP tmp2);
8643   format %{ "pshufd   $tmp,$src2,0xE\n\t"
8644             "vpminsd  $tmp2,$tmp,$src2\n\t"
8645             "pshufd   $tmp,$tmp2,0x1\n\t"
8646             "vpminsd  $tmp2,$tmp2,$tmp\n\t"
8647             "movd     $tmp,$src1\n\t"
8648             "vpminsd  $tmp2,$tmp,$tmp2\n\t"
8649             "movd     $dst,$tmp2\t! min reduction4I" %}
8650   ins_encode %{
8651     int vector_len = 0;
8652     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
8653     __ vpminsd($tmp2$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
8654     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister,0x1);
8655     __ vpminsd($tmp2$$XMMRegister,$tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
8656     __ movdl($tmp$$XMMRegister, $src1$$Register);
8657     __ vpminsd($tmp2$$XMMRegister,$tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
8658     __ movdl($dst$$Register, $tmp2$$XMMRegister);
8659   %}
8660   ins_pipe( pipe_slow );
8661 %}
8662 
8663 instruct rvmin4I_reduction_evex(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
8664   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
8665   match(Set dst (MinReductionV src1 src2));
8666   effect(TEMP tmp, TEMP tmp2);
8667   format %{ "pshufd   $tmp,$src2,0xE\n\t"
8668             "vpminsd  $tmp2,$tmp,$src2\n\t"
8669             "pshufd   $tmp,$tmp2,0x1\n\t"
8670             "vpminsd  $tmp2,$tmp2,$tmp\n\t"
8671             "movd     $tmp,$src1\n\t"
8672             "vpminsd  $tmp2,$tmp,$tmp2\n\t"
8673             "movd     $dst,$tmp2\t! min reduction4I" %}
8674   ins_encode %{
8675     int vector_len = 0;
8676     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
8677     __ vpminsd($tmp2$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
8678     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister,0x1);
8679     __ vpminsd($tmp2$$XMMRegister,$tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
8680     __ movdl($tmp$$XMMRegister, $src1$$Register);
8681     __ vpminsd($tmp2$$XMMRegister,$tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
8682     __ movdl($dst$$Register, $tmp2$$XMMRegister);
8683   %}
8684   ins_pipe( pipe_slow );
8685 %}
8686 
8687 instruct rvmin8I_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2) %{
8688   predicate(UseAVX > 1 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
8689   match(Set dst (MinReductionV src1 src2));
8690   effect(TEMP tmp, TEMP tmp2);
8691   format %{ "vextracti128_high   $tmp,$src2\n\t"
8692             "vpminsd  $tmp,$tmp,$src2\n\t"
8693             "pshufd   $tmp2,$tmp,0xE\n\t"
8694             "vpminsd  $tmp2,$tmp,$tmp2\n\t"
8695             "pshufd   $tmp,$tmp2,0x1\n\t"
8696             "vpminsd  $tmp2,$tmp2,$tmp\n\t"
8697             "movd     $tmp,$src1\n\t"
8698             "vpminsd  $tmp2,$tmp,$tmp2\n\t"
8699             "movd     $dst,$tmp2\t! min reduction8I" %}
8700   ins_encode %{
8701     int vector_len = 1;
8702     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
8703     __ vpminsd($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
8704     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
8705     __ vpminsd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
8706     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister,0x1);
8707     __ vpminsd($tmp2$$XMMRegister,$tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
8708     __ movdl($tmp$$XMMRegister, $src1$$Register);
8709     __ vpminsd($tmp2$$XMMRegister,$tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
8710     __ movdl($dst$$Register, $tmp2$$XMMRegister);
8711   %}
8712   ins_pipe( pipe_slow );
8713 %}
8714 
8715 instruct rvmin8I_reduction_evex(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2) %{
8716   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
8717   match(Set dst (MinReductionV src1 src2));
8718   effect(TEMP tmp, TEMP tmp2);
8719   format %{ "vextracti128_high   $tmp,$src2\n\t"
8720             "vpminsd  $tmp,$tmp,$src2\n\t"
8721             "pshufd   $tmp2,$tmp,0xE\n\t"
8722             "vpminsd  $tmp2,$tmp,$tmp2\n\t"
8723             "pshufd   $tmp,$tmp2,0x1\n\t"
8724             "vpminsd  $tmp2,$tmp2,$tmp\n\t"
8725             "movd     $tmp,$src1\n\t"
8726             "vpminsd  $tmp2,$tmp,$tmp2\n\t"
8727             "movd     $dst,$tmp2\t! min reduction8I" %}
8728   ins_encode %{
8729     int vector_len = 1;
8730     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
8731     __ vpminsd($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
8732     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
8733     __ vpminsd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
8734     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister,0x1);
8735     __ vpminsd($tmp2$$XMMRegister,$tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
8736     __ movdl($tmp$$XMMRegister, $src1$$Register);
8737     __ vpminsd($tmp2$$XMMRegister,$tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
8738     __ movdl($dst$$Register, $tmp2$$XMMRegister);
8739   %}
8740   ins_pipe( pipe_slow );
8741 %}
8742 
8743 instruct rvmin16I_reduction_evex(rRegI dst, rRegI src1, vecZ src2, regF tmp, regF tmp2, regF tmp3) %{
8744   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
8745   match(Set dst (MinReductionV src1 src2));
8746   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
8747   format %{ "vextracti64x4_high  $tmp3,$src2\n\t"
8748             "vpminsd  $tmp3,$tmp3,$src2\n\t"
8749             "vextracti128_high   $tmp,$tmp3\n\t"
8750             "vpminsd  $tmp,$tmp,$tmp3\n\t"
8751             "pshufd   $tmp2,$tmp,0xE\n\t"
8752             "vpminsd  $tmp2,$tmp,$tmp2\n\t"
8753             "pshufd   $tmp,$tmp2,0x1\n\t"
8754             "vpminsd  $tmp2,$tmp2,$tmp\n\t"
8755             "movd     $tmp,$src1\n\t"
8756             "vpminsd  $tmp2,$tmp,$tmp2\n\t"
8757             "movd     $dst,$tmp2\t! min reduction16I" %}
8758   ins_encode %{
8759     int vector_len = 2;
8760     __ vextracti64x4_high($tmp3$$XMMRegister, $src2$$XMMRegister);
8761     __ vpminsd($tmp3$$XMMRegister, $tmp3$$XMMRegister, $src2$$XMMRegister, vector_len);
8762     __ vextracti128_high($tmp$$XMMRegister, $tmp3$$XMMRegister);
8763     __ vpminsd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp3$$XMMRegister, vector_len);
8764     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
8765     __ vpminsd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
8766     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister,0x1);
8767     __ vpminsd($tmp2$$XMMRegister,$tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
8768     __ movdl($tmp$$XMMRegister, $src1$$Register);
8769     __ vpminsd($tmp2$$XMMRegister,$tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
8770     __ movdl($dst$$Register, $tmp2$$XMMRegister);
8771   %}
8772   ins_pipe( pipe_slow );
8773 %}
8774 
8775 // Long Min Reduction
8776 instruct rsmin1L_reduction_reg(rRegL dst, rRegL src1, legVecD src2, rxmm0 tmp, legVecD tmp2) %{
8777   predicate(UseSSE > 3 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
8778   match(Set dst (MinReductionV src1 src2));
8779   effect(TEMP tmp, TEMP tmp2);
8780   format %{ "movdq      $tmp,$src1\n\t"
8781             "movdq      $tmp2,$src1\n\t"
8782             "pcmpgtq   $tmp,$src2\n\t"
8783             "blendvpd  $tmp2,$src2\n\t"
8784             "movdq      $dst,$tmp2\t! min reduction1L" %}
8785   ins_encode %{
8786     __ movdq($tmp$$XMMRegister,$src1$$Register);
8787     __ movdq($tmp2$$XMMRegister,$src1$$Register);
8788     __ pcmpgtq($tmp$$XMMRegister, $src2$$XMMRegister);
8789     __ blendvpd($tmp2$$XMMRegister,$src2$$XMMRegister);
8790     __ movdq($dst$$Register, $tmp2$$XMMRegister);
8791   %}
8792   ins_pipe( pipe_slow );
8793 %}
8794 
8795 instruct rsmin2L_reduction_reg(rRegL dst, rRegL src1, vecX src2, rxmm0 xmm_0, vecX tmp2, vecX tmp3) %{
8796   predicate(UseSSE > 3 && UseAVX == 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
8797   match(Set dst (MinReductionV src1 src2));
8798   effect(TEMP xmm_0, TEMP tmp2, TEMP tmp3);
8799   format %{ "pshufd  $tmp3,$src2,0xE\n\t"
8800             "movdqu  $xmm_0,$src2\n\t"
8801             "movdqu  $tmp2,$src2\n\t"
8802             "pcmpgtq  $xmm_0,$tmp3\n\t"
8803             "blendvpd  $tmp2,$tmp3\n\t"
8804             "movdqu  $xmm_0,$tmp2\n\t"
8805             "movdq  $tmp3,$src1\n\t"
8806             "pcmpgtq  $xmm_0,$tmp3\n\t"
8807             "blendvpd  $tmp2,$tmp3\n\t"
8808             "movq  $dst,$tmp2\t! min reduction2L" %}
8809   ins_encode %{
8810     __ pshufd($tmp3$$XMMRegister, $src2$$XMMRegister, 0xE);
8811     __ movdqu($xmm_0$$XMMRegister, $src2$$XMMRegister);
8812     __ movdqu($tmp2$$XMMRegister, $src2$$XMMRegister);
8813     __ pcmpgtq($xmm_0$$XMMRegister, $tmp3$$XMMRegister);
8814     __ blendvpd($tmp2$$XMMRegister, $tmp3$$XMMRegister);
8815     __ movdqu($xmm_0$$XMMRegister, $tmp2$$XMMRegister);
8816     __ movdq($tmp3$$XMMRegister, $src1$$Register);
8817     __ pcmpgtq($xmm_0$$XMMRegister, $tmp3$$XMMRegister);
8818     __ blendvpd($tmp2$$XMMRegister,$tmp3$$XMMRegister);
8819     __ movdq($dst$$Register, $tmp2$$XMMRegister);
8820   %}
8821   ins_pipe( pipe_slow );
8822 %}
8823 
8824 instruct rvmin2L_reduction_reg(rRegL dst, rRegL src1, legVecX src2, legVecX tmp, legVecX tmp2, legVecX tmp3) %{
8825   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
8826   match(Set dst (MinReductionV src1 src2));
8827   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
8828   format %{ "pshufd   $tmp2,$src2,0xE\n\t"
8829             "vpcmpgtq  $tmp,$tmp2,$src2\n\t"
8830             "vblendvpd   $tmp2,$tmp2,$src2,$tmp\n\t"
8831             "movq     $tmp,$src1\n\t"
8832             "vpcmpgtq  $tmp3,$tmp2,$tmp\n\t"
8833             "blendvpd   $tmp2,$tmp2,$src1,$tmp3\n\t"
8834             "movq     $dst,$tmp2\t! min reduction2L" %}
8835   ins_encode %{
8836     int vector_len = 0;
8837     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
8838     __ vpcmpgtq($tmp$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, vector_len);
8839     __ vblendvpd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister, vector_len);
8840     __ movdq($tmp$$XMMRegister,$src1$$Register);
8841     __ vpcmpgtq($tmp3$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
8842     __ vblendvpd($tmp2$$XMMRegister, $tmp2$$XMMRegister,$src1$$XMMRegister,$tmp3$$XMMRegister, vector_len);
8843     __ movdq($dst$$Register, $tmp2$$XMMRegister);
8844   %}
8845   ins_pipe( pipe_slow );
8846 %}
8847 
8848 instruct rvmin4L_reduction_reg(rRegL dst, rRegL src1, legVecY src2, legVecY tmp, legVecY tmp2, legVecY tmp3) %{
8849   predicate(UseAVX > 1 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
8850   match(Set dst (MinReductionV src1 src2));
8851   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
8852   format %{ "vextracti128_high   $tmp2,$src2\n\t"
8853             "vpcmpgtq  $tmp,$tmp2,$src2\n\t"
8854             "vblendvpd   $tmp2,$tmp2,$src2,$tmp\n\t"
8855             "vpshufd   $tmp3, $tmp2,0x1\n\t"
8856             "vpcmpgtq  $tmp, $tmp3,$tmp\n\t2"
8857             "vblendvpd $tmp3,$tmp3,$tmp2,$tmp\n\t"
8858             "movq     $tmp2,$src1\n\t"
8859             "vpcmpgtq  $tmp,$tmp2,$tmp3\n\t"
8860             "blendvpd   $tmp2,$tmp2,$tmp3,$tmp\n\t"
8861             "movq     $dst,$tmp2\t! min reduction2L" %}
8862   ins_encode %{
8863     int vector_len = 1;
8864     __ vextracti128_high($tmp2$$XMMRegister, $src2$$XMMRegister);
8865     __ vpcmpgtq($tmp$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, vector_len);
8866     __ vblendvpd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister, vector_len);
8867     __ pshufd($tmp3$$XMMRegister, $tmp2$$XMMRegister, 0xE);
8868     __ vpcmpgtq($tmp$$XMMRegister, $tmp3$$XMMRegister, $tmp2$$XMMRegister, vector_len);
8869     __ vblendvpd($tmp3$$XMMRegister,$tmp3$$XMMRegister,$tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
8870     __ movdq($tmp$$XMMRegister,$src1$$Register);
8871     __ vpcmpgtq($tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp$$XMMRegister, vector_len);
8872     __ vblendvpd($tmp2$$XMMRegister, $tmp3$$XMMRegister,$tmp$$XMMRegister,$tmp2$$XMMRegister, vector_len);
8873     __ movdq($dst$$Register, $tmp2$$XMMRegister);
8874   %}
8875   ins_pipe( pipe_slow );
8876 %}
8877 
8878 instruct rvmin8L_reduction_reg(rRegL dst, rRegL src1, legVecZ src2, legVecZ tmp, legVecZ tmp2, legVecZ tmp3) %{
8879   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
8880   match(Set dst (MinReductionV src1 src2));
8881   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
8882   format %{ "vextracti64x4_high  $tmp3,$src2\n\t"
8883             "vpcmpgtq  $tmp,$tmp3,$src2\n\t"
8884             "vblendvpd   $tmp3,$tmp3,$src2,$tmp\n\t"
8885             "vextracti128_high   $tmp2,$tmp3\n\t"
8886             "vpcmpgtq  $tmp,$tmp2,$tmp3\n\t"
8887             "vblendvpd   $tmp2,$tmp2,$tmp3,$tmp\n\t"
8888             "vpshufd  $tmp3,$tmp2,0x1\n\t"
8889             "vpcmpgtq   $tmp,$tmp3,$tmp2\n\t"
8890             "vblendvpd  $tmp3,$tmp3,$tmp2,$tmp\n\t"
8891             "movq     $tmp2,$src1\n\t"
8892             "vpcmpgtq  $tmp,$tmp2,$tmp3\n\t"
8893             "vblendvpd  $tmp2,$tmp2,$tmp3,$tmp\n\t"
8894             "movq     $dst,$tmp2\t! min reduction4I" %}
8895   ins_encode %{
8896     __ vextracti64x4_high($tmp3$$XMMRegister, $src2$$XMMRegister);
8897     __ vpcmpgtq($tmp$$XMMRegister, $tmp3$$XMMRegister, $src2$$XMMRegister, 1);
8898     __ vblendvpd($tmp3$$XMMRegister, $tmp3$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister, 1);
8899     __ vextracti128_high($tmp2$$XMMRegister, $tmp3$$XMMRegister);
8900     __ vpcmpgtq($tmp$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, 1);
8901     __ vblendvpd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp$$XMMRegister, 1);
8902     __ pshufd($tmp3$$XMMRegister, $tmp2$$XMMRegister, 0xE);
8903     __ vpcmpgtq($tmp$$XMMRegister, $tmp3$$XMMRegister, $tmp2$$XMMRegister, 1);
8904     __ vblendvpd($tmp3$$XMMRegister,$tmp3$$XMMRegister,$tmp2$$XMMRegister, $tmp$$XMMRegister, 1);
8905     __ movdq($tmp2$$XMMRegister, $src1$$Register);
8906     __ vpcmpgtq($tmp$$XMMRegister,$tmp2$$XMMRegister, $tmp3$$XMMRegister, 1);
8907     __ vblendvpd($tmp2$$XMMRegister,$tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp$$XMMRegister, 1);
8908     __ movdq($dst$$Register, $tmp2$$XMMRegister);
8909   %}
8910   ins_pipe( pipe_slow );
8911 %}
8912 
8913 // Float Min Reduction
8914 instruct rvmin2F_reduction_reg_av(legRegF dst, legVecD src, legVecD tmp, legVecD dtmp,
8915                                   legVecD atmp, legVecD btmp, legVecX xmm_1) %{
8916   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
8917   match(Set dst (MinReductionV dst src));
8918   effect(TEMP dst, TEMP tmp, TEMP dtmp, TEMP atmp, TEMP btmp, TEMP xmm_1);
8919   format %{ "vpermilps    $xmm_1,$src,1\n\t"
8920             "vminps_macro $dtmp,$xmm_1,$src\t! minps\n\t"
8921             "vminps_macro $dst,$dtmp,$dst\t! minps" %}
8922   ins_encode %{
8923     int vector_len = 0;
8924     __ vpermilps($xmm_1$$XMMRegister, $src$$XMMRegister, 1, vector_len);
8925     __ vmin_max_macro($dtmp$$XMMRegister, $xmm_1$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister,
8926                       $atmp$$XMMRegister, $btmp$$XMMRegister, true, true, vector_len);
8927     __ vmin_max_macro($dst$$XMMRegister, $dtmp$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister,
8928                       $atmp$$XMMRegister, $btmp$$XMMRegister, true, true, vector_len);
8929   %}
8930   ins_pipe( pipe_slow );
8931 %}
8932 
8933 instruct rvmin2F_reduction_reg(legRegF dst, immF src1, legVecD src2, legVecD tmp,
8934                                legVecD atmp, legVecD btmp, legVecX xmm_1) %{
8935   predicate(UseAVX > 0 && n->in(1)->as_Type()->type() == (Type*)TypeF::POS_INF && 
8936             n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
8937   match(Set dst (MinReductionV src1 src2));
8938   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_1);
8939   format %{ "vpermilps    $xmm_1,$src2,1\n\t"
8940             "vminps_macro $dst,$xmm_1,$src2\t! minps" %}
8941   ins_encode %{
8942     int vector_len = 0;
8943     __ vpermilps($xmm_1$$XMMRegister, $src2$$XMMRegister, 1, vector_len);
8944     __ vmin_max_macro($dst$$XMMRegister, $xmm_1$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister,
8945                       $atmp$$XMMRegister, $btmp$$XMMRegister, true, true, vector_len);
8946   %}
8947   ins_pipe( pipe_slow );
8948 %}
8949 
8950 instruct rvmin4F_reduction_reg_av(legRegF dst, legVecX src, legVecX tmp, legVecX dtmp, 
8951                                   legVecX atmp, legVecX btmp, legVecX xmm_0, legVecX xmm_1) %{
8952   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
8953   match(Set dst (MinReductionV dst src));
8954   effect(TEMP dst, TEMP tmp, TEMP dtmp, TEMP atmp, TEMP btmp, TEMP xmm_0, TEMP xmm_1);
8955   format %{ "vpermilps    $xmm_1,$src,14\n\t"
8956             "vminps_macro $xmm_0,$xmm_1,$src\t! minps\n\t"
8957             "vpermilps    $xmm_1,$xmm_0,1\n\t"
8958             "vminps_macro $dtmp,$xmm_1,$xmm_0\t! minps\n\t"
8959             "vminps_macro $dst,$dtmp,$dst\t! minps" %}
8960   ins_encode %{
8961     int vector_len = 0;
8962     __ vpermilps($xmm_1$$XMMRegister, $src$$XMMRegister, 14, vector_len);
8963     __ vmin_max_macro($xmm_0$$XMMRegister, $xmm_1$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister,
8964                       $atmp$$XMMRegister, $btmp$$XMMRegister , true, true, vector_len);
8965     __ vpermilps($xmm_1$$XMMRegister, $xmm_0$$XMMRegister, 1, vector_len);
8966     __ vmin_max_macro($dtmp$$XMMRegister, $xmm_0$$XMMRegister, $xmm_1$$XMMRegister, $tmp$$XMMRegister,
8967                       $atmp$$XMMRegister, $btmp$$XMMRegister , true, true, vector_len);
8968     __ vmin_max_macro($dst$$XMMRegister, $dtmp$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister,
8969                       $atmp$$XMMRegister, $btmp$$XMMRegister , true, true, vector_len);
8970   %}
8971   ins_pipe( pipe_slow );
8972 %}
8973 
8974 instruct rvmin4F_reduction_reg(legRegF dst, immF src1, legVecX src2, legVecX tmp, legVecX atmp,
8975                                legVecX btmp, legVecX xmm_0, legVecX xmm_1) %{
8976   predicate(UseAVX > 0 && n->in(1)->as_Type()->type() == (Type*)TypeF::POS_INF && 
8977             n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
8978   match(Set dst (MinReductionV src1 src2));
8979   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_0, TEMP xmm_1);
8980   format %{ "vpermilps    $xmm_1,$src2,14\n\t"
8981             "vminps_macro $xmm_0,$xmm_1,$src2\t! minps\n\t"
8982             "vpermilps    $xmm_1,$xmm_0,1\n\t"
8983             "vminps_macro $dst,$xmm_1,$xmm_0\t! minps" %}
8984   ins_encode %{
8985     int vector_len = 0;
8986     __ vpermilps($xmm_1$$XMMRegister, $src2$$XMMRegister, 14, vector_len);
8987     __ vmin_max_macro($xmm_0$$XMMRegister, $xmm_1$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister,
8988                       $atmp$$XMMRegister, $btmp$$XMMRegister , true, true, vector_len);
8989     __ vpermilps($xmm_1$$XMMRegister, $xmm_0$$XMMRegister, 1, vector_len);
8990     __ vmin_max_macro($dst$$XMMRegister, $xmm_0$$XMMRegister, $xmm_1$$XMMRegister, $tmp$$XMMRegister,
8991                       $atmp$$XMMRegister, $btmp$$XMMRegister , true, true, vector_len);
8992   %}
8993   ins_pipe( pipe_slow );
8994 %}
8995 
8996 instruct rvmin8F_reduction_reg_av(legRegF dst, legVecY src, legVecY tmp, legVecY dtmp, legVecY atmp,
8997                                   legVecY btmp, legVecY ymm_0, legVecY ymm_1) %{
8998   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
8999   match(Set dst (MinReductionV dst src));
9000   effect(TEMP dst, TEMP tmp, TEMP dtmp, TEMP atmp, TEMP btmp, TEMP ymm_0, TEMP ymm_1);
9001   format %{ "vperm2f128   $ymm_1,$src,$src,1\n\t"
9002             "vminps_macro $ymm_0,$ymm_1,$src\t! minps\n\t"
9003             "vpermilps    $ymm_1,$ymm_0,14\n\t"
9004             "vminps_macro $ymm_0,$ymm_1,$ymm_0\n\t! mips\n\t"
9005             "vpermilps    $ymm_1,$ymm_0,1\n\t"
9006             "vminps_macro $dtmp,$ymm_1,$ymm_0\t! minps\n\t" 
9007             "vminps_macro $dst,$dtmp,$dst\t! minps" %}
9008   ins_encode %{
9009     int vector_len = 1;
9010     __ vperm2f128($ymm_1$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 1);
9011     __ vmin_max_macro($ymm_0$$XMMRegister, $ymm_1$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister,
9012                       $atmp$$XMMRegister, $btmp$$XMMRegister , true, true, vector_len);
9013     __ vpermilps($ymm_1$$XMMRegister, $ymm_0$$XMMRegister, 14, vector_len);
9014     __ vmin_max_macro($ymm_0$$XMMRegister, $ymm_1$$XMMRegister, $ymm_0$$XMMRegister, $tmp$$XMMRegister,
9015                       $atmp$$XMMRegister, $btmp$$XMMRegister , true, true, vector_len);
9016     __ vpermilps($ymm_1$$XMMRegister, $ymm_0$$XMMRegister, 1, vector_len);
9017     __ vmin_max_macro($dtmp$$XMMRegister, $ymm_0$$XMMRegister, $ymm_1$$XMMRegister, $tmp$$XMMRegister,
9018                       $atmp$$XMMRegister, $btmp$$XMMRegister , true, true, vector_len);
9019     __ vmin_max_macro($dst$$XMMRegister, $dtmp$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister,
9020                       $atmp$$XMMRegister, $btmp$$XMMRegister , true, true, vector_len);
9021   %}
9022   ins_pipe( pipe_slow );
9023 %}
9024 
9025 instruct rvmin8F_reduction_reg(legRegF dst, immF src1, legVecY src2, legVecY tmp,
9026                                legVecY atmp, legVecY btmp, legVecY ymm_0, legVecY ymm_1) %{
9027   predicate(UseAVX > 0 && n->in(1)->as_Type()->type() == (Type*)TypeF::POS_INF && 
9028             n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
9029   match(Set dst (MinReductionV src1 src2));
9030   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP ymm_0, TEMP ymm_1);
9031   format %{ "vperm2f128   $ymm_1,$src2,$src2, 1\n\t"
9032             "vminps_macro $ymm_0,$ymm_1,$src2\t! minps\n\t"
9033             "vpermilps    $ymm_1,$ymm_0,14\n\t"
9034             "vminps_macro $ymm_0,$ymm_1,$ymm_0\t! mips\n\t"
9035             "vpermilps    $ymm_1,$ymm_0,1\n\t"
9036             "vminps_macro $dst,$ymm_1,$ymm_0\t! minps" %}
9037   ins_encode %{
9038     int vector_len = 1;
9039     __ vperm2f128($ymm_1$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, 1);
9040     __ vmin_max_macro($ymm_0$$XMMRegister, $ymm_1$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister,
9041                       $atmp$$XMMRegister, $btmp$$XMMRegister , true, true, vector_len);
9042     __ vpermilps($ymm_1$$XMMRegister, $ymm_0$$XMMRegister, 14, vector_len);
9043     __ vmin_max_macro($ymm_0$$XMMRegister, $ymm_1$$XMMRegister, $ymm_0$$XMMRegister, $tmp$$XMMRegister,
9044                       $atmp$$XMMRegister, $btmp$$XMMRegister , true, true, vector_len);
9045     __ vpermilps($ymm_1$$XMMRegister, $ymm_0$$XMMRegister, 1, vector_len);
9046     __ vmin_max_macro($dst$$XMMRegister, $ymm_0$$XMMRegister, $ymm_1$$XMMRegister, $tmp$$XMMRegister,
9047                       $atmp$$XMMRegister, $btmp$$XMMRegister , true, true, vector_len);
9048   %}
9049   ins_pipe( pipe_slow );
9050 %}
9051 
9052 instruct rvmin16F_reduction_reg_av(regF dst, vecZ src, vecZ tmp, vecZ dtmp,
9053                                    vecZ atmp, vecZ btmp, vecY ymm_0, vecY ymm_1) %{
9054   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
9055   match(Set dst (MinReductionV dst src));
9056   effect(TEMP dst, TEMP tmp, TEMP dtmp, TEMP atmp, TEMP btmp, TEMP ymm_0, TEMP ymm_1);
9057   format %{
9058        "vextractf64x4 $ymm_0, $src, 0\n\t"
9059        "vextractf64x4 $ymm_1, $src, 1\n\t"
9060        "vminps_macro $ymm_0,$ymm_1,$ymm_0\t! minps\n\t"
9061        "vpermpd      $ymm_1,$ymm_0,78\n\t"
9062        "vminps_macro $ymm_0,$ymm_1,$ymm_0\n\t! minps\n\t"
9063        "vpermilps    $ymm_1,$ymm_0,14\n\t"
9064        "vminps_macro $ymm_0,$ymm_1,$ymm_0\n\t! minps\n\t"
9065        "vpermilps    $ymm_1,$ymm_0,1\n\t"
9066        "vminps_macro $dtmp,$ymm_1,$ymm_0\t! minps\n\t"
9067        "vminps_macro $dst,$dtmp,$dst\t! minps" %}
9068   ins_encode %{
9069     int vector_len = 1;
9070     KRegister ktmp = k1;
9071     __ vextractf64x4($ymm_0$$XMMRegister, $src$$XMMRegister, 0);
9072     __ vextractf64x4($ymm_1$$XMMRegister, $src$$XMMRegister, 1);
9073     __ vmin_max_macro_evex($ymm_0$$XMMRegister, $ymm_1$$XMMRegister, $ymm_0$$XMMRegister, ktmp,
9074                     $atmp$$XMMRegister, $btmp$$XMMRegister , true, true, vector_len);
9075     __ vpermpd($ymm_1$$XMMRegister, $ymm_0$$XMMRegister, 78, vector_len);
9076     __ vmin_max_macro_evex($ymm_0$$XMMRegister, $ymm_1$$XMMRegister, $ymm_0$$XMMRegister, ktmp,
9077                     $atmp$$XMMRegister, $btmp$$XMMRegister , true, true, vector_len);
9078     __ vpermilps($ymm_1$$XMMRegister, $ymm_0$$XMMRegister, 14, vector_len);
9079     __ vmin_max_macro_evex($ymm_0$$XMMRegister, $ymm_1$$XMMRegister, $ymm_0$$XMMRegister, ktmp,
9080                     $atmp$$XMMRegister, $btmp$$XMMRegister , true, true, vector_len);
9081     __ vpermilps($ymm_1$$XMMRegister, $ymm_0$$XMMRegister, 1, vector_len);
9082     __ vmin_max_macro_evex($dtmp$$XMMRegister, $ymm_0$$XMMRegister, $ymm_1$$XMMRegister, ktmp,
9083                     $atmp$$XMMRegister, $btmp$$XMMRegister , true, true, vector_len);
9084     __ vmin_max_macro_evex($dst$$XMMRegister, $dtmp$$XMMRegister, $dst$$XMMRegister, ktmp,
9085                     $atmp$$XMMRegister, $btmp$$XMMRegister , true, true, vector_len);
9086   %}
9087   ins_pipe( pipe_slow );
9088 %}
9089 
9090 instruct rvmin16F_reduction_reg(regF dst, immF src1, vecZ src2, vecZ tmp,
9091                                 vecZ atmp, vecZ btmp, vecY ymm_0, vecY ymm_1) %{
9092   predicate(UseAVX > 2 && n->in(1)->as_Type()->type() == (Type*)TypeF::POS_INF && 
9093             n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
9094   match(Set dst (MinReductionV src1 src2));
9095   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP ymm_0, TEMP ymm_1);
9096   format %{
9097        "vextractf64x4 $ymm_0, $src2, 0\n\t"
9098        "vextractf64x4 $ymm_1, $src2, 1\n\t"
9099        "vminps_macro $ymm_0,$ymm_1,$ymm_0\t! minps\n\t"
9100        "vpermpd      $ymm_1,$ymm_0, 78\n\t"
9101        "vminps_macro $ymm_0,$ymm_1,$ymm_0\t! minps\n\t"
9102        "vpermilps    $ymm_1,$ymm_0,14\n\t"
9103        "vminps_macro $ymm_0,$ymm_1,$ymm_0\t! mips\n\t"
9104        "vpermilps    $ymm_1,$ymm_0,1\n\t"
9105        "vminps_macro $dst,$ymm_1,$ymm_0\t! minps" %}
9106   ins_encode %{
9107     int vector_len = 1;
9108     KRegister ktmp = k1;
9109     __ vextractf64x4($ymm_0$$XMMRegister, $src2$$XMMRegister, 0);
9110     __ vextractf64x4($ymm_1$$XMMRegister, $src2$$XMMRegister, 1);
9111     __ vmin_max_macro_evex($ymm_0$$XMMRegister, $ymm_1$$XMMRegister, $ymm_0$$XMMRegister, ktmp,
9112                     $atmp$$XMMRegister, $btmp$$XMMRegister , true, true, vector_len);
9113     __ vpermpd($ymm_1$$XMMRegister, $ymm_0$$XMMRegister, 78, vector_len);
9114     __ vmin_max_macro_evex($ymm_0$$XMMRegister, $ymm_1$$XMMRegister, $ymm_0$$XMMRegister, ktmp,
9115                     $atmp$$XMMRegister, $btmp$$XMMRegister , true, true, vector_len);
9116     __ vpermilps($ymm_1$$XMMRegister, $ymm_0$$XMMRegister, 14, vector_len);
9117     __ vmin_max_macro_evex($ymm_0$$XMMRegister, $ymm_1$$XMMRegister, $ymm_0$$XMMRegister, ktmp,
9118                     $atmp$$XMMRegister, $btmp$$XMMRegister , true, true, vector_len);
9119     __ vpermilps($ymm_1$$XMMRegister, $ymm_0$$XMMRegister, 1, vector_len);
9120     __ vmin_max_macro_evex($dst$$XMMRegister, $ymm_0$$XMMRegister, $ymm_1$$XMMRegister, ktmp,
9121                     $atmp$$XMMRegister, $btmp$$XMMRegister , true, true, vector_len);
9122   %}
9123   ins_pipe( pipe_slow );
9124 %}
9125 
9126 instruct rvmin2D_reduction_reg_av(legRegD dst, legVecX src, legVecX tmp, legVecX dtmp,
9127                                   legVecX atmp, legVecX btmp, legVecX xmm_1) %{
9128   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
9129   match(Set dst (MinReductionV dst src));
9130   effect(TEMP dst, TEMP tmp, TEMP dtmp, TEMP atmp, TEMP btmp, TEMP xmm_1);
9131   format %{ "vpermilpd    $xmm_1,$src,1\n\t"
9132             "vminps_macro $dtmp,$xmm_1,$src\t! minps\n\t"
9133             "vminps_macro $dst,$dtmp,$dst\t! minps" %}
9134   ins_encode %{
9135     int vector_len = 0;
9136     __ vpermilpd($xmm_1$$XMMRegister, $src$$XMMRegister, 1, vector_len);
9137     __ vmin_max_macro($dtmp$$XMMRegister, $xmm_1$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister,
9138                       $atmp$$XMMRegister, $btmp$$XMMRegister, false, true, vector_len);
9139     __ vmin_max_macro($dst$$XMMRegister, $dtmp$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister,
9140                       $atmp$$XMMRegister, $btmp$$XMMRegister, false, true, vector_len);
9141   %}
9142   ins_pipe( pipe_slow );
9143 %}
9144 
9145 instruct rvmin2D_reduction_reg(legRegD dst, immD src1, legVecX src2, legVecX tmp,
9146                                legVecX atmp, legVecX btmp, legVecX xmm_1) %{
9147   predicate(UseAVX > 0 && n->in(1)->as_Type()->type() == (Type*)TypeD::POS_INF && 
9148             n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
9149   match(Set dst (MinReductionV src1 src2));
9150   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_1);
9151   format %{ "vpermilpd    $xmm_1,$src2,1\n\t"
9152             "vminps_macro $dst,$xmm_1,$src2\t! minps" %}
9153   ins_encode %{
9154     int vector_len = 0;
9155     __ vpermilpd($xmm_1$$XMMRegister, $src2$$XMMRegister, 1, vector_len);
9156     __ vmin_max_macro($dst$$XMMRegister, $xmm_1$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister,
9157                       $atmp$$XMMRegister, $btmp$$XMMRegister, false, true, vector_len);
9158   %}
9159   ins_pipe( pipe_slow );
9160 %}
9161 
9162 instruct rvmin4D_reduction_reg_av(legRegD dst, legVecY src, legVecY tmp, legVecY dtmp,
9163                                   legVecY atmp, legVecY btmp, legVecY ymm_0, legVecY ymm_1) %{
9164   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
9165   match(Set dst (MinReductionV dst src));
9166   effect(TEMP dst, TEMP tmp, TEMP dtmp, TEMP atmp, TEMP btmp, TEMP ymm_0, TEMP ymm_1);
9167   format %{ "vperm2f128   $ymm_1,$src,$src,1\n\t"
9168             "vminpd_macro $ymm_0,$ymm_1,$src\t! minps\n\t"
9169             "vpermilpd    $ymm_1,$ymm_0,1\n\t"
9170             "vminpd_macro $dtmp,$ymm_1,$ymm_0\t! minps\n\t"
9171             "vminpd_macro $dst,$dtmp,$dst\t! minps" %}
9172   ins_encode %{
9173     int vector_len = 1;
9174     __ vperm2f128($ymm_1$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 1);
9175     __ vmin_max_macro($ymm_0$$XMMRegister, $ymm_1$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister,
9176                       $atmp$$XMMRegister, $btmp$$XMMRegister , false, true, vector_len);
9177     __ vpermilpd($ymm_1$$XMMRegister, $ymm_0$$XMMRegister, 1, vector_len);
9178     __ vmin_max_macro($dtmp$$XMMRegister, $ymm_0$$XMMRegister, $ymm_1$$XMMRegister, $tmp$$XMMRegister,
9179                       $atmp$$XMMRegister, $btmp$$XMMRegister , false, true, vector_len);
9180     __ vmin_max_macro($dst$$XMMRegister, $dtmp$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister,
9181                       $atmp$$XMMRegister, $btmp$$XMMRegister , false, true, vector_len);
9182   %}
9183   ins_pipe( pipe_slow );
9184 %}
9185 
9186 instruct rvmin4D_reduction_reg(legRegD dst, immD src1, legVecY src2, legVecY tmp,
9187                                legVecY atmp, legVecY btmp, legVecY ymm_0, legVecY ymm_1) %{
9188   predicate(UseAVX > 0  && n->in(1)->as_Type()->type() == (Type*)TypeD::POS_INF && 
9189             n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
9190   match(Set dst (MinReductionV src1 src2));
9191   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP ymm_0, TEMP ymm_1);
9192   format %{ "vperm2f128   $ymm_1,$src2,$src2,1\n\t"
9193             "vminpd_macro $ymm_0,$ymm_1,$src2\t! minps\n\t"
9194             "vpermilpd    $ymm_1,$ymm_0,1\n\t"
9195             "vminpd_macro $dst,$ymm_1,$ymm_0\t! minps" %}
9196   ins_encode %{
9197     int vector_len = 1;
9198     __ vperm2f128($ymm_1$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, 1);
9199     __ vmin_max_macro($ymm_0$$XMMRegister, $ymm_1$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister,
9200                       $atmp$$XMMRegister, $btmp$$XMMRegister , false, true, vector_len);
9201     __ vpermilpd($ymm_1$$XMMRegister, $ymm_0$$XMMRegister, 1, vector_len);
9202     __ vmin_max_macro($dst$$XMMRegister, $ymm_0$$XMMRegister, $ymm_1$$XMMRegister, $tmp$$XMMRegister,
9203                       $atmp$$XMMRegister, $btmp$$XMMRegister , false, true, vector_len);
9204   %}
9205   ins_pipe( pipe_slow );
9206 %}
9207 
9208 instruct rvmin8D_reduction_reg_av(regD dst, vecZ src, vecZ tmp, vecZ dtmp, vecZ atmp,
9209                                   vecZ btmp, vecY ymm_0, vecY ymm_1) %{
9210   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
9211   match(Set dst (MinReductionV dst src));
9212   effect(TEMP dst, TEMP tmp, TEMP dtmp, TEMP atmp, TEMP btmp, TEMP ymm_0, TEMP ymm_1);
9213   format %{
9214        "vextractf64x4 $ymm_0, $src, 0\n\t"
9215        "vextractf64x4 $ymm_1, $src, 1\n\t"
9216        "vminpd_macro $ymm_0,$ymm_1,$ymm_0\t! minpd\n\t"
9217        "vpermpd      $ymm_1,$ymm_0,14\n\t"
9218        "vminpd_macro $ymm_0,$ymm_1,$src\t! minpd\n\t"
9219        "vpermilpd    $ymm_1,$ymm_0,1\n\t"
9220        "vminpd_macro $dtmp,$ymm_1,$ymm_0\t! minpd\n\t" 
9221        "vminpd_macro $dst,$dtmp,$dst\t! minpd\t" %}
9222   ins_encode %{
9223     int vector_len = 1;
9224     KRegister ktmp = k1;
9225     __ vextractf64x4($ymm_0$$XMMRegister, $src$$XMMRegister, 0);
9226     __ vextractf64x4($ymm_1$$XMMRegister, $src$$XMMRegister, 1);
9227     __ vmin_max_macro_evex($ymm_0$$XMMRegister, $ymm_1$$XMMRegister, $ymm_0$$XMMRegister, ktmp,
9228                     $atmp$$XMMRegister, $btmp$$XMMRegister , false, true, vector_len);
9229     __ vpermpd($ymm_1$$XMMRegister, $ymm_0$$XMMRegister, 14, vector_len);
9230     __ vmin_max_macro_evex($ymm_0$$XMMRegister, $ymm_1$$XMMRegister, $ymm_0$$XMMRegister, ktmp,
9231                     $atmp$$XMMRegister, $btmp$$XMMRegister , false, true, vector_len);
9232     __ vpermilpd($ymm_1$$XMMRegister, $ymm_0$$XMMRegister, 1, vector_len);
9233     __ vmin_max_macro_evex($dst$$XMMRegister, $ymm_1$$XMMRegister, $ymm_0$$XMMRegister, ktmp,
9234                     $atmp$$XMMRegister, $btmp$$XMMRegister , false, true, vector_len);
9235     __ vmin_max_macro_evex($dst$$XMMRegister, $dtmp$$XMMRegister, $dst$$XMMRegister, ktmp,
9236                     $atmp$$XMMRegister, $btmp$$XMMRegister , false, true, vector_len);
9237   %}
9238   ins_pipe( pipe_slow );
9239 %}
9240 
9241 instruct rvmin8D_reduction_reg(regD dst, immD src1, vecZ src2, vecZ tmp, 
9242                                vecZ atmp, vecZ btmp, vecY ymm_0, vecY ymm_1) %{
9243   predicate(UseAVX > 2 && n->in(1)->as_Type()->type() == (Type*)TypeD::POS_INF && 
9244             n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
9245   match(Set dst (MinReductionV src1 src2));
9246   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP ymm_0, TEMP ymm_1);
9247   format %{
9248        "vextractf64x4 $ymm_0, $src2, 0\n\t"
9249        "vextractf64x4 $ymm_1, $src2, 1\n\t"
9250        "vminpd_macro $ymm_0,$ymm_1,$ymm_0\t! minpd\n\t"
9251        "vpermpd      $ymm_1,$ymm_0,14\n\t"
9252        "vminpd_macro $ymm_0,$ymm_1,$ymm_0\t! minpd\n\t"
9253        "vpermilpd    $ymm_1,$ymm_0,1\n\t"
9254        "vminpd_macro $dst,$ymm_1,$ymm_0\t! minpd\n\t" %}
9255   ins_encode %{
9256     int vector_len = 1;
9257     KRegister ktmp = k1;
9258     __ vextractf64x4($ymm_0$$XMMRegister, $src2$$XMMRegister, 0);
9259     __ vextractf64x4($ymm_1$$XMMRegister, $src2$$XMMRegister, 1);
9260     __ vmin_max_macro_evex($ymm_0$$XMMRegister, $ymm_1$$XMMRegister, $ymm_0$$XMMRegister, ktmp,
9261                     $atmp$$XMMRegister, $btmp$$XMMRegister , false, true, vector_len);
9262     __ vpermpd($ymm_1$$XMMRegister, $ymm_0$$XMMRegister, 14, vector_len);
9263     __ vmin_max_macro_evex($ymm_0$$XMMRegister, $ymm_1$$XMMRegister, $ymm_0$$XMMRegister, ktmp,
9264                     $atmp$$XMMRegister, $btmp$$XMMRegister , false, true, vector_len);
9265     __ vpermilpd($ymm_1$$XMMRegister, $ymm_0$$XMMRegister, 1, vector_len);
9266     __ vmin_max_macro_evex($dst$$XMMRegister, $ymm_1$$XMMRegister, $ymm_0$$XMMRegister, ktmp,
9267                     $atmp$$XMMRegister, $btmp$$XMMRegister , false, true, vector_len);
9268   %}
9269   ins_pipe( pipe_slow );
9270 %}
9271 
9272 // ------- Max Reduction ------------
9273 
9274 instruct rsmax8B_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, rRegI tmp2, rRegI tmp3, regF tmp4) %{
9275   predicate(UseSSE > 3 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
9276   match(Set dst (MaxReductionV src1 src2));
9277   effect(TEMP dst, TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP tmp4);
9278   format %{ "pshufd  $tmp,$src2,0x1\n\t"
9279             "pminsb  $tmp,$src2\n\t"
9280             "pextrb  $tmp2,$tmp, 0x1\n\t"
9281             "movsbl  $tmp2,$tmp2\n\t"
9282             "pextrb  $tmp3,$tmp,0x0\n\t"
9283             "movsbl  $tmp3,$tmp3\n\t"
9284             "cmpl  $tmp2,$tmp3\n\t"
9285             "cmovl  $tmp3,$tmp2\n\t"
9286             "cmpl  $src1,$tmp3\n\t"
9287             "cmovl  $tmp3,$src1, 0x0\n\t"
9288             "movl  $dst,$tmp2\n\t"
9289             "pextrb  $tmp2,$tmp\n\t"
9290             "movsbl  $tmp2,$tmp2\n\t"
9291             "pextrb  $tmp3,$tmp\n\t"
9292             "movsbl  $tmp3,$tmp3\n\t"
9293             "cmpl  $tmp2,$tmp3\n\t"
9294             "cmovl  $tmp3,$tmp2\n\t"
9295             "cmpl  $tmp3,$dst\n\t"
9296             "cmovl  $dst,$tmp3\t! min reduction4S" %}
9297   ins_encode %{
9298     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister,0x1);
9299     __ pmaxsb($tmp$$XMMRegister, $src2$$XMMRegister);
9300     __ pextrb($tmp2$$Register, $tmp$$XMMRegister,0x1);
9301     __ movsbl($tmp2$$Register, $tmp2$$Register);
9302     __ pextrb($tmp3$$Register, $tmp$$XMMRegister,0x0);
9303     __ movsbl($tmp3$$Register, $tmp3$$Register);
9304     __ cmpl($tmp2$$Register, $tmp3$$Register);
9305     __ cmovl(Assembler::greater, $tmp3$$Register, $tmp2$$Register);
9306     __ cmpl($src1$$Register, $tmp3$$Register);
9307     __ cmovl(Assembler::greater, $tmp3$$Register, $src1$$Register);
9308     __ movl($dst$$Register, $tmp3$$Register);
9309     __ pextrb($tmp2$$Register, $tmp$$XMMRegister,0x3);
9310     __ movsbl($tmp2$$Register, $tmp2$$Register);
9311     __ pextrb($tmp3$$Register, $tmp$$XMMRegister,0x2);
9312     __ movsbl($tmp3$$Register, $tmp3$$Register);
9313     __ cmpl($tmp2$$Register, $tmp3$$Register);
9314     __ cmovl(Assembler::greater, $tmp3$$Register, $tmp2$$Register);
9315     __ cmpl($tmp3$$Register, $dst$$Register);
9316     __ cmovl(Assembler::greater, $dst$$Register, $tmp3$$Register);
9317     __ movsbl($dst$$Register, $dst$$Register);
9318   %}
9319   ins_pipe( pipe_slow );
9320 %}
9321 
9322 instruct rsmax16B_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, rRegI tmp2, rRegI tmp3, regF tmp4) %{
9323   predicate(UseSSE > 3 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
9324   match(Set dst (MaxReductionV src1 src2));
9325   effect(TEMP dst, TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP tmp4);
9326   format %{ "pshufd  $tmp4,$src2,0xE\n\t"
9327             "pmaxsb  $tmp4,$src2\n\t"
9328             "pshufd  $tmp,$tmp4,0x1\n\t"
9329             "pmaxsb  $tmp,$tmp4\n\t"
9330             "pextrb  $tmp2,$tmp, 0x1\n\t"
9331             "movsbl  $tmp2,$tmp2\n\t"
9332             "pextrb  $tmp3,$tmp,0x0\n\t"
9333             "movsbl  $tmp3,$tmp3\n\t"
9334             "cmpl  $tmp2,$tmp3\n\t"
9335             "cmovl  $tmp3,$tmp2\n\t"
9336             "cmpl  $src1,$tmp3\n\t"
9337             "cmovl  $tmp3,$src1, 0x0\n\t"
9338             "movl  $dst,$tmp2\n\t"
9339             "pextrb  $tmp2,$tmp\n\t"
9340             "movsbl  $tmp2,$tmp2\n\t"
9341             "pextrb  $tmp3,$tmp\n\t"
9342             "movsbl  $tmp3,$tmp3\n\t"
9343             "cmpl  $tmp2,$tmp3\n\t"
9344             "cmovl  $tmp3,$tmp2\n\t"
9345             "cmpl  $tmp3,$dst\n\t"
9346             "cmovl  $dst,$tmp3\t! max reduction4S" %}
9347   ins_encode %{
9348     __ pshufd($tmp4$$XMMRegister, $src2$$XMMRegister, 0xE);
9349     __ pmaxsb($tmp4$$XMMRegister, $src2$$XMMRegister);
9350     __ pshufd($tmp$$XMMRegister, $tmp4$$XMMRegister,0x1);
9351     __ pmaxsb($tmp$$XMMRegister, $tmp4$$XMMRegister);
9352     __ pextrb($tmp2$$Register, $tmp$$XMMRegister,0x1);
9353     __ movsbl($tmp2$$Register, $tmp2$$Register);
9354     __ pextrb($tmp3$$Register, $tmp$$XMMRegister,0x0);
9355     __ movsbl($tmp3$$Register, $tmp3$$Register);
9356     __ cmpl($tmp2$$Register, $tmp3$$Register);
9357     __ cmovl(Assembler::greater, $tmp3$$Register, $tmp2$$Register);
9358     __ cmpl($src1$$Register, $tmp3$$Register);
9359     __ cmovl(Assembler::greater, $tmp3$$Register, $src1$$Register);
9360     __ movl($dst$$Register, $tmp3$$Register);
9361     __ pextrb($tmp2$$Register, $tmp$$XMMRegister,0x3);
9362     __ movsbl($tmp2$$Register, $tmp2$$Register);
9363     __ pextrb($tmp3$$Register, $tmp$$XMMRegister,0x2);
9364     __ movsbl($tmp3$$Register, $tmp3$$Register);
9365     __ cmpl($tmp2$$Register, $tmp3$$Register);
9366     __ cmovl(Assembler::greater, $tmp3$$Register, $tmp2$$Register);
9367     __ cmpl($tmp3$$Register, $dst$$Register);
9368     __ cmovl(Assembler::greater, $dst$$Register, $tmp3$$Register);
9369     __ movsbl($dst$$Register, $dst$$Register);
9370   %}
9371   ins_pipe( pipe_slow );
9372 %}
9373 
9374 instruct rvmax16B_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, rRegI tmp2, rRegI tmp3, regF tmp4) %{
9375   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
9376   match(Set dst (MaxReductionV src1 src2));
9377   effect(TEMP dst, TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP tmp4);
9378   format %{ "pshufd  $tmp4,$src2,0xE\n\t"
9379             "vpmaxsb  $tmp,$tmp4,$src2\n\t"
9380             "pshufd  $tmp,$tmp4,0x1\n\t"
9381             "vpmaxsb  $tmp,$tmp,$tmp4\n\t"
9382             "pextrb  $tmp2,$tmp, 0x1\n\t"
9383             "movsbl  $tmp2,$tmp2\n\t"
9384             "pextrb  $tmp3,$tmp,0x0\n\t"
9385             "movsbl  $tmp3,$tmp3\n\t"
9386             "cmpl  $tmp2,$tmp3\n\t"
9387             "cmovl  $tmp3,$tmp2\n\t"
9388             "cmpl  $src1,$tmp3\n\t"
9389             "cmovl  $tmp3,$src1, 0x0\n\t"
9390             "movl  $dst,$tmp2\n\t"
9391             "pextrb  $tmp2,$tmp\n\t"
9392             "movsbl  $tmp2,$tmp2\n\t"
9393             "pextrb  $tmp3,$tmp\n\t"
9394             "movsbl  $tmp3,$tmp3\n\t"
9395             "cmpl  $tmp2,$tmp3\n\t"
9396             "cmovl  $tmp3,$tmp2\n\t"
9397             "cmpl  $tmp3,$dst\n\t"
9398             "cmovl  $dst,$tmp3\t! max reduction4S" %}
9399   ins_encode %{
9400     int vector_len = 0;
9401     __ pshufd($tmp4$$XMMRegister, $src2$$XMMRegister, 0xE);
9402     __ vpmaxsb($tmp4$$XMMRegister, $tmp4$$XMMRegister, $src2$$XMMRegister, 0);
9403     __ pshufd($tmp$$XMMRegister, $tmp4$$XMMRegister,0x1);
9404     __ vpmaxsb($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp4$$XMMRegister, 0);
9405     __ pextrb($tmp2$$Register, $tmp$$XMMRegister,0x1);
9406     __ movsbl($tmp2$$Register, $tmp2$$Register);
9407     __ pextrb($tmp3$$Register, $tmp$$XMMRegister,0x0);
9408     __ movsbl($tmp3$$Register, $tmp3$$Register);
9409     __ cmpl($tmp2$$Register, $tmp3$$Register);
9410     __ cmovl(Assembler::greater, $tmp3$$Register, $tmp2$$Register);
9411     __ cmpl($src1$$Register, $tmp3$$Register);
9412     __ cmovl(Assembler::greater, $tmp3$$Register, $src1$$Register);
9413     __ movl($dst$$Register, $tmp3$$Register);
9414     __ pextrb($tmp2$$Register, $tmp$$XMMRegister,0x3);
9415     __ movsbl($tmp2$$Register, $tmp2$$Register);
9416     __ pextrb($tmp3$$Register, $tmp$$XMMRegister,0x2);
9417     __ movsbl($tmp3$$Register, $tmp3$$Register);
9418     __ cmpl($tmp2$$Register, $tmp3$$Register);
9419     __ cmovl(Assembler::greater, $tmp3$$Register, $tmp2$$Register);
9420     __ cmpl($tmp3$$Register, $dst$$Register);
9421     __ cmovl(Assembler::greater, $dst$$Register, $tmp3$$Register);
9422     __ movsbl($dst$$Register, $dst$$Register);
9423   %}
9424   ins_pipe( pipe_slow );
9425 %}
9426 
9427 instruct rvmax32B_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, rRegI tmp2, rRegI tmp3, regF tmp4) %{
9428   predicate(UseAVX > 1 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
9429   match(Set dst (MaxReductionV src1 src2));
9430   effect(TEMP dst, TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP tmp4);
9431   format %{ "vextracti128_high  $tmp,$src2\n\t"
9432             "vpmaxsb  $tmp,$tmp,$src2\n\t"
9433             "pshufd  $tmp4,$tmp,0xE\n\t"
9434             "vpmaxsb  $tmp4,$tmp4,$tmp\n\t"
9435             "pshufd  $tmp,$tmp4,0x1\n\t"
9436             "vpmaxsb  $tmp,$tmp,$tmp4\n\t"
9437             "pextrb  $tmp2,$tmp, 0x1\n\t"
9438             "movsbl  $tmp2,$tmp2\n\t"
9439             "pextrb  $tmp3,$tmp,0x0\n\t"
9440             "movsbl  $tmp3,$tmp3\n\t"
9441             "cmpl  $tmp2,$tmp3\n\t"
9442             "cmovl  $tmp3,$tmp2\n\t"
9443             "cmpl  $src1,$tmp3\n\t"
9444             "cmovl  $tmp3,$src1, 0x0\n\t"
9445             "movl  $dst,$tmp2\n\t"
9446             "pextrb  $tmp2,$tmp\n\t"
9447             "movsbl  $tmp2,$tmp2\n\t"
9448             "pextrb  $tmp3,$tmp\n\t"
9449             "movsbl  $tmp3,$tmp3\n\t"
9450             "cmpl  $tmp2,$tmp3\n\t"
9451             "cmovl  $tmp3,$tmp2\n\t"
9452             "cmpl  $tmp3,$dst\n\t"
9453             "cmovl  $dst,$tmp3\t! min reduction4S" %}
9454   ins_encode %{
9455     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
9456     __ vpmaxsb($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, 0);
9457     __ pshufd($tmp4$$XMMRegister, $tmp$$XMMRegister, 0xE);
9458     __ vpmaxsb($tmp4$$XMMRegister, $tmp4$$XMMRegister, $tmp$$XMMRegister, 0);
9459     __ pshufd($tmp$$XMMRegister, $tmp4$$XMMRegister,0x1);
9460     __ vpmaxsb($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp4$$XMMRegister, 0);
9461     __ pextrb($tmp2$$Register, $tmp$$XMMRegister,0x1);
9462     __ movsbl($tmp2$$Register, $tmp2$$Register);
9463     __ pextrb($tmp3$$Register, $tmp$$XMMRegister,0x0);
9464     __ movsbl($tmp3$$Register, $tmp3$$Register);
9465     __ cmpl($tmp2$$Register, $tmp3$$Register);
9466     __ cmovl(Assembler::greater, $tmp3$$Register, $tmp2$$Register);
9467     __ cmpl($src1$$Register, $tmp3$$Register);
9468     __ cmovl(Assembler::greater, $tmp3$$Register, $src1$$Register);
9469     __ movl($dst$$Register, $tmp3$$Register);
9470     __ pextrb($tmp2$$Register, $tmp$$XMMRegister,0x3);
9471     __ movsbl($tmp2$$Register, $tmp2$$Register);
9472     __ pextrb($tmp3$$Register, $tmp$$XMMRegister,0x2);
9473     __ movsbl($tmp3$$Register, $tmp3$$Register);
9474     __ cmpl($tmp2$$Register, $tmp3$$Register);
9475     __ cmovl(Assembler::greater, $tmp3$$Register, $tmp2$$Register);
9476     __ cmpl($tmp3$$Register, $dst$$Register);
9477     __ cmovl(Assembler::greater, $dst$$Register, $tmp3$$Register);
9478     __ movsbl($dst$$Register, $dst$$Register);
9479   %}
9480   ins_pipe( pipe_slow );
9481 %}
9482 
9483 instruct rvmax64B_reduction_reg(rRegI dst, rRegI src1, vecZ src2, regF tmp, rRegI tmp2, rRegI tmp3, regF tmp4) %{
9484   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
9485   match(Set dst (MaxReductionV src1 src2));
9486   effect(TEMP dst, TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP tmp4);
9487   format %{ "vextracti64x4_high  $tmp4,$src2\n\t"
9488             "vpmaxsb  $tmp4,$tmp4,$src2\n\t"
9489             "vextracti128_high  $tmp,$tmp4\n\t"
9490             "vpmaxsb  $tmp,$tmp,$tmp4\n\t"
9491             "pshufd  $tmp4,$tmp,0xE\n\t"
9492             "vpmaxsb  $tmp,$tmp4,$tmp\n\t"
9493             "pshufd  $tmp4,$src2,0xE\n\t"
9494             "vpmaxsb  $tmp,$tmp4,$src2\n\t"
9495             "pshufd  $tmp,$tmp4,0x1\n\t"
9496             "vpmaxsb  $tmp,$tmp,$tmp4\n\t"
9497             "pextrb  $tmp2,$tmp, 0x1\n\t"
9498             "movsbl  $tmp2,$tmp2\n\t"
9499             "pextrb  $tmp3,$tmp,0x0\n\t"
9500             "movsbl  $tmp3,$tmp3\n\t"
9501             "cmpl  $tmp2,$tmp3\n\t"
9502             "cmovl  $tmp3,$tmp2\n\t"
9503             "cmpl  $src1,$tmp3\n\t"
9504             "cmovl  $tmp3,$src1, 0x0\n\t"
9505             "movl  $dst,$tmp2\n\t"
9506             "pextrb  $tmp2,$tmp\n\t"
9507             "movsbl  $tmp2,$tmp2\n\t"
9508             "pextrb  $tmp3,$tmp\n\t"
9509             "movsbl  $tmp3,$tmp3\n\t"
9510             "cmpl  $tmp2,$tmp3\n\t"
9511             "cmovl  $tmp3,$tmp2\n\t"
9512             "cmpl  $tmp3,$dst\n\t"
9513             "cmovl  $dst,$tmp3\t! max reduction32B" %}
9514   ins_encode %{
9515     __ vextracti64x4_high($tmp4$$XMMRegister, $src2$$XMMRegister);
9516     __ vpmaxsb($tmp4$$XMMRegister, $tmp4$$XMMRegister, $src2$$XMMRegister, 1);
9517     __ vextracti128_high($tmp$$XMMRegister, $tmp4$$XMMRegister);
9518     __ vpmaxsb($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp4$$XMMRegister, 0);
9519     __ pshufd($tmp4$$XMMRegister, $tmp$$XMMRegister, 0xE);
9520     __ vpmaxsb($tmp4$$XMMRegister, $tmp4$$XMMRegister, $tmp$$XMMRegister, 0);
9521     __ pshufd($tmp$$XMMRegister, $tmp4$$XMMRegister,0x1);
9522     __ vpmaxsb($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp4$$XMMRegister, 0);
9523     __ pextrb($tmp2$$Register, $tmp$$XMMRegister,0x1);
9524     __ movsbl($tmp2$$Register, $tmp2$$Register);
9525     __ pextrb($tmp3$$Register, $tmp$$XMMRegister,0x0);
9526     __ movsbl($tmp3$$Register, $tmp3$$Register);
9527     __ cmpl($tmp2$$Register, $tmp3$$Register);
9528     __ cmovl(Assembler::greater, $tmp3$$Register, $tmp2$$Register);
9529     __ cmpl($src1$$Register, $tmp3$$Register);
9530     __ cmovl(Assembler::greater, $tmp3$$Register, $src1$$Register);
9531     __ movl($dst$$Register, $tmp3$$Register);
9532     __ pextrb($tmp2$$Register, $tmp$$XMMRegister,0x3);
9533     __ movsbl($tmp2$$Register, $tmp2$$Register);
9534     __ pextrb($tmp3$$Register, $tmp$$XMMRegister,0x2);
9535     __ movsbl($tmp3$$Register, $tmp3$$Register);
9536     __ cmpl($tmp2$$Register, $tmp3$$Register);
9537     __ cmovl(Assembler::greater, $tmp3$$Register, $tmp2$$Register);
9538     __ cmpl($tmp3$$Register, $dst$$Register);
9539     __ cmovl(Assembler::greater, $dst$$Register, $tmp3$$Register);
9540     __ movsbl($dst$$Register, $dst$$Register);
9541   %}
9542   ins_pipe( pipe_slow );
9543 %}
9544 
9545 instruct rsmax4S_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, rRegI tmp2, rRegI tmp3) %{
9546   predicate(UseSSE > 1 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
9547   match(Set dst (MaxReductionV src1 src2));
9548   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
9549   format %{ "pshufd  $tmp,$src2,0x1\n\t"
9550             "pminsw  $tmp,$src2\n\t"
9551             "movzwl  $dst,$src1\n\t"
9552             "pextrw  $tmp2,$tmp, 0x0\n\t"
9553             "pminsw  $dst,$tmp2\n\t"
9554             "pminsw  $dst,$tmp2\n\t"
9555             "movswl  $dst,$dst\t! min reduction4S" %}
9556   ins_encode %{
9557     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister,0x1);
9558     __ pmaxsw($tmp$$XMMRegister, $src2$$XMMRegister);
9559     __ pextrw($tmp2$$Register, $tmp$$XMMRegister,0x1);
9560     __ movswl($tmp2$$Register, $tmp2$$Register);
9561     __ pextrw($tmp3$$Register, $tmp$$XMMRegister,0x0);
9562     __ movswl($tmp3$$Register, $tmp3$$Register);
9563     __ cmpl($tmp2$$Register, $tmp3$$Register);
9564     __ cmovl(Assembler::greater, $tmp3$$Register, $tmp2$$Register);
9565     __ cmpl($src1$$Register, $tmp3$$Register);
9566     __ cmovl(Assembler::greater, $tmp3$$Register, $src1$$Register);
9567     __ movl($dst$$Register, $tmp3$$Register);
9568   %}
9569   ins_pipe( pipe_slow );
9570 %}
9571 
9572 instruct rvmax4S_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, rRegI tmp2, rRegI tmp3) %{
9573   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
9574   match(Set dst (MaxReductionV src1 src2));
9575   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
9576   format %{ "pshufd  $tmp,$src2,0x1\n\t"
9577             "pminsw  $tmp,$src2\n\t"
9578             "movzwl  $dst,$src1\n\t"
9579             "pextrw  $tmp2,$tmp, 0x0\n\t"
9580             "pminsw  $dst,$tmp2\n\t"
9581             "pminsw  $dst,$tmp2\n\t"
9582             "movswl  $dst,$dst\t! min reduction4S" %}
9583   ins_encode %{
9584     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister,0x1);
9585     __ vpmaxsw($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, 0);
9586     __ pextrw($tmp2$$Register, $tmp$$XMMRegister,0x1);
9587     __ movswl($tmp2$$Register, $tmp2$$Register);
9588     __ pextrw($tmp3$$Register, $tmp$$XMMRegister,0x0);
9589     __ movswl($tmp3$$Register, $tmp3$$Register);
9590     __ cmpl($tmp2$$Register, $tmp3$$Register);
9591     __ cmovl(Assembler::greater, $tmp3$$Register, $tmp2$$Register);
9592     __ cmpl($src1$$Register, $tmp3$$Register);
9593     __ cmovl(Assembler::greater, $tmp3$$Register, $src1$$Register);
9594     __ movl($dst$$Register, $tmp3$$Register);
9595   %}
9596   ins_pipe( pipe_slow );
9597 %}
9598 
9599 instruct rsmax8S_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2, rRegI tmp3, rRegI tmp4) %{
9600   predicate(UseSSE > 1 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
9601   match(Set dst (MaxReductionV src1 src2));
9602   effect(TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP tmp4);
9603   format %{ "pshufd  $tmp2,$src2,0xE\n\t"
9604             "pmaxsw  $tmp2,$src2\n\t"
9605             "pshufd  $tmp,$tmp2,0x1\n\t"
9606             "pmaxsw  $tmp,$tmp2\n\t"
9607             "pextrw  $tmp2,$tmp\n\t"
9608             "movswl  $tmp2,$tmp2\n\t"
9609             "pextrw  $tmp3,$tmp, 0x0\n\t"
9610             "movswl  $tmp3,$tmp3\n\t"
9611             "cmpl    $tmp2,$tmp3\n\t"
9612             "cmovl  $tmp3,$tmp2\n\t"
9613             "cmpl  $src1,$tmp3\n\t"
9614             "cmovl  $tmp3,$src1\n\t"
9615             "movl  $dst,$tmp3\t! max reduction8S" %}
9616   ins_encode %{
9617     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister,0xE);
9618     __ pmaxsw($tmp2$$XMMRegister, $src2$$XMMRegister);
9619     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister,0x1);
9620     __ pmaxsw($tmp$$XMMRegister, $tmp2$$XMMRegister);
9621     __ pextrw($tmp4$$Register, $tmp$$XMMRegister,0x1);
9622     __ movswl($tmp4$$Register, $tmp4$$Register);
9623     __ pextrw($tmp3$$Register, $tmp$$XMMRegister,0x0);
9624     __ movswl($tmp3$$Register, $tmp3$$Register);
9625     __ cmpl($tmp4$$Register, $tmp3$$Register);
9626     __ cmovl(Assembler::greater, $tmp3$$Register, $tmp4$$Register);
9627     __ cmpl($src1$$Register, $tmp3$$Register);
9628     __ cmovl(Assembler::greater, $tmp3$$Register, $src1$$Register);
9629     __ movl($dst$$Register, $tmp3$$Register);
9630   %}
9631   ins_pipe( pipe_slow );
9632 %}
9633 
9634 instruct rvmax8S_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2, rRegI tmp3, rRegI tmp4) %{
9635   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
9636   match(Set dst (MaxReductionV src1 src2));
9637   effect(TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP tmp4);
9638   format %{ "pshufd   $tmp,$src2,0xE\n\t"
9639             "vpmaxsw  $tmp,$tmp,$src2\n\t"
9640             "pshufd   $tmp2,$tmp,0x1\n\t"
9641             "vpmaxsw  $tmp,$tmp,$tmp2\n\t"
9642             "movzwl   $dst,$src1\n\t"
9643             "pextrw   $tmp3,$tmp, 0x0\n\t"
9644             "vpmaxsw  $dst,$dst,$tmp3\n\t"
9645             "pextrw   $tmp3,$tmp, 0x1\n\t"
9646             "vpmaxsw  $dst,$dst,$tmp3\n\t"
9647             "movswl   $dst,$dst\t! max reduction8S" %}
9648   ins_encode %{
9649     int vector_len = 0;
9650     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
9651     __ vpmaxsw($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
9652     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister,0x1);
9653     __ vpmaxsw($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
9654     __ pextrw($tmp4$$Register, $tmp$$XMMRegister,0x1);
9655     __ movswl($tmp4$$Register, $tmp4$$Register);
9656     __ pextrw($tmp3$$Register, $tmp$$XMMRegister,0x0);
9657     __ movswl($tmp3$$Register, $tmp3$$Register);
9658     __ cmpl($tmp4$$Register, $tmp3$$Register);
9659     __ cmovl(Assembler::greater, $tmp3$$Register, $tmp4$$Register);
9660     __ cmpl($src1$$Register, $tmp3$$Register);
9661     __ cmovl(Assembler::greater, $tmp3$$Register, $src1$$Register);
9662     __ movl($dst$$Register, $tmp3$$Register);
9663   %}
9664   ins_pipe( pipe_slow );
9665 %}
9666 
9667 instruct rvmax16S_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2, rRegI tmp3, rRegI tmp4) %{
9668   predicate(UseAVX > 1 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
9669   match(Set dst (MaxReductionV src1 src2));
9670   effect(TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP tmp4);
9671   format %{ "vextracti128_high  $tmp,$src2\n\t"
9672             "vpmaxsw  $tmp,$tmp,$src2\n\t"
9673             "pshufd  $tmp2,$tmp,0xE\n\t"
9674             "vpmaxsw  $tmp,$tmp,$tmp2\n\t"
9675             "pshufd  $tmp2,$tmp,0x1\n\t"
9676             "vpmaxsw  $tmp,$tmp,$tmp2\n\t"
9677             "pextrw  $tmp2,$tmp, 0x1\n\t"
9678             "movswl  $tmp2,$tmp2\n\t"
9679             "pextrw  $tmp3,$tmp, 0x0\n\t"
9680             "movswl  $tmp3,$tmp3\n\t"
9681             "cmpl  $tmp2$tmp3\n\t"
9682             "cmovl  $tmp3,$tmp2\n\t"
9683             "cmpl  $src1,$tmp3\n\t"
9684             "cmovl  $tmp3,$src1\n\t"
9685             "movl  $dst,$tmp3\t! max reduction16S" %}
9686   ins_encode %{
9687     int vector_len = 1;
9688     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
9689     __ vpmaxsw($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
9690     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
9691     __ vpmaxsw($tmp$$XMMRegister,$tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
9692     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister,0x1);
9693     __ vpmaxsw($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
9694     __ pextrw($tmp4$$Register, $tmp$$XMMRegister,0x1);
9695     __ movswl($tmp4$$Register, $tmp4$$Register);
9696     __ pextrw($tmp3$$Register, $tmp$$XMMRegister,0x0);
9697     __ movswl($tmp3$$Register, $tmp3$$Register);
9698     __ cmpl($tmp4$$Register, $tmp3$$Register);
9699     __ cmovl(Assembler::greater, $tmp3$$Register, $tmp4$$Register);
9700     __ cmpl($src1$$Register, $tmp3$$Register);
9701     __ cmovl(Assembler::greater, $tmp3$$Register, $src1$$Register);
9702     __ movl($dst$$Register, $tmp3$$Register);
9703   %}
9704   ins_pipe( pipe_slow );
9705 %}
9706 
9707 instruct rvmax32S_reduction_reg(rRegI dst, rRegI src1, vecZ src2, regF tmp, regF tmp2, rRegI tmp3, rRegI tmp4) %{
9708   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
9709   match(Set dst (MaxReductionV src1 src2));
9710   effect(TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP tmp4);
9711   format %{ "vextracti64x4_high  $tmp2,$src2\n\t"
9712             "vpmaxsw  $tmp2,$tmp2,$src2\n\t"
9713             "vextracti128_high  $tmp,$tmp2\n\t"
9714             "vpmaxsw  $tmp,$tmp,$tmp2\n\t"
9715             "pshufd  $tmp2,$tmp,0xE\n\t"
9716             "vpmaxsw  $tmp,$tmp,$tmp2\n\t"
9717             "pshufd  $tmp2,$tmp,0x1\n\t"
9718             "vpmaxsw  $tmp,$tmp,$tmp2\n\t"
9719             "pextrw  $tmp3,$tmp, 0x0\n\t"
9720             "movswl  $dst,$src1\n\t"
9721             "pextrw  $tmp3,$tmp, 0x0\n\t"
9722             "movswl  $dst,$src1\n\t"
9723             "cmpl  $tmp2$tmp3\n\t"
9724             "cmovl  $tmp3,$tmp2\n\t"
9725             "cmpl  $src1,$tmp3\n\t"
9726             "cmovl  $tmp3,$src1\n\t"
9727             "movl  $dst,$dst\t! max reduction32S" %}
9728   ins_encode %{
9729     int vector_len = 2;
9730     __ vextracti64x4_high($tmp2$$XMMRegister, $src2$$XMMRegister);
9731     __ vpmaxsw($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, vector_len);
9732     __ vextracti128_high($tmp$$XMMRegister, $tmp2$$XMMRegister);
9733     __ vpmaxsw($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
9734     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
9735     __ vpmaxsw($tmp$$XMMRegister,$tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
9736     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister,0x1);
9737     __ vpmaxsw($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
9738     __ pextrw($tmp4$$Register, $tmp$$XMMRegister,0x1);
9739     __ movswl($tmp4$$Register, $tmp4$$Register);
9740     __ pextrw($tmp3$$Register, $tmp$$XMMRegister,0x0);
9741     __ movswl($tmp3$$Register, $tmp3$$Register);
9742     __ cmpl($tmp4$$Register, $tmp3$$Register);
9743     __ cmovl(Assembler::greater, $tmp3$$Register, $tmp4$$Register);
9744     __ cmpl($src1$$Register, $tmp3$$Register);
9745     __ cmovl(Assembler::greater, $tmp3$$Register, $src1$$Register);
9746     __ movl($dst$$Register, $tmp3$$Register);
9747   %}
9748   ins_pipe( pipe_slow );
9749 %}
9750 
9751 instruct rsmax2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
9752   predicate(UseSSE > 3 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
9753   match(Set dst (MaxReductionV src1 src2));
9754   effect(TEMP tmp, TEMP tmp2);
9755   format %{ "pshufd  $tmp,$src2,0x1\n\t"
9756             "pmaxsd  $tmp,$src2\n\t"
9757             "movd    $tmp2,$src1\n\t"
9758             "pmaxsd  $tmp2,$tmp\n\t"
9759             "movd    $dst,$tmp2\t! max reduction2I" %}
9760   ins_encode %{
9761     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x1);
9762     __ pmaxsd($tmp$$XMMRegister, $src2$$XMMRegister);
9763     __ movdl($tmp2$$XMMRegister, $src1$$Register);
9764     __ pmaxsd($tmp2$$XMMRegister, $tmp$$XMMRegister);
9765     __ movdl($dst$$Register, $tmp2$$XMMRegister);
9766   %}
9767   ins_pipe( pipe_slow );
9768 %}
9769 
9770 instruct rvmax2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
9771   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
9772   match(Set dst (MaxReductionV src1 src2));
9773   effect(TEMP tmp, TEMP tmp2);
9774   format %{ "pshufd   $tmp,$src2,0x1\n\t"
9775             "vpmaxsd  $tmp2,$tmp,$src2\n\t"
9776             "movd     $tmp,$src1\n\t"
9777             "vpmaxsd  $tmp2,$tmp,$tmp2\n\t"
9778             "movd     $dst,$tmp2\t! max reduction2I" %}
9779   ins_encode %{
9780     int vector_len = 0;
9781     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x1);
9782     __ vpmaxsd($tmp2$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
9783     __ movdl($tmp$$XMMRegister, $src1$$Register);
9784     __ vpmaxsd($tmp2$$XMMRegister,$tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
9785     __ movdl($dst$$Register, $tmp2$$XMMRegister);
9786   %}
9787   ins_pipe( pipe_slow );
9788 %}
9789 
9790 instruct rsmax4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
9791   predicate(UseSSE > 3 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
9792   match(Set dst (MaxReductionV src1 src2));
9793   effect(TEMP tmp, TEMP tmp2);
9794   format %{ "pshufd  $tmp,$src2,0xE\n\t"
9795             "pmaxsd  $tmp,$src2\n\t"
9796             "pshufd  $tmp2,$tmp,0x1\n\t"
9797             "pmaxsd  $tmp2,$tmp\n\t"
9798             "movd    $tmp,$src1\n\t"
9799             "pmaxsd  $tmp2,$tmp\n\t"
9800             "movd    $dst,$tmp2\t! max reduction4I" %}
9801   ins_encode %{
9802     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
9803     __ pmaxsd($tmp$$XMMRegister, $src2$$XMMRegister);
9804     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
9805     __ pmaxsd($tmp2$$XMMRegister,$tmp$$XMMRegister);
9806     __ movdl($tmp$$XMMRegister, $src1$$Register);
9807     __ pmaxsd($tmp2$$XMMRegister,$tmp$$XMMRegister);
9808     __ movdl($dst$$Register, $tmp2$$XMMRegister);
9809   %}
9810   ins_pipe( pipe_slow );
9811 %}
9812 
9813 instruct rvmax4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
9814   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
9815   match(Set dst (MaxReductionV src1 src2));
9816   effect(TEMP tmp, TEMP tmp2);
9817   format %{ "pshufd   $tmp,$src2,0xE\n\t"
9818             "vpmaxsd  $tmp2,$tmp,$src2\n\t"
9819             "pshufd   $tmp,$tmp2,0x1\n\t"
9820             "vpmaxsd  $tmp2,$tmp2,$tmp\n\t"
9821             "movd     $tmp,$src1\n\t"
9822             "vpmaxsd  $tmp2,$tmp,$tmp2\n\t"
9823             "movd     $dst,$tmp2\t! max reduction4I" %}
9824   ins_encode %{
9825     int vector_len = 0;
9826     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
9827     __ vpmaxsd($tmp2$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
9828     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister,0x1);
9829     __ vpmaxsd($tmp2$$XMMRegister,$tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
9830     __ movdl($tmp$$XMMRegister, $src1$$Register);
9831     __ vpmaxsd($tmp2$$XMMRegister,$tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
9832     __ movdl($dst$$Register, $tmp2$$XMMRegister);
9833   %}
9834   ins_pipe( pipe_slow );
9835 %}
9836 
9837 instruct rvmax4I_reduction_evex(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
9838   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
9839   match(Set dst (MaxReductionV src1 src2));
9840   effect(TEMP tmp, TEMP tmp2);
9841   format %{ "pshufd   $tmp,$src2,0xE\n\t"
9842             "vpmaxsd  $tmp2,$tmp,$src2\n\t"
9843             "pshufd   $tmp,$tmp2,0x1\n\t"
9844             "vpmaxsd  $tmp2,$tmp2,$tmp\n\t"
9845             "movd     $tmp,$src1\n\t"
9846             "vpmaxsd  $tmp2,$tmp,$tmp2\n\t"
9847             "movd     $dst,$tmp2\t! max reduction4I" %}
9848   ins_encode %{
9849     int vector_len = 0;
9850     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
9851     __ vpmaxsd($tmp2$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
9852     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister,0x1);
9853     __ vpmaxsd($tmp2$$XMMRegister,$tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
9854     __ movdl($tmp$$XMMRegister, $src1$$Register);
9855     __ vpmaxsd($tmp2$$XMMRegister,$tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
9856     __ movdl($dst$$Register, $tmp2$$XMMRegister);
9857   %}
9858   ins_pipe( pipe_slow );
9859 %}
9860 
9861 instruct rvmax8I_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2) %{
9862   predicate(UseAVX > 1 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
9863   match(Set dst (MaxReductionV src1 src2));
9864   effect(TEMP tmp, TEMP tmp2);
9865   format %{ "vextracti128_high   $tmp,$src2\n\t"
9866             "vpmaxsd  $tmp,$tmp,$src2\n\t"
9867             "pshufd   $tmp2,$tmp,0xE\n\t"
9868             "vpmaxsd  $tmp2,$tmp,$tmp2\n\t"
9869             "pshufd   $tmp,$tmp2,0x1\n\t"
9870             "vpmaxsd  $tmp2,$tmp2,$tmp\n\t"
9871             "movd     $tmp,$src1\n\t"
9872             "vpmaxsd  $tmp2,$tmp,$tmp2\n\t"
9873             "movd     $dst,$tmp2\t! max reduction8I" %}
9874   ins_encode %{
9875     int vector_len = 1;
9876     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
9877     __ vpmaxsd($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
9878     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
9879     __ vpmaxsd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
9880     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister,0x1);
9881     __ vpmaxsd($tmp2$$XMMRegister,$tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
9882     __ movdl($tmp$$XMMRegister, $src1$$Register);
9883     __ vpmaxsd($tmp2$$XMMRegister,$tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
9884     __ movdl($dst$$Register, $tmp2$$XMMRegister);
9885   %}
9886   ins_pipe( pipe_slow );
9887 %}
9888 
9889 instruct rvmax8I_reduction_evex(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2) %{
9890   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
9891   match(Set dst (MaxReductionV src1 src2));
9892   effect(TEMP tmp, TEMP tmp2);
9893   format %{ "vextracti128_high   $tmp,$src2\n\t"
9894             "vpmaxsd  $tmp,$tmp,$src2\n\t"
9895             "pshufd   $tmp2,$tmp,0xE\n\t"
9896             "vpmaxsd  $tmp2,$tmp,$tmp2\n\t"
9897             "pshufd   $tmp,$tmp2,0x1\n\t"
9898             "vpmaxsd  $tmp2,$tmp2,$tmp\n\t"
9899             "movd     $tmp,$src1\n\t"
9900             "vpmaxsd  $tmp2,$tmp,$tmp2\n\t"
9901             "movd     $dst,$tmp2\t! max reduction8I" %}
9902   ins_encode %{
9903     int vector_len = 1;
9904     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
9905     __ vpmaxsd($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
9906     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
9907     __ vpmaxsd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
9908     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister,0x1);
9909     __ vpmaxsd($tmp2$$XMMRegister,$tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
9910     __ movdl($tmp$$XMMRegister, $src1$$Register);
9911     __ vpmaxsd($tmp2$$XMMRegister,$tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
9912     __ movdl($dst$$Register, $tmp2$$XMMRegister);
9913   %}
9914   ins_pipe( pipe_slow );
9915 %}
9916 
9917 instruct rvmax16I_reduction_evex(rRegI dst, rRegI src1, vecZ src2, regF tmp, regF tmp2, regF tmp3) %{
9918   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
9919   match(Set dst (MaxReductionV src1 src2));
9920   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
9921   format %{ "vextracti64x4_high  $tmp3,$src2\n\t"
9922             "vpmaxsd  $tmp3,$tmp3,$src2\n\t"
9923             "vextracti128_high   $tmp,$tmp3\n\t"
9924             "vpmaxsd  $tmp,$tmp,$tmp3\n\t"
9925             "pshufd   $tmp2,$tmp,0xE\n\t"
9926             "vpmaxsd  $tmp2,$tmp,$tmp2\n\t"
9927             "pshufd   $tmp,$tmp2,0x1\n\t"
9928             "vpmaxsd  $tmp2,$tmp2,$tmp\n\t"
9929             "movd     $tmp,$src1\n\t"
9930             "vpmaxsd  $tmp2,$tmp,$tmp2\n\t"
9931             "movd     $dst,$tmp2\t! max reduction16I" %}
9932   ins_encode %{
9933     int vector_len = 2;
9934     __ vextracti64x4_high($tmp3$$XMMRegister, $src2$$XMMRegister);
9935     __ vpmaxsd($tmp3$$XMMRegister, $tmp3$$XMMRegister, $src2$$XMMRegister, vector_len);
9936     __ vextracti128_high($tmp$$XMMRegister, $tmp3$$XMMRegister);
9937     __ vpmaxsd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp3$$XMMRegister, vector_len);
9938     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
9939     __ vpmaxsd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
9940     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister,0x1);
9941     __ vpmaxsd($tmp2$$XMMRegister,$tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
9942     __ movdl($tmp$$XMMRegister, $src1$$Register);
9943     __ vpmaxsd($tmp2$$XMMRegister,$tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
9944     __ movdl($dst$$Register, $tmp2$$XMMRegister);
9945   %}
9946   ins_pipe( pipe_slow );
9947 %}
9948 
9949 // Long Max Reduction
9950 instruct rsmax1L_reduction_reg(rRegL dst, rRegL src1, legVecD src2, rxmm0 xmm_0, legVecD tmp2, legVecD tmp3) %{
9951   predicate(UseSSE > 3 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
9952   match(Set dst (MaxReductionV src1 src2));
9953   effect(TEMP xmm_0, TEMP tmp2, TEMP tmp3);
9954   format %{ "movdq      $xmm_0,$src1\n\t"
9955             "movdq      $tmp2,$src1\n\t"
9956             "pcmpgtq   $xmm_0,$src2\n\t"
9957             "blendvpd  $tmp2,$src2\n\t"
9958             "movdq      $dst,$tmp2\t! max reduction1L" %}
9959   ins_encode %{
9960     __ movdq($xmm_0$$XMMRegister,$src1$$Register);
9961     __ movdq($tmp2$$XMMRegister,$src1$$Register);
9962     __ movdq($tmp3$$XMMRegister,$src2$$Register);
9963     __ pcmpgtq($xmm_0$$XMMRegister, $tmp3$$XMMRegister);
9964     __ blendvpd($tmp3$$XMMRegister,$tmp2$$XMMRegister);
9965     __ movdq($dst$$Register, $tmp3$$XMMRegister);
9966   %}
9967   ins_pipe( pipe_slow );
9968 %}
9969 
9970 instruct rsmax2L_reduction_reg(rRegL dst, rRegL src1, vecX src2, rxmm0 xmm_0, vecX tmp2, vecX tmp3) %{
9971   predicate(UseSSE > 3 && UseAVX == 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
9972   match(Set dst (MaxReductionV src1 src2));
9973   effect(TEMP xmm_0, TEMP tmp2, TEMP tmp3);
9974   format %{ "pshufd   $tmp3,$src2,0xE\n\t"
9975             "movdqu  $xmm_0,$src2\n\t"
9976             "pcmpgtq  $xmm_0,$tmp3\n\t"
9977             "blendvpd  $tmp3,$src2\n\t"
9978             "movdqu  $xmm_0,$tmp3\n\t"
9979             "movdq  $tmp2,$src1\n\t"
9980             "pcmpgtq  $xmm_0,$tmp2\n\t"
9981             "blendvpd  $tmp2,$tmp3\n\t"
9982             "movq     $dst,$tmp2\t! max reduction2L" %}
9983   ins_encode %{
9984     __ pshufd($tmp3$$XMMRegister, $src2$$XMMRegister, 0xE);
9985     __ movdqu($xmm_0$$XMMRegister, $src2$$XMMRegister);
9986     __ pcmpgtq($xmm_0$$XMMRegister, $tmp3$$XMMRegister);
9987     __ blendvpd($tmp3$$XMMRegister, $src2$$XMMRegister);
9988     __ movdqu($xmm_0$$XMMRegister, $tmp3$$XMMRegister);
9989     __ movdq($tmp2$$XMMRegister, $src1$$Register);
9990     __ pcmpgtq($xmm_0$$XMMRegister, $tmp2$$XMMRegister);
9991     __ blendvpd($tmp2$$XMMRegister,$tmp3$$XMMRegister);
9992     __ movdq($dst$$Register, $tmp2$$XMMRegister);
9993   %}
9994   ins_pipe( pipe_slow );
9995 %}
9996 
9997 instruct rvmax2L_reduction_reg(rRegL dst, rRegL src1, legVecX src2, legVecX tmp, legVecX tmp2, legVecX tmp3) %{
9998   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
9999   match(Set dst (MaxReductionV src1 src2));
10000   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
10001   format %{ "pshufd   $tmp2,$src2,0xE\n\t"
10002             "vpcmpgtq  $tmp,$tmp2,$src2\n\t"
10003             "vblendvpd   $tmp2,$tmp2,$src2,$tmp\n\t"
10004             "movq     $tmp,$src1\n\t"
10005             "vpcmpgtq  $tmp3,$tmp2,$tmp\n\t"
10006             "blendvpd   $tmp2,$tmp2,$src1,$tmp3\n\t"
10007             "movq     $dst,$tmp2\t! max reduction2L" %}
10008   ins_encode %{
10009     int vector_len = 0;
10010     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
10011     __ vpcmpgtq($tmp$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, vector_len);
10012     __ vblendvpd($tmp2$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
10013     __ movdq($tmp$$XMMRegister,$src1$$Register);
10014     __ vpcmpgtq($tmp3$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
10015     __ vblendvpd($tmp2$$XMMRegister, $tmp$$XMMRegister,$tmp2$$XMMRegister,$tmp3$$XMMRegister, vector_len);
10016     __ movdq($dst$$Register, $tmp2$$XMMRegister);
10017   %}
10018   ins_pipe( pipe_slow );
10019 %}
10020 
10021 instruct rvmax4L_reduction_reg(rRegL dst, rRegL src1, legVecY src2, legVecY tmp, legVecY tmp2, legVecY tmp3) %{
10022   predicate(UseAVX > 1 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
10023   match(Set dst (MaxReductionV src1 src2));
10024   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
10025   format %{ "vextracti128_high   $tmp2,$src2\n\t"
10026             "vpcmpgtq  $tmp,$tmp2,$src2\n\t"
10027             "vblendvpd   $tmp2,$tmp2,$src2,$tmp\n\t"
10028             "vpshufd   $tmp3, $tmp2,0x1\n\t"
10029             "vpcmpgtq  $tmp, $tmp3,$tmp\n\t2"
10030             "vblendvpd $tmp3,$tmp3,$tmp2,$tmp\n\t"
10031             "movq     $tmp2,$src1\n\t"
10032             "vpcmpgtq  $tmp,$tmp2,$tmp3\n\t"
10033             "blendvpd   $tmp2,$tmp2,$tmp3,$tmp\n\t"
10034             "movq     $dst,$tmp2\t! max reduction2L" %}
10035   ins_encode %{
10036     int vector_len = 1;
10037     __ vextracti128_high($tmp2$$XMMRegister, $src2$$XMMRegister);
10038     __ vpcmpgtq($tmp$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, vector_len);
10039     __ vblendvpd($tmp2$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
10040     __ pshufd($tmp3$$XMMRegister, $tmp2$$XMMRegister, 0xE);
10041     __ vpcmpgtq($tmp$$XMMRegister, $tmp3$$XMMRegister, $tmp2$$XMMRegister, vector_len);
10042     __ vblendvpd($tmp3$$XMMRegister,$tmp2$$XMMRegister,$tmp3$$XMMRegister, $tmp$$XMMRegister, vector_len);
10043     __ movdq($tmp$$XMMRegister,$src1$$Register);
10044     __ vpcmpgtq($tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp$$XMMRegister, vector_len);
10045     __ vblendvpd($tmp2$$XMMRegister, $tmp$$XMMRegister,$tmp3$$XMMRegister,$tmp2$$XMMRegister, vector_len);
10046     __ movdq($dst$$Register, $tmp2$$XMMRegister);
10047   %}
10048   ins_pipe( pipe_slow );
10049 %}
10050 
10051 instruct rvmax8L_reduction_reg(rRegL dst, rRegL src1, legVecZ src2, legVecZ tmp, legVecZ tmp2, legVecZ tmp3) %{
10052   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
10053   match(Set dst (MaxReductionV src1 src2));
10054   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
10055   format %{ "vextracti64x4_high  $tmp3,$src2\n\t"
10056             "vpcmpgtq  $tmp,$tmp3,$src2\n\t"
10057             "vblendvpd   $tmp3,$tmp3,$src2,$tmp\n\t"
10058             "vextracti128_high   $tmp2,$tmp3\n\t"
10059             "vpcmpgtq  $tmp,$tmp2,$tmp3\n\t"
10060             "vblendvpd   $tmp2,$tmp2,$tmp3,$tmp\n\t"
10061             "vpshufd  $tmp3,$tmp2,0x1\n\t"
10062             "vpcmpgtq   $tmp,$tmp3,$tmp2\n\t"
10063             "vblendvpd  $tmp3,$tmp3,$tmp2,$tmp\n\t"
10064             "movq     $tmp2,$src1\n\t"
10065             "vpcmpgtq  $tmp,$tmp2,$tmp3\n\t"
10066             "vblendvpd  $tmp2,$tmp2,$tmp3,$tmp\n\t"
10067             "movq     $dst,$tmp2\t! max reduction4I" %}
10068   ins_encode %{
10069     int vector_len = 1;
10070     __ vextracti64x4_high($tmp3$$XMMRegister, $src2$$XMMRegister);
10071     __ vpcmpgtq($tmp$$XMMRegister, $tmp3$$XMMRegister, $src2$$XMMRegister, vector_len);
10072     __ vblendvpd($tmp3$$XMMRegister, $src2$$XMMRegister, $tmp3$$XMMRegister, $tmp$$XMMRegister, vector_len);
10073     __ vextracti128_high($tmp2$$XMMRegister, $tmp3$$XMMRegister);
10074     __ vpcmpgtq($tmp$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, vector_len);
10075     __ vblendvpd($tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
10076     __ pshufd($tmp3$$XMMRegister, $tmp2$$XMMRegister, 0xE);
10077     __ vpcmpgtq($tmp$$XMMRegister, $tmp3$$XMMRegister, $tmp2$$XMMRegister, vector_len);
10078     __ vblendvpd($tmp3$$XMMRegister,$tmp2$$XMMRegister,$tmp3$$XMMRegister, $tmp$$XMMRegister, vector_len);
10079     __ movdq($tmp2$$XMMRegister, $src1$$Register);
10080     __ vpcmpgtq($tmp$$XMMRegister,$tmp2$$XMMRegister, $tmp3$$XMMRegister, vector_len);
10081     __ vblendvpd($tmp2$$XMMRegister,$tmp3$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
10082     __ movdq($dst$$Register, $tmp2$$XMMRegister);
10083   %}
10084   ins_pipe( pipe_slow );
10085 %}
10086 
10087 // Float max Reduction
10088 instruct rvmax2F_reduction_reg_av(legRegF dst, legVecD src, legVecD tmp,
10089                                   legVecD dtmp, legVecD atmp, legVecD btmp, legVecX xmm_1) %{
10090   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
10091   match(Set dst (MaxReductionV dst src));
10092   effect(TEMP dst, TEMP tmp, TEMP dtmp, TEMP atmp, TEMP btmp, TEMP xmm_1);
10093   format %{ "vpermilps    $tmp,$src,1\n\t"
10094             "vminps_macro $dtmp,$tmp,$src\t! minps\n\t"
10095             "vminps_macro $dst,$dtmp,$dst\t! minps" %}
10096   ins_encode %{
10097     int vector_len = 0;
10098     __ vpermilps($xmm_1$$XMMRegister, $src$$XMMRegister, 1, vector_len);
10099     __ vmin_max_macro($dtmp$$XMMRegister, $xmm_1$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister,
10100                       $atmp$$XMMRegister, $btmp$$XMMRegister, true, false, vector_len);
10101     __ vmin_max_macro($dst$$XMMRegister, $dtmp$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister,
10102                       $atmp$$XMMRegister, $btmp$$XMMRegister, true, false, vector_len);
10103   %}
10104   ins_pipe( pipe_slow );
10105 %}
10106 
10107 instruct rvmax2F_reduction_reg(legRegF dst, immF src1, legVecD src2, legVecD tmp,
10108                                legVecD atmp, legVecD btmp, legVecX xmm_1) %{
10109   predicate(UseAVX > 0 && n->in(1)->as_Type()->type() == (Type*)TypeF::NEG_INF && 
10110             n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
10111   match(Set dst (MaxReductionV src1 src2));
10112   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_1);
10113   format %{ "vpermilps    $tmp,$src2,1\n\t"
10114             "vminps_macro $dst,$tmp,$src2\t! minps" %}
10115   ins_encode %{
10116     int vector_len = 0;
10117     __ vpermilps($xmm_1$$XMMRegister, $src2$$XMMRegister, 1, vector_len);
10118     __ vmin_max_macro($dst$$XMMRegister, $xmm_1$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister,
10119                       $atmp$$XMMRegister, $btmp$$XMMRegister, true, false, vector_len);
10120   %}
10121   ins_pipe( pipe_slow );
10122 %}
10123 
10124 instruct rvmax4F_reduction_reg_av(legRegF dst, legVecX src, legVecX tmp, legVecX dtmp,
10125                                   legVecX atmp, legVecX btmp, legVecX xmm_0, legVecX xmm_1) %{
10126   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
10127   match(Set dst (MaxReductionV dst src));
10128   effect(TEMP dst, TEMP tmp, TEMP dtmp, TEMP atmp, TEMP btmp, TEMP xmm_0, TEMP xmm_1);
10129   format %{ "vpermilps    $xmm_1,$src,14\n\t"
10130             "vmaxps_macro $xmm_0,$xmm_1,$src\t! maxps\n\t"
10131             "vpermilps    $xmm_1,$xmm_0,1\n\t"
10132             "vmaxps_macro $dtmp,$xmm_1,$xmm_0\t! maxps\n\t"
10133             "vmaxps_macro $dst,$dtmp,$dst\t! maxps" %}
10134   ins_encode %{
10135     int vector_len = 0;
10136     __ vpermilps($xmm_1$$XMMRegister, $src$$XMMRegister, 14, vector_len);
10137     __ vmin_max_macro($xmm_0$$XMMRegister, $xmm_1$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister,
10138                       $atmp$$XMMRegister, $btmp$$XMMRegister , true, false, vector_len);
10139     __ vpermilps($xmm_1$$XMMRegister, $xmm_0$$XMMRegister, 1, vector_len);
10140     __ vmin_max_macro($dtmp$$XMMRegister, $xmm_0$$XMMRegister, $xmm_1$$XMMRegister, $tmp$$XMMRegister,
10141                       $atmp$$XMMRegister, $btmp$$XMMRegister , true, false, vector_len);
10142     __ vmin_max_macro($dst$$XMMRegister, $dtmp$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister,
10143                       $atmp$$XMMRegister, $btmp$$XMMRegister , true, false, vector_len);
10144   %}
10145   ins_pipe( pipe_slow );
10146 %}
10147 
10148 instruct rvmax4F_reduction_reg(legRegF dst, immF src1, legVecX src2, legVecX tmp,
10149                                legVecX atmp, legVecX btmp, legVecX xmm_0, legVecX xmm_1) %{
10150   predicate(UseAVX > 0 && n->in(1)->as_Type()->type() == (Type*)TypeF::NEG_INF && 
10151             n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
10152   match(Set dst (MaxReductionV src1 src2));
10153   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_0, TEMP xmm_1);
10154   format %{ "vpermilps    $xmm_1,$src2,14\n\t"
10155             "vmaxps_macro $xmm_0,$xmm_1,$src2\t! maxps\n\t"
10156             "vpermilps    $xmm_1,$xmm_0,1\n\t"
10157             "vmaxps_macro $xmm_0,$xmm_1,$xmm_0\t! maxps" %}
10158   ins_encode %{
10159     int vector_len = 0;
10160     __ vpermilps($xmm_1$$XMMRegister, $src2$$XMMRegister, 14, vector_len);
10161     __ vmin_max_macro($xmm_0$$XMMRegister, $xmm_1$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister,
10162                       $atmp$$XMMRegister, $btmp$$XMMRegister , true, false, vector_len);
10163     __ vpermilps($xmm_1$$XMMRegister, $xmm_0$$XMMRegister, 1, vector_len);
10164     __ vmin_max_macro($dst$$XMMRegister, $xmm_0$$XMMRegister, $xmm_1$$XMMRegister, $tmp$$XMMRegister,
10165                       $atmp$$XMMRegister, $btmp$$XMMRegister , true, false, vector_len);
10166   %}
10167   ins_pipe( pipe_slow );
10168 %}
10169 
10170 instruct rvmax8F_reduction_reg_av(legRegF dst, legVecY src, legVecY tmp, legVecY dtmp,
10171                                   legVecY atmp, legVecY btmp, legVecY ymm_0, legVecY ymm_1) %{
10172   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
10173   match(Set dst (MaxReductionV dst src));
10174   effect(TEMP dst, TEMP tmp, TEMP dtmp, TEMP atmp, TEMP btmp, TEMP ymm_0, TEMP ymm_1);
10175   format %{ "vperm2f128   $ymm_1,$src,$src,1\n\t"
10176             "vmaxps_macro $ymm_0,$ymm_1,$src\t! maxps\n\t"
10177             "vpermilps    $ymm_1,$ymm_0,14\n\t"
10178             "vmaxps_macro $ymm_0,$ymm_1,$ymm_0\t! mips\n\t"
10179             "vpermilps    $ymm_1,$ymm_0,1\n\t"
10180             "vmaxps_macro $dtmp,$ymm_1,$ymm_0\t! maxps\n\t" 
10181             "vmaxps_macro $dst,$dtmp,$dst\t! maxps" %}
10182   ins_encode %{
10183     int vector_len = 1;
10184     __ vperm2f128($ymm_1$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 1);
10185     __ vmin_max_macro($ymm_0$$XMMRegister, $ymm_1$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister,
10186                       $atmp$$XMMRegister, $btmp$$XMMRegister , true, false, vector_len);
10187     __ vpermilps($ymm_1$$XMMRegister, $ymm_0$$XMMRegister, 14, vector_len);
10188     __ vmin_max_macro($ymm_0$$XMMRegister, $ymm_1$$XMMRegister, $ymm_0$$XMMRegister, $tmp$$XMMRegister,
10189                       $atmp$$XMMRegister, $btmp$$XMMRegister , true, false, vector_len);
10190     __ vpermilps($ymm_1$$XMMRegister, $ymm_0$$XMMRegister, 1, vector_len);
10191     __ vmin_max_macro($dtmp$$XMMRegister, $ymm_0$$XMMRegister, $ymm_1$$XMMRegister, $tmp$$XMMRegister,
10192                       $atmp$$XMMRegister, $btmp$$XMMRegister , true, false, vector_len);
10193     __ vmin_max_macro($dst$$XMMRegister, $dtmp$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister,
10194                       $atmp$$XMMRegister, $btmp$$XMMRegister , true, false, vector_len);
10195   %}
10196   ins_pipe( pipe_slow );
10197 %}
10198 
10199 instruct rvmax8F_reduction_reg(legRegF dst, immF src1, legVecY src2, legVecY tmp, 
10200                                legVecY atmp, legVecY btmp, legVecY ymm_0, legVecY ymm_1) %{
10201   predicate(UseAVX > 0 && n->in(1)->as_Type()->type() == (Type*)TypeF::NEG_INF &&
10202             n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
10203   match(Set dst (MaxReductionV src1 src2));
10204   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP ymm_0, TEMP ymm_1);
10205   format %{ "vperm2f128   $ymm_1,$src2,$src2,1\n\t"
10206             "vmaxps_macro $ymm_0,$ymm_1,$src2\t! maxps\n\t"
10207             "vpermilps    $ymm_1,$ymm_0,14\n\t"
10208             "vmaxps_macro $ymm_0,$ymm_1,$ymm_0\t! mips\n\t"
10209             "vpermilps    $ymm_1,$ymm_0,1\n\t"
10210             "vmaxps_macro $dst,$ymm_1,$ymm_0\t! maxps" %}
10211   ins_encode %{
10212     int vector_len = 1;
10213     __ vperm2f128($ymm_1$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, 1);
10214     __ vmin_max_macro($ymm_0$$XMMRegister, $ymm_1$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister,
10215                       $atmp$$XMMRegister, $btmp$$XMMRegister , true, false, vector_len);
10216     __ vpermilps($ymm_1$$XMMRegister, $ymm_0$$XMMRegister, 14, vector_len);
10217     __ vmin_max_macro($ymm_0$$XMMRegister, $ymm_1$$XMMRegister, $ymm_0$$XMMRegister, $tmp$$XMMRegister,
10218                       $atmp$$XMMRegister, $btmp$$XMMRegister , true, false, vector_len);
10219     __ vpermilps($ymm_1$$XMMRegister, $ymm_0$$XMMRegister, 1, vector_len);
10220     __ vmin_max_macro($dst$$XMMRegister, $ymm_0$$XMMRegister, $ymm_1$$XMMRegister, $tmp$$XMMRegister,
10221                       $atmp$$XMMRegister, $btmp$$XMMRegister , true, false, vector_len);
10222   %}
10223   ins_pipe( pipe_slow );
10224 %}
10225 
10226 instruct rvmax16F_reduction_reg_av(regF dst, vecZ src, vecZ dtmp, vecZ atmp, vecZ btmp, vecY ymm_0, vecY ymm_1) %{
10227   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
10228   match(Set dst (MaxReductionV dst src));
10229   effect(TEMP dst, TEMP dtmp, TEMP atmp, TEMP btmp, TEMP ymm_0, TEMP ymm_1);
10230   format %{
10231        "vextractf64x4 $ymm_0, $src, 0\n\t"
10232        "vextractf64x4 $ymm_1, $src, 1\n\t"
10233        "vmaxps_macro $ymm_0,$ymm_1,$ymm_0\t! maxps\n\t"
10234        "vpermpd      $ymm_1,$ymm_0, 78\n\t"
10235        "vmaxps_macro $ymm_0,$ymm_1,$src\t! maxps\n\t"
10236        "vpermilps    $ymm_1,$ymm_0,14\n\t"
10237        "vmaxps_macro $ymm_0,$ymm_1,$ymm_0\t! mips\n\t"
10238        "vpermilps    $ymm_1,$ymm_0,1\n\t"
10239        "vmaxps_macro $dtmp,$ymm_1,$ymm_0\t! maxps\n\t" 
10240        "vmaxps_macro $dst,$dtmp,$dst\t! maxps" %}
10241   ins_encode %{
10242     int vector_len = 1;
10243     KRegister  ktmp = k1;
10244     __ vextractf64x4($ymm_0$$XMMRegister, $src$$XMMRegister, 0);
10245     __ vextractf64x4($ymm_1$$XMMRegister, $src$$XMMRegister, 1);
10246     __ vmin_max_macro_evex($ymm_0$$XMMRegister, $ymm_1$$XMMRegister, $ymm_0$$XMMRegister, ktmp,
10247                     $atmp$$XMMRegister, $btmp$$XMMRegister , true, false, vector_len);
10248     __ vpermpd($ymm_1$$XMMRegister, $ymm_0$$XMMRegister, 78, vector_len);
10249     __ vmin_max_macro_evex($ymm_0$$XMMRegister, $ymm_1$$XMMRegister, $ymm_0$$XMMRegister, ktmp,
10250                     $atmp$$XMMRegister, $btmp$$XMMRegister , true, false, vector_len);
10251     __ vpermilps($ymm_1$$XMMRegister, $ymm_0$$XMMRegister, 14, vector_len);
10252     __ vmin_max_macro_evex($ymm_0$$XMMRegister, $ymm_1$$XMMRegister, $ymm_0$$XMMRegister, ktmp,
10253                     $atmp$$XMMRegister, $btmp$$XMMRegister , true, false, vector_len);
10254     __ vpermilps($ymm_1$$XMMRegister, $ymm_0$$XMMRegister, 1, vector_len);
10255     __ vmin_max_macro_evex($dtmp$$XMMRegister, $ymm_0$$XMMRegister, $ymm_1$$XMMRegister, ktmp,
10256                     $atmp$$XMMRegister, $btmp$$XMMRegister , true, false, vector_len);
10257     __ vmin_max_macro_evex($dst$$XMMRegister, $dtmp$$XMMRegister, $dst$$XMMRegister, ktmp,
10258                     $atmp$$XMMRegister, $btmp$$XMMRegister , true, false, vector_len);
10259   %}
10260   ins_pipe( pipe_slow );
10261 %}
10262 
10263 instruct rvmax16F_reduction_reg(regF dst, immF src1, vecZ src2, vecZ atmp, vecZ btmp, vecY ymm_0, vecY ymm_1) %{
10264   predicate(UseAVX > 2 && n->in(1)->as_Type()->type() == (Type*)TypeF::NEG_INF &&
10265             n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
10266   match(Set dst (MaxReductionV src1 src2));
10267   effect(TEMP dst, TEMP atmp, TEMP btmp, TEMP ymm_0, TEMP ymm_1);
10268   format %{
10269        "vextractf64x4 $ymm_0, $src2, 0\n\t"
10270        "vextractf64x4 $ymm_1, $src2, 1\n\t"
10271        "vmaxps_macro $ymm_0,$ymm_1,$ymm_0\t! maxps\n\t"
10272        "vpermpd      $ymm_1,$ymm_0, 78\n\t"
10273        "vmaxps_macro $ymm_0,$ymm_1,$ymm_0\t! maxps\n\t"
10274        "vpermilps    $ymm_1,$ymm_0,14\n\t"
10275        "vmaxps_macro $ymm_0,$ymm_1,$ymm_0\t! mips\n\t"
10276        "vpermilps    $ymm_1,$ymm_0,1\n\t"
10277        "vmaxps_macro $dst,$ymm_1,$ymm_0\t! maxps" %}
10278   ins_encode %{
10279     int vector_len = 1;
10280     KRegister  ktmp = k1;
10281     __ vextractf64x4($ymm_0$$XMMRegister, $src2$$XMMRegister, 0);
10282     __ vextractf64x4($ymm_1$$XMMRegister, $src2$$XMMRegister, 1);
10283     __ vmin_max_macro_evex($ymm_0$$XMMRegister, $ymm_1$$XMMRegister, $ymm_0$$XMMRegister, ktmp,
10284                     $atmp$$XMMRegister, $btmp$$XMMRegister , true, false, vector_len);
10285     __ vpermpd($ymm_1$$XMMRegister, $ymm_0$$XMMRegister, 78, vector_len);
10286     __ vmin_max_macro_evex($ymm_0$$XMMRegister, $ymm_1$$XMMRegister, $ymm_0$$XMMRegister, ktmp,
10287                     $atmp$$XMMRegister, $btmp$$XMMRegister , true, false, vector_len);
10288     __ vpermilps($ymm_1$$XMMRegister, $ymm_0$$XMMRegister, 14, vector_len);
10289     __ vmin_max_macro_evex($ymm_0$$XMMRegister, $ymm_1$$XMMRegister, $ymm_0$$XMMRegister, ktmp,
10290                     $atmp$$XMMRegister, $btmp$$XMMRegister , true, false, vector_len);
10291     __ vpermilps($ymm_1$$XMMRegister, $ymm_0$$XMMRegister, 1, vector_len);
10292     __ vmin_max_macro_evex($dst$$XMMRegister, $ymm_0$$XMMRegister, $ymm_1$$XMMRegister, ktmp,
10293                     $atmp$$XMMRegister, $btmp$$XMMRegister , true, false, vector_len);
10294   %}
10295   ins_pipe( pipe_slow );
10296 %}
10297 
10298 instruct rvmax2D_reduction_reg_av(legRegD dst, legVecX src, legVecX tmp, legVecX dtmp,
10299                                   legVecX atmp, legVecX btmp, legVecX xmm_1) %{
10300   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
10301   match(Set dst (MaxReductionV dst src));
10302   effect(TEMP dst, TEMP tmp, TEMP dtmp, TEMP atmp, TEMP btmp, TEMP xmm_1);
10303   format %{ "vpermilpd    $xmm_1,$src,1\n\t"
10304             "vmaxpd_macro $dtmp,$xmm_1,$src\t! maxps\n\t" 
10305             "vmaxpd_macro $dst,$dtmp,$dst\t! maxps" %}
10306   ins_encode %{
10307     int vector_len = 0;
10308     __ vpermilpd($xmm_1$$XMMRegister, $src$$XMMRegister, 1, vector_len);
10309     __ vmin_max_macro($dtmp$$XMMRegister, $xmm_1$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister,
10310                       $atmp$$XMMRegister, $btmp$$XMMRegister, false, false, vector_len);
10311     __ vmin_max_macro($dst$$XMMRegister, $dtmp$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister,
10312                       $atmp$$XMMRegister, $btmp$$XMMRegister, false, false, vector_len);
10313   %}
10314   ins_pipe( pipe_slow );
10315 %}
10316 
10317 instruct rvmax2D_reduction_reg(legRegD dst, immD src1 , legVecX src2, legVecX tmp,
10318                                legVecX atmp, legVecX btmp, legVecX xmm_1) %{
10319   predicate(UseAVX > 0 && n->in(1)->as_Type()->type() == (Type*)TypeD::NEG_INF &&
10320             n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
10321   match(Set dst (MaxReductionV src1 src2));
10322   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_1);
10323   format %{ "vpermilpd    $xmm_1,$src2,1\n\t"
10324             "vmaxpd_macro $dst,$xmm_1,$src2\t! maxps" %}
10325   ins_encode %{
10326     int vector_len = 0;
10327     __ vpermilpd($xmm_1$$XMMRegister, $src2$$XMMRegister, 1, vector_len);
10328     __ vmin_max_macro($dst$$XMMRegister, $xmm_1$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister,
10329                       $atmp$$XMMRegister, $btmp$$XMMRegister, false, false, vector_len);
10330   %}
10331   ins_pipe( pipe_slow );
10332 %}
10333 instruct rvmax4D_reduction_reg_av(legRegD dst, legVecY src, legVecY tmp, legVecY dtmp, 
10334                                   legVecY atmp, legVecY btmp, legVecY ymm_0, legVecY ymm_1) %{
10335   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
10336   match(Set dst (MaxReductionV dst src));
10337   effect(TEMP dst, TEMP tmp, TEMP dtmp, TEMP atmp, TEMP btmp, TEMP ymm_0, TEMP ymm_1);
10338   format %{ "vperm2f128   $ymm_1,$src,$src,1\n\t"
10339             "vmaxpd_macro $ymm_0,$ymm_1,$src\t! maxps\n\t"
10340             "vpermilpd    $ymm_1,$ymm_0,1\n\t"
10341             "vmaxpd_macro $dtmp,$ymm_1,$ymm_0\t! maxps\n\t"
10342             "vmaxpd_macro $dst,$dtmp,$dst\t! maxps" %}
10343   ins_encode %{
10344     int vector_len = 1;
10345     __ vperm2f128($ymm_1$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 1);
10346     __ vmin_max_macro($ymm_0$$XMMRegister, $ymm_1$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister,
10347                       $atmp$$XMMRegister, $btmp$$XMMRegister , false, false, vector_len);
10348     __ vpermilpd($ymm_1$$XMMRegister, $ymm_0$$XMMRegister, 1, vector_len);
10349     __ vmin_max_macro($dtmp$$XMMRegister, $ymm_0$$XMMRegister, $ymm_1$$XMMRegister, $tmp$$XMMRegister,
10350                       $atmp$$XMMRegister, $btmp$$XMMRegister , false, false, vector_len);
10351     __ vmin_max_macro($dst$$XMMRegister, $dtmp$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister,
10352                       $atmp$$XMMRegister, $btmp$$XMMRegister , false, false, vector_len);
10353   %}
10354   ins_pipe( pipe_slow );
10355 %}
10356 
10357 instruct rvmax4D_reduction_reg(legRegD dst, immD src1, legVecY src2, legVecY tmp, 
10358                                legVecY atmp, legVecY btmp, legVecY ymm_0, legVecY ymm_1) %{
10359   predicate(UseAVX > 0 && n->in(1)->as_Type()->type() == (Type*)TypeD::NEG_INF &&
10360             n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
10361   match(Set dst (MaxReductionV src1 src2));
10362   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP ymm_0, TEMP ymm_1);
10363   format %{ "vperm2f128   $ymm_1,$src2,$src2,1\n\t"
10364             "vmaxpd_macro $ymm_0,$ymm_1,$src2\t! maxps\n\t"
10365             "vpermilpd    $ymm_1,$ymm_0,1\n\t"
10366             "vmaxpd_macro $ymm_0,$ymm_1,$ymm_0\t! maxps" %}
10367   ins_encode %{
10368     int vector_len = 1;
10369     __ vperm2f128($ymm_1$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, 1);
10370     __ vmin_max_macro($ymm_0$$XMMRegister, $ymm_1$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister,
10371                       $atmp$$XMMRegister, $btmp$$XMMRegister , false, false, vector_len);
10372     __ vpermilpd($ymm_1$$XMMRegister, $ymm_0$$XMMRegister, 1, vector_len);
10373     __ vmin_max_macro($dst$$XMMRegister, $ymm_0$$XMMRegister, $ymm_1$$XMMRegister, $tmp$$XMMRegister,
10374                       $atmp$$XMMRegister, $btmp$$XMMRegister , false, false, vector_len);
10375   %}
10376   ins_pipe( pipe_slow );
10377 %}
10378 
10379 instruct rvmax8D_reduction_reg_av(regD dst, vecZ src, vecZ dtmp, vecZ atmp, vecZ btmp, vecY ymm_0, vecY ymm_1) %{
10380   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
10381   match(Set dst (MaxReductionV dst src));
10382   effect(TEMP dst, TEMP dtmp, TEMP atmp, TEMP btmp, TEMP ymm_0, TEMP ymm_1);
10383   format %{
10384        "vextractf64x4 $ymm_0, $src, 0\n\t"
10385        "vextractf64x4 $ymm_1, $src, 1\n\t"
10386        "vmaxpd_macro $ymm_0,$ymm_1,$ymm_0\t! maxpd\n\t"
10387        "vpermpd      $ymm_1,$ymm_0, 14\n\t"
10388        "vmaxpd_macro $ymm_0,$ymm_1,$ymm_0\t! maxpd\n\t"
10389        "vpermilpd    $ymm_1,$ymm_0,1\n\t"
10390        "vmaxpd_macro $dtmp,$ymm_1,$ymm_0\t! maxpd\n\t" 
10391        "vmaxpd_macro $dst,$dtmp,$dst\t! maxpd\n\t" %} 
10392   ins_encode %{
10393     int vector_len = 1;
10394     KRegister ktmp = k1;
10395     __ vextractf64x4($ymm_0$$XMMRegister, $src$$XMMRegister, 0);
10396     __ vextractf64x4($ymm_1$$XMMRegister, $src$$XMMRegister, 1);
10397     __ vmin_max_macro_evex($ymm_0$$XMMRegister, $ymm_1$$XMMRegister, $ymm_0$$XMMRegister, ktmp,
10398                     $atmp$$XMMRegister, $btmp$$XMMRegister , false, false, vector_len);
10399     __ vpermpd($ymm_1$$XMMRegister, $ymm_0$$XMMRegister, 14, vector_len);
10400     __ vmin_max_macro_evex($ymm_0$$XMMRegister, $ymm_1$$XMMRegister, $ymm_0$$XMMRegister, ktmp,
10401                     $atmp$$XMMRegister, $btmp$$XMMRegister , false, false, vector_len);
10402     __ vpermilpd($ymm_1$$XMMRegister, $ymm_0$$XMMRegister, 1, vector_len);
10403     __ vmin_max_macro_evex($dtmp$$XMMRegister, $ymm_1$$XMMRegister, $ymm_0$$XMMRegister, ktmp,
10404                     $atmp$$XMMRegister, $btmp$$XMMRegister , false, false, vector_len);
10405     __ vmin_max_macro_evex($dst$$XMMRegister, $dtmp$$XMMRegister, $dst$$XMMRegister, ktmp,
10406                     $atmp$$XMMRegister, $btmp$$XMMRegister , false, false, vector_len);
10407   %}
10408   ins_pipe( pipe_slow );
10409 %}
10410 
10411 
10412 instruct rvmax8D_reduction_reg(regD dst, immD src1, vecZ src2, vecZ atmp, vecZ btmp, vecY ymm_0, vecY ymm_1) %{
10413   predicate(UseAVX > 2 && n->in(1)->as_Type()->type() == (Type*)TypeD::NEG_INF &&
10414             n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
10415   match(Set dst (MaxReductionV src1 src2));
10416   effect(TEMP dst, TEMP atmp, TEMP btmp, TEMP ymm_0, TEMP ymm_1);
10417   format %{
10418        "vextractf64x4 $ymm_0, $src2, 0\n\t"
10419        "vextractf64x4 $ymm_1, $src2, 1\n\t"
10420        "vmaxpd_macro $ymm_0,$ymm_1,$ymm_0\t! maxpd\n\t"
10421        "vpermpd      $ymm_1,$ymm_0, 14\n\t"
10422        "vmaxpd_macro $ymm_0,$ymm_1,$ymm_0\t! maxpd\n\t"
10423        "vpermilpd    $ymm_1,$ymm_0,1\n\t"
10424        "vmaxpd_macro $dst,$ymm_1,$ymm_0\t! maxpd\n\t" %} 
10425   ins_encode %{
10426     int vector_len = 1;
10427     KRegister ktmp = k1;
10428     __ vextractf64x4($ymm_0$$XMMRegister, $src2$$XMMRegister, 0);
10429     __ vextractf64x4($ymm_1$$XMMRegister, $src2$$XMMRegister, 1);
10430     __ vmin_max_macro_evex($ymm_0$$XMMRegister, $ymm_1$$XMMRegister, $ymm_0$$XMMRegister, ktmp,
10431                     $atmp$$XMMRegister, $btmp$$XMMRegister , false, false, vector_len);
10432     __ vpermpd($ymm_1$$XMMRegister, $ymm_0$$XMMRegister, 14, vector_len);
10433     __ vmin_max_macro_evex($ymm_0$$XMMRegister, $ymm_1$$XMMRegister, $ymm_0$$XMMRegister, ktmp,
10434                     $atmp$$XMMRegister, $btmp$$XMMRegister , false, false, vector_len);
10435     __ vpermilpd($ymm_1$$XMMRegister, $ymm_0$$XMMRegister, 1, vector_len);
10436     __ vmin_max_macro_evex($dst$$XMMRegister, $ymm_1$$XMMRegister, $ymm_0$$XMMRegister, ktmp,
10437                     $atmp$$XMMRegister, $btmp$$XMMRegister , false, false, vector_len);
10438   %}
10439   ins_pipe( pipe_slow );
10440 %}
10441 
10442 
10443 instruct rsand8B_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, rRegI tmp2) %{
10444   predicate(UseSSE > 3 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
10445   match(Set dst (AndReductionV src1 src2));
10446   effect(TEMP tmp, TEMP tmp2, TEMP dst);
10447   format %{
10448             "pshufd  $tmp,$src2,0x1\n\t"
10449             "pand    $tmp,$src2\n\t"
10450             "movzbl  $dst,$src1\n\t"
10451             "pextrb  $tmp2,$tmp, 0x0\n\t"
10452             "andl    $dst,$tmp2\n\t"
10453             "pextrb  $tmp2,$tmp, 0x1\n\t"
10454             "andl    $dst,$tmp2\n\t"
10455             "pextrb  $tmp2,$tmp, 0x2\n\t"
10456             "andl    $dst,$tmp2\n\t"
10457             "pextrb  $tmp2,$tmp, 0x3\n\t"
10458             "andl    $dst,$tmp2\n\t"
10459             "movsbl  $dst,$dst\t! and reduction8B" %}
10460   ins_encode %{
10461     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x1);
10462     __ pand($tmp$$XMMRegister, $src2$$XMMRegister);
10463     __ movzbl($dst$$Register, $src1$$Register);
10464     __ pextrb($tmp2$$Register, $tmp$$XMMRegister, 0x0);
10465     __ andl($dst$$Register, $tmp2$$Register);
10466     __ pextrb($tmp2$$Register, $tmp$$XMMRegister, 0x1);
10467     __ andl($dst$$Register, $tmp2$$Register);
10468     __ pextrb($tmp2$$Register, $tmp$$XMMRegister, 0x2);
10469     __ andl($dst$$Register, $tmp2$$Register);
10470     __ pextrb($tmp2$$Register, $tmp$$XMMRegister, 0x3);
10471     __ andl($dst$$Register, $tmp2$$Register);
10472     __ movsbl($dst$$Register, $dst$$Register);
10473   %}
10474   ins_pipe( pipe_slow );
10475 %}
10476 
10477 instruct rsand16B_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2, rRegI tmp3) %{
10478   predicate(UseSSE > 3 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
10479   match(Set dst (AndReductionV src1 src2));
10480   effect(TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP dst);
10481   format %{ "pshufd  $tmp,$src2,0xE\n\t"
10482             "pand    $tmp,$src2\n\t"
10483             "pshufd  $tmp2,$tmp,0x1\n\t"
10484             "pand    $tmp,$tmp,$tmp2\n\t"
10485             "movzbl  $dst,$src1\n\t"
10486             "pextrb  $tmp3,$tmp, 0x0\n\t"
10487             "andl    $dst,$tmp3\n\t"
10488             "pextrb  $tmp3,$tmp, 0x1\n\t"
10489             "andl    $dst,$tmp3\n\t"
10490             "pextrb  $tmp3,$tmp, 0x2\n\t"
10491             "andl    $dst,$tmp3\n\t"
10492             "pextrb  $tmp3,$tmp, 0x3\n\t"
10493             "andl    $dst,$tmp3\n\t"
10494             "movsbl  $dst,$dst\t! and reduction16B" %}
10495   ins_encode %{
10496     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
10497     __ pand($tmp$$XMMRegister, $src2$$XMMRegister);
10498     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
10499     __ pand($tmp$$XMMRegister, $tmp2$$XMMRegister);
10500     __ movzbl($dst$$Register, $src1$$Register);
10501     __ pextrb($tmp3$$Register, $tmp$$XMMRegister, 0x0);
10502     __ andl($dst$$Register, $tmp3$$Register);
10503     __ pextrb($tmp3$$Register, $tmp$$XMMRegister, 0x1);
10504     __ andl($dst$$Register, $tmp3$$Register);
10505     __ pextrb($tmp3$$Register, $tmp$$XMMRegister, 0x2);
10506     __ andl($dst$$Register, $tmp3$$Register);
10507     __ pextrb($tmp3$$Register, $tmp$$XMMRegister, 0x3);
10508     __ andl($dst$$Register, $tmp3$$Register);
10509     __ movsbl($dst$$Register, $dst$$Register);
10510   %}
10511   ins_pipe( pipe_slow );
10512 %}
10513 
10514 instruct rvand32B_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2, rRegI tmp3) %{
10515   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
10516   match(Set dst (AndReductionV src1 src2));
10517   effect(TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP dst);
10518    format %{ "vextracti128_high  $tmp,$src2\n\t"
10519             "vpand   $tmp,$tmp,$src2\n\t"
10520             "pshufd  $tmp2,$tmp,0xE\n\t"
10521             "vpand   $tmp,$tmp,$tmp2\n\t"
10522             "pshufd  $tmp2,$tmp,0x1\n\t"
10523             "vpand   $tmp,$tmp,$tmp2\n\t"
10524             "movzbl  $dst,$src1\n\t"
10525             "pextrb  $tmp3,$tmp, 0x0\n\t"
10526             "andl    $dst,$tmp3\n\t"
10527             "pextrb  $tmp3,$tmp, 0x1\n\t"
10528             "andl    $dst,$tmp3\n\t"
10529             "pextrb  $tmp3,$tmp, 0x2\n\t"
10530             "andl    $dst,$tmp3\n\t"
10531             "pextrb  $tmp3,$tmp, 0x3\n\t"
10532             "andl    $dst,$tmp3\n\t"
10533             "movsbl  $dst,$dst\t! and reduction32B" %}
10534   ins_encode %{
10535     int vector_len = 0;
10536     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
10537     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
10538     __ vpshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE, vector_len);
10539     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
10540     __ vpshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1, vector_len);
10541     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
10542     __ movzbl($dst$$Register, $src1$$Register);
10543     __ pextrb($tmp3$$Register, $tmp$$XMMRegister, 0x0);
10544     __ andl($dst$$Register, $tmp3$$Register);
10545     __ pextrb($tmp3$$Register, $tmp$$XMMRegister, 0x1);
10546     __ andl($dst$$Register, $tmp3$$Register);
10547     __ pextrb($tmp3$$Register, $tmp$$XMMRegister, 0x2);
10548     __ andl($dst$$Register, $tmp3$$Register);
10549     __ pextrb($tmp3$$Register, $tmp$$XMMRegister, 0x3);
10550     __ andl($dst$$Register, $tmp3$$Register);
10551     __ movsbl($dst$$Register, $dst$$Register);
10552   %}
10553   ins_pipe( pipe_slow );
10554 %}
10555 
10556 instruct rvand64B_reduction_reg(rRegI dst, rRegI src1, vecZ src2, regF tmp, regF tmp2, rRegI tmp3) %{
10557   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
10558   match(Set dst (AndReductionV src1 src2));
10559   effect(TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP dst);
10560   format %{ "vextracti64x4_high  $tmp2,$src2\n\t"
10561             "vpand   $tmp2,$tmp2,$src2\n\t"
10562             "vextracti128_high  $tmp,$tmp2\n\t"
10563             "vpand   $tmp,$tmp,$tmp2\n\t"
10564             "pshufd  $tmp2,$tmp,0xE\n\t"
10565             "vpand   $tmp,$tmp,$tmp2\n\t"
10566             "pshufd  $tmp2,$tmp,0x1\n\t"
10567             "vpand   $tmp,$tmp,$tmp2\n\t"
10568             "movzbl  $dst,$src1\n\t"
10569             "movdl   $tmp3,$tmp\n\t"
10570             "andl    $dst,$tmp3\n\t"
10571             "shrl    $tmp3,0x8\n\t"
10572             "andl    $dst,$tmp3\n\t"
10573             "shrl    $tmp3,0x8\n\t"
10574             "andl    $dst,$tmp3\n\t"
10575             "shrl    $tmp3,0x8\n\t"
10576             "andl    $dst,$tmp3\n\t"
10577             "movsbl  $dst,$dst\t! and reduction64B" %}
10578   ins_encode %{
10579     int vector_len = 0;
10580     __ vextracti64x4_high($tmp2$$XMMRegister, $src2$$XMMRegister);
10581     __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
10582     __ vextracti128_high($tmp$$XMMRegister, $tmp2$$XMMRegister);
10583     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
10584     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
10585     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
10586     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
10587     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
10588     __ movzbl($dst$$Register, $src1$$Register);
10589     __ movdl($tmp3$$Register, $tmp$$XMMRegister);
10590     __ andl($dst$$Register, $tmp3$$Register);
10591     __ shrl($tmp3$$Register, 8);
10592     __ andl($dst$$Register, $tmp3$$Register);
10593     __ shrl($tmp3$$Register, 8);
10594     __ andl($dst$$Register, $tmp3$$Register);
10595     __ shrl($tmp3$$Register, 8);
10596     __ andl($dst$$Register, $tmp3$$Register);
10597     __ movsbl($dst$$Register, $dst$$Register);
10598   %}
10599   ins_pipe( pipe_slow );
10600 %}
10601 
10602 instruct rsand4S_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, rRegI tmp2) %{
10603   predicate(UseSSE > 1 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
10604   match(Set dst (AndReductionV src1 src2));
10605   effect(TEMP tmp, TEMP tmp2, TEMP dst);
10606   format %{
10607             "pshufd  $tmp,$src2,0x1\n\t"
10608             "pand    $tmp,$src2\n\t"
10609             "movzwl  $dst,$src1\n\t"
10610             "pextrw  $tmp2,$tmp, 0x0\n\t"
10611             "andw    $dst,$tmp2\n\t"
10612             "pextrw  $tmp2,$tmp, 0x1\n\t"
10613             "andw    $dst,$tmp2\n\t"
10614             "movswl  $dst,$dst\t! and reduction4S" %}
10615   ins_encode %{
10616     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x1);
10617     __ pand($tmp$$XMMRegister, $src2$$XMMRegister);
10618     __ movzwl($dst$$Register, $src1$$Register);
10619     __ pextrw($tmp2$$Register, $tmp$$XMMRegister, 0x0);
10620     __ andw($dst$$Register, $tmp2$$Register);
10621     __ pextrw($tmp2$$Register, $tmp$$XMMRegister, 0x1);
10622     __ andw($dst$$Register, $tmp2$$Register);
10623     __ movswl($dst$$Register, $dst$$Register);
10624   %}
10625   ins_pipe( pipe_slow );
10626 %}
10627 
10628 instruct rsand8S_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2, rRegI tmp3) %{
10629   predicate(UseSSE > 1 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
10630   match(Set dst (AndReductionV src1 src2));
10631   effect(TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP dst);
10632   format %{ "pshufd  $tmp,$src2,0xE\n\t"
10633             "pand    $tmp,$src2\n\t"
10634             "pshufd  $tmp2,$tmp,0x1\n\t"
10635             "pand    $tmp,$tmp,$tmp2\n\t"
10636             "movzwl  $dst,$src1\n\t"
10637             "pextrw  $tmp3,$tmp, 0x0\n\t"
10638             "andw    $dst,$tmp3\n\t"
10639             "pextrw  $tmp3,$tmp, 0x1\n\t"
10640             "andw    $dst,$tmp3\n\t"
10641             "movswl  $dst,$dst\t! and reduction8S" %}
10642   ins_encode %{
10643     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
10644     __ pand($tmp$$XMMRegister, $src2$$XMMRegister);
10645     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
10646     __ pand($tmp$$XMMRegister, $tmp2$$XMMRegister);
10647     __ movzwl($dst$$Register, $src1$$Register);
10648     __ pextrw($tmp3$$Register, $tmp$$XMMRegister, 0x0);
10649     __ andw($dst$$Register, $tmp3$$Register);
10650     __ pextrw($tmp3$$Register, $tmp$$XMMRegister, 0x1);
10651     __ andw($dst$$Register, $tmp3$$Register);
10652     __ movswl($dst$$Register, $dst$$Register);
10653   %}
10654   ins_pipe( pipe_slow );
10655 %}
10656 
10657 instruct rvand16S_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2, rRegI tmp3) %{
10658   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
10659   match(Set dst (AndReductionV src1 src2));
10660   effect(TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP dst);
10661    format %{ "vextracti128_high  $tmp,$src2\n\t"
10662             "vpand   $tmp,$tmp,$src2\n\t"
10663             "pshufd  $tmp2,$tmp,0xE\n\t"
10664             "vpand   $tmp,$tmp,$tmp2\n\t"
10665             "pshufd  $tmp2,$tmp,0x1\n\t"
10666             "vpand   $tmp,$tmp,$tmp2\n\t"
10667             "movzwl  $dst,$src1\n\t"
10668             "pextrw  $tmp3,$tmp, 0x0\n\t"
10669             "andw    $dst,$tmp3\n\t"
10670             "pextrw  $tmp3,$tmp, 0x1\n\t"
10671             "andw    $dst,$tmp3\n\t"
10672             "movswl  $dst,$dst\t! and reduction16S" %}
10673   ins_encode %{
10674     int vector_len = 0;
10675     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
10676     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
10677     __ vpshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE, vector_len);
10678     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
10679     __ vpshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1, vector_len);
10680     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
10681     __ movzwl($dst$$Register, $src1$$Register);
10682     __ pextrw($tmp3$$Register, $tmp$$XMMRegister, 0x0);
10683     __ andw($dst$$Register, $tmp3$$Register);
10684     __ pextrw($tmp3$$Register, $tmp$$XMMRegister, 0x1);
10685     __ andw($dst$$Register, $tmp3$$Register);
10686     __ movswl($dst$$Register, $dst$$Register);
10687   %}
10688   ins_pipe( pipe_slow );
10689 %}
10690 
10691 instruct rvand32S_reduction_reg(rRegI dst, rRegI src1, vecZ src2, regF tmp, regF tmp2, rRegI tmp3) %{
10692   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
10693   match(Set dst (AndReductionV src1 src2));
10694   effect(TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP dst);
10695   format %{ "vextracti64x4_high  $tmp2,$src2\n\t"
10696             "vpand   $tmp2,$tmp2,$src2\n\t"
10697             "vextracti128_high  $tmp,$tmp2\n\t"
10698             "vpand   $tmp,$tmp,$tmp2\n\t"
10699             "pshufd  $tmp2,$tmp,0xE\n\t"
10700             "vpand   $tmp,$tmp,$tmp2\n\t"
10701             "pshufd  $tmp2,$tmp,0x1\n\t"
10702             "vpand   $tmp,$tmp,$tmp2\n\t"
10703             "movzwl  $dst,$src1\n\t"
10704             "movdl   $tmp3,$tmp\n\t"
10705             "andw    $dst,$tmp3\n\t"
10706             "shrl    $tmp3,0x16\n\t"
10707             "andw    $dst,$tmp3\n\t"
10708             "movswl  $dst,$dst\t! and reduction32S" %}
10709   ins_encode %{
10710     int vector_len = 0;
10711     __ vextracti64x4_high($tmp2$$XMMRegister, $src2$$XMMRegister);
10712     __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
10713     __ vextracti128_high($tmp$$XMMRegister, $tmp2$$XMMRegister);
10714     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
10715     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
10716     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
10717     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
10718     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
10719     __ movzwl($dst$$Register, $src1$$Register);
10720     __ movdl($tmp3$$Register, $tmp$$XMMRegister);
10721     __ andw($dst$$Register, $tmp3$$Register);
10722     __ shrl($tmp3$$Register, 16);
10723     __ andw($dst$$Register, $tmp3$$Register);
10724     __ movswl($dst$$Register, $dst$$Register);
10725   %}
10726   ins_pipe( pipe_slow );
10727 %}
10728 
10729 instruct rsand2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
10730   predicate(UseSSE > 1 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
10731   match(Set dst (AndReductionV src1 src2));
10732   effect(TEMP tmp, TEMP tmp2);
10733   format %{ "pshufd  $tmp2,$src2,0x1\n\t"
10734             "pand    $tmp2,$src2\n\t"
10735             "movd    $tmp,$src1\n\t"
10736             "pand    $tmp2,$tmp\n\t"
10737             "movd    $dst,$tmp2\t! and reduction2I" %}
10738   ins_encode %{
10739     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
10740     __ pand($tmp2$$XMMRegister, $src2$$XMMRegister);
10741     __ movdl($tmp$$XMMRegister, $src1$$Register);
10742     __ pand($tmp2$$XMMRegister, $tmp$$XMMRegister);
10743     __ movdl($dst$$Register, $tmp2$$XMMRegister);
10744   %}
10745   ins_pipe( pipe_slow );
10746 %}
10747 
10748 instruct rsand4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
10749   predicate(UseSSE > 1 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
10750   match(Set dst (AndReductionV src1 src2));
10751   effect(TEMP tmp, TEMP tmp2);
10752   format %{ "pshufd  $tmp2,$src2,0xE\n\t"
10753             "pand    $tmp2,$src2\n\t"
10754             "pshufd  $tmp,$tmp2,0x1\n\t"
10755             "pand    $tmp2,$tmp\n\t"
10756             "movd    $tmp,$src1\n\t"
10757             "pand    $tmp2,$tmp\n\t"
10758             "movd    $dst,$tmp2\t! and reduction4I" %}
10759   ins_encode %{
10760     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
10761     __ pand($tmp2$$XMMRegister, $src2$$XMMRegister);
10762     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x1);
10763     __ pand($tmp2$$XMMRegister, $tmp$$XMMRegister);
10764     __ movdl($tmp$$XMMRegister, $src1$$Register);
10765     __ pand($tmp2$$XMMRegister, $tmp$$XMMRegister);
10766     __ movdl($dst$$Register, $tmp2$$XMMRegister);
10767   %}
10768   ins_pipe( pipe_slow );
10769 %}
10770 
10771 instruct rvand8I_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2) %{
10772   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
10773   match(Set dst (AndReductionV src1 src2));
10774   effect(TEMP tmp, TEMP tmp2);
10775   format %{ "vextracti128_high  $tmp,$src2\n\t"
10776             "vpand    $tmp,$tmp,$src2\n\t"
10777             "vpshufd   $tmp2,$tmp,0xE\n\t"
10778             "vpand    $tmp,$tmp,$tmp2\n\t"
10779             "vpshufd   $tmp2,$tmp,0x1\n\t"
10780             "vpand    $tmp,$tmp,$tmp2\n\t"
10781             "movd     $tmp2,$src1\n\t"
10782             "vpand    $tmp2,$tmp,$tmp2\n\t"
10783             "movd     $dst,$tmp2\t! and reduction8I" %}
10784   ins_encode %{
10785     int vector_len = 0;
10786     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
10787     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
10788     __ vpshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE, vector_len);
10789     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
10790     __ vpshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1, vector_len);
10791     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
10792     __ movdl($tmp2$$XMMRegister, $src1$$Register);
10793     __ vpand($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
10794     __ movdl($dst$$Register, $tmp2$$XMMRegister);
10795   %}
10796   ins_pipe( pipe_slow );
10797 %}
10798 
10799 instruct rvand16I_reduction_reg(rRegI dst, rRegI src1, vecZ src2, regF tmp, regF tmp2, regF tmp3) %{
10800   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
10801   match(Set dst (AndReductionV src1 src2));
10802   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
10803   format %{ "vextracti64x4_high  $tmp3,$src2\n\t"
10804             "vpand  $tmp3,$tmp3,$src2\n\t"
10805             "vextracti128_high  $tmp,$tmp3\n\t"
10806             "vpand    $tmp,$tmp,$src2\n\t"
10807             "vpshufd   $tmp2,$tmp,0xE\n\t"
10808             "vpand    $tmp,$tmp,$tmp2\n\t"
10809             "vpshufd   $tmp2,$tmp,0x1\n\t"
10810             "vpand    $tmp,$tmp,$tmp2\n\t"
10811             "movd     $tmp2,$src1\n\t"
10812             "vpand    $tmp2,$tmp,$tmp2\n\t"
10813             "movd     $dst,$tmp2\t! and reduction16I" %}
10814   ins_encode %{
10815     __ vextracti64x4_high($tmp3$$XMMRegister, $src2$$XMMRegister);
10816     __ vpand($tmp3$$XMMRegister, $tmp3$$XMMRegister, $src2$$XMMRegister, 1);
10817     __ vextracti128_high($tmp$$XMMRegister, $tmp3$$XMMRegister);
10818     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp3$$XMMRegister, 0);
10819     __ vpshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE, 0);
10820     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
10821     __ vpshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1, 0);
10822     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
10823     __ movdl($tmp2$$XMMRegister, $src1$$Register);
10824     __ vpand($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
10825     __ movdl($dst$$Register, $tmp2$$XMMRegister);
10826   %}
10827   ins_pipe( pipe_slow );
10828 %}
10829 
10830 #ifdef _LP64
10831 instruct rsand2L_reduction_reg(rRegL dst, rRegL src1, vecX src2, regF tmp, regF tmp2) %{
10832   predicate(UseSSE >= 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
10833   match(Set dst (AndReductionV src1 src2));
10834   effect(TEMP tmp, TEMP tmp2);
10835   format %{ "pshufd  $tmp2,$src2,0xE\n\t"
10836             "pand    $tmp2,$src2\n\t"
10837             "movdq   $tmp,$src1\n\t"
10838             "pand    $tmp2,$tmp\n\t"
10839             "movq   $dst,$tmp2\t! and reduction2L" %}
10840   ins_encode %{
10841     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
10842     __ pand($tmp2$$XMMRegister, $src2$$XMMRegister);
10843     __ movdq($tmp$$XMMRegister, $src1$$Register);
10844     __ pand($tmp2$$XMMRegister, $tmp$$XMMRegister);
10845     __ movq($dst$$Register, $tmp2$$XMMRegister);
10846   %}
10847   ins_pipe( pipe_slow );
10848 %}
10849 
10850 instruct rvand4L_reduction_reg_avx(rRegL dst, rRegL src1, vecY src2, regF tmp, regF tmp2) %{
10851   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
10852   match(Set dst (AndReductionV src1 src2));
10853   effect(TEMP tmp, TEMP tmp2);
10854   format %{ "vextracti128_high  $tmp,$src2\n\t"
10855             "vpand  $tmp2,$tmp,$src2\n\t"
10856             "vpshufd  $tmp,$tmp2,0xE\n\t"
10857             "vpand  $tmp2,$tmp2,$tmp\n\t"
10858             "movq   $tmp,$src1\n\t"
10859             "vpand  $tmp2,$tmp2,$tmp\n\t"
10860             "movq   $dst,$tmp2\t! and reduction4L" %}
10861   ins_encode %{
10862     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
10863     __ vpand($tmp2$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, 0);
10864     __ vpshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE, 0);
10865     __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
10866     __ movq($tmp$$XMMRegister, $src1$$Register);
10867     __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
10868     __ movq($dst$$Register, $tmp2$$XMMRegister);
10869   %}
10870   ins_pipe( pipe_slow );
10871 %}
10872 
10873 instruct rvand8L_reduction_reg(rRegL dst, rRegL src1, vecZ src2, regF tmp, regF tmp2) %{
10874   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
10875   match(Set dst (AndReductionV src1 src2));
10876   effect(TEMP tmp, TEMP tmp2);
10877   format %{ "vextracti64x4_high  $tmp2,$src2\n\t"
10878             "vpandq  $tmp2,$tmp2,$src2\n\t"
10879             "vextracti128_high  $tmp,$tmp2\n\t"
10880             "vpandq  $tmp2,$tmp2,$tmp\n\t"
10881             "vpshufd  $tmp,$tmp2,0xE\n\t"
10882             "vpandq  $tmp2,$tmp2,$tmp\n\t"
10883             "movdq   $tmp,$src1\n\t"
10884             "vpandq  $tmp2,$tmp2,$tmp\n\t"
10885             "movdq   $dst,$tmp2\t! and reduction8L" %}
10886   ins_encode %{
10887     __ vextracti64x4_high($tmp2$$XMMRegister, $src2$$XMMRegister);
10888     __ vpandq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
10889     __ vextracti128_high($tmp$$XMMRegister, $tmp2$$XMMRegister);
10890     __ vpandq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
10891     __ vpshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE, 0);
10892     __ vpandq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
10893     __ movdq($tmp$$XMMRegister, $src1$$Register);
10894     __ vpandq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
10895     __ movdq($dst$$Register, $tmp2$$XMMRegister);
10896   %}
10897   ins_pipe( pipe_slow );
10898 %}
10899 #endif
10900 
10901 instruct rsor8B_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, rRegI tmp2) %{
10902   predicate(UseSSE > 3 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
10903   match(Set dst (OrReductionV src1 src2));
10904   effect(TEMP tmp, TEMP tmp2, TEMP dst);
10905   format %{
10906             "pshufd  $tmp,$src2,0x1\n\t"
10907             "por    $tmp,$src2\n\t"
10908             "movzbl  $dst,$src1\n\t"
10909             "pextrb  $tmp2,$tmp, 0x0\n\t"
10910             "orl    $dst,$tmp2\n\t"
10911             "pextrb  $tmp2,$tmp, 0x1\n\t"
10912             "orl    $dst,$tmp2\n\t"
10913             "pextrb  $tmp2,$tmp, 0x2\n\t"
10914             "orl    $dst,$tmp2\n\t"
10915             "pextrb  $tmp2,$tmp, 0x3\n\t"
10916             "orl    $dst,$tmp2\n\t"
10917             "movsbl  $dst,$dst\t! or reduction8B" %}
10918   ins_encode %{
10919     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x1);
10920     __ por($tmp$$XMMRegister, $src2$$XMMRegister);
10921     __ movzbl($dst$$Register, $src1$$Register);
10922     __ pextrb($tmp2$$Register, $tmp$$XMMRegister, 0x0);
10923     __ orl($dst$$Register, $tmp2$$Register);
10924     __ pextrb($tmp2$$Register, $tmp$$XMMRegister, 0x1);
10925     __ orl($dst$$Register, $tmp2$$Register);
10926     __ pextrb($tmp2$$Register, $tmp$$XMMRegister, 0x2);
10927     __ orl($dst$$Register, $tmp2$$Register);
10928     __ pextrb($tmp2$$Register, $tmp$$XMMRegister, 0x3);
10929     __ orl($dst$$Register, $tmp2$$Register);
10930     __ movsbl($dst$$Register, $dst$$Register);
10931   %}
10932   ins_pipe( pipe_slow );
10933 %}
10934 
10935 instruct rsor16B_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2, rRegI tmp3) %{
10936   predicate(UseSSE > 3 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
10937   match(Set dst (OrReductionV src1 src2));
10938   effect(TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP dst);
10939   format %{ "pshufd  $tmp,$src2,0xE\n\t"
10940             "por    $tmp,$src2\n\t"
10941             "pshufd  $tmp2,$tmp,0x1\n\t"
10942             "por    $tmp,$tmp,$tmp2\n\t"
10943             "movzbl  $dst,$src1\n\t"
10944             "pextrb  $tmp3,$tmp, 0x0\n\t"
10945             "orl    $dst,$tmp3\n\t"
10946             "pextrb  $tmp3,$tmp, 0x1\n\t"
10947             "orl    $dst,$tmp3\n\t"
10948             "pextrb  $tmp3,$tmp, 0x2\n\t"
10949             "orl    $dst,$tmp3\n\t"
10950             "pextrb  $tmp3,$tmp, 0x3\n\t"
10951             "orl    $dst,$tmp3\n\t"
10952             "movsbl  $dst,$dst\t! or reduction16B" %}
10953   ins_encode %{
10954     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
10955     __ por($tmp$$XMMRegister, $src2$$XMMRegister);
10956     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
10957     __ por($tmp$$XMMRegister, $tmp2$$XMMRegister);
10958     __ movzbl($dst$$Register, $src1$$Register);
10959     __ pextrb($tmp3$$Register, $tmp$$XMMRegister, 0x0);
10960     __ orl($dst$$Register, $tmp3$$Register);
10961     __ pextrb($tmp3$$Register, $tmp$$XMMRegister, 0x1);
10962     __ orl($dst$$Register, $tmp3$$Register);
10963     __ pextrb($tmp3$$Register, $tmp$$XMMRegister, 0x2);
10964     __ orl($dst$$Register, $tmp3$$Register);
10965     __ pextrb($tmp3$$Register, $tmp$$XMMRegister, 0x3);
10966     __ orl($dst$$Register, $tmp3$$Register);
10967     __ movsbl($dst$$Register, $dst$$Register);
10968   %}
10969   ins_pipe( pipe_slow );
10970 %}
10971 
10972 instruct rvor32B_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2, rRegI tmp3) %{
10973   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
10974   match(Set dst (OrReductionV src1 src2));
10975   effect(TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP dst);
10976    format %{ "vextracti128_high  $tmp,$src2\n\t"
10977             "vpor   $tmp,$tmp,$src2\n\t"
10978             "pshufd  $tmp2,$tmp,0xE\n\t"
10979             "vpor   $tmp,$tmp,$tmp2\n\t"
10980             "pshufd  $tmp2,$tmp,0x1\n\t"
10981             "vpor   $tmp,$tmp,$tmp2\n\t"
10982             "movzbl  $dst,$src1\n\t"
10983             "pextrb  $tmp3,$tmp, 0x0\n\t"
10984             "orl    $dst,$tmp3\n\t"
10985             "pextrb  $tmp3,$tmp, 0x1\n\t"
10986             "orl    $dst,$tmp3\n\t"
10987             "pextrb  $tmp3,$tmp, 0x2\n\t"
10988             "orl    $dst,$tmp3\n\t"
10989             "pextrb  $tmp3,$tmp, 0x3\n\t"
10990             "orl    $dst,$tmp3\n\t"
10991             "movsbl  $dst,$dst\t! or reduction32B" %}
10992   ins_encode %{
10993     int vector_len = 0;
10994     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
10995     __ vpor($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
10996     __ vpshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE, vector_len);
10997     __ vpor($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
10998     __ vpshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1, vector_len);
10999     __ vpor($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
11000     __ movzbl($dst$$Register, $src1$$Register);
11001     __ pextrb($tmp3$$Register, $tmp$$XMMRegister, 0x0);
11002     __ orl($dst$$Register, $tmp3$$Register);
11003     __ pextrb($tmp3$$Register, $tmp$$XMMRegister, 0x1);
11004     __ orl($dst$$Register, $tmp3$$Register);
11005     __ pextrb($tmp3$$Register, $tmp$$XMMRegister, 0x2);
11006     __ orl($dst$$Register, $tmp3$$Register);
11007     __ pextrb($tmp3$$Register, $tmp$$XMMRegister, 0x3);
11008     __ orl($dst$$Register, $tmp3$$Register);
11009     __ movsbl($dst$$Register, $dst$$Register);
11010   %}
11011   ins_pipe( pipe_slow );
11012 %}
11013 
11014 instruct rvor64B_reduction_reg(rRegI dst, rRegI src1, vecZ src2, regF tmp, regF tmp2, rRegI tmp3) %{
11015   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
11016   match(Set dst (OrReductionV src1 src2));
11017   effect(TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP dst);
11018   format %{ "vextracti64x4_high  $tmp2,$src2\n\t"
11019             "vpor   $tmp2,$tmp2,$src2\n\t"
11020             "vextracti128_high  $tmp,$tmp2\n\t"
11021             "vpor   $tmp,$tmp,$tmp2\n\t"
11022             "pshufd  $tmp2,$tmp,0xE\n\t"
11023             "vpor   $tmp,$tmp,$tmp2\n\t"
11024             "pshufd  $tmp2,$tmp,0x1\n\t"
11025             "vpor   $tmp,$tmp,$tmp2\n\t"
11026             "movzbl  $dst,$src1\n\t"
11027             "movdl   $tmp3,$tmp\n\t"
11028             "orl    $dst,$tmp3\n\t"
11029             "shrl    $tmp3,0x8\n\t"
11030             "orl    $dst,$tmp3\n\t"
11031             "shrl    $tmp3,0x8\n\t"
11032             "orl    $dst,$tmp3\n\t"
11033             "shrl    $tmp3,0x8\n\t"
11034             "orl    $dst,$tmp3\n\t"
11035             "movsbl  $dst,$dst\t! or reduction64B" %}
11036   ins_encode %{
11037     int vector_len = 0;
11038     __ vextracti64x4_high($tmp2$$XMMRegister, $src2$$XMMRegister);
11039     __ vpor($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
11040     __ vextracti128_high($tmp$$XMMRegister, $tmp2$$XMMRegister);
11041     __ vpor($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
11042     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
11043     __ vpor($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
11044     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
11045     __ vpor($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
11046     __ movzbl($dst$$Register, $src1$$Register);
11047     __ movdl($tmp3$$Register, $tmp$$XMMRegister);
11048     __ orl($dst$$Register, $tmp3$$Register);
11049     __ shrl($tmp3$$Register, 8);
11050     __ orl($dst$$Register, $tmp3$$Register);
11051     __ shrl($tmp3$$Register, 8);
11052     __ orl($dst$$Register, $tmp3$$Register);
11053     __ shrl($tmp3$$Register, 8);
11054     __ orl($dst$$Register, $tmp3$$Register);
11055     __ movsbl($dst$$Register, $dst$$Register);
11056   %}
11057   ins_pipe( pipe_slow );
11058 %}
11059 
11060 instruct rsor4S_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, rRegI tmp2) %{
11061   predicate(UseSSE > 1 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
11062   match(Set dst (OrReductionV src1 src2));
11063   effect(TEMP tmp, TEMP tmp2, TEMP dst);
11064   format %{
11065             "pshufd  $tmp,$src2,0x1\n\t"
11066             "por    $tmp,$src2\n\t"
11067             "movzwl  $dst,$src1\n\t"
11068             "pextrw  $tmp2,$tmp, 0x0\n\t"
11069             "orw    $dst,$tmp2\n\t"
11070             "pextrw  $tmp2,$tmp, 0x1\n\t"
11071             "orw    $dst,$tmp2\n\t"
11072             "movswl  $dst,$dst\t! or reduction4S" %}
11073   ins_encode %{
11074     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x1);
11075     __ por($tmp$$XMMRegister, $src2$$XMMRegister);
11076     __ movzwl($dst$$Register, $src1$$Register);
11077     __ pextrw($tmp2$$Register, $tmp$$XMMRegister, 0x0);
11078     __ orw($dst$$Register, $tmp2$$Register);
11079     __ pextrw($tmp2$$Register, $tmp$$XMMRegister, 0x1);
11080     __ orw($dst$$Register, $tmp2$$Register);
11081     __ movswl($dst$$Register, $dst$$Register);
11082   %}
11083   ins_pipe( pipe_slow );
11084 %}
11085 
11086 instruct rsor8S_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2, rRegI tmp3) %{
11087   predicate(UseSSE > 1 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
11088   match(Set dst (OrReductionV src1 src2));
11089   effect(TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP dst);
11090   format %{ "pshufd  $tmp,$src2,0xE\n\t"
11091             "por    $tmp,$src2\n\t"
11092             "pshufd  $tmp2,$tmp,0x1\n\t"
11093             "por    $tmp,$tmp,$tmp2\n\t"
11094             "movzwl  $dst,$src1\n\t"
11095             "pextrw  $tmp3,$tmp, 0x0\n\t"
11096             "orw    $dst,$tmp3\n\t"
11097             "pextrw  $tmp3,$tmp, 0x1\n\t"
11098             "orw    $dst,$tmp3\n\t"
11099             "movswl  $dst,$dst\t! or reduction8S" %}
11100   ins_encode %{
11101     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
11102     __ por($tmp$$XMMRegister, $src2$$XMMRegister);
11103     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
11104     __ por($tmp$$XMMRegister, $tmp2$$XMMRegister);
11105     __ movzwl($dst$$Register, $src1$$Register);
11106     __ pextrw($tmp3$$Register, $tmp$$XMMRegister, 0x0);
11107     __ orw($dst$$Register, $tmp3$$Register);
11108     __ pextrw($tmp3$$Register, $tmp$$XMMRegister, 0x1);
11109     __ orw($dst$$Register, $tmp3$$Register);
11110     __ movswl($dst$$Register, $dst$$Register);
11111   %}
11112   ins_pipe( pipe_slow );
11113 %}
11114 
11115 instruct rvor16S_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2, rRegI tmp3) %{
11116   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
11117   match(Set dst (OrReductionV src1 src2));
11118   effect(TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP dst);
11119    format %{ "vextracti128_high  $tmp,$src2\n\t"
11120             "vpor   $tmp,$tmp,$src2\n\t"
11121             "pshufd  $tmp2,$tmp,0xE\n\t"
11122             "vpor   $tmp,$tmp,$tmp2\n\t"
11123             "pshufd  $tmp2,$tmp,0x1\n\t"
11124             "vpor   $tmp,$tmp,$tmp2\n\t"
11125             "movzwl  $dst,$src1\n\t"
11126             "pextrw  $tmp3,$tmp, 0x0\n\t"
11127             "orw    $dst,$tmp3\n\t"
11128             "pextrw  $tmp3,$tmp, 0x1\n\t"
11129             "orw    $dst,$tmp3\n\t"
11130             "movswl  $dst,$dst\t! or reduction16S" %}
11131   ins_encode %{
11132     int vector_len = 0;
11133     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
11134     __ vpor($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
11135     __ vpshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE, vector_len);
11136     __ vpor($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
11137     __ vpshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1, vector_len);
11138     __ vpor($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
11139     __ movzwl($dst$$Register, $src1$$Register);
11140     __ pextrw($tmp3$$Register, $tmp$$XMMRegister, 0x0);
11141     __ orw($dst$$Register, $tmp3$$Register);
11142     __ pextrw($tmp3$$Register, $tmp$$XMMRegister, 0x1);
11143     __ orw($dst$$Register, $tmp3$$Register);
11144     __ movswl($dst$$Register, $dst$$Register);
11145   %}
11146   ins_pipe( pipe_slow );
11147 %}
11148 
11149 instruct rvor32S_reduction_reg(rRegI dst, rRegI src1, vecZ src2, regF tmp, regF tmp2, rRegI tmp3) %{
11150   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
11151   match(Set dst (OrReductionV src1 src2));
11152   effect(TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP dst);
11153   format %{ "vextracti64x4_high  $tmp2,$src2\n\t"
11154             "vpor   $tmp2,$tmp2,$src2\n\t"
11155             "vextracti128_high  $tmp,$tmp2\n\t"
11156             "vpor   $tmp,$tmp,$tmp2\n\t"
11157             "pshufd  $tmp2,$tmp,0xE\n\t"
11158             "vpor   $tmp,$tmp,$tmp2\n\t"
11159             "pshufd  $tmp2,$tmp,0x1\n\t"
11160             "vpor   $tmp,$tmp,$tmp2\n\t"
11161             "movzwl  $dst,$src1\n\t"
11162             "movdl   $tmp3,$tmp\n\t"
11163             "orw    $dst,$tmp3\n\t"
11164             "shrl    $tmp3,0x16\n\t"
11165             "orw    $dst,$tmp3\n\t"
11166             "movswl  $dst,$dst\t! or reduction32S" %}
11167   ins_encode %{
11168     int vector_len = 0;
11169     __ vextracti64x4_high($tmp2$$XMMRegister, $src2$$XMMRegister);
11170     __ vpor($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
11171     __ vextracti128_high($tmp$$XMMRegister, $tmp2$$XMMRegister);
11172     __ vpor($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
11173     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
11174     __ vpor($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
11175     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
11176     __ vpor($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
11177     __ movzwl($dst$$Register, $src1$$Register);
11178     __ movdl($tmp3$$Register, $tmp$$XMMRegister);
11179     __ orw($dst$$Register, $tmp3$$Register);
11180     __ shrl($tmp3$$Register, 16);
11181     __ orw($dst$$Register, $tmp3$$Register);
11182     __ movswl($dst$$Register, $dst$$Register);
11183   %}
11184   ins_pipe( pipe_slow );
11185 %}
11186 
11187 instruct rsor2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
11188   predicate(UseSSE > 1 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
11189   match(Set dst (OrReductionV src1 src2));
11190   effect(TEMP tmp, TEMP tmp2);
11191   format %{ "pshufd  $tmp2,$src2,0x1\n\t"
11192             "por    $tmp2,$src2\n\t"
11193             "movd    $tmp,$src1\n\t"
11194             "por    $tmp2,$tmp\n\t"
11195             "movd    $dst,$tmp2\t! or reduction2I" %}
11196   ins_encode %{
11197     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
11198     __ por($tmp2$$XMMRegister, $src2$$XMMRegister);
11199     __ movdl($tmp$$XMMRegister, $src1$$Register);
11200     __ por($tmp2$$XMMRegister, $tmp$$XMMRegister);
11201     __ movdl($dst$$Register, $tmp2$$XMMRegister);
11202   %}
11203   ins_pipe( pipe_slow );
11204 %}
11205 
11206 instruct rsor4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
11207   predicate(UseSSE > 1 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
11208   match(Set dst (OrReductionV src1 src2));
11209   effect(TEMP tmp, TEMP tmp2);
11210   format %{ "pshufd  $tmp2,$src2,0xE\n\t"
11211             "por    $tmp2,$src2\n\t"
11212             "pshufd  $tmp,$tmp2,0x1\n\t"
11213             "por    $tmp2,$tmp\n\t"
11214             "movd    $tmp,$src1\n\t"
11215             "por    $tmp2,$tmp\n\t"
11216             "movd    $dst,$tmp2\t! or reduction4I" %}
11217   ins_encode %{
11218     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
11219     __ por($tmp2$$XMMRegister, $src2$$XMMRegister);
11220     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x1);
11221     __ por($tmp2$$XMMRegister, $tmp$$XMMRegister);
11222     __ movdl($tmp$$XMMRegister, $src1$$Register);
11223     __ por($tmp2$$XMMRegister, $tmp$$XMMRegister);
11224     __ movdl($dst$$Register, $tmp2$$XMMRegister);
11225   %}
11226   ins_pipe( pipe_slow );
11227 %}
11228 
11229 instruct rvor8I_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2) %{
11230   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
11231   match(Set dst (OrReductionV src1 src2));
11232   effect(TEMP tmp, TEMP tmp2);
11233   format %{ "vextracti128_high  $tmp,$src2\n\t"
11234             "vpor    $tmp,$tmp,$src2\n\t"
11235             "vpshufd   $tmp2,$tmp,0xE\t"
11236             "vpor    $tmp,$tmp,$tmp2\n\t"
11237             "vpshufd   $tmp2,$tmp,0x1\t"
11238             "vpor    $tmp,$tmp,$tmp2\n\t"
11239             "movd     $tmp2,$src1\n\t"
11240             "vpor    $tmp2,$tmp,$tmp2\n\t"
11241             "movd     $dst,$tmp2\t! or reduction8I" %}
11242   ins_encode %{
11243     int vector_len = 0;
11244     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
11245     __ vpor($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
11246     __ vpshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE, vector_len);
11247     __ vpor($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
11248     __ vpshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1, vector_len);
11249     __ vpor($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
11250     __ movdl($tmp2$$XMMRegister, $src1$$Register);
11251     __ vpor($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
11252     __ movdl($dst$$Register, $tmp2$$XMMRegister);
11253   %}
11254   ins_pipe( pipe_slow );
11255 %}
11256 
11257 instruct rvor16I_reduction_reg(rRegI dst, rRegI src1, vecZ src2, regF tmp, regF tmp2, regF tmp3) %{
11258   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
11259   match(Set dst (OrReductionV src1 src2));
11260   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
11261   format %{ "vextracti64x4_high  $tmp3,$src2\n\t"
11262             "vpor  $tmp3,$tmp3,$src2\n\t"
11263             "vextracti128_high  $tmp,$tmp3\n\t"
11264             "vpor    $tmp,$tmp,$src2\n\t"
11265             "vpshufd   $tmp2,$tmp,0xE\t"
11266             "vpor    $tmp,$tmp,$tmp2\n\t"
11267             "vpshufd   $tmp2,$tmp,0x1\t"
11268             "vpor    $tmp,$tmp,$tmp2\n\t"
11269             "movd     $tmp2,$src1\n\t"
11270             "vpor    $tmp2,$tmp,$tmp2\n\t"
11271             "movd     $dst,$tmp2\t! or reduction16I" %}
11272   ins_encode %{
11273     int vector_len = 0;
11274     __ vextracti64x4_high($tmp3$$XMMRegister, $src2$$XMMRegister);
11275     __ vpor($tmp3$$XMMRegister, $tmp3$$XMMRegister, $src2$$XMMRegister, 1);
11276     __ vextracti128_high($tmp$$XMMRegister, $tmp3$$XMMRegister);
11277     __ vpor($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp3$$XMMRegister, vector_len);
11278     __ vpshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE, vector_len);
11279     __ vpor($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
11280     __ vpshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1, vector_len);
11281     __ vpor($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
11282     __ movdl($tmp2$$XMMRegister, $src1$$Register);
11283     __ vpor($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
11284     __ movdl($dst$$Register, $tmp2$$XMMRegister);
11285   %}
11286   ins_pipe( pipe_slow );
11287 %}
11288 
11289 instruct rsor2L_reduction_reg(rRegL dst, rRegL src1, vecX src2, regF tmp, regF tmp2) %{
11290   predicate(UseSSE >= 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
11291   match(Set dst (OrReductionV src1 src2));
11292   effect(TEMP tmp, TEMP tmp2);
11293   format %{ "pshufd  $tmp2,$src2,0xE\n\t"
11294             "por    $tmp2,$src2\n\t"
11295             "movdq   $tmp,$src1\n\t"
11296             "por    $tmp2,$tmp\n\t"
11297             "movq   $dst,$tmp2\t! or reduction2L" %}
11298   ins_encode %{
11299     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
11300     __ por($tmp2$$XMMRegister, $src2$$XMMRegister);
11301     __ movdq($tmp$$XMMRegister, $src1$$Register);
11302     __ por($tmp2$$XMMRegister, $tmp$$XMMRegister);
11303     __ movq($dst$$Register, $tmp2$$XMMRegister);
11304   %}
11305   ins_pipe( pipe_slow );
11306 %}
11307 
11308 instruct rvor4L_reduction_reg_avx(rRegL dst, rRegL src1, vecY src2, regF tmp, regF tmp2) %{
11309   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
11310   match(Set dst (OrReductionV src1 src2));
11311   effect(TEMP tmp, TEMP tmp2);
11312   format %{ "vextracti128_high  $tmp,$src2\n\t"
11313             "vpor  $tmp2,$tmp,$src2\n\t"
11314             "vpshufd  $tmp,$tmp2,0xE\t"
11315             "vpor  $tmp2,$tmp2,$tmp\n\t"
11316             "movq   $tmp,$src1\n\t"
11317             "vpor  $tmp2,$tmp2,$tmp\n\t"
11318             "movq   $dst,$tmp2\t! or reduction4L" %}
11319   ins_encode %{
11320     int vector_len = 0;
11321     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
11322     __ vpor($tmp2$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
11323     __ vpshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE, vector_len);
11324     __ vpor($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
11325     __ movq($tmp$$XMMRegister, $src1$$Register);
11326     __ vpor($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
11327     __ movq($dst$$Register, $tmp2$$XMMRegister);
11328   %}
11329   ins_pipe( pipe_slow );
11330 %}
11331 
11332 #ifdef _LP64
11333 instruct rvor8L_reduction_reg(rRegL dst, rRegL src1, vecZ src2, regF tmp, regF tmp2) %{
11334   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
11335   match(Set dst (OrReductionV src1 src2));
11336   effect(TEMP tmp, TEMP tmp2);
11337   format %{ "vextracti64x4_high  $tmp2,$src2\n\t"
11338             "vporq  $tmp2,$tmp2,$src2\n\t"
11339             "vextracti128_high  $tmp,$tmp2\n\t"
11340             "vporq  $tmp2,$tmp2,$tmp\n\t"
11341             "vpshufd  $tmp,$tmp2,0xE\t"
11342             "vporq  $tmp2,$tmp2,$tmp\n\t"
11343             "movdq   $tmp,$src1\n\t"
11344             "vporq  $tmp2,$tmp2,$tmp\n\t"
11345             "movdq   $dst,$tmp2\t! or reduction8L" %}
11346   ins_encode %{
11347     int vector_len = 0;
11348     __ vextracti64x4_high($tmp2$$XMMRegister, $src2$$XMMRegister);
11349     __ vporq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
11350     __ vextracti128_high($tmp$$XMMRegister, $tmp2$$XMMRegister);
11351     __ vporq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
11352     __ vpshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE, vector_len);
11353     __ vporq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
11354     __ movdq($tmp$$XMMRegister, $src1$$Register);
11355     __ vporq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
11356     __ movdq($dst$$Register, $tmp2$$XMMRegister);
11357   %}
11358   ins_pipe( pipe_slow );
11359 %}
11360 #endif
11361 
11362 instruct rsxor8B_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, rRegI tmp2) %{
11363   predicate(UseSSE > 3 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
11364   match(Set dst (XorReductionV src1 src2));
11365   effect(TEMP tmp, TEMP tmp2, TEMP dst);
11366   format %{
11367             "pshufd  $tmp,$src2,0x1\n\t"
11368             "pxor    $tmp,$src2\n\t"
11369             "movzbl  $dst,$src1\n\t"
11370             "pextrb  $tmp2,$tmp, 0x0\n\t"
11371             "xorl    $dst,$tmp2\n\t"
11372             "pextrb  $tmp2,$tmp, 0x1\n\t"
11373             "xorl    $dst,$tmp2\n\t"
11374             "pextrb  $tmp2,$tmp, 0x2\n\t"
11375             "xorl    $dst,$tmp2\n\t"
11376             "pextrb  $tmp2,$tmp, 0x3\n\t"
11377             "xorl    $dst,$tmp2\n\t"
11378             "movsbl  $dst,$dst\t! xor reduction8B" %}
11379   ins_encode %{
11380     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x1);
11381     __ pxor($tmp$$XMMRegister, $src2$$XMMRegister);
11382     __ movzbl($dst$$Register, $src1$$Register);
11383     __ pextrb($tmp2$$Register, $tmp$$XMMRegister, 0x0);
11384     __ xorl($dst$$Register, $tmp2$$Register);
11385     __ pextrb($tmp2$$Register, $tmp$$XMMRegister, 0x1);
11386     __ xorl($dst$$Register, $tmp2$$Register);
11387     __ pextrb($tmp2$$Register, $tmp$$XMMRegister, 0x2);
11388     __ xorl($dst$$Register, $tmp2$$Register);
11389     __ pextrb($tmp2$$Register, $tmp$$XMMRegister, 0x3);
11390     __ xorl($dst$$Register, $tmp2$$Register);
11391     __ movsbl($dst$$Register, $dst$$Register);
11392   %}
11393   ins_pipe( pipe_slow );
11394 %}
11395 
11396 instruct rsxor16B_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2, rRegI tmp3) %{
11397   predicate(UseSSE > 3 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
11398   match(Set dst (XorReductionV src1 src2));
11399   effect(TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP dst);
11400   format %{ "pshufd  $tmp,$src2,0xE\n\t"
11401             "pxor    $tmp,$src2\n\t"
11402             "pshufd  $tmp2,$tmp,0x1\n\t"
11403             "pxor    $tmp,$tmp,$tmp2\n\t"
11404             "movzbl  $dst,$src1\n\t"
11405             "pextrb  $tmp3,$tmp, 0x0\n\t"
11406             "xorl    $dst,$tmp3\n\t"
11407             "pextrb  $tmp3,$tmp, 0x1\n\t"
11408             "xorl    $dst,$tmp3\n\t"
11409             "pextrb  $tmp3,$tmp, 0x2\n\t"
11410             "xorl    $dst,$tmp3\n\t"
11411             "pextrb  $tmp3,$tmp, 0x3\n\t"
11412             "xorl    $dst,$tmp3\n\t"
11413             "movsbl  $dst,$dst\t! xor reduction16B" %}
11414   ins_encode %{
11415     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
11416     __ pxor($tmp$$XMMRegister, $src2$$XMMRegister);
11417     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
11418     __ pxor($tmp$$XMMRegister, $tmp2$$XMMRegister);
11419     __ movzbl($dst$$Register, $src1$$Register);
11420     __ pextrb($tmp3$$Register, $tmp$$XMMRegister, 0x0);
11421     __ xorl($dst$$Register, $tmp3$$Register);
11422     __ pextrb($tmp3$$Register, $tmp$$XMMRegister, 0x1);
11423     __ xorl($dst$$Register, $tmp3$$Register);
11424     __ pextrb($tmp3$$Register, $tmp$$XMMRegister, 0x2);
11425     __ xorl($dst$$Register, $tmp3$$Register);
11426     __ pextrb($tmp3$$Register, $tmp$$XMMRegister, 0x3);
11427     __ xorl($dst$$Register, $tmp3$$Register);
11428     __ movsbl($dst$$Register, $dst$$Register);
11429   %}
11430   ins_pipe( pipe_slow );
11431 %}
11432 
11433 instruct rvxor32B_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2, rRegI tmp3) %{
11434   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
11435   match(Set dst (XorReductionV src1 src2));
11436   effect(TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP dst);
11437    format %{ "vextracti128_high  $tmp,$src2\n\t"
11438             "vpxor   $tmp,$tmp,$src2\n\t"
11439             "pshufd  $tmp2,$tmp,0xE\n\t"
11440             "vpxor   $tmp,$tmp,$tmp2\n\t"
11441             "pshufd  $tmp2,$tmp,0x1\n\t"
11442             "vpxor   $tmp,$tmp,$tmp2\n\t"
11443             "movzbl  $dst,$src1\n\t"
11444             "pextrb  $tmp3,$tmp, 0x0\n\t"
11445             "xorl    $dst,$tmp3\n\t"
11446             "pextrb  $tmp3,$tmp, 0x1\n\t"
11447             "xorl    $dst,$tmp3\n\t"
11448             "pextrb  $tmp3,$tmp, 0x2\n\t"
11449             "xorl    $dst,$tmp3\n\t"
11450             "pextrb  $tmp3,$tmp, 0x3\n\t"
11451             "xorl    $dst,$tmp3\n\t"
11452             "movsbl  $dst,$dst\t! xor reduction32B" %}
11453   ins_encode %{
11454     int vector_len = 0;
11455     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
11456     __ vpxor($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
11457     __ vpshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE, vector_len);
11458     __ vpxor($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
11459     __ vpshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1, vector_len);
11460     __ vpxor($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
11461     __ movzbl($dst$$Register, $src1$$Register);
11462     __ pextrb($tmp3$$Register, $tmp$$XMMRegister, 0x0);
11463     __ xorl($dst$$Register, $tmp3$$Register);
11464     __ pextrb($tmp3$$Register, $tmp$$XMMRegister, 0x1);
11465     __ xorl($dst$$Register, $tmp3$$Register);
11466     __ pextrb($tmp3$$Register, $tmp$$XMMRegister, 0x2);
11467     __ xorl($dst$$Register, $tmp3$$Register);
11468     __ pextrb($tmp3$$Register, $tmp$$XMMRegister, 0x3);
11469     __ xorl($dst$$Register, $tmp3$$Register);
11470     __ movsbl($dst$$Register, $dst$$Register);
11471   %}
11472   ins_pipe( pipe_slow );
11473 %}
11474 
11475 instruct rvxor64B_reduction_reg(rRegI dst, rRegI src1, vecZ src2, regF tmp, regF tmp2, rRegI tmp3) %{
11476   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
11477   match(Set dst (XorReductionV src1 src2));
11478   effect(TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP dst);
11479   format %{ "vextracti64x4_high  $tmp2,$src2\n\t"
11480             "vpxor   $tmp2,$tmp2,$src2\n\t"
11481             "vextracti128_high  $tmp,$tmp2\n\t"
11482             "vpxor   $tmp,$tmp,$tmp2\n\t"
11483             "pshufd  $tmp2,$tmp,0xE\n\t"
11484             "vpxor   $tmp,$tmp,$tmp2\n\t"
11485             "pshufd  $tmp2,$tmp,0x1\n\t"
11486             "vpxor   $tmp,$tmp,$tmp2\n\t"
11487             "movzbl  $dst,$src1\n\t"
11488             "movdl   $tmp3,$tmp\n\t"
11489             "xorl    $dst,$tmp3\n\t"
11490             "shrl    $tmp3,0x8\n\t"
11491             "xorl    $dst,$tmp3\n\t"
11492             "shrl    $tmp3,0x8\n\t"
11493             "xorl    $dst,$tmp3\n\t"
11494             "shrl    $tmp3,0x8\n\t"
11495             "xorl    $dst,$tmp3\n\t"
11496             "movsbl  $dst,$dst\t! xor reduction64B" %}
11497   ins_encode %{
11498     int vector_len = 0;
11499     __ vextracti64x4_high($tmp2$$XMMRegister, $src2$$XMMRegister);
11500     __ vpxor($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
11501     __ vextracti128_high($tmp$$XMMRegister, $tmp2$$XMMRegister);
11502     __ vpxor($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
11503     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
11504     __ vpxor($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
11505     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
11506     __ vpxor($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
11507     __ movzbl($dst$$Register, $src1$$Register);
11508     __ movdl($tmp3$$Register, $tmp$$XMMRegister);
11509     __ xorl($dst$$Register, $tmp3$$Register);
11510     __ shrl($tmp3$$Register, 8);
11511     __ xorl($dst$$Register, $tmp3$$Register);
11512     __ shrl($tmp3$$Register, 8);
11513     __ xorl($dst$$Register, $tmp3$$Register);
11514     __ shrl($tmp3$$Register, 8);
11515     __ xorl($dst$$Register, $tmp3$$Register);
11516     __ movsbl($dst$$Register, $dst$$Register);
11517   %}
11518   ins_pipe( pipe_slow );
11519 %}
11520 
11521 instruct rsxor4S_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, rRegI tmp2) %{
11522   predicate(UseSSE > 1 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
11523   match(Set dst (XorReductionV src1 src2));
11524   effect(TEMP tmp, TEMP tmp2, TEMP dst);
11525   format %{
11526             "pshufd  $tmp,$src2,0x1\n\t"
11527             "pxor    $tmp,$src2\n\t"
11528             "movzwl  $dst,$src1\n\t"
11529             "pextrw  $tmp2,$tmp, 0x0\n\t"
11530             "xorw    $dst,$tmp2\n\t"
11531             "pextrw  $tmp2,$tmp, 0x1\n\t"
11532             "xorw    $dst,$tmp2\n\t"
11533             "movswl  $dst,$dst\t! xor reduction4S" %}
11534   ins_encode %{
11535     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x1);
11536     __ pxor($tmp$$XMMRegister, $src2$$XMMRegister);
11537     __ movzwl($dst$$Register, $src1$$Register);
11538     __ pextrw($tmp2$$Register, $tmp$$XMMRegister, 0x0);
11539     __ xorw($dst$$Register, $tmp2$$Register);
11540     __ pextrw($tmp2$$Register, $tmp$$XMMRegister, 0x1);
11541     __ xorw($dst$$Register, $tmp2$$Register);
11542     __ movswl($dst$$Register, $dst$$Register);
11543   %}
11544   ins_pipe( pipe_slow );
11545 %}
11546 
11547 instruct rsxor8S_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2, rRegI tmp3) %{
11548   predicate(UseSSE > 1 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
11549   match(Set dst (XorReductionV src1 src2));
11550   effect(TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP dst);
11551   format %{ "pshufd  $tmp,$src2,0xE\n\t"
11552             "pxor    $tmp,$src2\n\t"
11553             "pshufd  $tmp2,$tmp,0x1\n\t"
11554             "pxor    $tmp,$tmp,$tmp2\n\t"
11555             "movzwl  $dst,$src1\n\t"
11556             "pextrw  $tmp3,$tmp, 0x0\n\t"
11557             "xorw    $dst,$tmp3\n\t"
11558             "pextrw  $tmp3,$tmp, 0x1\n\t"
11559             "xorw    $dst,$tmp3\n\t"
11560             "movswl  $dst,$dst\t! xor reduction8S" %}
11561   ins_encode %{
11562     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
11563     __ pxor($tmp$$XMMRegister, $src2$$XMMRegister);
11564     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
11565     __ pxor($tmp$$XMMRegister, $tmp2$$XMMRegister);
11566     __ movzwl($dst$$Register, $src1$$Register);
11567     __ pextrw($tmp3$$Register, $tmp$$XMMRegister, 0x0);
11568     __ xorw($dst$$Register, $tmp3$$Register);
11569     __ pextrw($tmp3$$Register, $tmp$$XMMRegister, 0x1);
11570     __ xorw($dst$$Register, $tmp3$$Register);
11571     __ movswl($dst$$Register, $dst$$Register);
11572   %}
11573   ins_pipe( pipe_slow );
11574 %}
11575 
11576 instruct rvxor16S_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2, rRegI tmp3) %{
11577   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
11578   match(Set dst (XorReductionV src1 src2));
11579   effect(TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP dst);
11580    format %{ "vextracti128_high  $tmp,$src2\n\t"
11581             "vpxor   $tmp,$tmp,$src2\n\t"
11582             "pshufd  $tmp2,$tmp,0xE\n\t"
11583             "vpxor   $tmp,$tmp,$tmp2\n\t"
11584             "pshufd  $tmp2,$tmp,0x1\n\t"
11585             "vpxor   $tmp,$tmp,$tmp2\n\t"
11586             "movzwl  $dst,$src1\n\t"
11587             "pextrw  $tmp3,$tmp, 0x0\n\t"
11588             "xorw    $dst,$tmp3\n\t"
11589             "pextrw  $tmp3,$tmp, 0x1\n\t"
11590             "xorw    $dst,$tmp3\n\t"
11591             "movswl  $dst,$dst\t! xor reduction16S" %}
11592   ins_encode %{
11593     int vector_len = 0;
11594     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
11595     __ vpxor($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
11596     __ vpshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE, vector_len);
11597     __ vpxor($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
11598     __ vpshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1, vector_len);
11599     __ vpxor($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
11600     __ movzwl($dst$$Register, $src1$$Register);
11601     __ pextrw($tmp3$$Register, $tmp$$XMMRegister, 0x0);
11602     __ xorw($dst$$Register, $tmp3$$Register);
11603     __ pextrw($tmp3$$Register, $tmp$$XMMRegister, 0x1);
11604     __ xorw($dst$$Register, $tmp3$$Register);
11605     __ movswl($dst$$Register, $dst$$Register);
11606   %}
11607   ins_pipe( pipe_slow );
11608 %}
11609 
11610 instruct rvxor32S_reduction_reg(rRegI dst, rRegI src1, vecZ src2, regF tmp, regF tmp2, rRegI tmp3) %{
11611   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
11612   match(Set dst (XorReductionV src1 src2));
11613   effect(TEMP tmp, TEMP tmp2, TEMP tmp3, TEMP dst);
11614   format %{ "vextracti64x4_high  $tmp2,$src2\n\t"
11615             "vpxor   $tmp2,$tmp2,$src2\n\t"
11616             "vextracti128_high  $tmp,$tmp2\n\t"
11617             "vpxor   $tmp,$tmp,$tmp2\n\t"
11618             "pshufd  $tmp2,$tmp,0xE\n\t"
11619             "vpxor   $tmp,$tmp,$tmp2\n\t"
11620             "pshufd  $tmp2,$tmp,0x1\n\t"
11621             "vpxor   $tmp,$tmp,$tmp2\n\t"
11622             "movzwl  $dst,$src1\n\t"
11623             "movdl   $tmp3,$tmp\n\t"
11624             "xorw    $dst,$tmp3\n\t"
11625             "shrl    $tmp3,0x16\n\t"
11626             "xorw    $dst,$tmp3\n\t"
11627             "movswl  $dst,$dst\t! xor reduction32S" %}
11628   ins_encode %{
11629     int vector_len = 0;
11630     __ vextracti64x4_high($tmp2$$XMMRegister, $src2$$XMMRegister);
11631     __ vpxor($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
11632     __ vextracti128_high($tmp$$XMMRegister, $tmp2$$XMMRegister);
11633     __ vpxor($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
11634     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
11635     __ vpxor($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
11636     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
11637     __ vpxor($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
11638     __ movzwl($dst$$Register, $src1$$Register);
11639     __ movdl($tmp3$$Register, $tmp$$XMMRegister);
11640     __ xorw($dst$$Register, $tmp3$$Register);
11641     __ shrl($tmp3$$Register, 16);
11642     __ xorw($dst$$Register, $tmp3$$Register);
11643     __ movswl($dst$$Register, $dst$$Register);
11644   %}
11645   ins_pipe( pipe_slow );
11646 %}
11647 
11648 instruct rsxor2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
11649   predicate(UseSSE > 1 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
11650   match(Set dst (XorReductionV src1 src2));
11651   effect(TEMP tmp, TEMP tmp2);
11652   format %{ "pshufd  $tmp2,$src2,0x1\n\t"
11653             "pxor    $tmp2,$src2\n\t"
11654             "movd    $tmp,$src1\n\t"
11655             "pxor    $tmp2,$tmp\n\t"
11656             "movd    $dst,$tmp2\t! xor reduction2I" %}
11657   ins_encode %{
11658     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
11659     __ pxor($tmp2$$XMMRegister, $src2$$XMMRegister);
11660     __ movdl($tmp$$XMMRegister, $src1$$Register);
11661     __ pxor($tmp2$$XMMRegister, $tmp$$XMMRegister);
11662     __ movdl($dst$$Register, $tmp2$$XMMRegister);
11663   %}
11664   ins_pipe( pipe_slow );
11665 %}
11666 
11667 instruct rsxor4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
11668   predicate(UseSSE > 1 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
11669   match(Set dst (XorReductionV src1 src2));
11670   effect(TEMP tmp, TEMP tmp2);
11671   format %{ "pshufd  $tmp2,$src2,0xE\n\t"
11672             "pxor    $tmp2,$src2\n\t"
11673             "pshufd  $tmp,$tmp2,0x1\n\t"
11674             "pxor    $tmp2,$tmp\n\t"
11675             "movd    $tmp,$src1\n\t"
11676             "pxor    $tmp2,$tmp\n\t"
11677             "movd    $dst,$tmp2\t! xor reduction4I" %}
11678   ins_encode %{
11679     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
11680     __ pxor($tmp2$$XMMRegister, $src2$$XMMRegister);
11681     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x1);
11682     __ pxor($tmp2$$XMMRegister, $tmp$$XMMRegister);
11683     __ movdl($tmp$$XMMRegister, $src1$$Register);
11684     __ pxor($tmp2$$XMMRegister, $tmp$$XMMRegister);
11685     __ movdl($dst$$Register, $tmp2$$XMMRegister);
11686   %}
11687   ins_pipe( pipe_slow );
11688 %}
11689 
11690 instruct rvxor8I_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2) %{
11691   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
11692   match(Set dst (XorReductionV src1 src2));
11693   effect(TEMP tmp, TEMP tmp2);
11694   format %{ "vextracti128_high  $tmp,$src2\n\t"
11695             "vpxor    $tmp,$tmp,$src2\n\t"
11696             "vpshufd   $tmp2,$tmp,0xE\t"
11697             "vpxor    $tmp,$tmp,$tmp2\n\t"
11698             "vpshufd   $tmp2,$tmp,0x1\t"
11699             "vpxor    $tmp,$tmp,$tmp2\n\t"
11700             "movd     $tmp2,$src1\n\t"
11701             "vpxor    $tmp2,$tmp,$tmp2\n\t"
11702             "movd     $dst,$tmp2\t! xor reduction8I" %}
11703   ins_encode %{
11704     int vector_len = 0;
11705     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
11706     __ vpxor($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
11707     __ vpshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE, vector_len);
11708     __ vpxor($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
11709     __ vpshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1, vector_len);
11710     __ vpxor($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
11711     __ movdl($tmp2$$XMMRegister, $src1$$Register);
11712     __ vpxor($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
11713     __ movdl($dst$$Register, $tmp2$$XMMRegister);
11714   %}
11715   ins_pipe( pipe_slow );
11716 %}
11717 
11718 instruct rvxor16I_reduction_reg(rRegI dst, rRegI src1, vecZ src2, regF tmp, regF tmp2, regF tmp3) %{
11719   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT);
11720   match(Set dst (XorReductionV src1 src2));
11721   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
11722   format %{ "vextracti64x4_high  $tmp3,$src2\n\t"
11723             "vpxor  $tmp3,$tmp3,$src2\n\t"
11724             "vextracti128_high  $tmp,$tmp3\n\t"
11725             "vpxor    $tmp,$tmp,$src2\n\t"
11726             "vpshufd   $tmp2,$tmp,0xE\t"
11727             "vpxor    $tmp,$tmp,$tmp2\n\t"
11728             "vpshufd   $tmp2,$tmp,0x1\t"
11729             "vpxor    $tmp,$tmp,$tmp2\n\t"
11730             "movd     $tmp2,$src1\n\t"
11731             "vpxor    $tmp2,$tmp,$tmp2\n\t"
11732             "movd     $dst,$tmp2\t! xor reduction16I" %}
11733   ins_encode %{
11734     int vector_len = 0;
11735     __ vextracti64x4_high($tmp3$$XMMRegister, $src2$$XMMRegister);
11736     __ vpxor($tmp3$$XMMRegister, $tmp3$$XMMRegister, $src2$$XMMRegister, 1);
11737     __ vextracti128_high($tmp$$XMMRegister, $tmp3$$XMMRegister);
11738     __ vpxor($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp3$$XMMRegister, vector_len);
11739     __ vpshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE, vector_len);
11740     __ vpxor($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
11741     __ vpshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1, vector_len);
11742     __ vpxor($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
11743     __ movdl($tmp2$$XMMRegister, $src1$$Register);
11744     __ vpxor($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
11745     __ movdl($dst$$Register, $tmp2$$XMMRegister);
11746   %}
11747   ins_pipe( pipe_slow );
11748 %}
11749 
11750 instruct rsxor2L_reduction_reg(rRegL dst, rRegL src1, vecX src2, regF tmp, regF tmp2) %{
11751   predicate(UseSSE >= 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
11752   match(Set dst (XorReductionV src1 src2));
11753   effect(TEMP tmp, TEMP tmp2);
11754   format %{ "pshufd  $tmp2,$src2,0xE\n\t"
11755             "pxor    $tmp2,$src2\n\t"
11756             "movdq   $tmp,$src1\n\t"
11757             "pxor    $tmp2,$tmp\n\t"
11758             "movq   $dst,$tmp2\t! xor reduction2L" %}
11759   ins_encode %{
11760     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
11761     __ pxor($tmp2$$XMMRegister, $src2$$XMMRegister);
11762     __ movdq($tmp$$XMMRegister, $src1$$Register);
11763     __ pxor($tmp2$$XMMRegister, $tmp$$XMMRegister);
11764     __ movq($dst$$Register, $tmp2$$XMMRegister);
11765   %}
11766   ins_pipe( pipe_slow );
11767 %}
11768 
11769 instruct rvxor4L_reduction_reg_avx(rRegL dst, rRegL src1, vecY src2, regF tmp, regF tmp2) %{
11770   predicate(UseAVX > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
11771   match(Set dst (XorReductionV src1 src2));
11772   effect(TEMP tmp, TEMP tmp2);
11773   format %{ "vextracti128_high  $tmp,$src2\n\t"
11774             "vpxor  $tmp2,$tmp,$src2\n\t"
11775             "vpshufd  $tmp,$tmp2,0xE\t"
11776             "vpxor  $tmp2,$tmp2,$tmp\n\t"
11777             "movq   $tmp,$src1\n\t"
11778             "vpxor  $tmp2,$tmp2,$tmp\n\t"
11779             "movq   $dst,$tmp2\t! xor reduction4L" %}
11780   ins_encode %{
11781     int vector_len = 0;
11782     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
11783     __ vpxor($tmp2$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
11784     __ vpshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE, vector_len);
11785     __ vpxor($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
11786     __ movq($tmp$$XMMRegister, $src1$$Register);
11787     __ vpxor($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
11788     __ movq($dst$$Register, $tmp2$$XMMRegister);
11789   %}
11790   ins_pipe( pipe_slow );
11791 %}
11792 
11793 #ifdef _LP64
11794 instruct rvxor8L_reduction_reg(rRegL dst, rRegL src1, vecZ src2, regF tmp, regF tmp2) %{
11795   predicate(UseAVX > 2 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
11796   match(Set dst (XorReductionV src1 src2));
11797   effect(TEMP tmp, TEMP tmp2);
11798   format %{ "vextracti64x4_high  $tmp2,$src2\n\t"
11799             "vpxorq  $tmp2,$tmp2,$src2\n\t"
11800             "vextracti128_high  $tmp,$tmp2\n\t"
11801             "vpxorq  $tmp2,$tmp2,$tmp\n\t"
11802             "vpshufd  $tmp,$tmp2,0xE\t"
11803             "vpxorq  $tmp2,$tmp2,$tmp\n\t"
11804             "movdq   $tmp,$src1\n\t"
11805             "vpxorq  $tmp2,$tmp2,$tmp\n\t"
11806             "movdq   $dst,$tmp2\t! xor reduction8L" %}
11807   ins_encode %{
11808     int vector_len = 0;
11809     __ vextracti64x4_high($tmp2$$XMMRegister, $src2$$XMMRegister);
11810     __ vpxorq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
11811     __ vextracti128_high($tmp$$XMMRegister, $tmp2$$XMMRegister);
11812     __ vpxorq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
11813     __ vpshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE, vector_len);
11814     __ vpxorq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
11815     __ movdq($tmp$$XMMRegister, $src1$$Register);
11816     __ vpxorq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
11817     __ movdq($dst$$Register, $tmp2$$XMMRegister);
11818   %}
11819   ins_pipe( pipe_slow );
11820 %}
11821 #endif
11822 
11823 // ====================VECTOR ARITHMETIC=======================================
11824 
11825 // --------------------------------- ADD --------------------------------------
11826 
11827 // Bytes vector add
11828 instruct vadd4B(vecS dst, vecS src) %{
11829   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
11830   match(Set dst (AddVB dst src));
11831   format %{ "paddb   $dst,$src\t! add packed4B" %}
11832   ins_encode %{
11833     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
11834   %}
11835   ins_pipe( pipe_slow );
11836 %}
11837 
11838 instruct vadd4B_reg(vecS dst, vecS src1, vecS src2) %{
11839   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
11840   match(Set dst (AddVB src1 src2));
11841   format %{ "vpaddb  $dst,$src1,$src2\t! add packed4B" %}
11842   ins_encode %{
11843     int vector_len = 0;
11844     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
11845   %}
11846   ins_pipe( pipe_slow );
11847 %}
11848 
11849 
11850 instruct vadd4B_mem(vecS dst, vecS src, memory mem) %{
11851   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
11852   match(Set dst (AddVB src (LoadVector mem)));
11853   format %{ "vpaddb  $dst,$src,$mem\t! add packed4B" %}
11854   ins_encode %{
11855     int vector_len = 0;
11856     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
11857   %}
11858   ins_pipe( pipe_slow );
11859 %}
11860 
11861 instruct vadd8B(vecD dst, vecD src) %{
11862   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
11863   match(Set dst (AddVB dst src));
11864   format %{ "paddb   $dst,$src\t! add packed8B" %}
11865   ins_encode %{
11866     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
11867   %}
11868   ins_pipe( pipe_slow );
11869 %}
11870 
11871 instruct vadd8B_reg(vecD dst, vecD src1, vecD src2) %{
11872   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
11873   match(Set dst (AddVB src1 src2));
11874   format %{ "vpaddb  $dst,$src1,$src2\t! add packed8B" %}
11875   ins_encode %{
11876     int vector_len = 0;
11877     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
11878   %}
11879   ins_pipe( pipe_slow );
11880 %}
11881 
11882 
11883 instruct vadd8B_mem(vecD dst, vecD src, memory mem) %{
11884   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
11885   match(Set dst (AddVB src (LoadVector mem)));
11886   format %{ "vpaddb  $dst,$src,$mem\t! add packed8B" %}
11887   ins_encode %{
11888     int vector_len = 0;
11889     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
11890   %}
11891   ins_pipe( pipe_slow );
11892 %}
11893 
11894 instruct vadd16B(vecX dst, vecX src) %{
11895   predicate(UseAVX == 0 && n->as_Vector()->length() == 16);
11896   match(Set dst (AddVB dst src));
11897   format %{ "paddb   $dst,$src\t! add packed16B" %}
11898   ins_encode %{
11899     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
11900   %}
11901   ins_pipe( pipe_slow );
11902 %}
11903 
11904 instruct vadd16B_reg(vecX dst, vecX src1, vecX src2) %{
11905   predicate(UseAVX > 0  && n->as_Vector()->length() == 16);
11906   match(Set dst (AddVB src1 src2));
11907   format %{ "vpaddb  $dst,$src1,$src2\t! add packed16B" %}
11908   ins_encode %{
11909     int vector_len = 0;
11910     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
11911   %}
11912   ins_pipe( pipe_slow );
11913 %}
11914 
11915 instruct vadd16B_mem(vecX dst, vecX src, memory mem) %{
11916   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
11917   match(Set dst (AddVB src (LoadVector mem)));
11918   format %{ "vpaddb  $dst,$src,$mem\t! add packed16B" %}
11919   ins_encode %{
11920     int vector_len = 0;
11921     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
11922   %}
11923   ins_pipe( pipe_slow );
11924 %}
11925 
11926 instruct vadd32B_reg(vecY dst, vecY src1, vecY src2) %{
11927   predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
11928   match(Set dst (AddVB src1 src2));
11929   format %{ "vpaddb  $dst,$src1,$src2\t! add packed32B" %}
11930   ins_encode %{
11931     int vector_len = 1;
11932     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
11933   %}
11934   ins_pipe( pipe_slow );
11935 %}
11936 
11937 instruct vadd32B_mem(vecY dst, vecY src, memory mem) %{
11938   predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
11939   match(Set dst (AddVB src (LoadVector mem)));
11940   format %{ "vpaddb  $dst,$src,$mem\t! add packed32B" %}
11941   ins_encode %{
11942     int vector_len = 1;
11943     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
11944   %}
11945   ins_pipe( pipe_slow );
11946 %}
11947 
11948 instruct vadd64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
11949   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64);
11950   match(Set dst (AddVB src1 src2));
11951   format %{ "vpaddb  $dst,$src1,$src2\t! add packed64B" %}
11952   ins_encode %{
11953     int vector_len = 2;
11954     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
11955   %}
11956   ins_pipe( pipe_slow );
11957 %}
11958 
11959 instruct vadd64B_mem(vecZ dst, vecZ src, memory mem) %{
11960   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64);
11961   match(Set dst (AddVB src (LoadVector mem)));
11962   format %{ "vpaddb  $dst,$src,$mem\t! add packed64B" %}
11963   ins_encode %{
11964     int vector_len = 2;
11965     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
11966   %}
11967   ins_pipe( pipe_slow );
11968 %}
11969 
11970 // Shorts/Chars vector add
11971 instruct vadd2S(vecS dst, vecS src) %{
11972   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
11973   match(Set dst (AddVS dst src));
11974   format %{ "paddw   $dst,$src\t! add packed2S" %}
11975   ins_encode %{
11976     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
11977   %}
11978   ins_pipe( pipe_slow );
11979 %}
11980 
11981 instruct vadd2S_reg(vecS dst, vecS src1, vecS src2) %{
11982   predicate(UseAVX > 0  && n->as_Vector()->length() == 2);
11983   match(Set dst (AddVS src1 src2));
11984   format %{ "vpaddw  $dst,$src1,$src2\t! add packed2S" %}
11985   ins_encode %{
11986     int vector_len = 0;
11987     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
11988   %}
11989   ins_pipe( pipe_slow );
11990 %}
11991 
11992 instruct vadd2S_mem(vecS dst, vecS src, memory mem) %{
11993   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
11994   match(Set dst (AddVS src (LoadVector mem)));
11995   format %{ "vpaddw  $dst,$src,$mem\t! add packed2S" %}
11996   ins_encode %{
11997     int vector_len = 0;
11998     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
11999   %}
12000   ins_pipe( pipe_slow );
12001 %}
12002 
12003 instruct vadd4S(vecD dst, vecD src) %{
12004   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
12005   match(Set dst (AddVS dst src));
12006   format %{ "paddw   $dst,$src\t! add packed4S" %}
12007   ins_encode %{
12008     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
12009   %}
12010   ins_pipe( pipe_slow );
12011 %}
12012 
12013 instruct vadd4S_reg(vecD dst, vecD src1, vecD src2) %{
12014   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
12015   match(Set dst (AddVS src1 src2));
12016   format %{ "vpaddw  $dst,$src1,$src2\t! add packed4S" %}
12017   ins_encode %{
12018     int vector_len = 0;
12019     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12020   %}
12021   ins_pipe( pipe_slow );
12022 %}
12023 
12024 instruct vadd4S_mem(vecD dst, vecD src, memory mem) %{
12025   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
12026   match(Set dst (AddVS src (LoadVector mem)));
12027   format %{ "vpaddw  $dst,$src,$mem\t! add packed4S" %}
12028   ins_encode %{
12029     int vector_len = 0;
12030     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12031   %}
12032   ins_pipe( pipe_slow );
12033 %}
12034 
12035 instruct vadd8S(vecX dst, vecX src) %{
12036   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
12037   match(Set dst (AddVS dst src));
12038   format %{ "paddw   $dst,$src\t! add packed8S" %}
12039   ins_encode %{
12040     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
12041   %}
12042   ins_pipe( pipe_slow );
12043 %}
12044 
12045 instruct vadd8S_reg(vecX dst, vecX src1, vecX src2) %{
12046   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
12047   match(Set dst (AddVS src1 src2));
12048   format %{ "vpaddw  $dst,$src1,$src2\t! add packed8S" %}
12049   ins_encode %{
12050     int vector_len = 0;
12051     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12052   %}
12053   ins_pipe( pipe_slow );
12054 %}
12055 
12056 instruct vadd8S_mem(vecX dst, vecX src, memory mem) %{
12057   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
12058   match(Set dst (AddVS src (LoadVector mem)));
12059   format %{ "vpaddw  $dst,$src,$mem\t! add packed8S" %}
12060   ins_encode %{
12061     int vector_len = 0;
12062     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12063   %}
12064   ins_pipe( pipe_slow );
12065 %}
12066 
12067 instruct vadd16S_reg(vecY dst, vecY src1, vecY src2) %{
12068   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
12069   match(Set dst (AddVS src1 src2));
12070   format %{ "vpaddw  $dst,$src1,$src2\t! add packed16S" %}
12071   ins_encode %{
12072     int vector_len = 1;
12073     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12074   %}
12075   ins_pipe( pipe_slow );
12076 %}
12077 
12078 instruct vadd16S_mem(vecY dst, vecY src, memory mem) %{
12079   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
12080   match(Set dst (AddVS src (LoadVector mem)));
12081   format %{ "vpaddw  $dst,$src,$mem\t! add packed16S" %}
12082   ins_encode %{
12083     int vector_len = 1;
12084     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12085   %}
12086   ins_pipe( pipe_slow );
12087 %}
12088 
12089 instruct vadd32S_reg(vecZ dst, vecZ src1, vecZ src2) %{
12090   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
12091   match(Set dst (AddVS src1 src2));
12092   format %{ "vpaddw  $dst,$src1,$src2\t! add packed32S" %}
12093   ins_encode %{
12094     int vector_len = 2;
12095     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12096   %}
12097   ins_pipe( pipe_slow );
12098 %}
12099 
12100 instruct vadd32S_mem(vecZ dst, vecZ src, memory mem) %{
12101   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
12102   match(Set dst (AddVS src (LoadVector mem)));
12103   format %{ "vpaddw  $dst,$src,$mem\t! add packed32S" %}
12104   ins_encode %{
12105     int vector_len = 2;
12106     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12107   %}
12108   ins_pipe( pipe_slow );
12109 %}
12110 
12111 // Integers vector add
12112 instruct vadd2I(vecD dst, vecD src) %{
12113   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
12114   match(Set dst (AddVI dst src));
12115   format %{ "paddd   $dst,$src\t! add packed2I" %}
12116   ins_encode %{
12117     __ paddd($dst$$XMMRegister, $src$$XMMRegister);
12118   %}
12119   ins_pipe( pipe_slow );
12120 %}
12121 
12122 instruct vadd2I_reg(vecD dst, vecD src1, vecD src2) %{
12123   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
12124   match(Set dst (AddVI src1 src2));
12125   format %{ "vpaddd  $dst,$src1,$src2\t! add packed2I" %}
12126   ins_encode %{
12127     int vector_len = 0;
12128     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12129   %}
12130   ins_pipe( pipe_slow );
12131 %}
12132 
12133 instruct vadd2I_mem(vecD dst, vecD src, memory mem) %{
12134   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
12135   match(Set dst (AddVI src (LoadVector mem)));
12136   format %{ "vpaddd  $dst,$src,$mem\t! add packed2I" %}
12137   ins_encode %{
12138     int vector_len = 0;
12139     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12140   %}
12141   ins_pipe( pipe_slow );
12142 %}
12143 
12144 instruct vadd4I(vecX dst, vecX src) %{
12145   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
12146   match(Set dst (AddVI dst src));
12147   format %{ "paddd   $dst,$src\t! add packed4I" %}
12148   ins_encode %{
12149     __ paddd($dst$$XMMRegister, $src$$XMMRegister);
12150   %}
12151   ins_pipe( pipe_slow );
12152 %}
12153 
12154 instruct vadd4I_reg(vecX dst, vecX src1, vecX src2) %{
12155   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
12156   match(Set dst (AddVI src1 src2));
12157   format %{ "vpaddd  $dst,$src1,$src2\t! add packed4I" %}
12158   ins_encode %{
12159     int vector_len = 0;
12160     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12161   %}
12162   ins_pipe( pipe_slow );
12163 %}
12164 
12165 instruct vadd4I_mem(vecX dst, vecX src, memory mem) %{
12166   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
12167   match(Set dst (AddVI src (LoadVector mem)));
12168   format %{ "vpaddd  $dst,$src,$mem\t! add packed4I" %}
12169   ins_encode %{
12170     int vector_len = 0;
12171     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12172   %}
12173   ins_pipe( pipe_slow );
12174 %}
12175 
12176 instruct vadd8I_reg(vecY dst, vecY src1, vecY src2) %{
12177   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
12178   match(Set dst (AddVI src1 src2));
12179   format %{ "vpaddd  $dst,$src1,$src2\t! add packed8I" %}
12180   ins_encode %{
12181     int vector_len = 1;
12182     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12183   %}
12184   ins_pipe( pipe_slow );
12185 %}
12186 
12187 instruct vadd8I_mem(vecY dst, vecY src, memory mem) %{
12188   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
12189   match(Set dst (AddVI src (LoadVector mem)));
12190   format %{ "vpaddd  $dst,$src,$mem\t! add packed8I" %}
12191   ins_encode %{
12192     int vector_len = 1;
12193     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12194   %}
12195   ins_pipe( pipe_slow );
12196 %}
12197 
12198 instruct vadd16I_reg(vecZ dst, vecZ src1, vecZ src2) %{
12199   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
12200   match(Set dst (AddVI src1 src2));
12201   format %{ "vpaddd  $dst,$src1,$src2\t! add packed16I" %}
12202   ins_encode %{
12203     int vector_len = 2;
12204     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12205   %}
12206   ins_pipe( pipe_slow );
12207 %}
12208 
12209 instruct vadd16I_mem(vecZ dst, vecZ src, memory mem) %{
12210   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
12211   match(Set dst (AddVI src (LoadVector mem)));
12212   format %{ "vpaddd  $dst,$src,$mem\t! add packed16I" %}
12213   ins_encode %{
12214     int vector_len = 2;
12215     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12216   %}
12217   ins_pipe( pipe_slow );
12218 %}
12219 
12220 // Longs vector add
12221 instruct vadd2L(vecX dst, vecX src) %{
12222   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
12223   match(Set dst (AddVL dst src));
12224   format %{ "paddq   $dst,$src\t! add packed2L" %}
12225   ins_encode %{
12226     __ paddq($dst$$XMMRegister, $src$$XMMRegister);
12227   %}
12228   ins_pipe( pipe_slow );
12229 %}
12230 
12231 instruct vadd2L_reg(vecX dst, vecX src1, vecX src2) %{
12232   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
12233   match(Set dst (AddVL src1 src2));
12234   format %{ "vpaddq  $dst,$src1,$src2\t! add packed2L" %}
12235   ins_encode %{
12236     int vector_len = 0;
12237     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12238   %}
12239   ins_pipe( pipe_slow );
12240 %}
12241 
12242 instruct vadd2L_mem(vecX dst, vecX src, memory mem) %{
12243   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
12244   match(Set dst (AddVL src (LoadVector mem)));
12245   format %{ "vpaddq  $dst,$src,$mem\t! add packed2L" %}
12246   ins_encode %{
12247     int vector_len = 0;
12248     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12249   %}
12250   ins_pipe( pipe_slow );
12251 %}
12252 
12253 instruct vadd4L_reg(vecY dst, vecY src1, vecY src2) %{
12254   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
12255   match(Set dst (AddVL src1 src2));
12256   format %{ "vpaddq  $dst,$src1,$src2\t! add packed4L" %}
12257   ins_encode %{
12258     int vector_len = 1;
12259     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12260   %}
12261   ins_pipe( pipe_slow );
12262 %}
12263 
12264 instruct vadd4L_mem(vecY dst, vecY src, memory mem) %{
12265   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
12266   match(Set dst (AddVL src (LoadVector mem)));
12267   format %{ "vpaddq  $dst,$src,$mem\t! add packed4L" %}
12268   ins_encode %{
12269     int vector_len = 1;
12270     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12271   %}
12272   ins_pipe( pipe_slow );
12273 %}
12274 
12275 instruct vadd8L_reg(vecZ dst, vecZ src1, vecZ src2) %{
12276   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
12277   match(Set dst (AddVL src1 src2));
12278   format %{ "vpaddq  $dst,$src1,$src2\t! add packed8L" %}
12279   ins_encode %{
12280     int vector_len = 2;
12281     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12282   %}
12283   ins_pipe( pipe_slow );
12284 %}
12285 
12286 instruct vadd8L_mem(vecZ dst, vecZ src, memory mem) %{
12287   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
12288   match(Set dst (AddVL src (LoadVector mem)));
12289   format %{ "vpaddq  $dst,$src,$mem\t! add packed8L" %}
12290   ins_encode %{
12291     int vector_len = 2;
12292     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12293   %}
12294   ins_pipe( pipe_slow );
12295 %}
12296 
12297 // Floats vector add
12298 instruct vadd2F(vecD dst, vecD src) %{
12299   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
12300   match(Set dst (AddVF dst src));
12301   format %{ "addps   $dst,$src\t! add packed2F" %}
12302   ins_encode %{
12303     __ addps($dst$$XMMRegister, $src$$XMMRegister);
12304   %}
12305   ins_pipe( pipe_slow );
12306 %}
12307 
12308 instruct vadd2F_reg(vecD dst, vecD src1, vecD src2) %{
12309   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
12310   match(Set dst (AddVF src1 src2));
12311   format %{ "vaddps  $dst,$src1,$src2\t! add packed2F" %}
12312   ins_encode %{
12313     int vector_len = 0;
12314     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12315   %}
12316   ins_pipe( pipe_slow );
12317 %}
12318 
12319 instruct vadd2F_mem(vecD dst, vecD src, memory mem) %{
12320   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
12321   match(Set dst (AddVF src (LoadVector mem)));
12322   format %{ "vaddps  $dst,$src,$mem\t! add packed2F" %}
12323   ins_encode %{
12324     int vector_len = 0;
12325     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12326   %}
12327   ins_pipe( pipe_slow );
12328 %}
12329 
12330 instruct vadd4F(vecX dst, vecX src) %{
12331   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
12332   match(Set dst (AddVF dst src));
12333   format %{ "addps   $dst,$src\t! add packed4F" %}
12334   ins_encode %{
12335     __ addps($dst$$XMMRegister, $src$$XMMRegister);
12336   %}
12337   ins_pipe( pipe_slow );
12338 %}
12339 
12340 instruct vadd4F_reg(vecX dst, vecX src1, vecX src2) %{
12341   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
12342   match(Set dst (AddVF src1 src2));
12343   format %{ "vaddps  $dst,$src1,$src2\t! add packed4F" %}
12344   ins_encode %{
12345     int vector_len = 0;
12346     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12347   %}
12348   ins_pipe( pipe_slow );
12349 %}
12350 
12351 instruct vadd4F_mem(vecX dst, vecX src, memory mem) %{
12352   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
12353   match(Set dst (AddVF src (LoadVector mem)));
12354   format %{ "vaddps  $dst,$src,$mem\t! add packed4F" %}
12355   ins_encode %{
12356     int vector_len = 0;
12357     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12358   %}
12359   ins_pipe( pipe_slow );
12360 %}
12361 
12362 instruct vadd8F_reg(vecY dst, vecY src1, vecY src2) %{
12363   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
12364   match(Set dst (AddVF src1 src2));
12365   format %{ "vaddps  $dst,$src1,$src2\t! add packed8F" %}
12366   ins_encode %{
12367     int vector_len = 1;
12368     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12369   %}
12370   ins_pipe( pipe_slow );
12371 %}
12372 
12373 instruct vadd8F_mem(vecY dst, vecY src, memory mem) %{
12374   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
12375   match(Set dst (AddVF src (LoadVector mem)));
12376   format %{ "vaddps  $dst,$src,$mem\t! add packed8F" %}
12377   ins_encode %{
12378     int vector_len = 1;
12379     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12380   %}
12381   ins_pipe( pipe_slow );
12382 %}
12383 
12384 instruct vadd16F_reg(vecZ dst, vecZ src1, vecZ src2) %{
12385   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
12386   match(Set dst (AddVF src1 src2));
12387   format %{ "vaddps  $dst,$src1,$src2\t! add packed16F" %}
12388   ins_encode %{
12389     int vector_len = 2;
12390     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12391   %}
12392   ins_pipe( pipe_slow );
12393 %}
12394 
12395 instruct vadd16F_mem(vecZ dst, vecZ src, memory mem) %{
12396   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
12397   match(Set dst (AddVF src (LoadVector mem)));
12398   format %{ "vaddps  $dst,$src,$mem\t! add packed16F" %}
12399   ins_encode %{
12400     int vector_len = 2;
12401     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12402   %}
12403   ins_pipe( pipe_slow );
12404 %}
12405 
12406 // Doubles vector add
12407 instruct vadd2D(vecX dst, vecX src) %{
12408   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
12409   match(Set dst (AddVD dst src));
12410   format %{ "addpd   $dst,$src\t! add packed2D" %}
12411   ins_encode %{
12412     __ addpd($dst$$XMMRegister, $src$$XMMRegister);
12413   %}
12414   ins_pipe( pipe_slow );
12415 %}
12416 
12417 instruct vadd2D_reg(vecX dst, vecX src1, vecX src2) %{
12418   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
12419   match(Set dst (AddVD src1 src2));
12420   format %{ "vaddpd  $dst,$src1,$src2\t! add packed2D" %}
12421   ins_encode %{
12422     int vector_len = 0;
12423     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12424   %}
12425   ins_pipe( pipe_slow );
12426 %}
12427 
12428 instruct vadd2D_mem(vecX dst, vecX src, memory mem) %{
12429   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
12430   match(Set dst (AddVD src (LoadVector mem)));
12431   format %{ "vaddpd  $dst,$src,$mem\t! add packed2D" %}
12432   ins_encode %{
12433     int vector_len = 0;
12434     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12435   %}
12436   ins_pipe( pipe_slow );
12437 %}
12438 
12439 instruct vadd4D_reg(vecY dst, vecY src1, vecY src2) %{
12440   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
12441   match(Set dst (AddVD src1 src2));
12442   format %{ "vaddpd  $dst,$src1,$src2\t! add packed4D" %}
12443   ins_encode %{
12444     int vector_len = 1;
12445     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12446   %}
12447   ins_pipe( pipe_slow );
12448 %}
12449 
12450 instruct vadd4D_mem(vecY dst, vecY src, memory mem) %{
12451   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
12452   match(Set dst (AddVD src (LoadVector mem)));
12453   format %{ "vaddpd  $dst,$src,$mem\t! add packed4D" %}
12454   ins_encode %{
12455     int vector_len = 1;
12456     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12457   %}
12458   ins_pipe( pipe_slow );
12459 %}
12460 
12461 instruct vadd8D_reg(vecZ dst, vecZ src1, vecZ src2) %{
12462   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
12463   match(Set dst (AddVD src1 src2));
12464   format %{ "vaddpd  $dst,$src1,$src2\t! add packed8D" %}
12465   ins_encode %{
12466     int vector_len = 2;
12467     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12468   %}
12469   ins_pipe( pipe_slow );
12470 %}
12471 
12472 instruct vadd8D_mem(vecZ dst, vecZ src, memory mem) %{
12473   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
12474   match(Set dst (AddVD src (LoadVector mem)));
12475   format %{ "vaddpd  $dst,$src,$mem\t! add packed8D" %}
12476   ins_encode %{
12477     int vector_len = 2;
12478     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12479   %}
12480   ins_pipe( pipe_slow );
12481 %}
12482 
12483 // --------------------------------- SUB --------------------------------------
12484 
12485 // Bytes vector sub
12486 instruct vsub4B(vecS dst, vecS src) %{
12487   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
12488   match(Set dst (SubVB dst src));
12489   format %{ "psubb   $dst,$src\t! sub packed4B" %}
12490   ins_encode %{
12491     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
12492   %}
12493   ins_pipe( pipe_slow );
12494 %}
12495 
12496 instruct vsub4B_reg(vecS dst, vecS src1, vecS src2) %{
12497   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
12498   match(Set dst (SubVB src1 src2));
12499   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed4B" %}
12500   ins_encode %{
12501     int vector_len = 0;
12502     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12503   %}
12504   ins_pipe( pipe_slow );
12505 %}
12506 
12507 instruct vsub4B_mem(vecS dst, vecS src, memory mem) %{
12508   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
12509   match(Set dst (SubVB src (LoadVector mem)));
12510   format %{ "vpsubb  $dst,$src,$mem\t! sub packed4B" %}
12511   ins_encode %{
12512     int vector_len = 0;
12513     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12514   %}
12515   ins_pipe( pipe_slow );
12516 %}
12517 
12518 instruct vsub8B(vecD dst, vecD src) %{
12519   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
12520   match(Set dst (SubVB dst src));
12521   format %{ "psubb   $dst,$src\t! sub packed8B" %}
12522   ins_encode %{
12523     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
12524   %}
12525   ins_pipe( pipe_slow );
12526 %}
12527 
12528 instruct vsub8B_reg(vecD dst, vecD src1, vecD src2) %{
12529   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
12530   match(Set dst (SubVB src1 src2));
12531   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed8B" %}
12532   ins_encode %{
12533     int vector_len = 0;
12534     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12535   %}
12536   ins_pipe( pipe_slow );
12537 %}
12538 
12539 instruct vsub8B_mem(vecD dst, vecD src, memory mem) %{
12540   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
12541   match(Set dst (SubVB src (LoadVector mem)));
12542   format %{ "vpsubb  $dst,$src,$mem\t! sub packed8B" %}
12543   ins_encode %{
12544     int vector_len = 0;
12545     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12546   %}
12547   ins_pipe( pipe_slow );
12548 %}
12549 
12550 instruct vsub16B(vecX dst, vecX src) %{
12551   predicate(UseAVX == 0 && n->as_Vector()->length() == 16);
12552   match(Set dst (SubVB dst src));
12553   format %{ "psubb   $dst,$src\t! sub packed16B" %}
12554   ins_encode %{
12555     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
12556   %}
12557   ins_pipe( pipe_slow );
12558 %}
12559 
12560 instruct vsub16B_reg(vecX dst, vecX src1, vecX src2) %{
12561   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
12562   match(Set dst (SubVB src1 src2));
12563   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed16B" %}
12564   ins_encode %{
12565     int vector_len = 0;
12566     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12567   %}
12568   ins_pipe( pipe_slow );
12569 %}
12570 
12571 instruct vsub16B_mem(vecX dst, vecX src, memory mem) %{
12572   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
12573   match(Set dst (SubVB src (LoadVector mem)));
12574   format %{ "vpsubb  $dst,$src,$mem\t! sub packed16B" %}
12575   ins_encode %{
12576     int vector_len = 0;
12577     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12578   %}
12579   ins_pipe( pipe_slow );
12580 %}
12581 
12582 instruct vsub32B_reg(vecY dst, vecY src1, vecY src2) %{
12583   predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
12584   match(Set dst (SubVB src1 src2));
12585   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed32B" %}
12586   ins_encode %{
12587     int vector_len = 1;
12588     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12589   %}
12590   ins_pipe( pipe_slow );
12591 %}
12592 
12593 instruct vsub32B_mem(vecY dst, vecY src, memory mem) %{
12594   predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
12595   match(Set dst (SubVB src (LoadVector mem)));
12596   format %{ "vpsubb  $dst,$src,$mem\t! sub packed32B" %}
12597   ins_encode %{
12598     int vector_len = 1;
12599     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12600   %}
12601   ins_pipe( pipe_slow );
12602 %}
12603 
12604 instruct vsub64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
12605   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64);
12606   match(Set dst (SubVB src1 src2));
12607   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed64B" %}
12608   ins_encode %{
12609     int vector_len = 2;
12610     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12611   %}
12612   ins_pipe( pipe_slow );
12613 %}
12614 
12615 instruct vsub64B_mem(vecZ dst, vecZ src, memory mem) %{
12616   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64);
12617   match(Set dst (SubVB src (LoadVector mem)));
12618   format %{ "vpsubb  $dst,$src,$mem\t! sub packed64B" %}
12619   ins_encode %{
12620     int vector_len = 2;
12621     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12622   %}
12623   ins_pipe( pipe_slow );
12624 %}
12625 
12626 // Shorts/Chars vector sub
12627 instruct vsub2S(vecS dst, vecS src) %{
12628   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
12629   match(Set dst (SubVS dst src));
12630   format %{ "psubw   $dst,$src\t! sub packed2S" %}
12631   ins_encode %{
12632     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
12633   %}
12634   ins_pipe( pipe_slow );
12635 %}
12636 
12637 instruct vsub2S_reg(vecS dst, vecS src1, vecS src2) %{
12638   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
12639   match(Set dst (SubVS src1 src2));
12640   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed2S" %}
12641   ins_encode %{
12642     int vector_len = 0;
12643     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12644   %}
12645   ins_pipe( pipe_slow );
12646 %}
12647 
12648 instruct vsub2S_mem(vecS dst, vecS src, memory mem) %{
12649   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
12650   match(Set dst (SubVS src (LoadVector mem)));
12651   format %{ "vpsubw  $dst,$src,$mem\t! sub packed2S" %}
12652   ins_encode %{
12653     int vector_len = 0;
12654     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12655   %}
12656   ins_pipe( pipe_slow );
12657 %}
12658 
12659 instruct vsub4S(vecD dst, vecD src) %{
12660   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
12661   match(Set dst (SubVS dst src));
12662   format %{ "psubw   $dst,$src\t! sub packed4S" %}
12663   ins_encode %{
12664     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
12665   %}
12666   ins_pipe( pipe_slow );
12667 %}
12668 
12669 instruct vsub4S_reg(vecD dst, vecD src1, vecD src2) %{
12670   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
12671   match(Set dst (SubVS src1 src2));
12672   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed4S" %}
12673   ins_encode %{
12674     int vector_len = 0;
12675     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12676   %}
12677   ins_pipe( pipe_slow );
12678 %}
12679 
12680 instruct vsub4S_mem(vecD dst, vecD src, memory mem) %{
12681   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
12682   match(Set dst (SubVS src (LoadVector mem)));
12683   format %{ "vpsubw  $dst,$src,$mem\t! sub packed4S" %}
12684   ins_encode %{
12685     int vector_len = 0;
12686     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12687   %}
12688   ins_pipe( pipe_slow );
12689 %}
12690 
12691 instruct vsub8S(vecX dst, vecX src) %{
12692   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
12693   match(Set dst (SubVS dst src));
12694   format %{ "psubw   $dst,$src\t! sub packed8S" %}
12695   ins_encode %{
12696     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
12697   %}
12698   ins_pipe( pipe_slow );
12699 %}
12700 
12701 instruct vsub8S_reg(vecX dst, vecX src1, vecX src2) %{
12702   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
12703   match(Set dst (SubVS src1 src2));
12704   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed8S" %}
12705   ins_encode %{
12706     int vector_len = 0;
12707     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12708   %}
12709   ins_pipe( pipe_slow );
12710 %}
12711 
12712 instruct vsub8S_mem(vecX dst, vecX src, memory mem) %{
12713   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
12714   match(Set dst (SubVS src (LoadVector mem)));
12715   format %{ "vpsubw  $dst,$src,$mem\t! sub packed8S" %}
12716   ins_encode %{
12717     int vector_len = 0;
12718     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12719   %}
12720   ins_pipe( pipe_slow );
12721 %}
12722 
12723 instruct vsub16S_reg(vecY dst, vecY src1, vecY src2) %{
12724   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
12725   match(Set dst (SubVS src1 src2));
12726   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed16S" %}
12727   ins_encode %{
12728     int vector_len = 1;
12729     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12730   %}
12731   ins_pipe( pipe_slow );
12732 %}
12733 
12734 instruct vsub16S_mem(vecY dst, vecY src, memory mem) %{
12735   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
12736   match(Set dst (SubVS src (LoadVector mem)));
12737   format %{ "vpsubw  $dst,$src,$mem\t! sub packed16S" %}
12738   ins_encode %{
12739     int vector_len = 1;
12740     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12741   %}
12742   ins_pipe( pipe_slow );
12743 %}
12744 
12745 instruct vsub32S_reg(vecZ dst, vecZ src1, vecZ src2) %{
12746   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
12747   match(Set dst (SubVS src1 src2));
12748   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed32S" %}
12749   ins_encode %{
12750     int vector_len = 2;
12751     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12752   %}
12753   ins_pipe( pipe_slow );
12754 %}
12755 
12756 instruct vsub32S_mem(vecZ dst, vecZ src, memory mem) %{
12757   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
12758   match(Set dst (SubVS src (LoadVector mem)));
12759   format %{ "vpsubw  $dst,$src,$mem\t! sub packed32S" %}
12760   ins_encode %{
12761     int vector_len = 2;
12762     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12763   %}
12764   ins_pipe( pipe_slow );
12765 %}
12766 
12767 // Integers vector sub
12768 instruct vsub2I(vecD dst, vecD src) %{
12769   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
12770   match(Set dst (SubVI dst src));
12771   format %{ "psubd   $dst,$src\t! sub packed2I" %}
12772   ins_encode %{
12773     __ psubd($dst$$XMMRegister, $src$$XMMRegister);
12774   %}
12775   ins_pipe( pipe_slow );
12776 %}
12777 
12778 instruct vsub2I_reg(vecD dst, vecD src1, vecD src2) %{
12779   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
12780   match(Set dst (SubVI src1 src2));
12781   format %{ "vpsubd  $dst,$src1,$src2\t! sub packed2I" %}
12782   ins_encode %{
12783     int vector_len = 0;
12784     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12785   %}
12786   ins_pipe( pipe_slow );
12787 %}
12788 
12789 instruct vsub2I_mem(vecD dst, vecD src, memory mem) %{
12790   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
12791   match(Set dst (SubVI src (LoadVector mem)));
12792   format %{ "vpsubd  $dst,$src,$mem\t! sub packed2I" %}
12793   ins_encode %{
12794     int vector_len = 0;
12795     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12796   %}
12797   ins_pipe( pipe_slow );
12798 %}
12799 
12800 instruct vsub4I(vecX dst, vecX src) %{
12801   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
12802   match(Set dst (SubVI dst src));
12803   format %{ "psubd   $dst,$src\t! sub packed4I" %}
12804   ins_encode %{
12805     __ psubd($dst$$XMMRegister, $src$$XMMRegister);
12806   %}
12807   ins_pipe( pipe_slow );
12808 %}
12809 
12810 instruct vsub4I_reg(vecX dst, vecX src1, vecX src2) %{
12811   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
12812   match(Set dst (SubVI src1 src2));
12813   format %{ "vpsubd  $dst,$src1,$src2\t! sub packed4I" %}
12814   ins_encode %{
12815     int vector_len = 0;
12816     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12817   %}
12818   ins_pipe( pipe_slow );
12819 %}
12820 
12821 instruct vsub4I_mem(vecX dst, vecX src, memory mem) %{
12822   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
12823   match(Set dst (SubVI src (LoadVector mem)));
12824   format %{ "vpsubd  $dst,$src,$mem\t! sub packed4I" %}
12825   ins_encode %{
12826     int vector_len = 0;
12827     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12828   %}
12829   ins_pipe( pipe_slow );
12830 %}
12831 
12832 instruct vsub8I_reg(vecY dst, vecY src1, vecY src2) %{
12833   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
12834   match(Set dst (SubVI src1 src2));
12835   format %{ "vpsubd  $dst,$src1,$src2\t! sub packed8I" %}
12836   ins_encode %{
12837     int vector_len = 1;
12838     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12839   %}
12840   ins_pipe( pipe_slow );
12841 %}
12842 
12843 instruct vsub8I_mem(vecY dst, vecY src, memory mem) %{
12844   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
12845   match(Set dst (SubVI src (LoadVector mem)));
12846   format %{ "vpsubd  $dst,$src,$mem\t! sub packed8I" %}
12847   ins_encode %{
12848     int vector_len = 1;
12849     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12850   %}
12851   ins_pipe( pipe_slow );
12852 %}
12853 
12854 instruct vsub16I_reg(vecZ dst, vecZ src1, vecZ src2) %{
12855   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
12856   match(Set dst (SubVI src1 src2));
12857   format %{ "vpsubd  $dst,$src1,$src2\t! sub packed16I" %}
12858   ins_encode %{
12859     int vector_len = 2;
12860     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12861   %}
12862   ins_pipe( pipe_slow );
12863 %}
12864 
12865 instruct vsub16I_mem(vecZ dst, vecZ src, memory mem) %{
12866   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
12867   match(Set dst (SubVI src (LoadVector mem)));
12868   format %{ "vpsubd  $dst,$src,$mem\t! sub packed16I" %}
12869   ins_encode %{
12870     int vector_len = 2;
12871     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12872   %}
12873   ins_pipe( pipe_slow );
12874 %}
12875 
12876 // Longs vector sub
12877 instruct vsub2L(vecX dst, vecX src) %{
12878   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
12879   match(Set dst (SubVL dst src));
12880   format %{ "psubq   $dst,$src\t! sub packed2L" %}
12881   ins_encode %{
12882     __ psubq($dst$$XMMRegister, $src$$XMMRegister);
12883   %}
12884   ins_pipe( pipe_slow );
12885 %}
12886 
12887 instruct vsub2L_reg(vecX dst, vecX src1, vecX src2) %{
12888   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
12889   match(Set dst (SubVL src1 src2));
12890   format %{ "vpsubq  $dst,$src1,$src2\t! sub packed2L" %}
12891   ins_encode %{
12892     int vector_len = 0;
12893     __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12894   %}
12895   ins_pipe( pipe_slow );
12896 %}
12897 
12898 instruct vsub2L_mem(vecX dst, vecX src, memory mem) %{
12899   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
12900   match(Set dst (SubVL src (LoadVector mem)));
12901   format %{ "vpsubq  $dst,$src,$mem\t! sub packed2L" %}
12902   ins_encode %{
12903     int vector_len = 0;
12904     __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12905   %}
12906   ins_pipe( pipe_slow );
12907 %}
12908 
12909 instruct vsub4L_reg(vecY dst, vecY src1, vecY src2) %{
12910   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
12911   match(Set dst (SubVL src1 src2));
12912   format %{ "vpsubq  $dst,$src1,$src2\t! sub packed4L" %}
12913   ins_encode %{
12914     int vector_len = 1;
12915     __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12916   %}
12917   ins_pipe( pipe_slow );
12918 %}
12919 
12920 instruct vsub4L_mem(vecY dst, vecY src, memory mem) %{
12921   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
12922   match(Set dst (SubVL src (LoadVector mem)));
12923   format %{ "vpsubq  $dst,$src,$mem\t! sub packed4L" %}
12924   ins_encode %{
12925     int vector_len = 1;
12926     __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12927   %}
12928   ins_pipe( pipe_slow );
12929 %}
12930 
12931 instruct vsub8L_reg(vecZ dst, vecZ src1, vecZ src2) %{
12932   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
12933   match(Set dst (SubVL src1 src2));
12934   format %{ "vpsubq  $dst,$src1,$src2\t! sub packed8L" %}
12935   ins_encode %{
12936     int vector_len = 2;
12937     __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12938   %}
12939   ins_pipe( pipe_slow );
12940 %}
12941 
12942 instruct vsub8L_mem(vecZ dst, vecZ src, memory mem) %{
12943   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
12944   match(Set dst (SubVL src (LoadVector mem)));
12945   format %{ "vpsubq  $dst,$src,$mem\t! sub packed8L" %}
12946   ins_encode %{
12947     int vector_len = 2;
12948     __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12949   %}
12950   ins_pipe( pipe_slow );
12951 %}
12952 
12953 // Floats vector sub
12954 instruct vsub2F(vecD dst, vecD src) %{
12955   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
12956   match(Set dst (SubVF dst src));
12957   format %{ "subps   $dst,$src\t! sub packed2F" %}
12958   ins_encode %{
12959     __ subps($dst$$XMMRegister, $src$$XMMRegister);
12960   %}
12961   ins_pipe( pipe_slow );
12962 %}
12963 
12964 instruct vsub2F_reg(vecD dst, vecD src1, vecD src2) %{
12965   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
12966   match(Set dst (SubVF src1 src2));
12967   format %{ "vsubps  $dst,$src1,$src2\t! sub packed2F" %}
12968   ins_encode %{
12969     int vector_len = 0;
12970     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
12971   %}
12972   ins_pipe( pipe_slow );
12973 %}
12974 
12975 instruct vsub2F_mem(vecD dst, vecD src, memory mem) %{
12976   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
12977   match(Set dst (SubVF src (LoadVector mem)));
12978   format %{ "vsubps  $dst,$src,$mem\t! sub packed2F" %}
12979   ins_encode %{
12980     int vector_len = 0;
12981     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
12982   %}
12983   ins_pipe( pipe_slow );
12984 %}
12985 
12986 instruct vsub4F(vecX dst, vecX src) %{
12987   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
12988   match(Set dst (SubVF dst src));
12989   format %{ "subps   $dst,$src\t! sub packed4F" %}
12990   ins_encode %{
12991     __ subps($dst$$XMMRegister, $src$$XMMRegister);
12992   %}
12993   ins_pipe( pipe_slow );
12994 %}
12995 
12996 instruct vsub4F_reg(vecX dst, vecX src1, vecX src2) %{
12997   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
12998   match(Set dst (SubVF src1 src2));
12999   format %{ "vsubps  $dst,$src1,$src2\t! sub packed4F" %}
13000   ins_encode %{
13001     int vector_len = 0;
13002     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13003   %}
13004   ins_pipe( pipe_slow );
13005 %}
13006 
13007 instruct vsub4F_mem(vecX dst, vecX src, memory mem) %{
13008   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
13009   match(Set dst (SubVF src (LoadVector mem)));
13010   format %{ "vsubps  $dst,$src,$mem\t! sub packed4F" %}
13011   ins_encode %{
13012     int vector_len = 0;
13013     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
13014   %}
13015   ins_pipe( pipe_slow );
13016 %}
13017 
13018 instruct vsub8F_reg(vecY dst, vecY src1, vecY src2) %{
13019   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
13020   match(Set dst (SubVF src1 src2));
13021   format %{ "vsubps  $dst,$src1,$src2\t! sub packed8F" %}
13022   ins_encode %{
13023     int vector_len = 1;
13024     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13025   %}
13026   ins_pipe( pipe_slow );
13027 %}
13028 
13029 instruct vsub8F_mem(vecY dst, vecY src, memory mem) %{
13030   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
13031   match(Set dst (SubVF src (LoadVector mem)));
13032   format %{ "vsubps  $dst,$src,$mem\t! sub packed8F" %}
13033   ins_encode %{
13034     int vector_len = 1;
13035     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
13036   %}
13037   ins_pipe( pipe_slow );
13038 %}
13039 
13040 instruct vsub16F_reg(vecZ dst, vecZ src1, vecZ src2) %{
13041   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
13042   match(Set dst (SubVF src1 src2));
13043   format %{ "vsubps  $dst,$src1,$src2\t! sub packed16F" %}
13044   ins_encode %{
13045     int vector_len = 2;
13046     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13047   %}
13048   ins_pipe( pipe_slow );
13049 %}
13050 
13051 instruct vsub16F_mem(vecZ dst, vecZ src, memory mem) %{
13052   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
13053   match(Set dst (SubVF src (LoadVector mem)));
13054   format %{ "vsubps  $dst,$src,$mem\t! sub packed16F" %}
13055   ins_encode %{
13056     int vector_len = 2;
13057     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
13058   %}
13059   ins_pipe( pipe_slow );
13060 %}
13061 
13062 // Doubles vector sub
13063 instruct vsub2D(vecX dst, vecX src) %{
13064   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
13065   match(Set dst (SubVD dst src));
13066   format %{ "subpd   $dst,$src\t! sub packed2D" %}
13067   ins_encode %{
13068     __ subpd($dst$$XMMRegister, $src$$XMMRegister);
13069   %}
13070   ins_pipe( pipe_slow );
13071 %}
13072 
13073 instruct vsub2D_reg(vecX dst, vecX src1, vecX src2) %{
13074   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
13075   match(Set dst (SubVD src1 src2));
13076   format %{ "vsubpd  $dst,$src1,$src2\t! sub packed2D" %}
13077   ins_encode %{
13078     int vector_len = 0;
13079     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13080   %}
13081   ins_pipe( pipe_slow );
13082 %}
13083 
13084 instruct vsub2D_mem(vecX dst, vecX src, memory mem) %{
13085   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
13086   match(Set dst (SubVD src (LoadVector mem)));
13087   format %{ "vsubpd  $dst,$src,$mem\t! sub packed2D" %}
13088   ins_encode %{
13089     int vector_len = 0;
13090     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
13091   %}
13092   ins_pipe( pipe_slow );
13093 %}
13094 
13095 instruct vsub4D_reg(vecY dst, vecY src1, vecY src2) %{
13096   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
13097   match(Set dst (SubVD src1 src2));
13098   format %{ "vsubpd  $dst,$src1,$src2\t! sub packed4D" %}
13099   ins_encode %{
13100     int vector_len = 1;
13101     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13102   %}
13103   ins_pipe( pipe_slow );
13104 %}
13105 
13106 instruct vsub4D_mem(vecY dst, vecY src, memory mem) %{
13107   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
13108   match(Set dst (SubVD src (LoadVector mem)));
13109   format %{ "vsubpd  $dst,$src,$mem\t! sub packed4D" %}
13110   ins_encode %{
13111     int vector_len = 1;
13112     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
13113   %}
13114   ins_pipe( pipe_slow );
13115 %}
13116 
13117 instruct vsub8D_reg(vecZ dst, vecZ src1, vecZ src2) %{
13118   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
13119   match(Set dst (SubVD src1 src2));
13120   format %{ "vsubpd  $dst,$src1,$src2\t! sub packed8D" %}
13121   ins_encode %{
13122     int vector_len = 2;
13123     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13124   %}
13125   ins_pipe( pipe_slow );
13126 %}
13127 
13128 instruct vsub8D_mem(vecZ dst, vecZ src, memory mem) %{
13129   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
13130   match(Set dst (SubVD src (LoadVector mem)));
13131   format %{ "vsubpd  $dst,$src,$mem\t! sub packed8D" %}
13132   ins_encode %{
13133     int vector_len = 2;
13134     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
13135   %}
13136   ins_pipe( pipe_slow );
13137 %}
13138 
13139 // --------------------------------- MUL --------------------------------------
13140 
13141 // Byte vector mul
13142 
13143 instruct mul4B_reg(vecS dst, vecS src1, vecS src2, vecS tmp2, vecS tmp) %{
13144   predicate(UseSSE > 3 && n->as_Vector()->length() == 4);
13145   match(Set dst (MulVB src1 src2));
13146   effect(TEMP dst, TEMP tmp2, TEMP tmp);
13147   format %{"pmovsxbw  $tmp,$src1\n\t"
13148            "pmovsxbw  $tmp2,$src2\n\t"
13149            "pmullw    $tmp,$tmp2\n\t"
13150            "movdqu    $tmp2,[0x00ff00ff0x00ff00ff]\n\t"
13151            "pand      $tmp,$tmp2\n\t"
13152            "packuswb  $tmp,$tmp\n\t"
13153            "movss     $dst,$tmp\t! mul packed4B" %}
13154   ins_encode %{
13155     __ pmovsxbw($tmp$$XMMRegister, $src1$$XMMRegister);
13156     __ pmovsxbw($tmp2$$XMMRegister, $src2$$XMMRegister);
13157     __ pmullw($tmp$$XMMRegister, $tmp2$$XMMRegister);
13158     __ movdqu($tmp2$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()));
13159     __ pand($tmp$$XMMRegister, $tmp2$$XMMRegister);
13160     __ packuswb($tmp$$XMMRegister, $tmp$$XMMRegister);
13161     __ movss($dst$$XMMRegister, $tmp$$XMMRegister);
13162   %}
13163   ins_pipe( pipe_slow );
13164 %}
13165 
13166 instruct mul8B_reg(vecD dst, vecD src1, vecD src2, vecD tmp2, vecD tmp) %{
13167   predicate(UseSSE > 3 && n->as_Vector()->length() == 8);
13168   match(Set dst (MulVB src1 src2));
13169   effect(TEMP dst, TEMP tmp2, TEMP tmp);
13170   format %{"pmovsxbw  $tmp,$src1\n\t"
13171            "pmovsxbw  $tmp2,$src2\n\t"
13172            "pmullw    $tmp,$tmp2\n\t"
13173            "movdqu    $tmp2,[0x00ff00ff0x00ff00ff]\n\t"
13174            "pand      $tmp,$tmp2\n\t"
13175            "packuswb  $tmp,$tmp\n\t"
13176            "movsd     $dst,$tmp\t! mul packed8B" %}
13177   ins_encode %{
13178     __ pmovsxbw($tmp$$XMMRegister, $src1$$XMMRegister);
13179     __ pmovsxbw($tmp2$$XMMRegister, $src2$$XMMRegister);
13180     __ pmullw($tmp$$XMMRegister, $tmp2$$XMMRegister);
13181     __ movdqu($tmp2$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()));
13182     __ pand($tmp$$XMMRegister, $tmp2$$XMMRegister);
13183     __ packuswb($tmp$$XMMRegister, $tmp$$XMMRegister);
13184     __ movsd($dst$$XMMRegister, $tmp$$XMMRegister);
13185   %}
13186   ins_pipe( pipe_slow );
13187 %}
13188 
13189 instruct mul16B_reg(vecX dst, vecX src1, vecX src2, vecX tmp3, vecX tmp2, vecX tmp) %{
13190   predicate(UseSSE > 3 && n->as_Vector()->length() == 16);
13191   match(Set dst (MulVB src1 src2));
13192   effect(TEMP tmp3, TEMP tmp2, TEMP tmp);
13193   format %{"pmovsxbw  $tmp,$src1\n\t"
13194            "pmovsxbw  $tmp2,$src2\n\t"
13195            "pmullw    $tmp,$tmp2\n\t"
13196            "pshufd    $tmp2,$src1\n\t"
13197            "pshufd    $tmp3,$src2\n\t"
13198            "pmovsxbw  $tmp2,$tmp2\n\t"
13199            "pmovsxbw  $tmp3,$tmp3\n\t"
13200            "pmullw    $tmp2,$tmp3\n\t"
13201            "movdqu    $tmp3,[0x00ff00ff0x00ff00ff]\n\t"
13202            "pand      $tmp,$tmp3\n\t"
13203            "pand      $tmp2,$tmp3\n\t"
13204            "packuswb  $tmp,$tmp2\n\t"
13205            "movdqu    $dst,$tmp \n\t! mul packed16B" %}
13206   ins_encode %{
13207     __ pmovsxbw($tmp$$XMMRegister, $src1$$XMMRegister);
13208     __ pmovsxbw($tmp2$$XMMRegister, $src2$$XMMRegister);
13209     __ pmullw($tmp$$XMMRegister, $tmp2$$XMMRegister);
13210     __ pshufd($tmp2$$XMMRegister, $src1$$XMMRegister, 238);
13211     __ pshufd($tmp3$$XMMRegister, $src2$$XMMRegister, 238);
13212     __ pmovsxbw($tmp2$$XMMRegister, $tmp2$$XMMRegister);
13213     __ pmovsxbw($tmp3$$XMMRegister, $tmp3$$XMMRegister);
13214     __ pmullw($tmp2$$XMMRegister, $tmp3$$XMMRegister);
13215     __ movdqu($tmp3$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()));
13216     __ pand($tmp$$XMMRegister, $tmp3$$XMMRegister);
13217     __ pand($tmp2$$XMMRegister, $tmp3$$XMMRegister);
13218     __ packuswb($tmp$$XMMRegister, $tmp2$$XMMRegister);
13219     __ movdqu($dst$$XMMRegister, $tmp$$XMMRegister);
13220   %}
13221   ins_pipe( pipe_slow );
13222 %}
13223 
13224 instruct vmul16B_reg_avx(vecX dst, vecX src1, vecX src2, vecY tmp2, vecY tmp) %{
13225   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
13226   match(Set dst (MulVB src1 src2));
13227   effect(TEMP dst, TEMP tmp2, TEMP tmp);
13228   format %{"vpmovsxbw  $tmp,$src1\n\t"
13229            "vpmovsxbw  $tmp2,$src2\n\t"
13230            "vpmullw    $tmp,$tmp2\n\t"
13231            "vmovdqu    $tmp2,[0x00ff00ff0x00ff00ff]\n\t"
13232            "vpand      $tmp,$tmp2\n\t"
13233            "vextracti128_high  $tmp2,$tmp\n\t"
13234            "vpackuswb  $dst,$tmp, $tmp2\n\t! mul packed16B" %}
13235   ins_encode %{
13236   int vector_len = 1;
13237     __ vpmovsxbw($tmp$$XMMRegister, $src1$$XMMRegister, vector_len);
13238     __ vpmovsxbw($tmp2$$XMMRegister, $src2$$XMMRegister, vector_len);
13239     __ vpmullw($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
13240     __ vmovdqu($tmp2$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()));
13241     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
13242     __ vextracti128_high($tmp2$$XMMRegister, $tmp$$XMMRegister);
13243     __ vpackuswb($dst$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
13244   %}
13245   ins_pipe( pipe_slow );
13246 %}
13247 
13248 instruct vmul32B_reg_avx(vecY dst, vecY src1, vecY src2, vecY tmp1, vecY tmp2, vecY tmp3) %{
13249   predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
13250   match(Set dst (MulVB src1 src2));
13251   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3);
13252   format %{"vextracti128_high  $tmp1,$src1\n\t"
13253            "vextracti128_high  $tmp3,$src2\n\t"
13254            "vpmovsxbw $tmp1,$tmp1\n\t"
13255            "vpmovsxbw $tmp3,$tmp3\n\t"
13256            "vpmullw $tmp1,$tmp1,$tmp3\n\t"
13257            "vpmovsxbw $tmp2,$src1\n\t"
13258            "vpmovsxbw $tmp3,$src2\n\t"
13259            "vpmullw $tmp2,$tmp2,$tmp3\n\t"
13260            "vmovdqu $tmp3, [0x00ff00ff0x00ff00ff]\n\t"
13261            "vpbroadcastd $tmp3, $tmp3\n\t"
13262            "vpand $tmp2,$tmp2,$tmp3\n\t"
13263            "vpand $tmp1,$tmp1,$tmp3\n\t"
13264            "vpackuswb $dst,$tmp2,$tmp1\n\t"
13265            "vpermq $dst, $dst, 0xD8\t! mul packed32B" %}
13266   ins_encode %{
13267     int vector_len = 1;
13268     __ vextracti128_high($tmp1$$XMMRegister, $src1$$XMMRegister);
13269     __ vextracti128_high($tmp3$$XMMRegister, $src2$$XMMRegister);
13270     __ vpmovsxbw($tmp1$$XMMRegister, $tmp1$$XMMRegister, vector_len);
13271     __ vpmovsxbw($tmp3$$XMMRegister, $tmp3$$XMMRegister, vector_len);
13272     __ vpmullw($tmp1$$XMMRegister, $tmp1$$XMMRegister, $tmp3$$XMMRegister, vector_len);
13273     __ vpmovsxbw($tmp2$$XMMRegister, $src1$$XMMRegister, vector_len);
13274     __ vpmovsxbw($tmp3$$XMMRegister, $src2$$XMMRegister, vector_len);
13275     __ vpmullw($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, vector_len);
13276     __ vmovdqu($tmp3$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()));
13277     __ vpbroadcastd($tmp3$$XMMRegister, $tmp3$$XMMRegister, vector_len);
13278     __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $tmp3$$XMMRegister, vector_len);
13279     __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, vector_len);
13280     __ vpackuswb($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp1$$XMMRegister, vector_len);
13281     __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vector_len);
13282   %}
13283   ins_pipe( pipe_slow );
13284 %}
13285 
13286 instruct vmul64B_reg_avx(vecZ dst, vecZ src1, vecZ src2, vecZ tmp1, vecZ tmp2, vecZ tmp3, vecZ tmp4, vecZ tmp5, vecZ tmp6) %{
13287   predicate(UseAVX > 2 && n->as_Vector()->length() == 64);
13288   match(Set dst (MulVB src1 src2));
13289   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4);
13290   format %{"vextracti64x4_high  $tmp1,$src1\n\t"
13291            "vextracti64x4_high  $tmp3,$src2\n\t"
13292            "vpmovsxbw $tmp1,$tmp1\n\t"
13293            "vpmovsxbw $tmp3,$tmp3\n\t"
13294            "vpmullw $tmp1,$tmp1,$tmp3\n\t"
13295            "vpmovsxbw $tmp2,$src1\n\t"
13296            "vpmovsxbw $tmp3,$src2\n\t"
13297            "vpmullw $tmp2,$tmp2,$tmp3\n\t"
13298            "vmovdqu $tmp3, [0x00ff00ff0x00ff00ff]\n\t"
13299            "vpbroadcastd $tmp3, $tmp3\n\t"
13300            "vpand $tmp1,$tmp1,$tmp3\n\t"
13301            "vpand $tmp2,$tmp2,$tmp3\n\t"
13302            "vpackuswb $tmp1,$tmp2,$tmp1\n\t"
13303            "vextracti64x4_high  $tmp3,$tmp1\n\t"
13304            "vpermq $tmp3, $tmp3, 0x8D\n\t"
13305            "vpermq $tmp1, $tmp1, 0xD8\n\t"
13306            "vmovdqu  $tmp4,$tmp3\n\t"
13307            "vmovdqu  $tmp2,$tmp1\n\t"
13308            "vpblendd  $tmp3,$tmp3,$tmp1\n\t"
13309            "vpblendd  $tmp2,$tmp2,$tmp4\n\t"
13310            "vpermq $tmp2,$tmp2,0x4E\n\t"
13311            "vinserti64x4 $dst,$dst,$tmp3,0x00\n\t"
13312            "vinserti64x4 $dst,$dst,$tmp2,0x01\t! mul packed64B" %}
13313   ins_encode %{
13314     int vector_len = 2;
13315     __ vextracti64x4_high($tmp1$$XMMRegister, $src1$$XMMRegister);
13316     __ vextracti64x4_high($tmp3$$XMMRegister, $src2$$XMMRegister);
13317     __ vpmovsxbw($tmp1$$XMMRegister, $tmp1$$XMMRegister, vector_len);
13318     __ vpmovsxbw($tmp3$$XMMRegister, $tmp3$$XMMRegister, vector_len);
13319     __ vpmullw($tmp1$$XMMRegister, $tmp1$$XMMRegister, $tmp3$$XMMRegister, vector_len);
13320     __ vpmovsxbw($tmp2$$XMMRegister, $src1$$XMMRegister, vector_len);
13321     __ vpmovsxbw($tmp3$$XMMRegister, $src2$$XMMRegister, vector_len);
13322     __ vpmullw($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, vector_len);
13323     __ vmovdqu($tmp3$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()));
13324     __ vpbroadcastd($tmp3$$XMMRegister, $tmp3$$XMMRegister, vector_len);
13325     __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $tmp3$$XMMRegister, vector_len);
13326     __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, vector_len);
13327     __ vpackuswb($tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp1$$XMMRegister, vector_len);
13328     __ vextracti64x4_high($tmp3$$XMMRegister, $tmp1$$XMMRegister);
13329     __ vpermq($tmp3$$XMMRegister, $tmp3$$XMMRegister, 0x8D, 1);
13330     __ vpermq($tmp1$$XMMRegister, $tmp1$$XMMRegister, 0xD8, 1);
13331     __ vmovdqu($tmp4$$XMMRegister, $tmp3$$XMMRegister);
13332     __ vmovdqu($tmp2$$XMMRegister, $tmp1$$XMMRegister);
13333     __ vpblendd($tmp3$$XMMRegister, $tmp3$$XMMRegister, $tmp1$$XMMRegister, 0x0F, 1);
13334     __ vpblendd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp4$$XMMRegister, 0x0F, 1);
13335     __ vpermq($tmp2$$XMMRegister, $tmp2$$XMMRegister, 0x4E, 1);
13336     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $tmp3$$XMMRegister, 0x00);
13337     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister, 0x01);
13338   %}
13339   ins_pipe( pipe_slow );
13340 %}
13341 
13342 // Shorts/Chars vector mul
13343 instruct vmul2S(vecS dst, vecS src) %{
13344   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
13345   match(Set dst (MulVS dst src));
13346   format %{ "pmullw $dst,$src\t! mul packed2S" %}
13347   ins_encode %{
13348     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
13349   %}
13350   ins_pipe( pipe_slow );
13351 %}
13352 
13353 instruct vmul2S_reg(vecS dst, vecS src1, vecS src2) %{
13354   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
13355   match(Set dst (MulVS src1 src2));
13356   format %{ "vpmullw $dst,$src1,$src2\t! mul packed2S" %}
13357   ins_encode %{
13358     int vector_len = 0;
13359     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13360   %}
13361   ins_pipe( pipe_slow );
13362 %}
13363 
13364 instruct vmul2S_mem(vecS dst, vecS src, memory mem) %{
13365   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
13366   match(Set dst (MulVS src (LoadVector mem)));
13367   format %{ "vpmullw $dst,$src,$mem\t! mul packed2S" %}
13368   ins_encode %{
13369     int vector_len = 0;
13370     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
13371   %}
13372   ins_pipe( pipe_slow );
13373 %}
13374 
13375 instruct vmul4S(vecD dst, vecD src) %{
13376   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
13377   match(Set dst (MulVS dst src));
13378   format %{ "pmullw  $dst,$src\t! mul packed4S" %}
13379   ins_encode %{
13380     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
13381   %}
13382   ins_pipe( pipe_slow );
13383 %}
13384 
13385 instruct vmul4S_reg(vecD dst, vecD src1, vecD src2) %{
13386   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
13387   match(Set dst (MulVS src1 src2));
13388   format %{ "vpmullw $dst,$src1,$src2\t! mul packed4S" %}
13389   ins_encode %{
13390     int vector_len = 0;
13391     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13392   %}
13393   ins_pipe( pipe_slow );
13394 %}
13395 
13396 instruct vmul4S_mem(vecD dst, vecD src, memory mem) %{
13397   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
13398   match(Set dst (MulVS src (LoadVector mem)));
13399   format %{ "vpmullw $dst,$src,$mem\t! mul packed4S" %}
13400   ins_encode %{
13401     int vector_len = 0;
13402     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
13403   %}
13404   ins_pipe( pipe_slow );
13405 %}
13406 
13407 instruct vmul8S(vecX dst, vecX src) %{
13408   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
13409   match(Set dst (MulVS dst src));
13410   format %{ "pmullw  $dst,$src\t! mul packed8S" %}
13411   ins_encode %{
13412     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
13413   %}
13414   ins_pipe( pipe_slow );
13415 %}
13416 
13417 instruct vmul8S_reg(vecX dst, vecX src1, vecX src2) %{
13418   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
13419   match(Set dst (MulVS src1 src2));
13420   format %{ "vpmullw $dst,$src1,$src2\t! mul packed8S" %}
13421   ins_encode %{
13422     int vector_len = 0;
13423     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13424   %}
13425   ins_pipe( pipe_slow );
13426 %}
13427 
13428 instruct vmul8S_mem(vecX dst, vecX src, memory mem) %{
13429   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
13430   match(Set dst (MulVS src (LoadVector mem)));
13431   format %{ "vpmullw $dst,$src,$mem\t! mul packed8S" %}
13432   ins_encode %{
13433     int vector_len = 0;
13434     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
13435   %}
13436   ins_pipe( pipe_slow );
13437 %}
13438 
13439 instruct vmul16S_reg(vecY dst, vecY src1, vecY src2) %{
13440   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
13441   match(Set dst (MulVS src1 src2));
13442   format %{ "vpmullw $dst,$src1,$src2\t! mul packed16S" %}
13443   ins_encode %{
13444     int vector_len = 1;
13445     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13446   %}
13447   ins_pipe( pipe_slow );
13448 %}
13449 
13450 instruct vmul16S_mem(vecY dst, vecY src, memory mem) %{
13451   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
13452   match(Set dst (MulVS src (LoadVector mem)));
13453   format %{ "vpmullw $dst,$src,$mem\t! mul packed16S" %}
13454   ins_encode %{
13455     int vector_len = 1;
13456     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
13457   %}
13458   ins_pipe( pipe_slow );
13459 %}
13460 
13461 instruct vmul32S_reg(vecZ dst, vecZ src1, vecZ src2) %{
13462   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
13463   match(Set dst (MulVS src1 src2));
13464   format %{ "vpmullw $dst,$src1,$src2\t! mul packed32S" %}
13465   ins_encode %{
13466     int vector_len = 2;
13467     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13468   %}
13469   ins_pipe( pipe_slow );
13470 %}
13471 
13472 instruct vmul32S_mem(vecZ dst, vecZ src, memory mem) %{
13473   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
13474   match(Set dst (MulVS src (LoadVector mem)));
13475   format %{ "vpmullw $dst,$src,$mem\t! mul packed32S" %}
13476   ins_encode %{
13477     int vector_len = 2;
13478     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
13479   %}
13480   ins_pipe( pipe_slow );
13481 %}
13482 
13483 // Integers vector mul (sse4_1)
13484 instruct vmul2I(vecD dst, vecD src) %{
13485   predicate(UseSSE > 3 && n->as_Vector()->length() == 2);
13486   match(Set dst (MulVI dst src));
13487   format %{ "pmulld  $dst,$src\t! mul packed2I" %}
13488   ins_encode %{
13489     __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
13490   %}
13491   ins_pipe( pipe_slow );
13492 %}
13493 
13494 instruct vmul2I_reg(vecD dst, vecD src1, vecD src2) %{
13495   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
13496   match(Set dst (MulVI src1 src2));
13497   format %{ "vpmulld $dst,$src1,$src2\t! mul packed2I" %}
13498   ins_encode %{
13499     int vector_len = 0;
13500     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13501   %}
13502   ins_pipe( pipe_slow );
13503 %}
13504 
13505 instruct vmul2I_mem(vecD dst, vecD src, memory mem) %{
13506   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
13507   match(Set dst (MulVI src (LoadVector mem)));
13508   format %{ "vpmulld $dst,$src,$mem\t! mul packed2I" %}
13509   ins_encode %{
13510     int vector_len = 0;
13511     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
13512   %}
13513   ins_pipe( pipe_slow );
13514 %}
13515 
13516 instruct vmul4I(vecX dst, vecX src) %{
13517   predicate(UseSSE > 3 && n->as_Vector()->length() == 4);
13518   match(Set dst (MulVI dst src));
13519   format %{ "pmulld  $dst,$src\t! mul packed4I" %}
13520   ins_encode %{
13521     __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
13522   %}
13523   ins_pipe( pipe_slow );
13524 %}
13525 
13526 instruct vmul4I_reg(vecX dst, vecX src1, vecX src2) %{
13527   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
13528   match(Set dst (MulVI src1 src2));
13529   format %{ "vpmulld $dst,$src1,$src2\t! mul packed4I" %}
13530   ins_encode %{
13531     int vector_len = 0;
13532     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13533   %}
13534   ins_pipe( pipe_slow );
13535 %}
13536 
13537 instruct vmul4I_mem(vecX dst, vecX src, memory mem) %{
13538   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
13539   match(Set dst (MulVI src (LoadVector mem)));
13540   format %{ "vpmulld $dst,$src,$mem\t! mul packed4I" %}
13541   ins_encode %{
13542     int vector_len = 0;
13543     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
13544   %}
13545   ins_pipe( pipe_slow );
13546 %}
13547 
13548 // Long vector mul
13549 
13550 instruct mul2L_reg(vecX dst, vecX src2, vecX tmp) %{
13551   predicate(UseSSE > 3 && n->as_Vector()->length() == 2 && VM_Version::supports_sse4_1());
13552   match(Set dst (MulVL dst src2));
13553   effect(TEMP dst, TEMP tmp);
13554   format %{ "pshufd $tmp,$src2, 177\n\t"
13555             "pmulld $tmp,$dst\n\t"
13556             "phaddd $tmp,$tmp\n\t"
13557             "pmovzxdq $tmp,$tmp\n\t"
13558             "psllq $tmp, 32\n\t"
13559             "pmuludq $dst,$src2\n\t"
13560             "paddq $dst,$tmp\n\t! mul packed2L" %}
13561 
13562   ins_encode %{
13563     int vector_len = 0;
13564     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 177);
13565     __ pmulld($tmp$$XMMRegister, $dst$$XMMRegister);
13566     __ phaddd($tmp$$XMMRegister, $tmp$$XMMRegister);
13567     __ pmovzxdq($tmp$$XMMRegister, $tmp$$XMMRegister);
13568     __ psllq($tmp$$XMMRegister, 32);
13569     __ pmuludq($dst$$XMMRegister, $src2$$XMMRegister);
13570     __ paddq($dst$$XMMRegister, $tmp$$XMMRegister);
13571   %}
13572   ins_pipe( pipe_slow );
13573 %}
13574 
13575 instruct vmul2L_reg_avx(vecX dst, vecX src1, vecX src2, vecX tmp1, vecX tmp) %{
13576   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 && VM_Version::supports_avx());
13577   match(Set dst (MulVL src1 src2));
13578   effect(TEMP tmp1, TEMP tmp);
13579   format %{ "vpshufd $tmp,$src2\n\t"
13580             "vpmulld $tmp,$src1,$tmp\n\t"
13581             "vphaddd $tmp,$tmp,$tmp\n\t"
13582             "vpmovzxdq $tmp,$tmp\n\t"
13583             "vpsllq $tmp,$tmp\n\t"
13584             "vpmuludq $tmp1,$src1,$src2\n\t"
13585             "vpaddq $dst,$tmp,$tmp1\t! mul packed2L" %}
13586   ins_encode %{
13587     int vector_len = 0;
13588     __ vpshufd($tmp$$XMMRegister, $src2$$XMMRegister, 177, vector_len);
13589     __ vpmulld($tmp$$XMMRegister, $src1$$XMMRegister, $tmp$$XMMRegister, vector_len);
13590     __ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp$$XMMRegister, vector_len);
13591     __ vpmovzxdq($tmp$$XMMRegister, $tmp$$XMMRegister, vector_len);
13592     __ vpsllq($tmp$$XMMRegister, $tmp$$XMMRegister, 32, vector_len);
13593     __ vpmuludq($tmp1$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13594     __ vpaddq($dst$$XMMRegister, $tmp$$XMMRegister, $tmp1$$XMMRegister, vector_len);
13595   %}
13596   ins_pipe( pipe_slow );
13597 %}
13598 
13599 instruct vmul2L_reg(vecX dst, vecX src1, vecX src2) %{
13600   predicate(UseAVX > 2 && n->as_Vector()->length() == 2 && VM_Version::supports_avx512dq());
13601   match(Set dst (MulVL src1 src2));
13602   format %{ "vpmullq $dst,$src1,$src2\t! mul packed2L" %}
13603   ins_encode %{
13604     int vector_len = 0;
13605     __ vpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13606   %}
13607   ins_pipe( pipe_slow );
13608 %}
13609 
13610 instruct vmul2L_mem(vecX dst, vecX src, memory mem) %{
13611   predicate(UseAVX > 2 && n->as_Vector()->length() == 2 && VM_Version::supports_avx512dq());
13612   match(Set dst (MulVL src (LoadVector mem)));
13613   format %{ "vpmullq $dst,$src,$mem\t! mul packed2L" %}
13614   ins_encode %{
13615     int vector_len = 0;
13616     __ vpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
13617   %}
13618   ins_pipe( pipe_slow );
13619 %}
13620 
13621 instruct vmul4L_reg_avx(vecY dst, vecY src1, vecY src2, vecY tmp, vecY tmp1,) %{
13622   predicate(UseAVX > 1 && n->as_Vector()->length() == 4 && VM_Version::supports_avx2());
13623   match(Set dst (MulVL src1 src2));
13624   effect(TEMP tmp1, TEMP tmp);
13625   format %{ "vpshufd $tmp,$src2\n\t"
13626             "vpmulld $tmp,$src1,$tmp\n\t"
13627             "vphaddd $tmp,$tmp,$tmp\n\t"
13628             "vpmovzxdq $tmp,$tmp\n\t"
13629             "vpsllq $tmp,$tmp\n\t"
13630             "vpmuludq $tmp1,$src1,$src2\n\t"
13631             "vpaddq $dst,$tmp,$tmp1\t! mul packed4L" %}
13632   ins_encode %{
13633     int vector_len = 1;
13634     __ vpshufd($tmp$$XMMRegister, $src2$$XMMRegister, 177, vector_len);
13635     __ vpmulld($tmp$$XMMRegister, $src1$$XMMRegister, $tmp$$XMMRegister, vector_len);
13636     __ vextracti128_high($tmp1$$XMMRegister, $tmp$$XMMRegister);
13637     __ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp1$$XMMRegister, vector_len);
13638     __ vpmovzxdq($tmp$$XMMRegister, $tmp$$XMMRegister, vector_len);
13639     __ vpsllq($tmp$$XMMRegister, $tmp$$XMMRegister, 32, vector_len);
13640     __ vpmuludq($tmp1$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13641     __ vpaddq($dst$$XMMRegister, $tmp$$XMMRegister, $tmp1$$XMMRegister, vector_len);
13642   %}
13643   ins_pipe( pipe_slow );
13644 %}
13645 
13646 instruct vmul4L_reg(vecY dst, vecY src1, vecY src2) %{
13647   predicate(UseAVX > 2 && n->as_Vector()->length() == 4 && VM_Version::supports_avx512dq());
13648   match(Set dst (MulVL src1 src2));
13649   format %{ "vpmullq $dst,$src1,$src2\t! mul packed4L" %}
13650   ins_encode %{
13651     int vector_len = 1;
13652     __ vpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13653   %}
13654   ins_pipe( pipe_slow );
13655 %}
13656 
13657 instruct vmul4L_mem(vecY dst, vecY src, memory mem) %{
13658   predicate(UseAVX > 2 && n->as_Vector()->length() == 4 && VM_Version::supports_avx512dq());
13659   match(Set dst (MulVL src (LoadVector mem)));
13660   format %{ "vpmullq $dst,$src,$mem\t! mul packed4L" %}
13661   ins_encode %{
13662     int vector_len = 1;
13663     __ vpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
13664   %}
13665   ins_pipe( pipe_slow );
13666 %}
13667 
13668 instruct vmul8L_reg(vecZ dst, vecZ src1, vecZ src2) %{
13669   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && VM_Version::supports_avx512dq());
13670   match(Set dst (MulVL src1 src2));
13671   format %{ "vpmullq $dst,$src1,$src2\t! mul packed8L" %}
13672   ins_encode %{
13673     int vector_len = 2;
13674     __ vpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13675   %}
13676   ins_pipe( pipe_slow );
13677 %}
13678 
13679 instruct vmul8L_mem(vecZ dst, vecZ src, memory mem) %{
13680   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && VM_Version::supports_avx512dq());
13681   match(Set dst (MulVL src (LoadVector mem)));
13682   format %{ "vpmullq $dst,$src,$mem\t! mul packed8L" %}
13683   ins_encode %{
13684     int vector_len = 2;
13685     __ vpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
13686   %}
13687   ins_pipe( pipe_slow );
13688 %}
13689 
13690 instruct vmul8I_reg(vecY dst, vecY src1, vecY src2) %{
13691   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
13692   match(Set dst (MulVI src1 src2));
13693   format %{ "vpmulld $dst,$src1,$src2\t! mul packed8I" %}
13694   ins_encode %{
13695     int vector_len = 1;
13696     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13697   %}
13698   ins_pipe( pipe_slow );
13699 %}
13700 
13701 instruct vmul8I_mem(vecY dst, vecY src, memory mem) %{
13702   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
13703   match(Set dst (MulVI src (LoadVector mem)));
13704   format %{ "vpmulld $dst,$src,$mem\t! mul packed8I" %}
13705   ins_encode %{
13706     int vector_len = 1;
13707     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
13708   %}
13709   ins_pipe( pipe_slow );
13710 %}
13711 
13712 instruct vmul16I_reg(vecZ dst, vecZ src1, vecZ src2) %{
13713   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
13714   match(Set dst (MulVI src1 src2));
13715   format %{ "vpmulld $dst,$src1,$src2\t! mul packed16I" %}
13716   ins_encode %{
13717     int vector_len = 2;
13718     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13719   %}
13720   ins_pipe( pipe_slow );
13721 %}
13722 
13723 instruct vmul16I_mem(vecZ dst, vecZ src, memory mem) %{
13724   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
13725   match(Set dst (MulVI src (LoadVector mem)));
13726   format %{ "vpmulld $dst,$src,$mem\t! mul packed16I" %}
13727   ins_encode %{
13728     int vector_len = 2;
13729     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
13730   %}
13731   ins_pipe( pipe_slow );
13732 %}
13733 
13734 // Floats vector mul
13735 instruct vmul2F(vecD dst, vecD src) %{
13736   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
13737   match(Set dst (MulVF dst src));
13738   format %{ "mulps   $dst,$src\t! mul packed2F" %}
13739   ins_encode %{
13740     __ mulps($dst$$XMMRegister, $src$$XMMRegister);
13741   %}
13742   ins_pipe( pipe_slow );
13743 %}
13744 
13745 instruct vmul2F_reg(vecD dst, vecD src1, vecD src2) %{
13746   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
13747   match(Set dst (MulVF src1 src2));
13748   format %{ "vmulps  $dst,$src1,$src2\t! mul packed2F" %}
13749   ins_encode %{
13750     int vector_len = 0;
13751     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13752   %}
13753   ins_pipe( pipe_slow );
13754 %}
13755 
13756 instruct vmul2F_mem(vecD dst, vecD src, memory mem) %{
13757   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
13758   match(Set dst (MulVF src (LoadVector mem)));
13759   format %{ "vmulps  $dst,$src,$mem\t! mul packed2F" %}
13760   ins_encode %{
13761     int vector_len = 0;
13762     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
13763   %}
13764   ins_pipe( pipe_slow );
13765 %}
13766 
13767 instruct vmul4F(vecX dst, vecX src) %{
13768   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
13769   match(Set dst (MulVF dst src));
13770   format %{ "mulps   $dst,$src\t! mul packed4F" %}
13771   ins_encode %{
13772     __ mulps($dst$$XMMRegister, $src$$XMMRegister);
13773   %}
13774   ins_pipe( pipe_slow );
13775 %}
13776 
13777 instruct vmul4F_reg(vecX dst, vecX src1, vecX src2) %{
13778   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
13779   match(Set dst (MulVF src1 src2));
13780   format %{ "vmulps  $dst,$src1,$src2\t! mul packed4F" %}
13781   ins_encode %{
13782     int vector_len = 0;
13783     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13784   %}
13785   ins_pipe( pipe_slow );
13786 %}
13787 
13788 instruct vmul4F_mem(vecX dst, vecX src, memory mem) %{
13789   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
13790   match(Set dst (MulVF src (LoadVector mem)));
13791   format %{ "vmulps  $dst,$src,$mem\t! mul packed4F" %}
13792   ins_encode %{
13793     int vector_len = 0;
13794     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
13795   %}
13796   ins_pipe( pipe_slow );
13797 %}
13798 
13799 instruct vmul8F_reg(vecY dst, vecY src1, vecY src2) %{
13800   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
13801   match(Set dst (MulVF src1 src2));
13802   format %{ "vmulps  $dst,$src1,$src2\t! mul packed8F" %}
13803   ins_encode %{
13804     int vector_len = 1;
13805     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13806   %}
13807   ins_pipe( pipe_slow );
13808 %}
13809 
13810 instruct vmul8F_mem(vecY dst, vecY src, memory mem) %{
13811   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
13812   match(Set dst (MulVF src (LoadVector mem)));
13813   format %{ "vmulps  $dst,$src,$mem\t! mul packed8F" %}
13814   ins_encode %{
13815     int vector_len = 1;
13816     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
13817   %}
13818   ins_pipe( pipe_slow );
13819 %}
13820 
13821 instruct vmul16F_reg(vecZ dst, vecZ src1, vecZ src2) %{
13822   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
13823   match(Set dst (MulVF src1 src2));
13824   format %{ "vmulps  $dst,$src1,$src2\t! mul packed16F" %}
13825   ins_encode %{
13826     int vector_len = 2;
13827     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13828   %}
13829   ins_pipe( pipe_slow );
13830 %}
13831 
13832 instruct vmul16F_mem(vecZ dst, vecZ src, memory mem) %{
13833   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
13834   match(Set dst (MulVF src (LoadVector mem)));
13835   format %{ "vmulps  $dst,$src,$mem\t! mul packed16F" %}
13836   ins_encode %{
13837     int vector_len = 2;
13838     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
13839   %}
13840   ins_pipe( pipe_slow );
13841 %}
13842 
13843 // Doubles vector mul
13844 instruct vmul2D(vecX dst, vecX src) %{
13845   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
13846   match(Set dst (MulVD dst src));
13847   format %{ "mulpd   $dst,$src\t! mul packed2D" %}
13848   ins_encode %{
13849     __ mulpd($dst$$XMMRegister, $src$$XMMRegister);
13850   %}
13851   ins_pipe( pipe_slow );
13852 %}
13853 
13854 instruct vmul2D_reg(vecX dst, vecX src1, vecX src2) %{
13855   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
13856   match(Set dst (MulVD src1 src2));
13857   format %{ "vmulpd  $dst,$src1,$src2\t! mul packed2D" %}
13858   ins_encode %{
13859     int vector_len = 0;
13860     __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13861   %}
13862   ins_pipe( pipe_slow );
13863 %}
13864 
13865 instruct vmul2D_mem(vecX dst, vecX src, memory mem) %{
13866   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
13867   match(Set dst (MulVD src (LoadVector mem)));
13868   format %{ "vmulpd  $dst,$src,$mem\t! mul packed2D" %}
13869   ins_encode %{
13870     int vector_len = 0;
13871     __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
13872   %}
13873   ins_pipe( pipe_slow );
13874 %}
13875 
13876 instruct vmul4D_reg(vecY dst, vecY src1, vecY src2) %{
13877   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
13878   match(Set dst (MulVD src1 src2));
13879   format %{ "vmulpd  $dst,$src1,$src2\t! mul packed4D" %}
13880   ins_encode %{
13881     int vector_len = 1;
13882     __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13883   %}
13884   ins_pipe( pipe_slow );
13885 %}
13886 
13887 instruct vmul4D_mem(vecY dst, vecY src, memory mem) %{
13888   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
13889   match(Set dst (MulVD src (LoadVector mem)));
13890   format %{ "vmulpd  $dst,$src,$mem\t! mul packed4D" %}
13891   ins_encode %{
13892     int vector_len = 1;
13893     __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
13894   %}
13895   ins_pipe( pipe_slow );
13896 %}
13897 
13898 instruct vmul8D_reg(vecZ dst, vecZ src1, vecZ src2) %{
13899   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
13900   match(Set dst (MulVD src1 src2));
13901   format %{ "vmulpd  $dst k0,$src1,$src2\t! mul packed8D" %}
13902   ins_encode %{
13903     int vector_len = 2;
13904     __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13905   %}
13906   ins_pipe( pipe_slow );
13907 %}
13908 
13909 instruct vmul8D_mem(vecZ dst, vecZ src, memory mem) %{
13910   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
13911   match(Set dst (MulVD src (LoadVector mem)));
13912   format %{ "vmulpd  $dst k0,$src,$mem\t! mul packed8D" %}
13913   ins_encode %{
13914     int vector_len = 2;
13915     __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
13916   %}
13917   ins_pipe( pipe_slow );
13918 %}
13919 
13920 instruct vcmov8F_reg(legVecY dst, legVecY src1, legVecY src2, immI8 cop, cmpOp_vcmppd copnd) %{
13921   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
13922   match(Set dst (CMoveVF (Binary copnd cop) (Binary src1 src2)));
13923   effect(TEMP dst, USE src1, USE src2);
13924   format %{ "cmpps.$copnd  $dst, $src1, $src2  ! vcmovevf, cond=$cop\n\t"
13925             "blendvps $dst,$src1,$src2,$dst ! vcmovevf\n\t"
13926          %}
13927   ins_encode %{
13928     int vector_len = 1;
13929     int cond = (Assembler::Condition)($copnd$$cmpcode);
13930     __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cond, vector_len);
13931     __ vblendvps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $dst$$XMMRegister, vector_len);
13932   %}
13933   ins_pipe( pipe_slow );
13934 %}
13935 
13936 instruct vcmov4D_reg(legVecY dst, legVecY src1, legVecY src2, immI8 cop, cmpOp_vcmppd copnd) %{
13937   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
13938   match(Set dst (CMoveVD (Binary copnd cop) (Binary src1 src2)));
13939   effect(TEMP dst, USE src1, USE src2);
13940   format %{ "cmppd.$copnd  $dst, $src1, $src2  ! vcmovevd, cond=$cop\n\t"
13941             "vblendvpd $dst,$src1,$src2,$dst ! vcmovevd\n\t"
13942          %}
13943   ins_encode %{
13944     int vector_len = 1;
13945     int cond = (Assembler::Condition)($copnd$$cmpcode);
13946     __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cond, vector_len);
13947     __ vblendvpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $dst$$XMMRegister, vector_len);
13948   %}
13949   ins_pipe( pipe_slow );
13950 %}
13951 
13952 // --------------------------------- DIV --------------------------------------
13953 
13954 // Floats vector div
13955 instruct vdiv2F(vecD dst, vecD src) %{
13956   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
13957   match(Set dst (DivVF dst src));
13958   format %{ "divps   $dst,$src\t! div packed2F" %}
13959   ins_encode %{
13960     __ divps($dst$$XMMRegister, $src$$XMMRegister);
13961   %}
13962   ins_pipe( pipe_slow );
13963 %}
13964 
13965 instruct vdiv2F_reg(vecD dst, vecD src1, vecD src2) %{
13966   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
13967   match(Set dst (DivVF src1 src2));
13968   format %{ "vdivps  $dst,$src1,$src2\t! div packed2F" %}
13969   ins_encode %{
13970     int vector_len = 0;
13971     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
13972   %}
13973   ins_pipe( pipe_slow );
13974 %}
13975 
13976 instruct vdiv2F_mem(vecD dst, vecD src, memory mem) %{
13977   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
13978   match(Set dst (DivVF src (LoadVector mem)));
13979   format %{ "vdivps  $dst,$src,$mem\t! div packed2F" %}
13980   ins_encode %{
13981     int vector_len = 0;
13982     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
13983   %}
13984   ins_pipe( pipe_slow );
13985 %}
13986 
13987 instruct vdiv4F(vecX dst, vecX src) %{
13988   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
13989   match(Set dst (DivVF dst src));
13990   format %{ "divps   $dst,$src\t! div packed4F" %}
13991   ins_encode %{
13992     __ divps($dst$$XMMRegister, $src$$XMMRegister);
13993   %}
13994   ins_pipe( pipe_slow );
13995 %}
13996 
13997 instruct vdiv4F_reg(vecX dst, vecX src1, vecX src2) %{
13998   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
13999   match(Set dst (DivVF src1 src2));
14000   format %{ "vdivps  $dst,$src1,$src2\t! div packed4F" %}
14001   ins_encode %{
14002     int vector_len = 0;
14003     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14004   %}
14005   ins_pipe( pipe_slow );
14006 %}
14007 
14008 instruct vdiv4F_mem(vecX dst, vecX src, memory mem) %{
14009   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
14010   match(Set dst (DivVF src (LoadVector mem)));
14011   format %{ "vdivps  $dst,$src,$mem\t! div packed4F" %}
14012   ins_encode %{
14013     int vector_len = 0;
14014     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
14015   %}
14016   ins_pipe( pipe_slow );
14017 %}
14018 
14019 instruct vdiv8F_reg(vecY dst, vecY src1, vecY src2) %{
14020   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
14021   match(Set dst (DivVF src1 src2));
14022   format %{ "vdivps  $dst,$src1,$src2\t! div packed8F" %}
14023   ins_encode %{
14024     int vector_len = 1;
14025     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14026   %}
14027   ins_pipe( pipe_slow );
14028 %}
14029 
14030 instruct vdiv8F_mem(vecY dst, vecY src, memory mem) %{
14031   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
14032   match(Set dst (DivVF src (LoadVector mem)));
14033   format %{ "vdivps  $dst,$src,$mem\t! div packed8F" %}
14034   ins_encode %{
14035     int vector_len = 1;
14036     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
14037   %}
14038   ins_pipe( pipe_slow );
14039 %}
14040 
14041 instruct vdiv16F_reg(vecZ dst, vecZ src1, vecZ src2) %{
14042   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
14043   match(Set dst (DivVF src1 src2));
14044   format %{ "vdivps  $dst,$src1,$src2\t! div packed16F" %}
14045   ins_encode %{
14046     int vector_len = 2;
14047     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14048   %}
14049   ins_pipe( pipe_slow );
14050 %}
14051 
14052 instruct vdiv16F_mem(vecZ dst, vecZ src, memory mem) %{
14053   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
14054   match(Set dst (DivVF src (LoadVector mem)));
14055   format %{ "vdivps  $dst,$src,$mem\t! div packed16F" %}
14056   ins_encode %{
14057     int vector_len = 2;
14058     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
14059   %}
14060   ins_pipe( pipe_slow );
14061 %}
14062 
14063 // Doubles vector div
14064 instruct vdiv2D(vecX dst, vecX src) %{
14065   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
14066   match(Set dst (DivVD dst src));
14067   format %{ "divpd   $dst,$src\t! div packed2D" %}
14068   ins_encode %{
14069     __ divpd($dst$$XMMRegister, $src$$XMMRegister);
14070   %}
14071   ins_pipe( pipe_slow );
14072 %}
14073 
14074 instruct vdiv2D_reg(vecX dst, vecX src1, vecX src2) %{
14075   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
14076   match(Set dst (DivVD src1 src2));
14077   format %{ "vdivpd  $dst,$src1,$src2\t! div packed2D" %}
14078   ins_encode %{
14079     int vector_len = 0;
14080     __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14081   %}
14082   ins_pipe( pipe_slow );
14083 %}
14084 
14085 instruct vdiv2D_mem(vecX dst, vecX src, memory mem) %{
14086   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
14087   match(Set dst (DivVD src (LoadVector mem)));
14088   format %{ "vdivpd  $dst,$src,$mem\t! div packed2D" %}
14089   ins_encode %{
14090     int vector_len = 0;
14091     __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
14092   %}
14093   ins_pipe( pipe_slow );
14094 %}
14095 
14096 instruct vdiv4D_reg(vecY dst, vecY src1, vecY src2) %{
14097   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
14098   match(Set dst (DivVD src1 src2));
14099   format %{ "vdivpd  $dst,$src1,$src2\t! div packed4D" %}
14100   ins_encode %{
14101     int vector_len = 1;
14102     __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14103   %}
14104   ins_pipe( pipe_slow );
14105 %}
14106 
14107 instruct vdiv4D_mem(vecY dst, vecY src, memory mem) %{
14108   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
14109   match(Set dst (DivVD src (LoadVector mem)));
14110   format %{ "vdivpd  $dst,$src,$mem\t! div packed4D" %}
14111   ins_encode %{
14112     int vector_len = 1;
14113     __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
14114   %}
14115   ins_pipe( pipe_slow );
14116 %}
14117 
14118 instruct vdiv8D_reg(vecZ dst, vecZ src1, vecZ src2) %{
14119   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
14120   match(Set dst (DivVD src1 src2));
14121   format %{ "vdivpd  $dst,$src1,$src2\t! div packed8D" %}
14122   ins_encode %{
14123     int vector_len = 2;
14124     __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14125   %}
14126   ins_pipe( pipe_slow );
14127 %}
14128 
14129 instruct vdiv8D_mem(vecZ dst, vecZ src, memory mem) %{
14130   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
14131   match(Set dst (DivVD src (LoadVector mem)));
14132   format %{ "vdivpd  $dst,$src,$mem\t! div packed8D" %}
14133   ins_encode %{
14134     int vector_len = 2;
14135     __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
14136   %}
14137   ins_pipe( pipe_slow );
14138 %}
14139 
14140 // ------------------------------ Min ---------------------------------------
14141 // Byte vector Min
14142 instruct min8B_reg(vecD dst, vecD src1, vecD src2) %{
14143   predicate(UseSSE > 3 && UseAVX == 0 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
14144   match(Set dst (MinV src1 src2));
14145   effect(TEMP dst);
14146   format %{ "movdqu  $dst,$src1\n\t"
14147             "pminsb  $dst,$src2\t!  " %}
14148   ins_encode %{
14149     __ movdqu($dst$$XMMRegister, $src1$$XMMRegister);
14150     __ pminsb($dst$$XMMRegister, $src2$$XMMRegister);
14151   %}
14152   ins_pipe( pipe_slow );
14153 %}
14154 
14155 instruct min8B_reg_avx(vecD dst, vecD src1, vecD src2) %{
14156   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
14157   match(Set dst (MinV src1 src2));
14158   format %{ "vpminsb  $dst,$src1,$src2\t!  " %}
14159   ins_encode %{
14160     __ vpminsb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, 0);
14161   %}
14162   ins_pipe( pipe_slow );
14163 %}
14164 
14165 instruct min16B_reg(vecX dst, vecX src1, vecX src2) %{
14166   predicate(UseSSE > 3 && UseAVX == 0 && n->as_Vector()->length() == 16 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
14167   match(Set dst (MinV src1 src2));
14168   effect(TEMP dst);
14169   format %{ "movdqu  $dst,$src1\n\t"
14170             "pminsb  $dst,$src2\t!  " %}
14171   ins_encode %{
14172     __ movdqu($dst$$XMMRegister, $src1$$XMMRegister);
14173     __ pminsb($dst$$XMMRegister, $src2$$XMMRegister);
14174   %}
14175   ins_pipe( pipe_slow );
14176 %}
14177 
14178 instruct min16B_reg_avx(vecX dst, vecX src1, vecX src2) %{
14179   predicate(UseAVX > 0 && n->as_Vector()->length() == 16 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
14180   match(Set dst (MinV src1 src2));
14181   format %{ "vpminsb    $dst,$src1,$src2\t! " %}
14182   ins_encode %{
14183     int vector_len = 0;
14184     __ vpminsb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14185   %}
14186   ins_pipe( pipe_slow );
14187 %}
14188 
14189 instruct min32B_reg(vecY dst, vecY src1, vecY src2) %{
14190   predicate(UseAVX > 1 && n->as_Vector()->length() == 32 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
14191   match(Set dst (MinV src1 src2));
14192   format %{ "vpminsb    $dst,$src1,$src2\t! " %}
14193   ins_encode %{
14194     int vector_len = 1;
14195     __ vpminsb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14196   %}
14197   ins_pipe( pipe_slow );
14198 %}
14199 
14200 instruct min64B_reg_evex(vecZ dst, vecZ src1, vecZ src2) %{
14201   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
14202   match(Set dst (MinV src1 src2));
14203   format %{ "vpminsb  $dst,$src1,$src2\t! " %}
14204   ins_encode %{
14205     int vector_len = 2;
14206     __ vpminsb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14207   %}
14208   ins_pipe( pipe_slow );
14209 %}
14210 
14211 //Short vector Min
14212 instruct min4S_reg(vecD dst, vecD src1, vecD src2) %{
14213   predicate(UseSSE >= 2 && UseAVX == 0 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
14214   match(Set dst (MinV src1 src2));
14215   effect(TEMP dst);
14216   format %{ "movsd   $dst,$src1\n\t"
14217             "pminsw  $dst,$src2\t! " %}
14218   ins_encode %{
14219     __ movsd($dst$$XMMRegister, $src1$$XMMRegister);
14220     __ pminsw($dst$$XMMRegister, $src2$$XMMRegister);
14221   %}
14222   ins_pipe( pipe_slow );
14223 %}
14224 
14225 instruct min4S_reg_avx(vecD dst, vecD src1, vecD src2) %{
14226   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
14227   match(Set dst (MinV src1 src2));
14228   effect(TEMP dst);
14229   format %{ "vpminsw  $dst,$src1,$src2\t! " %}
14230   ins_encode %{
14231     __ vpminsw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, 0);
14232   %}
14233   ins_pipe( pipe_slow );
14234 %}
14235 
14236 instruct min8S_reg(vecX dst, vecX src1, vecX src2) %{
14237   predicate(UseSSE >= 2 && UseAVX == 0 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
14238   match(Set dst (MinV src1 src2));
14239   effect(TEMP dst);
14240   format %{ "movdqu   $dst,$src1\n\t"
14241             "pminsw  $dst,$src2\t! " %}
14242   ins_encode %{
14243     __ movdqu($dst$$XMMRegister, $src1$$XMMRegister);
14244     __ pminsw($dst$$XMMRegister, $src2$$XMMRegister);
14245   %}
14246   ins_pipe( pipe_slow );
14247 %}
14248 
14249 instruct min8S_reg_avx(vecX dst, vecX src1, vecX src2) %{
14250   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
14251   match(Set dst (MinV src1 src2));
14252   format %{ "vpminsw    $dst,$src1,$src2\t! " %}
14253   ins_encode %{
14254     int vector_len = 0;
14255     __ vpminsw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14256   %}
14257   ins_pipe( pipe_slow );
14258 %}
14259 
14260 instruct min16S_reg(vecY dst, vecY src1, vecY src2) %{
14261   predicate(UseAVX > 1 && n->as_Vector()->length() == 16 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
14262   match(Set dst (MinV src1 src2));
14263   format %{ "vpminsw    $dst,$src1,$src2\t! " %}
14264   ins_encode %{
14265     int vector_len = 1;
14266     __ vpminsw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14267   %}
14268   ins_pipe( pipe_slow );
14269 %}
14270 
14271 instruct min32S_reg_evex(vecZ dst, vecZ src1, vecZ src2) %{
14272   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
14273   match(Set dst (MinV src1 src2));
14274   format %{ "vpminsw  $dst,$src1,$src2\t! " %}
14275   ins_encode %{
14276     int vector_len = 2;
14277     __ vpminsw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14278   %}
14279   ins_pipe( pipe_slow );
14280 %}
14281 
14282 // Int vector Min
14283 instruct min2I_reg(vecD dst, vecD src1, vecD src2) %{
14284   predicate(UseSSE > 3 && UseAVX == 0 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
14285   match(Set dst (MinV src1 src2));
14286   effect(TEMP dst);
14287   format %{ "movsd   $dst,$src1\n\t"
14288             "pminsd  $dst,$src2\t! " %}
14289   ins_encode %{
14290     __ movsd($dst$$XMMRegister, $src1$$XMMRegister);
14291     __ pminsd($dst$$XMMRegister, $src2$$XMMRegister);
14292   %}
14293   ins_pipe( pipe_slow );
14294 %}
14295 
14296 instruct min2I_reg_avx(vecD dst, vecD src1, vecD src2) %{
14297   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
14298   match(Set dst (MinV src1 src2));
14299   format %{ "vpminsd    $dst,$src1,$src2\t! " %}
14300   ins_encode %{
14301     int vector_len = 0;
14302     __ vpminsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14303   %}
14304   ins_pipe( pipe_slow );
14305 %}
14306 
14307 instruct min4I_reg(vecX dst, vecX src1, vecX src2) %{
14308   predicate(UseSSE > 3 && UseAVX == 0 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
14309   match(Set dst (MinV src1 src2));
14310   effect(TEMP dst);
14311   format %{ "movdqu   $dst,$src1\n\t"
14312             "pminsd   $dst,$src2\t! " %}
14313   ins_encode %{
14314     __ movdqu($dst$$XMMRegister, $src1$$XMMRegister);
14315     __ pminsd($dst$$XMMRegister, $src2$$XMMRegister);
14316   %}
14317   ins_pipe( pipe_slow );
14318 %}
14319 
14320 instruct min4I_reg_avx(vecX dst, vecX src1, vecX src2) %{
14321   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
14322   match(Set dst (MinV src1 src2));
14323   format %{ "vpminsd    $dst,$src1,$src2\t! " %}
14324   ins_encode %{
14325     int vector_len = 0;
14326     __ vpminsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14327   %}
14328   ins_pipe( pipe_slow );
14329 %}
14330 
14331 instruct min4I_reg_evex(vecX dst, vecX src1, vecX src2) %{
14332   predicate(UseAVX > 2 && VM_Version::supports_avx512vl() && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
14333   match(Set dst (MinV src1 src2));
14334   format %{ "vpminsd  $dst,$src1,$src2\t! " %}
14335   ins_encode %{
14336     int vector_len = 0;
14337     __ vpminsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14338   %}
14339   ins_pipe( pipe_slow );
14340 %}
14341 
14342 instruct min8I_reg_avx(vecY dst, vecY src1, vecY src2) %{
14343   predicate(UseAVX > 1 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
14344   match(Set dst (MinV src1 src2));
14345   format %{ "vpminsd    $dst,$src1,$src2\t! " %}
14346   ins_encode %{
14347     int vector_len = 1;
14348     __ vpminsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14349   %}
14350   ins_pipe( pipe_slow );
14351 %}
14352 
14353 instruct min8I_reg_evex(vecY dst, vecY src1, vecY src2) %{
14354   predicate(UseAVX > 2 && VM_Version::supports_avx512vl() && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
14355   match(Set dst (MinV src1 src2));
14356   format %{ "vpminsd  $dst,$src1,$src2\t! " %}
14357   ins_encode %{
14358     int vector_len = 1;
14359     __ vpminsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14360   %}
14361   ins_pipe( pipe_slow );
14362 %}
14363 
14364 instruct min16I_reg_evex(vecZ dst, vecZ src1, vecZ src2) %{
14365   predicate(UseAVX > 2 && VM_Version::supports_avx512vl() && n->as_Vector()->length() == 16 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
14366   match(Set dst (MinV src1 src2));
14367   format %{ "vpminsd  $dst,$src1,$src2\t! " %}
14368   ins_encode %{
14369     int vector_len = 2;
14370     __ vpminsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14371   %}
14372   ins_pipe( pipe_slow );
14373 %}
14374 
14375 // Long vector Min
14376 instruct minL_reg(vecD dst, vecD src1, vecD src2, rxmm0 tmp) %{
14377   predicate(UseAVX == 0 && UseSSE >= 4 && n->as_Vector()->length() == 1 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
14378   match(Set dst (MinV src1 src2));
14379   effect(TEMP dst, TEMP tmp);
14380   format %{ "movsd     $tmp,$src1\n\t"
14381             "movsd     $dst,$src1\n\t"
14382             "pcmpgtq   $tmp,$src2\n\t"
14383             "blendvpd  $dst,$src2\t! " %}
14384   ins_encode %{
14385     __ movsd($tmp$$XMMRegister, $src1$$XMMRegister);
14386     __ movsd($dst$$XMMRegister, $src1$$XMMRegister);
14387     __ pcmpgtq($tmp$$XMMRegister, $src2$$XMMRegister);
14388     __ blendvpd($dst$$XMMRegister, $src2$$XMMRegister);
14389   %}
14390   ins_pipe( pipe_slow );
14391 %}
14392 
14393 instruct min1L_reg_avx(vecD dst, vecD src1, vecD src2) %{
14394   predicate(UseAVX > 0 && n->as_Vector()->length() == 1 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
14395   match(Set dst (MinV src1 src2));
14396   effect(TEMP dst);
14397   format %{ "vpcmpgtq   $dst,$src1,$src2\n\t"
14398             "vblendvpd  $dst,$src1,$src2,$dst\t! " %}
14399   ins_encode %{
14400     int vector_len = 0;
14401     __ vpcmpgtq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14402     __ vblendvpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $dst$$XMMRegister, vector_len);
14403   %}
14404   ins_pipe( pipe_slow );
14405 %}
14406 
14407 instruct min2L_reg(vecX dst, vecX src1, vecX src2, rxmm0 tmp) %{
14408   predicate(UseAVX == 0 && UseSSE >= 4 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
14409   match(Set dst (MinV src1 src2));
14410   effect(TEMP dst, TEMP tmp);
14411   format %{ "movdqu    $tmp,$src1\n\t"
14412             "movdqu    $dst,$src1\n\t"
14413             "pcmpgtq   $tmp,$src2\n\t"
14414             "blendvpd  $dst,$src2\t! " %}
14415   ins_encode %{
14416     __ movdqu($tmp$$XMMRegister, $src1$$XMMRegister);
14417     __ movdqu($dst$$XMMRegister, $src1$$XMMRegister);
14418     __ pcmpgtq($tmp$$XMMRegister, $src2$$XMMRegister);
14419     __ blendvpd($dst$$XMMRegister, $src2$$XMMRegister);
14420   %}
14421   ins_pipe( pipe_slow );
14422 %}
14423 
14424 instruct min2L_reg_avx(vecX dst, vecX src1, vecX src2) %{
14425   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
14426   match(Set dst (MinV src1 src2));
14427   effect(TEMP dst);
14428   format %{ "vpcmpgtq   $dst,$src1,$src2\n\t"
14429             "vblendvpd  $dst,$src1,$src2,$dst\t! " %}
14430   ins_encode %{
14431     int vector_len = 0;
14432     __ vpcmpgtq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14433     __ vblendvpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $dst$$XMMRegister, vector_len);
14434   %}
14435   ins_pipe( pipe_slow );
14436 %}
14437 
14438 instruct min4L_reg_avx(vecY dst, vecY src1, vecY src2) %{
14439   predicate(UseAVX > 1 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
14440   match(Set dst (MinV src1 src2));
14441   effect(TEMP dst);
14442   format %{ "vpcmpgtq   $dst,$src1,$src2\n\t"
14443             "vblendvpd  $dst,$src1,$src2,$dst\t! " %}
14444   ins_encode %{
14445     int vector_len = 1;
14446     __ vpcmpgtq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14447     __ vblendvpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $dst$$XMMRegister, vector_len);
14448   %}
14449   ins_pipe( pipe_slow );
14450 %}
14451 
14452 instruct min2L_reg_evex(vecX dst, vecX src1, vecX src2) %{
14453   predicate(UseAVX > 2 && VM_Version::supports_avx512vl() && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
14454   match(Set dst (MinV src1 src2));
14455   format %{ "vpminsq  $dst,$src1,src2\t! " %}
14456   ins_encode %{
14457     __ vpminsq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, 0);
14458   %}
14459   ins_pipe( pipe_slow );
14460 %}
14461 
14462 instruct min4L_reg_evex(vecY dst, vecY src1, vecY src2) %{
14463   predicate(UseAVX > 2 && VM_Version::supports_avx512vl() && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
14464   match(Set dst (MinV src1 src2));
14465   format %{ "vpminsq  $dst,$src1,src2\t! " %}
14466   ins_encode %{
14467     __ vpminsq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, 1);
14468   %}
14469   ins_pipe( pipe_slow );
14470 %}
14471 
14472 instruct min8L_reg_evex(vecZ dst, vecZ src1, vecZ src2) %{
14473   predicate(UseAVX > 2 && VM_Version::supports_avx512vl() && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
14474   match(Set dst (MinV src1 src2));
14475   format %{ "vpminsq  $dst,$src1,src2\t! " %}
14476   ins_encode %{
14477     __ vpminsq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, 2);
14478   %}
14479   ins_pipe( pipe_slow );
14480 %}
14481 
14482 // Float vector Min
14483 instruct min2F_reg_avx(legVecD dst, legVecD a, legVecD b, legVecD tmp, legVecD atmp, legVecD btmp) %{
14484   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
14485   match(Set dst (MinV a b));
14486   effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp);
14487   format %{
14488      "blendvps         $atmp,$a,$b,$a             \n\t"
14489      "blendvps         $btmp,$b,$a,$a             \n\t"
14490      "vminps           $tmp,$atmp,$btmp           \n\t"
14491      "cmpps.unordered  $btmp, $atmp, $atmp        \n\t"
14492      "blendvps         $dst,$tmp,$atmp,$btmp      \n\t"
14493   %}
14494   ins_encode %{
14495     int vector_len = 0;
14496     __ vblendvps($atmp$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, vector_len);
14497     __ vblendvps($btmp$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, $a$$XMMRegister, vector_len);
14498     __ vminps($tmp$$XMMRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vector_len);
14499     __ vcmpps($btmp$$XMMRegister, $atmp$$XMMRegister, $atmp$$XMMRegister, Assembler::UNORD_Q, vector_len);
14500     __ vblendvps($dst$$XMMRegister, $tmp$$XMMRegister, $atmp$$XMMRegister, $btmp$$XMMRegister, vector_len);
14501   %}
14502   ins_pipe( pipe_slow );
14503 %}
14504 
14505 instruct min4F_reg_avx(legVecX dst, legVecX a, legVecX b, legVecX tmp, legVecX atmp, legVecX btmp) %{
14506   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
14507   match(Set dst (MinV a b));
14508   effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp);
14509   format %{
14510      "blendvps         $atmp,$a,$b,$a             \n\t"
14511      "blendvps         $btmp,$b,$a,$a             \n\t"
14512      "vminps           $tmp,$atmp,$btmp           \n\t"
14513      "cmpps.unordered  $btmp, $atmp, $atmp        \n\t"
14514      "blendvps         $dst,$tmp,$atmp,$btmp      \n\t"
14515   %}
14516   ins_encode %{
14517     int vector_len = 0;
14518     __ vblendvps($atmp$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, vector_len);
14519     __ vblendvps($btmp$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, $a$$XMMRegister, vector_len);
14520     __ vminps($tmp$$XMMRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vector_len);
14521     __ vcmpps($btmp$$XMMRegister, $atmp$$XMMRegister, $atmp$$XMMRegister, Assembler::UNORD_Q, vector_len);
14522     __ vblendvps($dst$$XMMRegister, $tmp$$XMMRegister, $atmp$$XMMRegister, $btmp$$XMMRegister, vector_len);
14523   %}
14524   ins_pipe( pipe_slow );
14525 %}
14526 
14527 instruct min8F_reg_avx(legVecY dst, legVecY a, legVecY b, legVecY tmp, legVecY atmp, legVecY btmp) %{
14528   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
14529   match(Set dst (MinV a b));
14530   effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp);
14531   format %{
14532      "blendvps         $atmp,$a,$b,$a             \n\t"
14533      "blendvps         $btmp,$b,$a,$a             \n\t"
14534      "vminps           $tmp,$atmp,$btmp           \n\t"
14535      "cmpps.unordered  $btmp, $atmp, $atmp        \n\t"
14536      "blendvps         $dst,$tmp,$atmp,$btmp      \n\t"
14537   %}
14538   ins_encode %{
14539     int vector_len = 1;
14540     __ vblendvps($atmp$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, vector_len);
14541     __ vblendvps($btmp$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, $a$$XMMRegister, vector_len);
14542     __ vminps($tmp$$XMMRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vector_len);
14543     __ vcmpps($btmp$$XMMRegister, $atmp$$XMMRegister, $atmp$$XMMRegister, Assembler::UNORD_Q, vector_len);
14544     __ vblendvps($dst$$XMMRegister, $tmp$$XMMRegister, $atmp$$XMMRegister, $btmp$$XMMRegister, vector_len);
14545   %}
14546   ins_pipe( pipe_slow );
14547 %}
14548 
14549 instruct min16F_reg_evex(vecZ dst, vecZ a, vecZ b, vecZ atmp, vecZ btmp) %{
14550   predicate(UseAVX > 2 && VM_Version::supports_avx512dq() &&  n->as_Vector()->length() == 16 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
14551   match(Set dst (MinV a b));
14552   effect(USE a, USE b, TEMP atmp, TEMP btmp);
14553   format %{ 
14554      "vpmovd2m         k1,$a                    \n\t"
14555      "vblendmps        $atmp,k1,$a,$b           \n\t"
14556      "vblendmps        $btmp,k1,$b,$a           \n\t"
14557      "vminps           $dst,$atmp,$btmp         \n\t"
14558      "vcmpps.unordered      k1,$atmp,$atmp           \n\t"
14559      "vmovaps          $dst,k1,$atmp            \n\t"
14560   %}
14561   ins_encode %{
14562     int vector_len = 2;
14563     KRegister ktmp = k1;
14564     KRegister mask = k0;
14565     __ evpmovd2m(ktmp, $a$$XMMRegister, vector_len); 
14566     __ evblendmps($atmp$$XMMRegister, ktmp, $a$$XMMRegister, $b$$XMMRegister, true, vector_len); 
14567     __ evblendmps($btmp$$XMMRegister, ktmp, $b$$XMMRegister, $a$$XMMRegister, true, vector_len); 
14568     __ vminps($dst$$XMMRegister, $atmp$$XMMRegister, $btmp$$XMMRegister, vector_len);
14569     __ evcmpps(ktmp, mask, $atmp$$XMMRegister, $atmp$$XMMRegister, Assembler::UNORD_Q, vector_len);
14570     __ evmovdqul($dst$$XMMRegister, ktmp, $atmp$$XMMRegister, true, vector_len);
14571   %}
14572   ins_pipe( pipe_slow );
14573 %}
14574 
14575 // Double vector Min
14576 instruct min1D_reg_avx(legVecD dst, legVecD a, legVecD b, legVecD tmp, legVecD atmp, legVecD btmp) %{
14577   predicate(UseAVX > 0 && n->as_Vector()->length() == 1 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
14578   match(Set dst (MinV a b));
14579   effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp);
14580   format %{ 
14581      "blendvpd         $atmp,$a,$b,$a           \n\t"
14582      "blendvpd         $btmp,$b,$a,$a           \n\t"
14583      "vminpd           $tmp,$atmp,$btmp         \n\t"
14584      "cmppd.unordered  $btmp, $atmp, $atmp      \n\t"
14585      "blendvpd         $dst,$tmp,$atmp,$btmp    \n\t"
14586   %}
14587   ins_encode %{
14588     int vector_len = 0;
14589     __ vblendvpd($atmp$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, vector_len);
14590     __ vblendvpd($btmp$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, $a$$XMMRegister, vector_len);
14591     __ vminpd($tmp$$XMMRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vector_len);
14592     __ vcmppd($btmp$$XMMRegister, $atmp$$XMMRegister, $atmp$$XMMRegister, Assembler::UNORD_Q, vector_len);
14593     __ vblendvpd($dst$$XMMRegister, $tmp$$XMMRegister, $atmp$$XMMRegister, $btmp$$XMMRegister, vector_len);
14594   %}
14595   ins_pipe( pipe_slow );
14596 %}
14597 
14598 instruct min2D_reg_avx(legVecX dst, legVecX a, legVecX b, legVecX tmp, legVecX atmp, legVecX btmp) %{
14599   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
14600   match(Set dst (MinV a b));
14601   effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp);
14602   format %{ 
14603      "blendvpd         $atmp,$a,$b,$a           \n\t"
14604      "blendvpd         $btmp,$b,$a,$a           \n\t"
14605      "vminpd           $tmp,$atmp,$btmp         \n\t"
14606      "cmppd.unordered  $btmp, $atmp, $atmp      \n\t"
14607      "blendvpd         $dst,$tmp,$atmp,$btmp    \n\t"
14608   %}
14609   ins_encode %{
14610     int vector_len = 0;
14611     __ vblendvpd($atmp$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, vector_len);
14612     __ vblendvpd($btmp$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, $a$$XMMRegister, vector_len);
14613     __ vminpd($tmp$$XMMRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vector_len);
14614     __ vcmppd($btmp$$XMMRegister, $atmp$$XMMRegister, $atmp$$XMMRegister, Assembler::UNORD_Q, vector_len);
14615     __ vblendvpd($dst$$XMMRegister, $tmp$$XMMRegister, $atmp$$XMMRegister, $btmp$$XMMRegister, vector_len);
14616   %}
14617   ins_pipe( pipe_slow );
14618 %}
14619 
14620 instruct min4D_reg_avx(legVecY dst, legVecY a, legVecY b, legVecY tmp, legVecY atmp, legVecY btmp) %{
14621   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
14622   match(Set dst (MinV a b));
14623   effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp);
14624   format %{ 
14625      "blendvpd         $atmp,$a,$b,$a           \n\t"
14626      "blendvpd         $btmp,$b,$a,$a           \n\t"
14627      "vminpd           $tmp,$atmp,$btmp         \n\t"
14628      "cmppd.unordered  $btmp, $atmp, $atmp      \n\t"
14629      "blendvpd         $dst,$tmp,$atmp,$btmp    \n\t"
14630   %}
14631   ins_encode %{
14632     int vector_len = 1;
14633     __ vblendvpd($atmp$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, vector_len);
14634     __ vblendvpd($btmp$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, $a$$XMMRegister, vector_len);
14635     __ vminpd($tmp$$XMMRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vector_len);
14636     __ vcmppd($btmp$$XMMRegister, $atmp$$XMMRegister, $atmp$$XMMRegister, Assembler::UNORD_Q, vector_len);
14637     __ vblendvpd($dst$$XMMRegister, $tmp$$XMMRegister, $atmp$$XMMRegister, $btmp$$XMMRegister, vector_len);
14638   %}
14639   ins_pipe( pipe_slow );
14640 %}
14641 
14642 instruct min8D_reg_evex(vecZ dst, vecZ a, vecZ b, vecZ atmp, vecZ btmp) %{
14643   predicate(UseAVX > 2 && VM_Version::supports_avx512dq() && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
14644   match(Set dst (MinV a b));
14645   effect(USE a, USE b, TEMP atmp, TEMP btmp);
14646   format %{ 
14647      "vpmovq2m         k1,$a                    \n\t"
14648      "vblendmpd        $atmp,k1,$a,$b           \n\t"
14649      "vblendmpd        $btmp,k1,$b,$a           \n\t"
14650      "vminpd           $dst,$atmp,$btmp         \n\t"
14651      "vcmppd.unordered      k1,$atmp,$atmp           \n\t"
14652      "vmovapd          $dst,k1,$atmp            \n\t"
14653   %}
14654   ins_encode %{
14655     int vector_len = 2;
14656     KRegister ktmp = k1;
14657     KRegister mask = k0;
14658     __ evpmovq2m(ktmp, $a$$XMMRegister, vector_len); 
14659     __ evblendmpd($atmp$$XMMRegister, ktmp, $a$$XMMRegister, $b$$XMMRegister, true, vector_len); 
14660     __ evblendmpd($btmp$$XMMRegister, ktmp, $b$$XMMRegister, $a$$XMMRegister, true, vector_len); 
14661     __ vminpd($dst$$XMMRegister, $atmp$$XMMRegister, $btmp$$XMMRegister, vector_len);
14662     __ evcmppd(ktmp, mask, $atmp$$XMMRegister, $atmp$$XMMRegister, Assembler::UNORD_Q, vector_len);
14663     __ evmovdquq($dst$$XMMRegister, ktmp, $atmp$$XMMRegister, true, vector_len);
14664   %}
14665   ins_pipe( pipe_slow );
14666 %}
14667 
14668 // ------------------------------ Max ---------------------------------------
14669 // Byte vector Max
14670 instruct max8B_reg(vecD dst, vecD src1, vecD src2) %{
14671   predicate(UseSSE > 3 && UseAVX == 0 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
14672   match(Set dst (MaxV src1 src2));
14673   effect(TEMP dst);
14674   format %{ "movsd   $dst,$src1\n\t"
14675             "pmaxsb  $dst,$src2\t! " %}
14676   ins_encode %{
14677     __ movsd($dst$$XMMRegister, $src1$$XMMRegister);
14678     __ pmaxsb($dst$$XMMRegister, $src2$$XMMRegister);
14679   %}
14680   ins_pipe( pipe_slow );
14681 %}
14682 
14683 instruct max8B_reg_avx(vecD dst, vecD src1, vecD src2) %{
14684   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
14685   match(Set dst (MaxV src1 src2));
14686   format %{ "vpmaxsb    $dst,$src1,$src2\t! " %}
14687   ins_encode %{
14688     int vector_len = 0;
14689     __ vpmaxsb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14690   %}
14691   ins_pipe( pipe_slow );
14692 %}
14693 
14694 instruct max16B_reg(vecX dst, vecX src1, vecX src2) %{
14695   predicate(UseSSE > 3 && UseAVX == 0 && n->as_Vector()->length() == 16 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
14696   match(Set dst (MaxV src1 src2));
14697   effect(TEMP dst);
14698   format %{ "movdqu  $dst,$src1\n\t"
14699             "pmaxsb  $dst,$src2\t! " %}
14700   ins_encode %{
14701     __ movdqu($dst$$XMMRegister, $src1$$XMMRegister);
14702     __ pmaxsb($dst$$XMMRegister, $src2$$XMMRegister);
14703   %}
14704   ins_pipe( pipe_slow );
14705 %}
14706 
14707 instruct max16B_reg_avx(vecX dst, vecX src1, vecX src2) %{
14708   predicate(UseAVX > 0 && n->as_Vector()->length() == 16 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
14709   match(Set dst (MaxV src1 src2));
14710   format %{ "vpmaxsb    $dst,$src1,$src2\t! " %}
14711   ins_encode %{
14712     int vector_len = 0;
14713     __ vpmaxsb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14714   %}
14715   ins_pipe( pipe_slow );
14716 %}
14717 
14718 instruct max32B_reg(vecY dst, vecY src1, vecY src2) %{
14719   predicate(UseAVX > 1 && n->as_Vector()->length() == 32 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
14720   match(Set dst (MaxV src1 src2));
14721   format %{ "vpmaxsb    $dst,$src1,$src2\t! " %}
14722   ins_encode %{
14723     int vector_len = 1;
14724     __ vpmaxsb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14725   %}
14726   ins_pipe( pipe_slow );
14727 %}
14728 
14729 instruct max64B_reg_evex(vecZ dst, vecZ src1, vecZ src2) %{
14730   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
14731   match(Set dst (MaxV src1 src2));
14732   format %{ "vpmaxsb  $dst,$src1,$src2\t! " %}
14733   ins_encode %{
14734     int vector_len = 2;
14735     __ vpmaxsb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14736   %}
14737   ins_pipe( pipe_slow );
14738 %}
14739 
14740 //Short vector Max
14741 instruct max4S_reg(vecD dst, vecD src1, vecD src2) %{
14742   predicate(UseSSE > 1 && UseAVX == 0 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
14743   match(Set dst (MaxV src1 src2));
14744   effect(TEMP dst);
14745   format %{ "movsd   $dst,$src1\n\t"
14746             "pmaxsw  $dst,$src2\t! " %}
14747   ins_encode %{
14748     __ movsd($dst$$XMMRegister, $src1$$XMMRegister);
14749     __ pmaxsw($dst$$XMMRegister, $src2$$XMMRegister);
14750   %}
14751   ins_pipe( pipe_slow );
14752 %}
14753 
14754 instruct max4S_reg_avx(vecD dst, vecD src1, vecD src2) %{
14755   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
14756   match(Set dst (MaxV src1 src2));
14757   format %{ "vpmaxsw    $dst,$src1,$src2\t! " %}
14758   ins_encode %{
14759     int vector_len = 0;
14760     __ vpmaxsw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14761   %}
14762   ins_pipe( pipe_slow );
14763 %}
14764 
14765 instruct max8S_reg(vecX dst, vecX src1, vecX src2) %{
14766   predicate(UseSSE > 1 && UseAVX == 0 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
14767   match(Set dst (MaxV src1 src2));
14768   effect(TEMP dst);
14769   format %{ "movdqu  $dst,$src1\n\t"
14770             "pmaxsw  $dst,$src2\t! " %}
14771   ins_encode %{
14772     __ movdqu($dst$$XMMRegister, $src1$$XMMRegister);
14773     __ pmaxsw($dst$$XMMRegister, $src2$$XMMRegister);
14774   %}
14775   ins_pipe( pipe_slow );
14776 %}
14777 
14778 instruct max8S_reg_avx(vecX dst, vecX src1, vecX src2) %{
14779   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
14780   match(Set dst (MaxV src1 src2));
14781   format %{ "vpmaxsw    $dst,$src1,$src2\t! " %}
14782   ins_encode %{
14783     int vector_len = 0;
14784     __ vpmaxsw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14785   %}
14786   ins_pipe( pipe_slow );
14787 %}
14788 
14789 instruct max16S_reg(vecY dst, vecY src1, vecY src2) %{
14790   predicate(UseAVX > 1 && n->as_Vector()->length() == 16 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
14791   match(Set dst (MaxV src1 src2));
14792   format %{ "vpmaxsw    $dst,$src1,$src2\t! " %}
14793   ins_encode %{
14794     int vector_len = 1;
14795     __ vpmaxsw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14796   %}
14797   ins_pipe( pipe_slow );
14798 %}
14799 
14800 instruct max32S_reg_evex(vecZ dst, vecZ src1, vecZ src2) %{
14801   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
14802   match(Set dst (MaxV src1 src2));
14803   format %{ "vpmaxsw  $dst,$src1,$src2\t! " %}
14804   ins_encode %{
14805     int vector_len = 2;
14806     __ vpmaxsw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14807   %}
14808   ins_pipe( pipe_slow );
14809 %}
14810 
14811 // Int vector Max
14812 instruct max2I_reg(vecD dst, vecD src1, vecD src2) %{
14813   predicate(UseSSE > 3 && UseAVX == 0 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
14814   match(Set dst (MaxV src1 src2));
14815   effect(TEMP dst);
14816   format %{ "movdqu  $dst,$src1\n\t"
14817             "pmaxsd  $dst,$src2\t! " %}
14818   ins_encode %{
14819     __ movdqu($dst$$XMMRegister, $src1$$XMMRegister);
14820     __ pmaxsd($dst$$XMMRegister, $src2$$XMMRegister);
14821   %}
14822   ins_pipe( pipe_slow );
14823 %}
14824 
14825 instruct max2I_reg_avx(vecD dst, vecD src1, vecD src2) %{
14826   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
14827   match(Set dst (MaxV src1 src2));
14828   format %{ "vpmaxsd    $dst,$src1,$src2\t! " %}
14829   ins_encode %{
14830     int vector_len = 0;
14831     __ vpmaxsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14832   %}
14833   ins_pipe( pipe_slow );
14834 %}
14835 
14836 instruct max4I_reg(vecX dst, vecX src1, vecX src2) %{
14837   predicate(UseSSE > 3 && UseAVX == 0 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
14838   match(Set dst (MaxV src1 src2));
14839   effect(TEMP dst);
14840   format %{ "movdqu  $dst,$src1\n\t"
14841             "pmaxsd  $dst,$src2\t! " %}
14842   ins_encode %{
14843     __ movdqu($dst$$XMMRegister, $src1$$XMMRegister);
14844     __ pmaxsd($dst$$XMMRegister, $src2$$XMMRegister);
14845   %}
14846   ins_pipe( pipe_slow );
14847 %}
14848 
14849 instruct max4I_reg_avx(vecX dst, vecX src1, vecX src2) %{
14850   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
14851   match(Set dst (MaxV src1 src2));
14852   format %{ "vpmaxsd    $dst,$src1,$src2\t! " %}
14853   ins_encode %{
14854     int vector_len = 0;
14855     __ vpmaxsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14856   %}
14857   ins_pipe( pipe_slow );
14858 %}
14859 
14860 instruct max4I_reg_evex(vecX dst, vecX src1, vecX src2) %{
14861   predicate(UseAVX > 2 && VM_Version::supports_avx512vl() && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
14862   match(Set dst (MaxV src1 src2));
14863   format %{ "vpmaxsd  $dst,$src1,$src2\t! " %}
14864   ins_encode %{
14865     int vector_len = 0;
14866     __ vpmaxsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14867   %}
14868   ins_pipe( pipe_slow );
14869 %}
14870 
14871 instruct max8I_reg_avx(vecY dst, vecY src1, vecY src2) %{
14872   predicate(UseAVX > 1 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
14873   match(Set dst (MaxV src1 src2));
14874   format %{ "vpmaxsd    $dst,$src1,$src2\t! " %}
14875   ins_encode %{
14876     int vector_len = 1;
14877     __ vpmaxsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14878   %}
14879   ins_pipe( pipe_slow );
14880 %}
14881 
14882 instruct max8I_reg_evex(vecY dst, vecY src1, vecY src2) %{
14883   predicate(UseAVX > 2 && VM_Version::supports_avx512vl() && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
14884   match(Set dst (MaxV src1 src2));
14885   format %{ "vpmaxsd  $dst,$src1,$src2\t! " %}
14886   ins_encode %{
14887     int vector_len = 1;
14888     __ vpmaxsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14889   %}
14890   ins_pipe( pipe_slow );
14891 %}
14892 
14893 instruct max16I_reg_evex(vecZ dst, vecZ src1, vecZ src2) %{
14894   predicate(UseAVX > 2 && VM_Version::supports_avx512vl() && n->as_Vector()->length() == 16 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
14895   match(Set dst (MaxV src1 src2));
14896   format %{ "vpmaxsd  $dst,$src1,$src2\t! " %}
14897   ins_encode %{
14898     int vector_len = 2;
14899     __ vpmaxsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14900   %}
14901   ins_pipe( pipe_slow );
14902 %}
14903 
14904 // Long Vector Max
14905 instruct maxL_reg(vecD dst, vecD src1, vecD src2, rxmm0 tmp) %{
14906   predicate(UseAVX == 0 && UseSSE >= 4 && n->as_Vector()->length() == 1 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
14907   match(Set dst (MaxV src1 src2));
14908   effect(TEMP dst, TEMP tmp);
14909   format %{ "movsd     $tmp,$src1\n\t"
14910             "movsd     $dst,$src1\n\t"
14911             "pcmpgtq   $tmp,$src2\n\t"
14912             "blendvpd  $dst,$src2\t! " %}
14913   ins_encode %{
14914     __ movsd($tmp$$XMMRegister, $src1$$XMMRegister);
14915     __ movsd($dst$$XMMRegister, $src2$$XMMRegister);
14916     __ pcmpgtq($tmp$$XMMRegister, $src2$$XMMRegister);
14917     __ blendvpd($dst$$XMMRegister, $src1$$XMMRegister);
14918   %}
14919   ins_pipe( pipe_slow );
14920 %}
14921 
14922 instruct max1L_reg_avx(vecD dst, vecD src1, vecD src2) %{
14923   predicate(UseAVX > 0 && n->as_Vector()->length() == 1 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
14924   match(Set dst (MaxV src1 src2));
14925   effect(TEMP dst);
14926   format %{ "vpcmpgtq   $dst,$src1,$src2\n\t"
14927             "vblendvpd  $dst,$src2,$src1,$dst\t! " %}
14928   ins_encode %{
14929     int vector_len = 0;
14930     __ vpcmpgtq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14931     __ vblendvpd($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, $dst$$XMMRegister, vector_len);
14932   %}
14933   ins_pipe( pipe_slow );
14934 %}
14935 
14936 instruct max2L_reg(vecX dst, vecX src1, vecX src2, rxmm0 tmp) %{
14937   predicate(UseAVX == 0 && UseSSE >= 4 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
14938   match(Set dst (MaxV src1 src2));
14939   effect(TEMP dst, TEMP tmp);
14940   format %{ "movdqu    $tmp,$src2\n\t"
14941             "movdqu    $dst,$src1\n\t"
14942             "pcmpgtq   $tmp,$src1\n\t"
14943             "blendvpd  $dst,$src2\t! " %}
14944   ins_encode %{
14945     __ movdqu($tmp$$XMMRegister, $src2$$XMMRegister);
14946     __ movdqu($dst$$XMMRegister, $src1$$XMMRegister);
14947     __ pcmpgtq($tmp$$XMMRegister, $src1$$XMMRegister);
14948     __ blendvpd($dst$$XMMRegister, $src2$$XMMRegister);
14949   %}
14950   ins_pipe( pipe_slow );
14951 %}
14952 
14953 instruct max2L_reg_avx(vecX dst, vecX src1, vecX src2) %{
14954   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
14955   match(Set dst (MaxV src1 src2));
14956   effect(TEMP dst);
14957   format %{ "vpcmpgtq   $dst,$src1,$src2\n\t"
14958             "vblendvpd  $dst,$src2,$src1,$dst\t! " %}
14959   ins_encode %{
14960     int vector_len = 0;
14961     __ vpcmpgtq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14962     __ vblendvpd($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, $dst$$XMMRegister, vector_len);
14963   %}
14964   ins_pipe( pipe_slow );
14965 %}
14966 
14967 instruct max2L_reg_evex(vecX dst, vecX src1, vecX src2) %{
14968   predicate(UseAVX > 2 && VM_Version::supports_avx512vl() && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
14969   match(Set dst (MaxV src1 src2));
14970   format %{ "vpmaxsq  $dst,$src1,src2\t! " %}
14971   ins_encode %{
14972     __ vpmaxsq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, 0);
14973   %}
14974   ins_pipe( pipe_slow );
14975 %}
14976 
14977 instruct max4L_reg_avx(vecY dst, vecY src1, vecY src2) %{
14978   predicate(UseAVX > 1 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
14979   match(Set dst (MaxV src1 src2));
14980   effect(TEMP dst);
14981   format %{ "vpcmpgtq   $dst,$src1,$src2\n\t"
14982             "vblendvpd  $dst,$src2,$src1,$dst\t! " %}
14983   ins_encode %{
14984     int vector_len = 1;
14985     __ vpcmpgtq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
14986     __ vblendvpd($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, $dst$$XMMRegister, vector_len);
14987   %}
14988   ins_pipe( pipe_slow );
14989 %}
14990 
14991 instruct max4L_reg_evex(vecY dst, vecY src1, vecY src2) %{
14992   predicate(UseAVX > 2 && VM_Version::supports_avx512vl() && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
14993   match(Set dst (MaxV src1 src2));
14994   format %{ "vpmaxsq  $dst,$src1,src2\t! " %}
14995   ins_encode %{
14996     __ vpmaxsq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, 1);
14997   %}
14998   ins_pipe( pipe_slow );
14999 %}
15000 
15001 instruct max8L_reg_evex(vecZ dst, vecZ src1, vecZ src2) %{
15002   predicate(UseAVX > 2 && VM_Version::supports_avx512vl() && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
15003   match(Set dst (MaxV src1 src2));
15004   format %{ "vpmaxsq  $dst,$src1,src2\t! " %}
15005   ins_encode %{
15006     __ vpmaxsq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, 2);
15007   %}
15008   ins_pipe( pipe_slow );
15009 %}
15010 
15011 // Float Vector Max
15012 instruct max2F_reg_avx(legVecD dst, legVecD a, legVecD b, legVecD tmp, legVecD atmp, legVecD btmp) %{
15013   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
15014   match(Set dst (MaxV a b));
15015   effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp);
15016   format %{ 
15017      "blendvps         $btmp,$b,$a,$b           \n\t"
15018      "blendvps         $atmp,$a,$b,$b           \n\t"
15019      "vmaxps           $tmp,$atmp,$btmp         \n\t"
15020      "cmpps.unordered  $btmp, $atmp, $atmp      \n\t"
15021      "blendvps         $dst,$tmp,$atmp,$btmp    \n\t"
15022   %}
15023   ins_encode %{
15024     int vector_len = 0;
15025     __ vblendvps($btmp$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, vector_len);
15026     __ vblendvps($atmp$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $b$$XMMRegister, vector_len);
15027     __ vmaxps($tmp$$XMMRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vector_len);
15028     __ vcmpps($btmp$$XMMRegister, $atmp$$XMMRegister, $atmp$$XMMRegister, Assembler::UNORD_Q, vector_len);
15029     __ vblendvps($dst$$XMMRegister, $tmp$$XMMRegister, $atmp$$XMMRegister, $btmp$$XMMRegister, vector_len);
15030  %}
15031  ins_pipe( pipe_slow );
15032 %}
15033 
15034 instruct max4F_reg_avx(legVecX dst, legVecX a, legVecX b, legVecX tmp, legVecX atmp, legVecX btmp) %{
15035   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
15036   match(Set dst (MaxV a b));
15037   effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp);
15038   format %{ 
15039      "blendvps         $btmp,$b,$a,$b           \n\t"
15040      "blendvps         $atmp,$a,$b,$b           \n\t"
15041      "vmaxps           $tmp,$atmp,$btmp         \n\t"
15042      "cmpps.unordered  $btmp, $atmp, $atmp      \n\t"
15043      "blendvps         $dst,$tmp,$atmp,$btmp    \n\t"
15044   %}
15045   ins_encode %{
15046     int vector_len = 0;
15047     __ vblendvps($btmp$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, vector_len);
15048     __ vblendvps($atmp$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $b$$XMMRegister, vector_len);
15049     __ vmaxps($tmp$$XMMRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vector_len);
15050     __ vcmpps($btmp$$XMMRegister, $atmp$$XMMRegister, $atmp$$XMMRegister, Assembler::UNORD_Q, vector_len);
15051     __ vblendvps($dst$$XMMRegister, $tmp$$XMMRegister, $atmp$$XMMRegister, $btmp$$XMMRegister, vector_len);
15052  %}
15053  ins_pipe( pipe_slow );
15054 %}
15055 
15056 instruct max8F_reg_avx(legVecY dst, legVecY a, legVecY b, legVecY tmp, legVecY atmp, legVecY btmp) %{
15057   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
15058   match(Set dst (MaxV a b));
15059   effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp);
15060   format %{ 
15061      "blendvps         $btmp,$b,$a,$b           \n\t"
15062      "blendvps         $atmp,$a,$b,$b           \n\t"
15063      "vmaxps           $tmp,$atmp,$btmp         \n\t"
15064      "cmpps.unordered  $btmp, $atmp, $atmp      \n\t"
15065      "blendvps         $dst,$tmp,$atmp,$btmp    \n\t"
15066   %}
15067   ins_encode %{
15068     int vector_len = 1;
15069     __ vblendvps($btmp$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, vector_len);
15070     __ vblendvps($atmp$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $b$$XMMRegister, vector_len);
15071     __ vmaxps($tmp$$XMMRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vector_len);
15072     __ vcmpps($btmp$$XMMRegister, $atmp$$XMMRegister, $atmp$$XMMRegister, Assembler::UNORD_Q, vector_len);
15073     __ vblendvps($dst$$XMMRegister, $tmp$$XMMRegister, $atmp$$XMMRegister, $btmp$$XMMRegister, vector_len);
15074  %}
15075  ins_pipe( pipe_slow );
15076 %}
15077 
15078 instruct max16F_reg_evex(vecZ dst, vecZ a, vecZ b, vecZ atmp, vecZ btmp) %{
15079   predicate(UseAVX > 2 && VM_Version::supports_avx512dq() && n->as_Vector()->length() == 16 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
15080   match(Set dst (MaxV a b));
15081   effect(USE a, USE b, TEMP atmp, TEMP btmp);
15082   format %{ 
15083      "vpmovd2m         k1,$b              \n\t"
15084      "vblendmps        $atmp,k1,$a,$b     \n\t"
15085      "vblendmps        $btmp,k1,$b,$a     \n\t"
15086      "vmaxps           $dst,$atmp,$btmp   \n\t"
15087      "vcmpps.unordered      k1,$atmp,$atmp     \n\t"
15088      "vmovaps          $dst,k1,$atmp      \n\t"
15089   %}
15090   ins_encode %{
15091     int vector_len = 2;
15092     KRegister ktmp = k1; 
15093     KRegister mask = k0;
15094     __ evpmovd2m(ktmp, $b$$XMMRegister, vector_len); 
15095     __ evblendmps($atmp$$XMMRegister, ktmp, $a$$XMMRegister, $b$$XMMRegister, true, vector_len); 
15096     __ evblendmps($btmp$$XMMRegister, ktmp, $b$$XMMRegister, $a$$XMMRegister, true, vector_len); 
15097     __ vmaxps($dst$$XMMRegister, $atmp$$XMMRegister, $btmp$$XMMRegister, vector_len);
15098     __ evcmpps(ktmp, mask, $atmp$$XMMRegister, $atmp$$XMMRegister, Assembler::UNORD_Q, vector_len);
15099     __ evmovdqul($dst$$XMMRegister, ktmp, $atmp$$XMMRegister, true, vector_len);
15100   %}
15101   ins_pipe( pipe_slow );
15102 %}
15103 
15104 // Double Vector Max
15105 instruct max1D_reg_avx(legVecD dst, legVecD a, legVecD b, legVecD tmp, legVecD atmp, legVecD btmp) %{
15106   predicate(UseAVX > 0 && n->as_Vector()->length() == 1 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
15107   match(Set dst (MaxV a b));
15108   effect(USE a, USE b, TEMP atmp, TEMP btmp, TEMP tmp);
15109   format %{ 
15110      "blendvpd         $btmp,$b,$a,$b            \n\t"
15111      "blendvpd         $atmp,$a,$b,$b            \n\t"
15112      "vmaxpd           $tmp,$atmp,$btmp          \n\t"
15113      "cmppd.unordered  $btmp, $atmp, $atmp       \n\t"
15114      "blendvpd         $dst,$tmp,$atmp,$btmp     \n\t"
15115   %}
15116   ins_encode %{
15117     int vector_len = 0;
15118     __ vblendvpd($btmp$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, vector_len);
15119     __ vblendvpd($atmp$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $b$$XMMRegister, vector_len);
15120     __ vmaxpd($tmp$$XMMRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vector_len);
15121     __ vcmppd($btmp$$XMMRegister, $atmp$$XMMRegister, $atmp$$XMMRegister, Assembler::UNORD_Q, vector_len);
15122     __ vblendvpd($dst$$XMMRegister, $tmp$$XMMRegister, $atmp$$XMMRegister, $btmp$$XMMRegister, vector_len);
15123   %}
15124   ins_pipe( pipe_slow );
15125 %}
15126 
15127 instruct max2D_reg_avx(legVecX dst, legVecX a, legVecX b, legVecX tmp, legVecX atmp, legVecX btmp) %{
15128   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
15129   match(Set dst (MaxV a b));
15130   effect(USE a, USE b, TEMP atmp, TEMP btmp, TEMP tmp);
15131   format %{ 
15132      "blendvpd         $btmp,$b,$a,$b            \n\t"
15133      "blendvpd         $atmp,$a,$b,$b            \n\t"
15134      "vmaxpd           $tmp,$atmp,$btmp          \n\t"
15135      "cmppd.unordered  $btmp, $atmp, $atmp       \n\t"
15136      "blendvpd         $dst,$tmp,$atmp,$btmp     \n\t"
15137   %}
15138   ins_encode %{
15139     int vector_len = 0;
15140     __ vblendvpd($btmp$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, vector_len);
15141     __ vblendvpd($atmp$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $b$$XMMRegister, vector_len);
15142     __ vmaxpd($tmp$$XMMRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vector_len);
15143     __ vcmppd($btmp$$XMMRegister, $atmp$$XMMRegister, $atmp$$XMMRegister, Assembler::UNORD_Q, vector_len);
15144     __ vblendvpd($dst$$XMMRegister, $tmp$$XMMRegister, $atmp$$XMMRegister, $btmp$$XMMRegister, vector_len);
15145   %}
15146   ins_pipe( pipe_slow );
15147 %}
15148 
15149 instruct max4D_reg_avx(legVecY dst, legVecY a, legVecY b, legVecY tmp, legVecY atmp, legVecY btmp) %{
15150   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
15151   match(Set dst (MaxV a b));
15152   effect(USE a, USE b, TEMP atmp, TEMP btmp, TEMP tmp);
15153   format %{ 
15154      "blendvpd         $btmp,$b,$a,$b            \n\t"
15155      "blendvpd         $atmp,$a,$b,$b            \n\t"
15156      "vmaxpd           $tmp,$atmp,$btmp          \n\t"
15157      "cmppd.unordered  $btmp, $atmp, $atmp       \n\t"
15158      "blendvpd         $dst,$tmp,$atmp,$btmp     \n\t"
15159   %}
15160   ins_encode %{
15161     int vector_len = 1;
15162     __ vblendvpd($btmp$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, vector_len);
15163     __ vblendvpd($atmp$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $b$$XMMRegister, vector_len);
15164     __ vmaxpd($tmp$$XMMRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vector_len);
15165     __ vcmppd($btmp$$XMMRegister, $atmp$$XMMRegister, $atmp$$XMMRegister, Assembler::UNORD_Q, vector_len);
15166     __ vblendvpd($dst$$XMMRegister, $tmp$$XMMRegister, $atmp$$XMMRegister, $btmp$$XMMRegister, vector_len);
15167   %}
15168   ins_pipe( pipe_slow );
15169 %}
15170 
15171 
15172 instruct max8D_reg_evex(vecZ dst, vecZ a, vecZ b, vecZ atmp, vecZ btmp) %{
15173   predicate(UseAVX > 2 && VM_Version::supports_avx512dq() && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
15174   match(Set dst (MaxV a b));
15175   effect(USE a, USE b, TEMP atmp, TEMP btmp);
15176   format %{ 
15177      "vpmovq2m         k1,$b              \n\t"
15178      "vblendmpd        $atmp,k1,$a,$b     \n\t"
15179      "vblendmpd        $btmp,k1,$b,$a     \n\t"
15180      "vmaxpd           $dst,$atmp,$btmp   \n\t"
15181      "vcmppd.unordered      k1,$atmp,$atmp     \n\t"
15182      "vmovapd          $dst,k1,$atmp      \n\t"
15183   %}
15184   ins_encode %{
15185     int vector_len = 2;
15186     KRegister ktmp = k1; 
15187     KRegister mask = k0;
15188     __ evpmovq2m(ktmp, $b$$XMMRegister, vector_len); 
15189     __ evblendmpd($atmp$$XMMRegister, ktmp, $a$$XMMRegister, $b$$XMMRegister, true, vector_len); 
15190     __ evblendmpd($btmp$$XMMRegister, ktmp, $b$$XMMRegister, $a$$XMMRegister, true, vector_len); 
15191     __ vmaxpd($dst$$XMMRegister, $atmp$$XMMRegister, $btmp$$XMMRegister, vector_len);
15192     __ evcmppd(ktmp, mask, $atmp$$XMMRegister, $atmp$$XMMRegister, Assembler::UNORD_Q, vector_len);
15193     __ evmovdquq($dst$$XMMRegister, ktmp, $atmp$$XMMRegister, true, vector_len);
15194   %}
15195   ins_pipe( pipe_slow );
15196 %}
15197 
15198 // ------------------------------ Shift ---------------------------------------
15199 
15200 // Left and right shift count vectors are the same on x86
15201 // (only lowest bits of xmm reg are used for count).
15202 instruct vshiftcnt(vecS dst, rRegI cnt) %{
15203   match(Set dst (LShiftCntV cnt));
15204   match(Set dst (RShiftCntV cnt));
15205   format %{ "movd    $dst,$cnt\t! load shift count" %}
15206   ins_encode %{
15207     __ movdl($dst$$XMMRegister, $cnt$$Register);
15208   %}
15209   ins_pipe( pipe_slow );
15210 %}
15211 
15212 // --------------------------------- Sqrt --------------------------------------
15213 
15214 // Floating point vector sqrt
15215 instruct vsqrt2D_reg(vecX dst, vecX src) %{
15216   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
15217   match(Set dst (SqrtVD src));
15218   format %{ "vsqrtpd  $dst,$src\t! sqrt packed2D" %}
15219   ins_encode %{
15220     int vector_len = 0;
15221     __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
15222   %}
15223   ins_pipe( pipe_slow );
15224 %}
15225 
15226 instruct vsqrt2D_mem(vecX dst, memory mem) %{
15227   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
15228   match(Set dst (SqrtVD (LoadVector mem)));
15229   format %{ "vsqrtpd  $dst,$mem\t! sqrt packed2D" %}
15230   ins_encode %{
15231     int vector_len = 0;
15232     __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vector_len);
15233   %}
15234   ins_pipe( pipe_slow );
15235 %}
15236 
15237 instruct vsqrt4D_reg(vecY dst, vecY src) %{
15238   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
15239   match(Set dst (SqrtVD src));
15240   format %{ "vsqrtpd  $dst,$src\t! sqrt packed4D" %}
15241   ins_encode %{
15242     int vector_len = 1;
15243     __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
15244   %}
15245   ins_pipe( pipe_slow );
15246 %}
15247 
15248 instruct vsqrt4D_mem(vecY dst, memory mem) %{
15249   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
15250   match(Set dst (SqrtVD (LoadVector mem)));
15251   format %{ "vsqrtpd  $dst,$mem\t! sqrt packed4D" %}
15252   ins_encode %{
15253     int vector_len = 1;
15254     __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vector_len);
15255   %}
15256   ins_pipe( pipe_slow );
15257 %}
15258 
15259 instruct vsqrt8D_reg(vecZ dst, vecZ src) %{
15260   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
15261   match(Set dst (SqrtVD src));
15262   format %{ "vsqrtpd  $dst,$src\t! sqrt packed8D" %}
15263   ins_encode %{
15264     int vector_len = 2;
15265     __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
15266   %}
15267   ins_pipe( pipe_slow );
15268 %}
15269 
15270 instruct vsqrt8D_mem(vecZ dst, memory mem) %{
15271   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
15272   match(Set dst (SqrtVD (LoadVector mem)));
15273   format %{ "vsqrtpd  $dst,$mem\t! sqrt packed8D" %}
15274   ins_encode %{
15275     int vector_len = 2;
15276     __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vector_len);
15277   %}
15278   ins_pipe( pipe_slow );
15279 %}
15280 
15281 instruct vsqrt2F_reg(vecD dst, vecD src) %{
15282   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
15283   match(Set dst (SqrtVF src));
15284   format %{ "vsqrtps  $dst,$src\t! sqrt packed2F" %}
15285   ins_encode %{
15286     int vector_len = 0;
15287     __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vector_len);
15288   %}
15289   ins_pipe( pipe_slow );
15290 %}
15291 
15292 instruct vsqrt2F_mem(vecD dst, memory mem) %{
15293   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
15294   match(Set dst (SqrtVF (LoadVector mem)));
15295   format %{ "vsqrtps  $dst,$mem\t! sqrt packed2F" %}
15296   ins_encode %{
15297     int vector_len = 0;
15298     __ vsqrtps($dst$$XMMRegister, $mem$$Address, vector_len);
15299   %}
15300   ins_pipe( pipe_slow );
15301 %}
15302 
15303 instruct vsqrt4F_reg(vecX dst, vecX src) %{
15304   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
15305   match(Set dst (SqrtVF src));
15306   format %{ "vsqrtps  $dst,$src\t! sqrt packed4F" %}
15307   ins_encode %{
15308     int vector_len = 0;
15309     __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vector_len);
15310   %}
15311   ins_pipe( pipe_slow );
15312 %}
15313 
15314 instruct vsqrt4F_mem(vecX dst, memory mem) %{
15315   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
15316   match(Set dst (SqrtVF (LoadVector mem)));
15317   format %{ "vsqrtps  $dst,$mem\t! sqrt packed4F" %}
15318   ins_encode %{
15319     int vector_len = 0;
15320     __ vsqrtps($dst$$XMMRegister, $mem$$Address, vector_len);
15321   %}
15322   ins_pipe( pipe_slow );
15323 %}
15324 
15325 instruct vsqrt8F_reg(vecY dst, vecY src) %{
15326   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
15327   match(Set dst (SqrtVF src));
15328   format %{ "vsqrtps  $dst,$src\t! sqrt packed8F" %}
15329   ins_encode %{
15330     int vector_len = 1;
15331     __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vector_len);
15332   %}
15333   ins_pipe( pipe_slow );
15334 %}
15335 
15336 instruct vsqrt8F_mem(vecY dst, memory mem) %{
15337   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
15338   match(Set dst (SqrtVF (LoadVector mem)));
15339   format %{ "vsqrtps  $dst,$mem\t! sqrt packed8F" %}
15340   ins_encode %{
15341     int vector_len = 1;
15342     __ vsqrtps($dst$$XMMRegister, $mem$$Address, vector_len);
15343   %}
15344   ins_pipe( pipe_slow );
15345 %}
15346 
15347 instruct vsqrt16F_reg(vecZ dst, vecZ src) %{
15348   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
15349   match(Set dst (SqrtVF src));
15350   format %{ "vsqrtps  $dst,$src\t! sqrt packed16F" %}
15351   ins_encode %{
15352     int vector_len = 2;
15353     __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vector_len);
15354   %}
15355   ins_pipe( pipe_slow );
15356 %}
15357 
15358 instruct vsqrt16F_mem(vecZ dst, memory mem) %{
15359   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
15360   match(Set dst (SqrtVF (LoadVector mem)));
15361   format %{ "vsqrtps  $dst,$mem\t! sqrt packed16F" %}
15362   ins_encode %{
15363     int vector_len = 2;
15364     __ vsqrtps($dst$$XMMRegister, $mem$$Address, vector_len);
15365   %}
15366   ins_pipe( pipe_slow );
15367 %}
15368 
15369 // ------------------------------ LeftShift -----------------------------------
15370 
15371 // Byte vector left shift
15372 instruct vsll4B_reg(vecS dst, vecS src, vecS shift, vecD tmp, vecD tmp2) %{
15373   predicate(UseSSE > 3 && n->as_Vector()->length() == 4);
15374   match(Set dst (LShiftVB src shift));
15375   effect(TEMP tmp2, TEMP tmp);
15376   format %{"pmovsxbw  $tmp,$src\n\t"
15377            "psllw     $tmp,$shift\n\t"
15378            "movdqu    $tmp2,[0x00ff00ff0x00ff00ff]\n\t"
15379            "pand      $tmp,$tmp2\n\t"
15380            "packuswb  $tmp,$tmp\n\t"
15381            "movss     $dst,$tmp\n\t! left shift packed4B" %}
15382   ins_encode %{
15383     __ pmovsxbw($tmp$$XMMRegister, $src$$XMMRegister);
15384     __ psllw($tmp$$XMMRegister, $shift$$XMMRegister);
15385     __ movdqu($tmp2$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()));
15386     __ pand($tmp$$XMMRegister, $tmp2$$XMMRegister);
15387     __ packuswb($tmp$$XMMRegister, $tmp$$XMMRegister);
15388     __ movss($dst$$XMMRegister, $tmp$$XMMRegister);
15389   %}
15390   ins_pipe( pipe_slow );
15391 %}
15392 
15393 instruct vsll8B_reg(vecD dst, vecD src, vecS shift, vecX tmp, vecX tmp2) %{
15394   predicate(UseSSE > 3 && n->as_Vector()->length() == 8);
15395   match(Set dst (LShiftVB src shift));
15396   effect(TEMP tmp2, TEMP tmp);
15397   format %{"pmovsxbw  $tmp,$src\n\t"
15398            "psllw     $tmp,$shift\n\t"
15399            "movdqu    $tmp2,[0x00ff00ff0x00ff00ff]\n\t"
15400            "pand      $tmp,$tmp2\n\t"
15401            "packuswb  $tmp,$tmp\n\t"
15402            "movsd     $dst,$tmp\n\t! left shift packed8B" %}
15403   ins_encode %{
15404     __ pmovsxbw($tmp$$XMMRegister, $src$$XMMRegister);
15405     __ psllw($tmp$$XMMRegister, $shift$$XMMRegister);
15406     __ movdqu($tmp2$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()));
15407     __ pand($tmp$$XMMRegister, $tmp2$$XMMRegister);
15408     __ packuswb($tmp$$XMMRegister, $tmp$$XMMRegister);
15409     __ movsd($dst$$XMMRegister, $tmp$$XMMRegister);
15410   %}
15411   ins_pipe( pipe_slow );
15412 %}
15413 
15414 instruct vsll16B_reg(vecX dst, vecX src, vecS shift, vecX tmp, vecX tmp2, vecX tmp3) %{
15415   predicate(UseSSE > 3  && n->as_Vector()->length() == 16);
15416   match(Set dst (LShiftVB src shift));
15417   effect(TEMP tmp2, TEMP tmp, TEMP tmp3);
15418   format %{"pmovsxbw  $tmp,$src\n\t"
15419            "psllw     $tmp,$shift\n\t"
15420            "pshufd    $tmp2,$src\n\t"
15421            "pmovsxbw  $tmp2,$tmp2\n\t"
15422            "psllw     $tmp2,$shift\n\t"
15423            "movdqu    $tmp3,[0x00ff00ff0x00ff00ff]\n\t"
15424            "pand      $tmp,$tmp3\n\t"
15425            "pand      $tmp2,$tmp3\n\t"
15426            "packuswb  $tmp,$tmp2\n\t"
15427            "modqu     $dst,$tmp\n\t! left shift packed16B" %}
15428   ins_encode %{
15429     __ pmovsxbw($tmp$$XMMRegister, $src$$XMMRegister);
15430     __ psllw($tmp$$XMMRegister, $shift$$XMMRegister);
15431     __ pshufd($tmp2$$XMMRegister, $src$$XMMRegister, 0x0E);
15432     __ pmovsxbw($tmp2$$XMMRegister, $tmp2$$XMMRegister);
15433     __ psllw($tmp2$$XMMRegister, $shift$$XMMRegister);
15434     __ movdqu($tmp3$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()));
15435     __ pand($tmp$$XMMRegister, $tmp3$$XMMRegister);
15436     __ pand($tmp2$$XMMRegister, $tmp3$$XMMRegister);
15437     __ packuswb($tmp$$XMMRegister, $tmp2$$XMMRegister);
15438     __ movdqu($dst$$XMMRegister, $tmp$$XMMRegister);
15439   %}
15440   ins_pipe( pipe_slow );
15441 %}
15442 
15443 instruct vsll16B_avx(vecX dst, vecX src, vecS shift, vecY tmp, rRegL scratch) %{
15444   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
15445   match(Set dst (LShiftVB src shift));
15446   effect(TEMP dst, TEMP tmp, TEMP scratch);
15447   format %{"vpmovsxbw  $tmp,$src\n\t"
15448            "vpsllw     $tmp,$tmp,$shift\\n\t"
15449            "vpand      $tmp,$tmp,[0x00ff00ff0x00ff00ff]\n\t"
15450            "vextracti128_high  $dst,$tmp\n\t"
15451            "vpackuswb  $dst,$tmp, $dst\n\t! left shift packed16B" %}
15452   ins_encode %{
15453     int vector_len = 1;
15454     __ vpmovsxbw($tmp$$XMMRegister, $src$$XMMRegister, vector_len);
15455     __ vpsllw($tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vector_len);
15456     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vector_len, $scratch$$Register);
15457     __ vextracti128_high($dst$$XMMRegister, $tmp$$XMMRegister);
15458     __ vpackuswb($dst$$XMMRegister, $tmp$$XMMRegister, $dst$$XMMRegister, 0);
15459   %}
15460   ins_pipe( pipe_slow );
15461 %}
15462 
15463 instruct vsll32B_avx(vecY dst, vecY src, vecS shift, vecY tmp, vecY tmp2, rRegL scratch) %{
15464   predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
15465   match(Set dst (LShiftVB src shift));
15466   effect(TEMP dst, TEMP tmp2, TEMP tmp, TEMP scratch);
15467   format %{"vextracti128_high  $tmp,$src\n\t"
15468            "vpmovsxbw   $tmp,$tmp\n\t"
15469            "vpmovsxbw   $tmp2,$src\n\t"
15470            "vpsllw      $tmp,$tmp,$shift\n\t"
15471            "vpsllw      $tmp2,$tmp2,$shift\n\t"
15472            "vpand       $tmp,$tmp,[0x00ff00ff0x00ff00ff]\n\t"
15473            "vpand       $tmp2,$tmp2,[0x00ff00ff0x00ff00ff]\n\t"
15474            "vpackuswb   $dst,$tmp2,$tmp\n\t"
15475            "vpermq      $dst,$dst, 0xD8\n\t! left shift for packed32B" %}
15476   ins_encode %{
15477     int vector_len = 1;
15478     __ vextracti128_high($tmp$$XMMRegister, $src$$XMMRegister);
15479     __ vpmovsxbw($tmp$$XMMRegister, $tmp$$XMMRegister, vector_len);
15480     __ vpmovsxbw($tmp2$$XMMRegister, $src$$XMMRegister, vector_len);
15481     __ vpsllw($tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vector_len);
15482     __ vpsllw($tmp2$$XMMRegister, $tmp2$$XMMRegister, $shift$$XMMRegister, vector_len);
15483     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vector_len, $scratch$$Register);
15484     __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vector_len, $scratch$$Register);
15485     __ vpackuswb($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
15486     __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vector_len);
15487   %}
15488   ins_pipe( pipe_slow );
15489 %}
15490 
15491 instruct vsll64B_avx(vecZ dst, vecZ src, vecS shift, vecZ tmp, vecZ tmp2, vecZ tmp3, rRegL scratch) %{
15492   predicate(UseAVX > 2 && n->as_Vector()->length() == 64);
15493   match(Set dst (LShiftVB src shift));
15494   effect(TEMP dst, TEMP tmp3, TEMP tmp2, TEMP tmp, TEMP scratch);
15495   format %{"vextracti64x4  $tmp,$src\n\t"
15496            "vpmovsxbw      $tmp,$tmp\n\t"
15497            "vpmovsxbw      $tmp2,$src\n\t"
15498            "vpsllw         $tmp,$tmp,$shift\n\t"
15499            "vpsllw         $tmp2,$tmp2,$shift\n\t"
15500            "vmovdqu        $tmp3,[0x00ff00ff0x00ff00ff]\n\t"
15501            "vpbroadcastd   $tmp3,$tmp3\n\t"
15502            "vpand          $tmp,$tmp,$tmp3\n\t"
15503            "vpand          $tmp2,$tmp2,$tmp3\n\t"
15504            "vpackuswb      $dst,$tmp,$tmp2\n\t"
15505            "evmovdquq     $tmp3, [0x06040200070500301]\n\t"
15506            "vpermq  $dst,$tmp3,$dst\n\t! left shift for packed64B" %}
15507   ins_encode %{
15508     int vector_len = 2;
15509     __ vextracti64x4($tmp$$XMMRegister, $src$$XMMRegister, 1);
15510     __ vpmovsxbw($tmp$$XMMRegister, $tmp$$XMMRegister, vector_len);
15511     __ vpmovsxbw($tmp2$$XMMRegister, $src$$XMMRegister, vector_len);
15512     __ vpsllw($tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vector_len);
15513     __ vpsllw($tmp2$$XMMRegister, $tmp2$$XMMRegister, $shift$$XMMRegister, vector_len);
15514     __ vmovdqu($tmp3$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()));
15515     __ vpbroadcastd($tmp3$$XMMRegister, $tmp3$$XMMRegister, vector_len);
15516     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp3$$XMMRegister, vector_len);
15517     __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, vector_len);
15518     __ vpackuswb($dst$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
15519     __ evmovdquq($tmp3$$XMMRegister, ExternalAddress(vector_byte_perm_mask()), vector_len, $scratch$$Register);
15520     __ vpermq($dst$$XMMRegister, $tmp3$$XMMRegister, $dst$$XMMRegister, vector_len);
15521   %}
15522   ins_pipe( pipe_slow );
15523 %}
15524 
15525 // Shorts/Chars vector left shift
15526 instruct vsll2S(vecS dst, vecS shift) %{
15527   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
15528   match(Set dst (LShiftVS dst shift));
15529   format %{ "psllw   $dst,$shift\t! left shift packed2S" %}
15530   ins_encode %{
15531     __ psllw($dst$$XMMRegister, $shift$$XMMRegister);
15532   %}
15533   ins_pipe( pipe_slow );
15534 %}
15535 
15536 instruct vsll2S_imm(vecS dst, immI8 shift) %{
15537   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
15538   match(Set dst (LShiftVS dst (LShiftCntV shift)));
15539   format %{ "psllw   $dst,$shift\t! left shift packed2S" %}
15540   ins_encode %{
15541     __ psllw($dst$$XMMRegister, (int)$shift$$constant);
15542   %}
15543   ins_pipe( pipe_slow );
15544 %}
15545 
15546 instruct vsll2S_reg(vecS dst, vecS src, vecS shift) %{
15547   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
15548   match(Set dst (LShiftVS src shift));
15549   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
15550   ins_encode %{
15551     int vector_len = 0;
15552     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
15553   %}
15554   ins_pipe( pipe_slow );
15555 %}
15556 
15557 instruct vsll2S_reg_imm(vecS dst, vecS src, immI8 shift) %{
15558   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
15559   match(Set dst (LShiftVS src (LShiftCntV shift)));
15560   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
15561   ins_encode %{
15562     int vector_len = 0;
15563     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
15564   %}
15565   ins_pipe( pipe_slow );
15566 %}
15567 
15568 instruct vsll4S(vecD dst, vecS shift) %{
15569   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
15570   match(Set dst (LShiftVS dst shift));
15571   format %{ "psllw   $dst,$shift\t! left shift packed4S" %}
15572   ins_encode %{
15573     __ psllw($dst$$XMMRegister, $shift$$XMMRegister);
15574   %}
15575   ins_pipe( pipe_slow );
15576 %}
15577 
15578 instruct vsll4S_imm(vecD dst, immI8 shift) %{
15579   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
15580   match(Set dst (LShiftVS dst (LShiftCntV shift)));
15581   format %{ "psllw   $dst,$shift\t! left shift packed4S" %}
15582   ins_encode %{
15583     __ psllw($dst$$XMMRegister, (int)$shift$$constant);
15584   %}
15585   ins_pipe( pipe_slow );
15586 %}
15587 
15588 instruct vsll4S_reg(vecD dst, vecD src, vecS shift) %{
15589   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
15590   match(Set dst (LShiftVS src shift));
15591   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
15592   ins_encode %{
15593     int vector_len = 0;
15594     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
15595   %}
15596   ins_pipe( pipe_slow );
15597 %}
15598 
15599 instruct vsll4S_reg_imm(vecD dst, vecD src, immI8 shift) %{
15600   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
15601   match(Set dst (LShiftVS src (LShiftCntV shift)));
15602   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
15603   ins_encode %{
15604     int vector_len = 0;
15605     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
15606   %}
15607   ins_pipe( pipe_slow );
15608 %}
15609 
15610 instruct vsll8S(vecX dst, vecS shift) %{
15611   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
15612   match(Set dst (LShiftVS dst shift));
15613   format %{ "psllw   $dst,$shift\t! left shift packed8S" %}
15614   ins_encode %{
15615     __ psllw($dst$$XMMRegister, $shift$$XMMRegister);
15616   %}
15617   ins_pipe( pipe_slow );
15618 %}
15619 
15620 instruct vsll8S_imm(vecX dst, immI8 shift) %{
15621   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
15622   match(Set dst (LShiftVS dst (LShiftCntV shift)));
15623   format %{ "psllw   $dst,$shift\t! left shift packed8S" %}
15624   ins_encode %{
15625     __ psllw($dst$$XMMRegister, (int)$shift$$constant);
15626   %}
15627   ins_pipe( pipe_slow );
15628 %}
15629 
15630 instruct vsll8S_reg(vecX dst, vecX src, vecS shift) %{
15631   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
15632   match(Set dst (LShiftVS src shift));
15633   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
15634   ins_encode %{
15635     int vector_len = 0;
15636     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
15637   %}
15638   ins_pipe( pipe_slow );
15639 %}
15640 
15641 instruct vsll8S_reg_imm(vecX dst, vecX src, immI8 shift) %{
15642   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
15643   match(Set dst (LShiftVS src (LShiftCntV shift)));
15644   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
15645   ins_encode %{
15646     int vector_len = 0;
15647     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
15648   %}
15649   ins_pipe( pipe_slow );
15650 %}
15651 
15652 instruct vsll16S_reg(vecY dst, vecY src, vecS shift) %{
15653   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
15654   match(Set dst (LShiftVS src shift));
15655   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
15656   ins_encode %{
15657     int vector_len = 1;
15658     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
15659   %}
15660   ins_pipe( pipe_slow );
15661 %}
15662 
15663 instruct vsll16S_reg_imm(vecY dst, vecY src, immI8 shift) %{
15664   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
15665   match(Set dst (LShiftVS src (LShiftCntV shift)));
15666   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
15667   ins_encode %{
15668     int vector_len = 1;
15669     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
15670   %}
15671   ins_pipe( pipe_slow );
15672 %}
15673 
15674 instruct vsll32S_reg(vecZ dst, vecZ src, vecS shift) %{
15675   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
15676   match(Set dst (LShiftVS src shift));
15677   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed32S" %}
15678   ins_encode %{
15679     int vector_len = 2;
15680     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
15681   %}
15682   ins_pipe( pipe_slow );
15683 %}
15684 
15685 instruct vsll32S_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
15686   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
15687   match(Set dst (LShiftVS src (LShiftCntV shift)));
15688   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed32S" %}
15689   ins_encode %{
15690     int vector_len = 2;
15691     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
15692   %}
15693   ins_pipe( pipe_slow );
15694 %}
15695 
15696 // Integers vector left shift
15697 instruct vsll2I(vecD dst, vecS shift) %{
15698   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
15699   match(Set dst (LShiftVI dst shift));
15700   format %{ "pslld   $dst,$shift\t! left shift packed2I" %}
15701   ins_encode %{
15702     __ pslld($dst$$XMMRegister, $shift$$XMMRegister);
15703   %}
15704   ins_pipe( pipe_slow );
15705 %}
15706 
15707 instruct vsll2I_imm(vecD dst, immI8 shift) %{
15708   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
15709   match(Set dst (LShiftVI dst (LShiftCntV shift)));
15710   format %{ "pslld   $dst,$shift\t! left shift packed2I" %}
15711   ins_encode %{
15712     __ pslld($dst$$XMMRegister, (int)$shift$$constant);
15713   %}
15714   ins_pipe( pipe_slow );
15715 %}
15716 
15717 instruct vsll2I_reg(vecD dst, vecD src, vecS shift) %{
15718   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
15719   match(Set dst (LShiftVI src shift));
15720   format %{ "vpslld  $dst,$src,$shift\t! left shift packed2I" %}
15721   ins_encode %{
15722     int vector_len = 0;
15723     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
15724   %}
15725   ins_pipe( pipe_slow );
15726 %}
15727 
15728 instruct vsll2I_reg_imm(vecD dst, vecD src, immI8 shift) %{
15729   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
15730   match(Set dst (LShiftVI src (LShiftCntV shift)));
15731   format %{ "vpslld  $dst,$src,$shift\t! left shift packed2I" %}
15732   ins_encode %{
15733     int vector_len = 0;
15734     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
15735   %}
15736   ins_pipe( pipe_slow );
15737 %}
15738 
15739 instruct vsll4I(vecX dst, vecS shift) %{
15740   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
15741   match(Set dst (LShiftVI dst shift));
15742   format %{ "pslld   $dst,$shift\t! left shift packed4I" %}
15743   ins_encode %{
15744     __ pslld($dst$$XMMRegister, $shift$$XMMRegister);
15745   %}
15746   ins_pipe( pipe_slow );
15747 %}
15748 
15749 instruct vsll4I_imm(vecX dst, immI8 shift) %{
15750   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
15751   match(Set dst (LShiftVI dst (LShiftCntV shift)));
15752   format %{ "pslld   $dst,$shift\t! left shift packed4I" %}
15753   ins_encode %{
15754     __ pslld($dst$$XMMRegister, (int)$shift$$constant);
15755   %}
15756   ins_pipe( pipe_slow );
15757 %}
15758 
15759 instruct vsll4I_reg(vecX dst, vecX src, vecS shift) %{
15760   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
15761   match(Set dst (LShiftVI src shift));
15762   format %{ "vpslld  $dst,$src,$shift\t! left shift packed4I" %}
15763   ins_encode %{
15764     int vector_len = 0;
15765     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
15766   %}
15767   ins_pipe( pipe_slow );
15768 %}
15769 
15770 instruct vsll4I_reg_imm(vecX dst, vecX src, immI8 shift) %{
15771   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
15772   match(Set dst (LShiftVI src (LShiftCntV shift)));
15773   format %{ "vpslld  $dst,$src,$shift\t! left shift packed4I" %}
15774   ins_encode %{
15775     int vector_len = 0;
15776     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
15777   %}
15778   ins_pipe( pipe_slow );
15779 %}
15780 
15781 instruct vsll8I_reg(vecY dst, vecY src, vecS shift) %{
15782   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
15783   match(Set dst (LShiftVI src shift));
15784   format %{ "vpslld  $dst,$src,$shift\t! left shift packed8I" %}
15785   ins_encode %{
15786     int vector_len = 1;
15787     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
15788   %}
15789   ins_pipe( pipe_slow );
15790 %}
15791 
15792 instruct vsll8I_reg_imm(vecY dst, vecY src, immI8 shift) %{
15793   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
15794   match(Set dst (LShiftVI src (LShiftCntV shift)));
15795   format %{ "vpslld  $dst,$src,$shift\t! left shift packed8I" %}
15796   ins_encode %{
15797     int vector_len = 1;
15798     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
15799   %}
15800   ins_pipe( pipe_slow );
15801 %}
15802 
15803 instruct vsll16I_reg(vecZ dst, vecZ src, vecS shift) %{
15804   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
15805   match(Set dst (LShiftVI src shift));
15806   format %{ "vpslld  $dst,$src,$shift\t! left shift packed16I" %}
15807   ins_encode %{
15808     int vector_len = 2;
15809     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
15810   %}
15811   ins_pipe( pipe_slow );
15812 %}
15813 
15814 instruct vsll16I_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
15815   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
15816   match(Set dst (LShiftVI src (LShiftCntV shift)));
15817   format %{ "vpslld  $dst,$src,$shift\t! left shift packed16I" %}
15818   ins_encode %{
15819     int vector_len = 2;
15820     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
15821   %}
15822   ins_pipe( pipe_slow );
15823 %}
15824 
15825 // Longs vector left shift
15826 instruct vsll2L(vecX dst, vecS shift) %{
15827   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
15828   match(Set dst (LShiftVL dst shift));
15829   format %{ "psllq   $dst,$shift\t! left shift packed2L" %}
15830   ins_encode %{
15831     __ psllq($dst$$XMMRegister, $shift$$XMMRegister);
15832   %}
15833   ins_pipe( pipe_slow );
15834 %}
15835 
15836 instruct vsll2L_imm(vecX dst, immI8 shift) %{
15837   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
15838   match(Set dst (LShiftVL dst (LShiftCntV shift)));
15839   format %{ "psllq   $dst,$shift\t! left shift packed2L" %}
15840   ins_encode %{
15841     __ psllq($dst$$XMMRegister, (int)$shift$$constant);
15842   %}
15843   ins_pipe( pipe_slow );
15844 %}
15845 
15846 instruct vsll2L_reg(vecX dst, vecX src, vecS shift) %{
15847   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
15848   match(Set dst (LShiftVL src shift));
15849   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed2L" %}
15850   ins_encode %{
15851     int vector_len = 0;
15852     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
15853   %}
15854   ins_pipe( pipe_slow );
15855 %}
15856 
15857 instruct vsll2L_reg_imm(vecX dst, vecX src, immI8 shift) %{
15858   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
15859   match(Set dst (LShiftVL src (LShiftCntV shift)));
15860   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed2L" %}
15861   ins_encode %{
15862     int vector_len = 0;
15863     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
15864   %}
15865   ins_pipe( pipe_slow );
15866 %}
15867 
15868 instruct vsll4L_reg(vecY dst, vecY src, vecS shift) %{
15869   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
15870   match(Set dst (LShiftVL src shift));
15871   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed4L" %}
15872   ins_encode %{
15873     int vector_len = 1;
15874     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
15875   %}
15876   ins_pipe( pipe_slow );
15877 %}
15878 
15879 instruct vsll4L_reg_imm(vecY dst, vecY src, immI8 shift) %{
15880   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
15881   match(Set dst (LShiftVL src (LShiftCntV shift)));
15882   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed4L" %}
15883   ins_encode %{
15884     int vector_len = 1;
15885     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
15886   %}
15887   ins_pipe( pipe_slow );
15888 %}
15889 
15890 instruct vsll8L_reg(vecZ dst, vecZ src, vecS shift) %{
15891   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
15892   match(Set dst (LShiftVL src shift));
15893   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed8L" %}
15894   ins_encode %{
15895     int vector_len = 2;
15896     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
15897   %}
15898   ins_pipe( pipe_slow );
15899 %}
15900 
15901 instruct vsll8L_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
15902   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
15903   match(Set dst (LShiftVL src (LShiftCntV shift)));
15904   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed8L" %}
15905   ins_encode %{
15906     int vector_len = 2;
15907     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
15908   %}
15909   ins_pipe( pipe_slow );
15910 %}
15911 
15912 // ----------------------- LogicalRightShift -----------------------------------
15913 
15914 // Bytes vector logical right shift
15915 instruct vsrl4B_reg(vecS dst, vecS src, vecS shift, vecD tmp, vecD tmp2) %{
15916   predicate(UseSSE > 3 && n->as_Vector()->length() == 4);
15917   match(Set dst (URShiftVB src shift));
15918   effect(TEMP tmp2, TEMP tmp);
15919   format %{"pmovzxbw   $tmp,$src\n\t"
15920            "psrlw      $tmp,$shift\n\t"
15921            "movdqu     $tmp2,[0x00ff00ff0x00ff00ff]\n\t"
15922            "pand       $tmp,$tmp2\n\t"
15923            "packuswb   $tmp,$tmp\n\t"
15924            "movss      $dst,$tmp\n\t! logical right shift for packed4B" %}
15925   ins_encode %{
15926     __ pmovzxbw($tmp$$XMMRegister, $src$$XMMRegister);
15927     __ psrlw($tmp$$XMMRegister, $shift$$XMMRegister);
15928     __ movdqu($tmp2$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()));
15929     __ pand($tmp$$XMMRegister, $tmp2$$XMMRegister);
15930     __ packuswb($tmp$$XMMRegister, $tmp$$XMMRegister);
15931     __ movss($dst$$XMMRegister, $tmp$$XMMRegister);
15932   %}
15933   ins_pipe( pipe_slow );
15934 %}
15935 
15936 instruct vsrl8B_reg(vecD dst, vecD src, vecS shift, vecX tmp, vecX tmp2) %{
15937   predicate(UseSSE > 3 && n->as_Vector()->length() == 8);
15938   match(Set dst (URShiftVB src shift));
15939   effect(TEMP tmp2, TEMP tmp);
15940   format %{"pmovzxbw   $tmp,$src\n\t"
15941            "psrlw      $tmp,$shift\n\t"
15942            "movdqu     $tmp2,[0x00ff00ff0x00ff00ff]\n\t"
15943            "pand       $tmp,$tmp2\n\t"
15944            "packuswb   $tmp,$tmp\n\t"
15945            "movsd      $dst,$tmp\n\t!logical right shift for packed8B" %}
15946   ins_encode %{
15947     __ pmovzxbw($tmp$$XMMRegister, $src$$XMMRegister);
15948     __ psrlw($tmp$$XMMRegister, $shift$$XMMRegister);
15949     __ movdqu($tmp2$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()));
15950     __ pand($tmp$$XMMRegister, $tmp2$$XMMRegister);
15951     __ packuswb($tmp$$XMMRegister, $tmp$$XMMRegister);
15952     __ movsd($dst$$XMMRegister, $tmp$$XMMRegister);
15953   %}
15954   ins_pipe( pipe_slow );
15955 %}
15956 
15957 instruct vsrl16B_reg(vecX dst, vecX src, vecS shift, vecX tmp, vecX tmp2, vecX tmp3) %{
15958   predicate(UseSSE > 3  && n->as_Vector()->length() == 16);
15959   match(Set dst (URShiftVB src shift));
15960   effect(TEMP tmp2, TEMP tmp, TEMP tmp3);
15961   format %{"pmovzxbw  $tmp,$src\n\t"
15962            "psrlw     $tmp,$shift\n\t"
15963            "pshufd    $tmp2,$src,14\n\t"
15964            "pmovzxbw  $tmp2,$tmp2\n\t"
15965            "psrlw     $tmp2,$shift\n\t"
15966            "movdqu    $tmp3,[0x00ff00ff0x00ff00ff]\n\t"
15967            "pand      $tmp,$tmp3\n\t"
15968            "pand      tmp2,$tmp3\n\t"
15969            "packuswb  $tmp,$tmp2\n\t"
15970            "movdqu    $dst,$tmp\n\t! logical right shift for packed16B" %}
15971   ins_encode %{
15972     __ pmovzxbw($tmp$$XMMRegister, $src$$XMMRegister);
15973     __ psrlw($tmp$$XMMRegister, $shift$$XMMRegister);
15974     __ pshufd($tmp2$$XMMRegister, $src$$XMMRegister, 14);
15975     __ pmovzxbw($tmp2$$XMMRegister, $tmp2$$XMMRegister);
15976     __ psrlw($tmp2$$XMMRegister, $shift$$XMMRegister);
15977     __ movdqu($tmp3$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()));
15978     __ pand($tmp$$XMMRegister, $tmp3$$XMMRegister);
15979     __ pand($tmp2$$XMMRegister, $tmp3$$XMMRegister);
15980     __ packuswb($tmp$$XMMRegister, $tmp2$$XMMRegister);
15981     __ movdqu($dst$$XMMRegister, $tmp$$XMMRegister);
15982   %}
15983   ins_pipe( pipe_slow );
15984 %}
15985 
15986 instruct vsrl16B_avx(vecX dst, vecX src, vecS shift, vecY tmp, rRegL scratch) %{
15987   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
15988   match(Set dst (URShiftVB src shift));
15989   effect(TEMP dst, TEMP tmp, TEMP scratch);
15990   format %{"vpmovzxbw   $tmp,$src\n\t"
15991            "vpsrlw      $tmp,$tmp,$shift\n\t"
15992            "vpand       $tmp,$tmp,[0x00ff00ff0x00ff00ff]\n\t"
15993            "vextracti128_high   $dst,$tmp\n\t"
15994            "vpackuswb   $dst,$tmp,$dst\n\t! logical right shift for packed16B" %}
15995   ins_encode %{
15996     int vector_len = 1;
15997     __ vpmovzxbw($tmp$$XMMRegister, $src$$XMMRegister, vector_len);
15998     __ vpsrlw($tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vector_len);
15999     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vector_len, $scratch$$Register);
16000     __ vextracti128_high($dst$$XMMRegister, $tmp$$XMMRegister);
16001     __ vpackuswb($dst$$XMMRegister, $tmp$$XMMRegister, $dst$$XMMRegister, 0);
16002   %}
16003   ins_pipe( pipe_slow );
16004 %}
16005 
16006 instruct vsrl32B_avx(vecY dst, vecY src, vecS shift, vecY tmp, vecY tmp2, rRegL scratch) %{
16007   predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
16008   match(Set dst (URShiftVB src shift));
16009   effect(TEMP tmp2, TEMP tmp, TEMP scratch);
16010   format %{"vextracti128_high  $tmp,$src\n\t"
16011            "vpmovzxbw   $tmp,$tmp\n\t"
16012            "vpmovzxbw   $tmp2,$src\n\t"
16013            "vpsrlw      $tmp,$tmp,$shift\n\t"
16014            "vpsrlw      $tmp2,$tmp2,$shift\n\t"
16015            "vpand       $tmp,$tmp,[0x00ff00ff0x00ff00ff]\n\t"
16016            "vpand       $tmp2,$tmp2,[0x00ff00ff0x00ff00ff]\n\t"
16017            "vpackuswb   $dst,$tmp2, $tmp\n\t"
16018            "vpermq      $dst,$dst, 0xD8\n\t! logical right shift for packed32B" %}
16019   ins_encode %{
16020     int vector_len = 1;
16021     __ vextracti128_high($tmp$$XMMRegister, $src$$XMMRegister);
16022     __ vpmovzxbw($tmp$$XMMRegister, $tmp$$XMMRegister, vector_len);
16023     __ vpmovzxbw($tmp2$$XMMRegister, $src$$XMMRegister, vector_len);
16024     __ vpsrlw($tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vector_len);
16025     __ vpsrlw($tmp2$$XMMRegister, $tmp2$$XMMRegister, $shift$$XMMRegister, vector_len);
16026     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vector_len, $scratch$$Register);
16027     __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vector_len, $scratch$$Register);
16028     __ vpackuswb($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
16029     __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vector_len);
16030   %}
16031   ins_pipe( pipe_slow );
16032 %}
16033 
16034 instruct vsrl64B(vecZ dst, vecZ src, vecS shift, vecZ tmp, vecZ tmp2, vecZ tmp3, rRegL scratch) %{
16035   predicate(UseAVX > 2 && n->as_Vector()->length() == 64);
16036   match(Set dst (URShiftVB src shift));
16037   effect(TEMP dst, TEMP tmp3, TEMP tmp2, TEMP tmp, TEMP scratch);
16038   format %{"vextracti64x4  $tmp,$src\n\t"
16039            "vpmovzxbw      $tmp,$tmp\n\t"
16040            "vpmovzxbw      $tmp2,$src\n\t"
16041            "vpsrlw         $tmp,$tmp,$shift\n\t"
16042            "vpsrlw         $tmp2,$tmp2,$shift\n\t"
16043            "vmovdqu        $tmp3,[0x00ff00ff0x00ff00ff]\n\t"
16044            "vpbroadcastd   $tmp3,$tmp3\n\t"
16045            "vpand          $tmp,$tmp,$tmp3\n\t"
16046            "vpand          $tmp2,$tmp2,$tmp3\n\t"
16047            "vpackuswb      $dst,$tmp,$tmp2\n\t"
16048            "evmovdquq     $tmp3, [0x06040200070500301]\n\t"
16049            "vpermq  $dst,$tmp3,$dst\n\t! logical right shift for packed64B" %}
16050   ins_encode %{
16051     int vector_len = 2;
16052     __ vextracti64x4($tmp$$XMMRegister, $src$$XMMRegister, 1);
16053     __ vpmovzxbw($tmp$$XMMRegister, $tmp$$XMMRegister, vector_len);
16054     __ vpmovzxbw($tmp2$$XMMRegister, $src$$XMMRegister, vector_len);
16055     __ vpsrlw($tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vector_len);
16056     __ vpsrlw($tmp2$$XMMRegister, $tmp2$$XMMRegister, $shift$$XMMRegister, vector_len);
16057     __ vmovdqu($tmp3$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()));
16058     __ vpbroadcastd($tmp3$$XMMRegister, $tmp3$$XMMRegister, vector_len);
16059     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp3$$XMMRegister, vector_len);
16060     __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, vector_len);
16061     __ vpackuswb($dst$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
16062     __ evmovdquq($tmp3$$XMMRegister, ExternalAddress(vector_byte_perm_mask()), vector_len, $scratch$$Register);
16063     __ vpermq($dst$$XMMRegister, $tmp3$$XMMRegister, $dst$$XMMRegister, vector_len);
16064   %}
16065   ins_pipe( pipe_slow );
16066 %}
16067 
16068 // Shorts vector logical right shift produces incorrect Java result
16069 // for negative data because java code convert short value into int with
16070 // sign extension before a shift. But char vectors are fine since chars are
16071 // unsigned values.
16072 
16073 instruct vsrl2S(vecS dst, vecS shift) %{
16074   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
16075   match(Set dst (URShiftVS dst shift));
16076   format %{ "psrlw   $dst,$shift\t! logical right shift packed2S" %}
16077   ins_encode %{
16078     __ psrlw($dst$$XMMRegister, $shift$$XMMRegister);
16079   %}
16080   ins_pipe( pipe_slow );
16081 %}
16082 
16083 instruct vsrl2S_imm(vecS dst, immI8 shift) %{
16084   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
16085   match(Set dst (URShiftVS dst (RShiftCntV shift)));
16086   format %{ "psrlw   $dst,$shift\t! logical right shift packed2S" %}
16087   ins_encode %{
16088     __ psrlw($dst$$XMMRegister, (int)$shift$$constant);
16089   %}
16090   ins_pipe( pipe_slow );
16091 %}
16092 
16093 instruct vsrl2S_reg(vecS dst, vecS src, vecS shift) %{
16094   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
16095   match(Set dst (URShiftVS src shift));
16096   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed2S" %}
16097   ins_encode %{
16098     int vector_len = 0;
16099     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
16100   %}
16101   ins_pipe( pipe_slow );
16102 %}
16103 
16104 instruct vsrl2S_reg_imm(vecS dst, vecS src, immI8 shift) %{
16105   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
16106   match(Set dst (URShiftVS src (RShiftCntV shift)));
16107   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed2S" %}
16108   ins_encode %{
16109     int vector_len = 0;
16110     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
16111   %}
16112   ins_pipe( pipe_slow );
16113 %}
16114 
16115 instruct vsrl4S(vecD dst, vecS shift) %{
16116   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
16117   match(Set dst (URShiftVS dst shift));
16118   format %{ "psrlw   $dst,$shift\t! logical right shift packed4S" %}
16119   ins_encode %{
16120     __ psrlw($dst$$XMMRegister, $shift$$XMMRegister);
16121   %}
16122   ins_pipe( pipe_slow );
16123 %}
16124 
16125 instruct vsrl4S_imm(vecD dst, immI8 shift) %{
16126   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
16127   match(Set dst (URShiftVS dst (RShiftCntV shift)));
16128   format %{ "psrlw   $dst,$shift\t! logical right shift packed4S" %}
16129   ins_encode %{
16130     __ psrlw($dst$$XMMRegister, (int)$shift$$constant);
16131   %}
16132   ins_pipe( pipe_slow );
16133 %}
16134 
16135 instruct vsrl4S_reg(vecD dst, vecD src, vecS shift) %{
16136   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
16137   match(Set dst (URShiftVS src shift));
16138   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed4S" %}
16139   ins_encode %{
16140     int vector_len = 0;
16141     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
16142   %}
16143   ins_pipe( pipe_slow );
16144 %}
16145 
16146 instruct vsrl4S_reg_imm(vecD dst, vecD src, immI8 shift) %{
16147   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
16148   match(Set dst (URShiftVS src (RShiftCntV shift)));
16149   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed4S" %}
16150   ins_encode %{
16151     int vector_len = 0;
16152     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
16153   %}
16154   ins_pipe( pipe_slow );
16155 %}
16156 
16157 instruct vsrl8S(vecX dst, vecS shift) %{
16158   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
16159   match(Set dst (URShiftVS dst shift));
16160   format %{ "psrlw   $dst,$shift\t! logical right shift packed8S" %}
16161   ins_encode %{
16162     __ psrlw($dst$$XMMRegister, $shift$$XMMRegister);
16163   %}
16164   ins_pipe( pipe_slow );
16165 %}
16166 
16167 instruct vsrl8S_imm(vecX dst, immI8 shift) %{
16168   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
16169   match(Set dst (URShiftVS dst (RShiftCntV shift)));
16170   format %{ "psrlw   $dst,$shift\t! logical right shift packed8S" %}
16171   ins_encode %{
16172     __ psrlw($dst$$XMMRegister, (int)$shift$$constant);
16173   %}
16174   ins_pipe( pipe_slow );
16175 %}
16176 
16177 instruct vsrl8S_reg(vecX dst, vecX src, vecS shift) %{
16178   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
16179   match(Set dst (URShiftVS src shift));
16180   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed8S" %}
16181   ins_encode %{
16182     int vector_len = 0;
16183     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
16184   %}
16185   ins_pipe( pipe_slow );
16186 %}
16187 
16188 instruct vsrl8S_reg_imm(vecX dst, vecX src, immI8 shift) %{
16189   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
16190   match(Set dst (URShiftVS src (RShiftCntV shift)));
16191   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed8S" %}
16192   ins_encode %{
16193     int vector_len = 0;
16194     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
16195   %}
16196   ins_pipe( pipe_slow );
16197 %}
16198 
16199 instruct vsrl16S_reg(vecY dst, vecY src, vecS shift) %{
16200   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
16201   match(Set dst (URShiftVS src shift));
16202   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed16S" %}
16203   ins_encode %{
16204     int vector_len = 1;
16205     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
16206   %}
16207   ins_pipe( pipe_slow );
16208 %}
16209 
16210 instruct vsrl16S_reg_imm(vecY dst, vecY src, immI8 shift) %{
16211   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
16212   match(Set dst (URShiftVS src (RShiftCntV shift)));
16213   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed16S" %}
16214   ins_encode %{
16215     int vector_len = 1;
16216     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
16217   %}
16218   ins_pipe( pipe_slow );
16219 %}
16220 
16221 instruct vsrl32S_reg(vecZ dst, vecZ src, vecS shift) %{
16222   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
16223   match(Set dst (URShiftVS src shift));
16224   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed32S" %}
16225   ins_encode %{
16226     int vector_len = 2;
16227     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
16228   %}
16229   ins_pipe( pipe_slow );
16230 %}
16231 
16232 instruct vsrl32S_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
16233   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
16234   match(Set dst (URShiftVS src (RShiftCntV shift)));
16235   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed32S" %}
16236   ins_encode %{
16237     int vector_len = 2;
16238     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
16239   %}
16240   ins_pipe( pipe_slow );
16241 %}
16242 
16243 // Integers vector logical right shift
16244 instruct vsrl2I(vecD dst, vecS shift) %{
16245   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
16246   match(Set dst (URShiftVI dst shift));
16247   format %{ "psrld   $dst,$shift\t! logical right shift packed2I" %}
16248   ins_encode %{
16249     __ psrld($dst$$XMMRegister, $shift$$XMMRegister);
16250   %}
16251   ins_pipe( pipe_slow );
16252 %}
16253 
16254 instruct vsrl2I_imm(vecD dst, immI8 shift) %{
16255   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
16256   match(Set dst (URShiftVI dst (RShiftCntV shift)));
16257   format %{ "psrld   $dst,$shift\t! logical right shift packed2I" %}
16258   ins_encode %{
16259     __ psrld($dst$$XMMRegister, (int)$shift$$constant);
16260   %}
16261   ins_pipe( pipe_slow );
16262 %}
16263 
16264 instruct vsrl2I_reg(vecD dst, vecD src, vecS shift) %{
16265   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
16266   match(Set dst (URShiftVI src shift));
16267   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed2I" %}
16268   ins_encode %{
16269     int vector_len = 0;
16270     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
16271   %}
16272   ins_pipe( pipe_slow );
16273 %}
16274 
16275 instruct vsrl2I_reg_imm(vecD dst, vecD src, immI8 shift) %{
16276   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
16277   match(Set dst (URShiftVI src (RShiftCntV shift)));
16278   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed2I" %}
16279   ins_encode %{
16280     int vector_len = 0;
16281     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
16282   %}
16283   ins_pipe( pipe_slow );
16284 %}
16285 
16286 instruct vsrl4I(vecX dst, vecS shift) %{
16287   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
16288   match(Set dst (URShiftVI dst shift));
16289   format %{ "psrld   $dst,$shift\t! logical right shift packed4I" %}
16290   ins_encode %{
16291     __ psrld($dst$$XMMRegister, $shift$$XMMRegister);
16292   %}
16293   ins_pipe( pipe_slow );
16294 %}
16295 
16296 instruct vsrl4I_imm(vecX dst, immI8 shift) %{
16297   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
16298   match(Set dst (URShiftVI dst (RShiftCntV shift)));
16299   format %{ "psrld   $dst,$shift\t! logical right shift packed4I" %}
16300   ins_encode %{
16301     __ psrld($dst$$XMMRegister, (int)$shift$$constant);
16302   %}
16303   ins_pipe( pipe_slow );
16304 %}
16305 
16306 instruct vsrl4I_reg(vecX dst, vecX src, vecS shift) %{
16307   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
16308   match(Set dst (URShiftVI src shift));
16309   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed4I" %}
16310   ins_encode %{
16311     int vector_len = 0;
16312     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
16313   %}
16314   ins_pipe( pipe_slow );
16315 %}
16316 
16317 instruct vsrl4I_reg_imm(vecX dst, vecX src, immI8 shift) %{
16318   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
16319   match(Set dst (URShiftVI src (RShiftCntV shift)));
16320   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed4I" %}
16321   ins_encode %{
16322     int vector_len = 0;
16323     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
16324   %}
16325   ins_pipe( pipe_slow );
16326 %}
16327 
16328 instruct vsrl8I_reg(vecY dst, vecY src, vecS shift) %{
16329   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
16330   match(Set dst (URShiftVI src shift));
16331   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed8I" %}
16332   ins_encode %{
16333     int vector_len = 1;
16334     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
16335   %}
16336   ins_pipe( pipe_slow );
16337 %}
16338 
16339 instruct vsrl8I_reg_imm(vecY dst, vecY src, immI8 shift) %{
16340   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
16341   match(Set dst (URShiftVI src (RShiftCntV shift)));
16342   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed8I" %}
16343   ins_encode %{
16344     int vector_len = 1;
16345     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
16346   %}
16347   ins_pipe( pipe_slow );
16348 %}
16349 
16350 instruct vsrl16I_reg(vecZ dst, vecZ src, vecS shift) %{
16351   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
16352   match(Set dst (URShiftVI src shift));
16353   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed16I" %}
16354   ins_encode %{
16355     int vector_len = 2;
16356     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
16357   %}
16358   ins_pipe( pipe_slow );
16359 %}
16360 
16361 instruct vsrl16I_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
16362   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
16363   match(Set dst (URShiftVI src (RShiftCntV shift)));
16364   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed16I" %}
16365   ins_encode %{
16366     int vector_len = 2;
16367     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
16368   %}
16369   ins_pipe( pipe_slow );
16370 %}
16371 
16372 // Longs vector logical right shift
16373 instruct vsrl2L(vecX dst, vecS shift) %{
16374   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
16375   match(Set dst (URShiftVL dst shift));
16376   format %{ "psrlq   $dst,$shift\t! logical right shift packed2L" %}
16377   ins_encode %{
16378     __ psrlq($dst$$XMMRegister, $shift$$XMMRegister);
16379   %}
16380   ins_pipe( pipe_slow );
16381 %}
16382 
16383 instruct vsrl2L_imm(vecX dst, immI8 shift) %{
16384   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
16385   match(Set dst (URShiftVL dst (RShiftCntV shift)));
16386   format %{ "psrlq   $dst,$shift\t! logical right shift packed2L" %}
16387   ins_encode %{
16388     __ psrlq($dst$$XMMRegister, (int)$shift$$constant);
16389   %}
16390   ins_pipe( pipe_slow );
16391 %}
16392 
16393 instruct vsrl2L_reg(vecX dst, vecX src, vecS shift) %{
16394   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
16395   match(Set dst (URShiftVL src shift));
16396   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed2L" %}
16397   ins_encode %{
16398     int vector_len = 0;
16399     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
16400   %}
16401   ins_pipe( pipe_slow );
16402 %}
16403 
16404 instruct vsrl2L_reg_imm(vecX dst, vecX src, immI8 shift) %{
16405   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
16406   match(Set dst (URShiftVL src (RShiftCntV shift)));
16407   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed2L" %}
16408   ins_encode %{
16409     int vector_len = 0;
16410     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
16411   %}
16412   ins_pipe( pipe_slow );
16413 %}
16414 
16415 instruct vsrl4L_reg(vecY dst, vecY src, vecS shift) %{
16416   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
16417   match(Set dst (URShiftVL src shift));
16418   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed4L" %}
16419   ins_encode %{
16420     int vector_len = 1;
16421     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
16422   %}
16423   ins_pipe( pipe_slow );
16424 %}
16425 
16426 instruct vsrl4L_reg_imm(vecY dst, vecY src, immI8 shift) %{
16427   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
16428   match(Set dst (URShiftVL src (RShiftCntV shift)));
16429   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed4L" %}
16430   ins_encode %{
16431     int vector_len = 1;
16432     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
16433   %}
16434   ins_pipe( pipe_slow );
16435 %}
16436 
16437 instruct vsrl8L_reg(vecZ dst, vecZ src, vecS shift) %{
16438   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
16439   match(Set dst (URShiftVL src shift));
16440   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed8L" %}
16441   ins_encode %{
16442     int vector_len = 2;
16443     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
16444   %}
16445   ins_pipe( pipe_slow );
16446 %}
16447 
16448 instruct vsrl8L_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
16449   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
16450   match(Set dst (URShiftVL src (RShiftCntV shift)));
16451   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed8L" %}
16452   ins_encode %{
16453     int vector_len = 2;
16454     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
16455   %}
16456   ins_pipe( pipe_slow );
16457 %}
16458 
16459 // ------------------- ArithmeticRightShift -----------------------------------
16460 
16461 // Byte vector arithmetic right shift
16462 instruct vsra4B_reg(vecS dst, vecS src, vecS shift, vecD tmp, vecD tmp2) %{
16463   predicate(UseSSE > 3 && n->as_Vector()->length() == 4);
16464   match(Set dst (RShiftVB src shift));
16465   effect(TEMP tmp2, TEMP tmp);
16466   format %{"pmovsxbw  $tmp,$src\n\t"
16467            "psraw     $tmp,$shift\n\t"
16468            "movdqu    $tmp2,[0x00ff00ff0x00ff00ff]\n\t"
16469            "pand      $tmp,$tmp2\n\t"
16470            "packuswb  $tmp,$tmp\n\t"
16471            "movss     $dst,$tmp\n\t! arithmetic right shift for packed4B" %}
16472   ins_encode %{
16473     __ pmovsxbw($tmp$$XMMRegister, $src$$XMMRegister);
16474     __ psraw($tmp$$XMMRegister, $shift$$XMMRegister);
16475     __ movdqu($tmp2$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()));
16476     __ pand($tmp$$XMMRegister, $tmp2$$XMMRegister);
16477     __ packuswb($tmp$$XMMRegister, $tmp$$XMMRegister);
16478     __ movss($dst$$XMMRegister, $tmp$$XMMRegister);
16479   %}
16480   ins_pipe( pipe_slow );
16481 %}
16482 
16483 instruct vsra8B_reg(vecD dst, vecD src, vecS shift, vecX tmp, vecX tmp2) %{
16484   predicate(UseSSE > 3 && n->as_Vector()->length() == 8);
16485   match(Set dst (RShiftVB src shift));
16486   effect(TEMP tmp2, TEMP tmp);
16487   format %{"pmovsxbw  $tmp,$src\n\t"
16488            "psraw     $tmp,$shift\n\t"
16489            "movdqu    $tmp2,[0x00ff00ff0x00ff00ff]\n\t"
16490            "pand      $tmp,$tmp2\n\t"
16491            "packuswb  $tmp,$tmp\n\t"
16492            "movsd     $dst,$tmp\n\t! arithmetic right shift for packed8B" %}
16493   ins_encode %{
16494     __ pmovsxbw($tmp$$XMMRegister, $src$$XMMRegister);
16495     __ psraw($tmp$$XMMRegister, $shift$$XMMRegister);
16496     __ movdqu($tmp2$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()));
16497     __ pand($tmp$$XMMRegister, $tmp2$$XMMRegister);
16498     __ packuswb($tmp$$XMMRegister, $tmp$$XMMRegister);
16499     __ movsd($dst$$XMMRegister, $tmp$$XMMRegister);
16500   %}
16501   ins_pipe( pipe_slow );
16502 %}
16503 
16504 instruct vsra16B_reg(vecX dst, vecX src, vecS shift, vecX tmp, vecX tmp2, vecX tmp3) %{
16505   predicate(UseSSE > 3  && n->as_Vector()->length() == 16);
16506   match(Set dst (RShiftVB src shift));
16507   effect(TEMP tmp2, TEMP tmp, TEMP tmp3);
16508   format %{"pmovsxbw  $tmp,$src\n\t"
16509            "psraw     $tmp,$shift\n\t"
16510            "pshufd    $tmp2,$src\n\t"
16511            "pmovsxbw  $tmp2,$tmp2\n\t"
16512            "psraw     $tmp2,$shift\n\t"
16513            "movdqu    $tmp3,[0x00ff00ff0x00ff00ff]\n\t"
16514            "pand      $tmp,$tmp3\n\t"
16515            "pand      $tmp2,$tmp3\n\t"
16516            "packuswb  $tmp,$tmp2\n\t"
16517            "movdqu    $dst,$tmp\n\t! arithmetic right shift for packed16B" %}
16518   ins_encode %{
16519     __ pmovsxbw($tmp$$XMMRegister, $src$$XMMRegister);
16520     __ psraw($tmp$$XMMRegister, $shift$$XMMRegister);
16521     __ pshufd($tmp2$$XMMRegister, $src$$XMMRegister, 0xE);
16522     __ pmovsxbw($tmp2$$XMMRegister, $tmp2$$XMMRegister);
16523     __ psraw($tmp2$$XMMRegister, $shift$$XMMRegister);
16524     __ movdqu($tmp3$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()));
16525     __ pand($tmp$$XMMRegister, $tmp3$$XMMRegister);
16526     __ pand($tmp2$$XMMRegister, $tmp3$$XMMRegister);
16527     __ packuswb($tmp$$XMMRegister, $tmp2$$XMMRegister);
16528     __ movdqu($dst$$XMMRegister, $tmp$$XMMRegister);
16529   %}
16530   ins_pipe( pipe_slow );
16531 %}
16532 
16533 instruct vsra16B_avx(vecX dst, vecX src, vecS shift, vecY tmp, rRegL scratch) %{
16534   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
16535   match(Set dst (RShiftVB src shift));
16536   effect(TEMP dst, TEMP tmp, TEMP scratch);
16537   format %{"vpmovsxbw  $tmp,$src\n\t"
16538            "vpsraw     $tmp,$tmp,$shift\n\t"
16539            "vpand      $tmp,$tmp,[0x00ff00ff0x00ff00ff]\n\t"
16540            "vextracti128_high  $dst,$tmp\n\t"
16541            "vpackuswb  $dst,$tmp,$dst\n\t! arithmetic right shift for packed16B" %}
16542   ins_encode %{
16543     int vector_len = 1;
16544     __ vpmovsxbw($tmp$$XMMRegister, $src$$XMMRegister, vector_len);
16545     __ vpsraw($tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vector_len);
16546     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vector_len, $scratch$$Register);
16547     __ vextracti128_high($dst$$XMMRegister, $tmp$$XMMRegister);
16548     __ vpackuswb($dst$$XMMRegister, $tmp$$XMMRegister, $dst$$XMMRegister, 0);
16549         %}
16550   ins_pipe( pipe_slow );
16551 %}
16552 
16553 instruct vsra32B_avx(vecY dst, vecY src, vecS shift, vecY tmp, vecY tmp2, rRegL scratch) %{
16554   predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
16555   match(Set dst (RShiftVB src shift));
16556   effect(TEMP tmp2, TEMP tmp, TEMP dst, TEMP scratch);
16557   format %{"vextracti128_high  $tmp,$src\n\t"
16558            "vpmovsxbw  $tmp,$tmp\n\t"
16559            "vpmovsxbw  $tmp2,$src\n\t"
16560            "vpsraw     $tmp,$tmp,$shift\n\t"
16561            "vpsraw     $tmp2,$tmp2,$shift\n\t"
16562            "vpand      $tmp,$tmp,[0x00ff00ff0x00ff00ff]\n\t"
16563            "vpand      $tmp2,$tmp2,[0x00ff00ff0x00ff00ff]\n\t"
16564            "vpackuswb  $dst,$tmp2,$tmp\n\t"
16565            "vpermq     $dst,$dst,0xD8\n\t! arithmetic right shift for packed32B" %}
16566   ins_encode %{
16567     int vector_len = 1;
16568     __ vextracti128_high($tmp$$XMMRegister, $src$$XMMRegister);
16569     __ vpmovsxbw($tmp$$XMMRegister, $tmp$$XMMRegister, vector_len);
16570     __ vpmovsxbw($tmp2$$XMMRegister, $src$$XMMRegister, vector_len);
16571     __ vpsraw($tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vector_len);
16572     __ vpsraw($tmp2$$XMMRegister, $tmp2$$XMMRegister, $shift$$XMMRegister, vector_len);
16573     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vector_len, $scratch$$Register);
16574     __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vector_len, $scratch$$Register);
16575     __ vpackuswb($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
16576     __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vector_len);
16577   %}
16578   ins_pipe( pipe_slow );
16579 %}
16580 
16581 instruct vsra64B(vecZ dst, vecZ src, vecS shift, vecZ tmp, vecZ tmp2, vecZ tmp3, rRegL scratch) %{
16582   predicate(UseAVX > 2 && n->as_Vector()->length() == 64);
16583   match(Set dst (RShiftVB src shift));
16584   effect(TEMP dst, TEMP tmp3, TEMP tmp2, TEMP tmp, TEMP scratch);
16585   format %{"vextracti64x4  $tmp,$src\n\t"
16586            "vpmovsxbw      $tmp,$tmp\n\t"
16587            "vpmovsxbw      $tmp2,$src\n\t"
16588            "vpsraw         $tmp,$tmp,$shift\n\t"
16589            "vpsraw         $tmp2,$tmp2,$shift\n\t"
16590            "vmovdqu        $tmp3,[0x00ff00ff0x00ff00ff]\n\t"
16591            "vpbroadcastd   $tmp3,$tmp3\n\t"
16592            "vpand          $tmp,$tmp,$tmp3\n\t"
16593            "vpand          $tmp2,$tmp2,$tmp3\n\t"
16594            "vpackuswb      $dst,$tmp,$tmp2\n\t"
16595            "evmovdquq     $tmp3, [0x06040200070500301]\n\t"
16596            "vpermq  $dst,$tmp3,$dst\n\t! arithmetic right shift for packed64B" %}
16597   ins_encode %{
16598     int vector_len = 2;
16599     __ vextracti64x4($tmp$$XMMRegister, $src$$XMMRegister, 1);
16600     __ vpmovsxbw($tmp$$XMMRegister, $tmp$$XMMRegister, vector_len);
16601     __ vpmovsxbw($tmp2$$XMMRegister, $src$$XMMRegister, vector_len);
16602     __ vpsraw($tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vector_len);
16603     __ vpsraw($tmp2$$XMMRegister, $tmp2$$XMMRegister, $shift$$XMMRegister, vector_len);
16604     __ vmovdqu($tmp3$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()));
16605     __ vpbroadcastd($tmp3$$XMMRegister, $tmp3$$XMMRegister, vector_len);
16606     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp3$$XMMRegister, vector_len);
16607     __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, vector_len);
16608     __ vpackuswb($dst$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
16609     __ evmovdquq($tmp3$$XMMRegister, ExternalAddress(vector_byte_perm_mask()), vector_len, $scratch$$Register);
16610     __ vpermq($dst$$XMMRegister, $tmp3$$XMMRegister, $dst$$XMMRegister, vector_len);
16611   %}
16612   ins_pipe( pipe_slow );
16613 %}
16614 
16615 // Shorts/Chars vector arithmetic right shift
16616 instruct vsra2S(vecS dst, vecS shift) %{
16617   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
16618   match(Set dst (RShiftVS dst shift));
16619   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed2S" %}
16620   ins_encode %{
16621     __ psraw($dst$$XMMRegister, $shift$$XMMRegister);
16622   %}
16623   ins_pipe( pipe_slow );
16624 %}
16625 
16626 instruct vsra2S_imm(vecS dst, immI8 shift) %{
16627   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
16628   match(Set dst (RShiftVS dst (RShiftCntV shift)));
16629   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed2S" %}
16630   ins_encode %{
16631     __ psraw($dst$$XMMRegister, (int)$shift$$constant);
16632   %}
16633   ins_pipe( pipe_slow );
16634 %}
16635 
16636 instruct vsra2S_reg(vecS dst, vecS src, vecS shift) %{
16637   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
16638   match(Set dst (RShiftVS src shift));
16639   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
16640   ins_encode %{
16641     int vector_len = 0;
16642     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
16643   %}
16644   ins_pipe( pipe_slow );
16645 %}
16646 
16647 instruct vsra2S_reg_imm(vecS dst, vecS src, immI8 shift) %{
16648   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
16649   match(Set dst (RShiftVS src (RShiftCntV shift)));
16650   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
16651   ins_encode %{
16652     int vector_len = 0;
16653     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
16654   %}
16655   ins_pipe( pipe_slow );
16656 %}
16657 
16658 instruct vsra4S(vecD dst, vecS shift) %{
16659   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
16660   match(Set dst (RShiftVS dst shift));
16661   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed4S" %}
16662   ins_encode %{
16663     __ psraw($dst$$XMMRegister, $shift$$XMMRegister);
16664   %}
16665   ins_pipe( pipe_slow );
16666 %}
16667 
16668 instruct vsra4S_imm(vecD dst, immI8 shift) %{
16669   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
16670   match(Set dst (RShiftVS dst (RShiftCntV shift)));
16671   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed4S" %}
16672   ins_encode %{
16673     __ psraw($dst$$XMMRegister, (int)$shift$$constant);
16674   %}
16675   ins_pipe( pipe_slow );
16676 %}
16677 
16678 instruct vsra4S_reg(vecD dst, vecD src, vecS shift) %{
16679   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
16680   match(Set dst (RShiftVS src shift));
16681   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
16682   ins_encode %{
16683     int vector_len = 0;
16684     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
16685   %}
16686   ins_pipe( pipe_slow );
16687 %}
16688 
16689 instruct vsra4S_reg_imm(vecD dst, vecD src, immI8 shift) %{
16690   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
16691   match(Set dst (RShiftVS src (RShiftCntV shift)));
16692   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
16693   ins_encode %{
16694     int vector_len = 0;
16695     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
16696   %}
16697   ins_pipe( pipe_slow );
16698 %}
16699 
16700 instruct vsra8S(vecX dst, vecS shift) %{
16701   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
16702   match(Set dst (RShiftVS dst shift));
16703   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed8S" %}
16704   ins_encode %{
16705     __ psraw($dst$$XMMRegister, $shift$$XMMRegister);
16706   %}
16707   ins_pipe( pipe_slow );
16708 %}
16709 
16710 instruct vsra8S_imm(vecX dst, immI8 shift) %{
16711   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
16712   match(Set dst (RShiftVS dst (RShiftCntV shift)));
16713   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed8S" %}
16714   ins_encode %{
16715     __ psraw($dst$$XMMRegister, (int)$shift$$constant);
16716   %}
16717   ins_pipe( pipe_slow );
16718 %}
16719 
16720 instruct vsra8S_reg(vecX dst, vecX src, vecS shift) %{
16721   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
16722   match(Set dst (RShiftVS src shift));
16723   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
16724   ins_encode %{
16725     int vector_len = 0;
16726     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
16727   %}
16728   ins_pipe( pipe_slow );
16729 %}
16730 
16731 instruct vsra8S_reg_imm(vecX dst, vecX src, immI8 shift) %{
16732   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
16733   match(Set dst (RShiftVS src (RShiftCntV shift)));
16734   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
16735   ins_encode %{
16736     int vector_len = 0;
16737     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
16738   %}
16739   ins_pipe( pipe_slow );
16740 %}
16741 
16742 instruct vsra16S_reg(vecY dst, vecY src, vecS shift) %{
16743   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
16744   match(Set dst (RShiftVS src shift));
16745   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
16746   ins_encode %{
16747     int vector_len = 1;
16748     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
16749   %}
16750   ins_pipe( pipe_slow );
16751 %}
16752 
16753 instruct vsra16S_reg_imm(vecY dst, vecY src, immI8 shift) %{
16754   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
16755   match(Set dst (RShiftVS src (RShiftCntV shift)));
16756   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
16757   ins_encode %{
16758     int vector_len = 1;
16759     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
16760   %}
16761   ins_pipe( pipe_slow );
16762 %}
16763 
16764 instruct vsra32S_reg(vecZ dst, vecZ src, vecS shift) %{
16765   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
16766   match(Set dst (RShiftVS src shift));
16767   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed32S" %}
16768   ins_encode %{
16769     int vector_len = 2;
16770     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
16771   %}
16772   ins_pipe( pipe_slow );
16773 %}
16774 
16775 instruct vsra32S_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
16776   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
16777   match(Set dst (RShiftVS src (RShiftCntV shift)));
16778   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed32S" %}
16779   ins_encode %{
16780     int vector_len = 2;
16781     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
16782   %}
16783   ins_pipe( pipe_slow );
16784 %}
16785 
16786 // Integers vector arithmetic right shift
16787 instruct vsra2I(vecD dst, vecS shift) %{
16788   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
16789   match(Set dst (RShiftVI dst shift));
16790   format %{ "psrad   $dst,$shift\t! arithmetic right shift packed2I" %}
16791   ins_encode %{
16792     __ psrad($dst$$XMMRegister, $shift$$XMMRegister);
16793   %}
16794   ins_pipe( pipe_slow );
16795 %}
16796 
16797 instruct vsra2I_imm(vecD dst, immI8 shift) %{
16798   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
16799   match(Set dst (RShiftVI dst (RShiftCntV shift)));
16800   format %{ "psrad   $dst,$shift\t! arithmetic right shift packed2I" %}
16801   ins_encode %{
16802     __ psrad($dst$$XMMRegister, (int)$shift$$constant);
16803   %}
16804   ins_pipe( pipe_slow );
16805 %}
16806 
16807 instruct vsra2I_reg(vecD dst, vecD src, vecS shift) %{
16808   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
16809   match(Set dst (RShiftVI src shift));
16810   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed2I" %}
16811   ins_encode %{
16812     int vector_len = 0;
16813     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
16814   %}
16815   ins_pipe( pipe_slow );
16816 %}
16817 
16818 instruct vsra2I_reg_imm(vecD dst, vecD src, immI8 shift) %{
16819   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
16820   match(Set dst (RShiftVI src (RShiftCntV shift)));
16821   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed2I" %}
16822   ins_encode %{
16823     int vector_len = 0;
16824     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
16825   %}
16826   ins_pipe( pipe_slow );
16827 %}
16828 
16829 instruct vsra4I(vecX dst, vecS shift) %{
16830   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
16831   match(Set dst (RShiftVI dst shift));
16832   format %{ "psrad   $dst,$shift\t! arithmetic right shift packed4I" %}
16833   ins_encode %{
16834     __ psrad($dst$$XMMRegister, $shift$$XMMRegister);
16835   %}
16836   ins_pipe( pipe_slow );
16837 %}
16838 
16839 instruct vsra4I_imm(vecX dst, immI8 shift) %{
16840   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
16841   match(Set dst (RShiftVI dst (RShiftCntV shift)));
16842   format %{ "psrad   $dst,$shift\t! arithmetic right shift packed4I" %}
16843   ins_encode %{
16844     __ psrad($dst$$XMMRegister, (int)$shift$$constant);
16845   %}
16846   ins_pipe( pipe_slow );
16847 %}
16848 
16849 instruct vsra4I_reg(vecX dst, vecX src, vecS shift) %{
16850   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
16851   match(Set dst (RShiftVI src shift));
16852   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed4I" %}
16853   ins_encode %{
16854     int vector_len = 0;
16855     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
16856   %}
16857   ins_pipe( pipe_slow );
16858 %}
16859 
16860 instruct vsra4I_reg_imm(vecX dst, vecX src, immI8 shift) %{
16861   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
16862   match(Set dst (RShiftVI src (RShiftCntV shift)));
16863   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed4I" %}
16864   ins_encode %{
16865     int vector_len = 0;
16866     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
16867   %}
16868   ins_pipe( pipe_slow );
16869 %}
16870 
16871 instruct vsra8I_reg(vecY dst, vecY src, vecS shift) %{
16872   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
16873   match(Set dst (RShiftVI src shift));
16874   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed8I" %}
16875   ins_encode %{
16876     int vector_len = 1;
16877     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
16878   %}
16879   ins_pipe( pipe_slow );
16880 %}
16881 
16882 instruct vsra8I_reg_imm(vecY dst, vecY src, immI8 shift) %{
16883   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
16884   match(Set dst (RShiftVI src (RShiftCntV shift)));
16885   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed8I" %}
16886   ins_encode %{
16887     int vector_len = 1;
16888     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
16889   %}
16890   ins_pipe( pipe_slow );
16891 %}
16892 
16893 instruct vsra16I_reg(vecZ dst, vecZ src, vecS shift) %{
16894   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
16895   match(Set dst (RShiftVI src shift));
16896   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed16I" %}
16897   ins_encode %{
16898     int vector_len = 2;
16899     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
16900   %}
16901   ins_pipe( pipe_slow );
16902 %}
16903 
16904 instruct vsra16I_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
16905   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
16906   match(Set dst (RShiftVI src (RShiftCntV shift)));
16907   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed16I" %}
16908   ins_encode %{
16909     int vector_len = 2;
16910     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
16911   %}
16912   ins_pipe( pipe_slow );
16913 %}
16914 
16915 // Long vector arithmetic right shift
16916 instruct vsra1L(vecD dst, vecD src, vecS shift, vecD tmp) %{
16917   predicate(n->as_Vector()->length() == 1);
16918   match(Set dst (RShiftVL src shift));
16919   effect(TEMP dst, TEMP tmp);
16920   format %{ "movdqu  $dst,$src\n\t"
16921             "psrlq   $dst,$shift\n\t"
16922             "movdqu  $tmp,[0x8000000000000000]\n\t"
16923             "psrlq   $tmp,$shift\n\t"
16924             "pxor    $dst,$tmp\n\t"
16925             "psubq   $dst,$tmp\t! arithmetic right shift packed1L" %}
16926   ins_encode %{
16927     __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
16928     __ psrlq($dst$$XMMRegister, $shift$$XMMRegister);
16929     __ movdqu($tmp$$XMMRegister, ExternalAddress(vector_all_ones_mask()));
16930     __ psrlq($tmp$$XMMRegister, $shift$$XMMRegister);
16931     __ pxor($dst$$XMMRegister, $tmp$$XMMRegister);
16932     __ psubq($dst$$XMMRegister, $tmp$$XMMRegister);
16933   %}
16934   ins_pipe( pipe_slow );
16935 %}
16936 
16937 instruct vsra1L_imm(vecD dst, vecD src, immI8 shift, vecD tmp) %{
16938   predicate(n->as_Vector()->length() == 1);
16939   match(Set dst (RShiftVL src (RShiftCntV shift)));
16940   effect(TEMP dst, TEMP tmp);
16941   format %{ "movdqu  $dst,$src\n\t"
16942             "psrlq   $dst,$shift\n\t"
16943             "movdqu  $tmp,[0x8000000000000000]\n\t"
16944             "psrlq   $tmp,$shift\n\t"
16945             "pxor    $dst,$tmp\n\t"
16946             "psubq   $dst,$tmp\t! arithmetic right shift packed1L" %}
16947   ins_encode %{
16948     __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
16949     __ psrlq($dst$$XMMRegister, (int)$shift$$constant);
16950     __ movdqu($tmp$$XMMRegister, ExternalAddress(vector_all_ones_mask()));
16951     __ psrlq($tmp$$XMMRegister, (int)$shift$$constant);
16952     __ pxor($dst$$XMMRegister, $tmp$$XMMRegister);
16953     __ psubq($dst$$XMMRegister, $tmp$$XMMRegister);
16954   %}
16955   ins_pipe( pipe_slow );
16956 %}
16957 
16958 instruct vsra1L_reg(vecD dst, vecD src, vecS shift, vecD tmp) %{
16959   predicate(UseAVX > 0 && n->as_Vector()->length() == 1);
16960   match(Set dst (RShiftVL src shift));
16961   effect(TEMP dst, TEMP tmp);
16962   format %{ "vpsrlq   $dst,$src,$shift\n\t"
16963             "vmovdqu  $tmp,[0x8000000000000000]\n\t"
16964             "vpsrlq   $tmp,$tmp,$shift\n\t"
16965             "vpxor    $dst,$dst,$tmp\n\t"
16966             "vpsubq   $dst,$dst,$tmp\t! arithmetic right shift packed1L" %}
16967   ins_encode %{
16968     int vector_len = 0;
16969     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
16970     __ vmovdqu($tmp$$XMMRegister, ExternalAddress(vector_all_ones_mask()));
16971     __ vpsrlq($tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vector_len);
16972     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
16973     __ vpsubq($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
16974   %}
16975   ins_pipe( pipe_slow );
16976 %}
16977 
16978 instruct vsra1L_reg_imm(vecD dst, vecD src, immI8 shift, vecD tmp) %{
16979   predicate(UseAVX > 0 && n->as_Vector()->length() == 1);
16980   match(Set dst (RShiftVL src (RShiftCntV shift)));
16981   effect(TEMP dst, TEMP tmp);
16982   format %{ "vpsrlq   $dst,$src,$shift\n\t"
16983             "vmovdqu  $tmp,[0x8000000000000000]\n\t"
16984             "vpsrlq   $tmp,$tmp,$shift\n\t"
16985             "vpxor    $dst,$dst,$tmp\n\t"
16986             "vpsubq   $dst,$dst,$tmp\t! arithmetic right shift packed1L" %}
16987   ins_encode %{
16988     int vector_len = 0;
16989     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
16990     __ vmovdqu($tmp$$XMMRegister, ExternalAddress(vector_all_ones_mask()));
16991     __ vpsrlq($tmp$$XMMRegister, $tmp$$XMMRegister, (int)$shift$$constant, vector_len);
16992     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
16993     __ vpsubq($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
16994   %}
16995   ins_pipe( pipe_slow );
16996 %}
16997 
16998 instruct vsra1L_reg_evex(vecD dst, vecD src, vecS shift) %{
16999   predicate(UseAVX > 2 && VM_Version::supports_avx512vl() && n->as_Vector()->length() == 1);
17000   match(Set dst (RShiftVL src shift));
17001   format %{ "evpsraq  $dst,$src,$shift\t! arithmetic right shift packed1L" %}
17002   ins_encode %{
17003     int vector_len = 0;
17004     __ evpsraq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17005   %}
17006   ins_pipe( pipe_slow );
17007 %}
17008 
17009 instruct vsra2L_reg_imm(vecX dst, vecX src, immI8 shift, vecX tmp) %{
17010   predicate(UseSSE >= 2 && n->as_Vector()->length() == 2);
17011   match(Set dst (RShiftVL src (RShiftCntV shift)));
17012   effect(TEMP dst, TEMP tmp);
17013   format %{ "movdqu  $dst,$src\n\t"
17014             "psrlq   $dst,$shift\n\t"
17015             "movdqu  $tmp,[0x8000000000000000]\n\t"
17016             "psrlq   $tmp,$shift\n\t"
17017             "pxor    $dst,$tmp\n\t"
17018             "psubq   $dst,$tmp\t! arithmetic right shift packed2L" %}
17019   ins_encode %{
17020     __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
17021     __ psrlq($dst$$XMMRegister, (int)$shift$$constant);
17022     __ movdqu($tmp$$XMMRegister, ExternalAddress(vector_all_ones_mask()));
17023     __ psrlq($tmp$$XMMRegister, (int)$shift$$constant);
17024     __ pxor($dst$$XMMRegister, $tmp$$XMMRegister);
17025     __ psubq($dst$$XMMRegister, $tmp$$XMMRegister);
17026   %}
17027   ins_pipe( pipe_slow );
17028 %}
17029 
17030 instruct vsra2L_reg(vecX dst, vecX src, vecS shift, vecX tmp) %{
17031   predicate(UseSSE >= 2 && n->as_Vector()->length() == 2);
17032   match(Set dst (RShiftVL src shift));
17033   effect(TEMP dst, TEMP tmp);
17034   format %{ "movdqu  $dst,$src\n\t"
17035             "psrlq   $dst,$shift\n\t"
17036             "movdqu  $tmp,[0x8000000000000000]\n\t"
17037             "psrlq   $tmp,$shift\n\t"
17038             "pxor    $dst,$tmp\n\t"
17039             "psubq   $dst,$tmp\t! arithmetic right shift packed2L" %}
17040   ins_encode %{
17041     __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
17042     __ psrlq($dst$$XMMRegister, $shift$$XMMRegister);
17043     __ movdqu($tmp$$XMMRegister, ExternalAddress(vector_all_ones_mask()));
17044     __ psrlq($tmp$$XMMRegister, $shift$$XMMRegister);
17045     __ pxor($dst$$XMMRegister, $tmp$$XMMRegister);
17046     __ psubq($dst$$XMMRegister, $tmp$$XMMRegister);
17047   %}
17048   ins_pipe( pipe_slow );
17049 %}
17050 
17051 instruct vsra2L_reg_evex_imm(vecX dst, vecX src, immI8 shift) %{
17052   predicate(UseAVX > 2 && VM_Version::supports_avx512vl() && n->as_Vector()->length() == 2);
17053   match(Set dst (RShiftVL src (RShiftCntV shift)));
17054   format %{ "evpsraq  $dst,$src,$shift\t! arithmetic right shift packed2L" %}
17055   ins_encode %{
17056     int vector_len = 0;
17057     __ evpsraq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
17058   %}
17059   ins_pipe( pipe_slow );
17060 %}
17061 
17062 instruct vsra2L_reg_evex(vecX dst, vecX src, vecS shift) %{
17063   predicate(UseAVX > 2 && VM_Version::supports_avx512vl() && n->as_Vector()->length() == 2);
17064   match(Set dst (RShiftVL src shift));
17065   format %{ "evpsraq  $dst,$src,$shift\t! arithmetic right shift packed2L" %}
17066   ins_encode %{
17067     int vector_len = 0;
17068     __ evpsraq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17069   %}
17070   ins_pipe( pipe_slow );
17071 %}
17072 
17073 instruct vsra4L_reg_imm(vecY dst, vecY src, immI8 shift, vecY tmp) %{
17074   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
17075   match(Set dst (RShiftVL src (RShiftCntV shift)));
17076   effect(TEMP dst, TEMP tmp);
17077   format %{ "vpsrlq   $dst,$src,$shift\n\t"
17078             "vmovdqu  $tmp,[0x8000000000000000]\n\t"
17079             "vpsrlq   $tmp,$tmp,$shift\n\t"
17080             "vpxor    $dst,$dst,$tmp\n\t"
17081             "vpsubq   $dst,$dst,$tmp\t! arithmetic right shift packed4L" %}
17082   ins_encode %{
17083     int vector_len = 1;
17084     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
17085     __ vmovdqu($tmp$$XMMRegister, ExternalAddress(vector_all_ones_mask()));
17086     __ vpsrlq($tmp$$XMMRegister, $tmp$$XMMRegister, (int)$shift$$constant, vector_len);
17087     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
17088     __ vpsubq($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
17089   %}
17090   ins_pipe( pipe_slow );
17091 %}
17092 
17093 instruct vsra4L_reg(vecY dst, vecY src, vecS shift, vecY tmp) %{
17094   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
17095   match(Set dst (RShiftVL src shift));
17096   effect(TEMP dst, TEMP tmp);
17097   format %{ "vpsrlq   $dst,$src,$shift\n\t"
17098             "vmovdqu  $tmp,[0x8000000000000000]\n\t"
17099             "vpsrlq   $tmp,$tmp,$shift\n\t"
17100             "vpxor    $dst,$dst,$tmp\n\t"
17101             "vpsubq   $dst,$dst,$tmp\t! arithmetic right shift packed4L" %}
17102   ins_encode %{
17103     int vector_len = 1;
17104     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17105     __ vmovdqu($tmp$$XMMRegister, ExternalAddress(vector_all_ones_mask()));
17106     __ vpsrlq($tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vector_len);
17107     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
17108     __ vpsubq($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
17109   %}
17110   ins_pipe( pipe_slow );
17111 %}
17112 
17113 instruct vsra4L_reg_evex_imm(vecY dst, vecY src, immI8 shift) %{
17114   predicate(UseAVX > 2 && VM_Version::supports_avx512vl() && n->as_Vector()->length() == 4);
17115   match(Set dst (RShiftVL src (RShiftCntV shift)));
17116   format %{ "evpsraq  $dst,$src,$shift\t! arithmetic right shift packed2L" %}
17117   ins_encode %{
17118     int vector_len = 1;
17119     __ evpsraq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
17120   %}
17121   ins_pipe( pipe_slow );
17122 %}
17123 
17124 instruct vsra4L_reg_evex(vecY dst, vecY src, vecS shift) %{
17125   predicate(UseAVX > 2 && VM_Version::supports_avx512vl() && n->as_Vector()->length() == 4);
17126   match(Set dst (RShiftVL src shift));
17127   format %{ "evpsraq  $dst,$src,$shift\t! arithmetic right shift packed4L" %}
17128   ins_encode %{
17129     int vector_len = 1;
17130     __ evpsraq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17131   %}
17132   ins_pipe( pipe_slow );
17133 %}
17134 
17135 instruct vsra8L_reg_evex_imm(vecZ dst, vecZ src, immI8 shift) %{
17136   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
17137   match(Set dst (RShiftVL src (RShiftCntV shift)));
17138   format %{ "evpsraq  $dst,$src,$shift\t! arithmetic right shift packed2L" %}
17139   ins_encode %{
17140     int vector_len = 2;
17141     __ evpsraq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
17142   %}
17143   ins_pipe( pipe_slow );
17144 %}
17145 
17146 instruct vsra8L_reg_evex(vecZ dst, vecZ src, vecS shift) %{
17147   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
17148   match(Set dst (RShiftVL src shift));
17149   format %{ "evpsraq  $dst,$src,$shift\t! arithmetic right shift packed8L" %}
17150   ins_encode %{
17151     int vector_len = 2;
17152     __ evpsraq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17153   %}
17154   ins_pipe( pipe_slow );
17155 %}
17156 
17157 // ------------------- Variable Bit Shift Left Logical -----------------------------
17158 //Integer Variable left shift
17159 instruct vsllv2I(vecD dst, vecD src, vecD shift) %{
17160   predicate(UseAVX > 1 && n->as_Vector()->length() == 2 && n->in(2)->Opcode() != Op_LShiftCntV);
17161   match(Set dst (LShiftVI src shift));
17162   format %{ "vpsllvd  $dst,$src,$shift\t! variable bit shift left shift packed2I" %}
17163   ins_encode %{
17164     int vector_len = 0;
17165     __ vpsllvd($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17166   %}
17167   ins_pipe( pipe_slow );
17168 %}
17169 
17170 instruct vsllv4I_reg(vecX dst, vecX src, vecX shift) %{
17171   predicate(UseAVX > 1 && n->as_Vector()->length() == 4 && n->in(2)->Opcode() != Op_LShiftCntV);
17172   match(Set dst (LShiftVI src shift));
17173   format %{ "vpsllvd  $dst,$src,$shift\t! variable bit shift left shift packed4I" %}
17174   ins_encode %{
17175     int vector_len = 0;
17176     __ vpsllvd($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17177   %}
17178   ins_pipe( pipe_slow );
17179 %}
17180 
17181 instruct vsllv4I_reg_evex(vecX dst, vecX src, vecX shift) %{
17182   predicate(UseAVX > 2 && n->as_Vector()->length() == 4 && n->in(2)->Opcode() != Op_LShiftCntV);
17183   match(Set dst (LShiftVI src shift));
17184   format %{ "vpsllvd  $dst,$src,$shift\t! variable bit shift left shift packed4I" %}
17185   ins_encode %{
17186     int vector_len = 0;
17187     __ vpsllvd($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17188   %}
17189   ins_pipe( pipe_slow );
17190 %}
17191 
17192 instruct vsllv8I_reg(vecY dst, vecY src, vecY shift) %{
17193   predicate(UseAVX > 1 && n->as_Vector()->length() == 8 && n->in(2)->Opcode() != Op_LShiftCntV);
17194   match(Set dst (LShiftVI src shift));
17195   format %{ "vpsllvd  $dst,$src,$shift\t! variable bit shift left shift packed8I" %}
17196   ins_encode %{
17197     int vector_len = 1;
17198     __ vpsllvd($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17199   %}
17200   ins_pipe( pipe_slow );
17201 %}
17202 
17203 instruct vsllv8I_reg_evex(vecY dst, vecY src, vecY shift) %{
17204   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && n->in(2)->Opcode() != Op_LShiftCntV);
17205   match(Set dst (LShiftVI src shift));
17206   format %{ "vpsllvd  $dst,$src,$shift\t! variable bit shift left shift packed8I" %}
17207   ins_encode %{
17208     int vector_len = 1;
17209     __ vpsllvd($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17210   %}
17211   ins_pipe( pipe_slow );
17212 %}
17213 
17214 instruct vsllv16I_reg_evex(vecZ dst, vecZ src, vecZ shift) %{
17215   predicate(UseAVX > 2 && n->as_Vector()->length() == 16 && n->in(2)->Opcode() != Op_LShiftCntV);
17216   match(Set dst (LShiftVI src shift));
17217   format %{ "vpsllvd  $dst,$src,$shift\t! variable bit shift left shift packed16I" %}
17218   ins_encode %{
17219     int vector_len = 2;
17220     __ vpsllvd($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17221   %}
17222   ins_pipe( pipe_slow );
17223 %}
17224 
17225 //Long Variable left shift
17226 instruct vsllv1L_reg(vecD dst, vecD src, vecD shift) %{
17227   predicate(UseAVX > 1 && n->as_Vector()->length() == 1 && n->in(2)->Opcode() != Op_LShiftCntV);
17228   match(Set dst (LShiftVL src shift));
17229   format %{ "vpsllvq  $dst,$src,$shift\t! variable bit shift left shift packed1L" %}
17230   ins_encode %{
17231     int vector_len = 0;
17232     __ vpsllvq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17233   %}
17234   ins_pipe( pipe_slow );
17235 %}
17236 
17237 instruct vsllv2L_reg(vecX dst, vecX src, vecX shift) %{
17238   predicate(UseAVX > 1 && n->as_Vector()->length() == 2 && n->in(2)->Opcode() != Op_LShiftCntV);
17239   match(Set dst (LShiftVL src shift));
17240   format %{ "vpsllvq  $dst,$src,$shift\t! variable bit shift left shift packed2L" %}
17241   ins_encode %{
17242     int vector_len = 0;
17243     __ vpsllvq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17244   %}
17245   ins_pipe( pipe_slow );
17246 %}
17247 
17248 instruct vsllv2L_reg_evex(vecX dst, vecX src, vecX shift) %{
17249   predicate(UseAVX > 2 && n->as_Vector()->length() == 2 && n->in(2)->Opcode() != Op_LShiftCntV);
17250   match(Set dst (LShiftVL src shift));
17251   format %{ "vpsllvq  $dst,$src,$shift\t! variable bit shift left shift packed2L" %}
17252   ins_encode %{
17253     int vector_len = 0;
17254     __ vpsllvq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17255   %}
17256   ins_pipe( pipe_slow );
17257 %}
17258 
17259 instruct vsllv4L_reg(vecY dst, vecY src, vecY shift) %{
17260   predicate(UseAVX > 1 && n->as_Vector()->length() == 4 && n->in(2)->Opcode() != Op_LShiftCntV);
17261   match(Set dst (LShiftVL src shift));
17262   format %{ "vpsllvq  $dst,$src,$shift\t! variable bit shift left shift packed4L" %}
17263   ins_encode %{
17264     int vector_len = 1;
17265     __ vpsllvq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17266   %}
17267   ins_pipe( pipe_slow );
17268 %}
17269 
17270 instruct vsllv4L_reg_evex(vecY dst, vecY src, vecY shift) %{
17271   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && n->in(2)->Opcode() != Op_LShiftCntV);
17272   match(Set dst (LShiftVL src shift));
17273   format %{ "vpsllvq  $dst,$src,$shift\t! variable bit shift left shift packed4L" %}
17274   ins_encode %{
17275     int vector_len = 1;
17276     __ vpsllvq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17277   %}
17278   ins_pipe( pipe_slow );
17279 %}
17280 
17281 instruct vsllv8L_reg_evex(vecZ dst, vecZ src, vecZ shift) %{
17282   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && n->in(2)->Opcode() != Op_LShiftCntV);
17283   match(Set dst (LShiftVL src shift));
17284   format %{ "vpsllvq  $dst,$src,$shift\t! variable bit shift left shift packed16I" %}
17285   ins_encode %{
17286     int vector_len = 2;
17287     __ vpsllvq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17288   %}
17289   ins_pipe( pipe_slow );
17290 %}
17291 
17292 // ------------------- Variable Bit Shift Right Logical -----------------------------
17293 //Integer Variable right shift
17294 instruct vsrlv2I_reg(vecD dst, vecD src, vecD shift) %{
17295   predicate(UseAVX > 1 && n->as_Vector()->length() == 2 && n->in(2)->Opcode() != Op_RShiftCntV);
17296   match(Set dst (URShiftVI src shift));
17297   format %{ "vpsrlvd  $dst,$src,$shift\t! variable bit shift right shift packed2I" %}
17298   ins_encode %{
17299     int vector_len = 0;
17300     __ vpsrlvd($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17301   %}
17302   ins_pipe( pipe_slow );
17303 %}
17304 
17305 instruct vsrlv4I_reg(vecX dst, vecX src, vecX shift) %{
17306   predicate(UseAVX > 1 && n->as_Vector()->length() == 4 && n->in(2)->Opcode() != Op_RShiftCntV);
17307   match(Set dst (URShiftVI src shift));
17308   format %{ "vpsrlvd  $dst,$src,$shift\t! variable bit shift right shift packed4I" %}
17309   ins_encode %{
17310     int vector_len = 0;
17311     __ vpsrlvd($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17312   %}
17313   ins_pipe( pipe_slow );
17314 %}
17315 
17316 instruct vsrlv4I_reg_evex(vecX dst, vecX src, vecX shift) %{
17317   predicate(UseAVX > 2 && n->as_Vector()->length() == 4 && n->in(2)->Opcode() != Op_RShiftCntV);
17318   match(Set dst (URShiftVI src shift));
17319   format %{ "vpsrlvd  $dst,$src,$shift\t! variable bit shift right shift packed4I" %}
17320   ins_encode %{
17321     int vector_len = 0;
17322     __ vpsrlvd($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17323   %}
17324   ins_pipe( pipe_slow );
17325 %}
17326 
17327 instruct vsrlv8I_reg(vecY dst, vecY src, vecY shift) %{
17328   predicate(UseAVX > 1 && n->as_Vector()->length() == 8 && n->in(2)->Opcode() != Op_RShiftCntV);
17329   match(Set dst (URShiftVI src shift));
17330   format %{ "vpsrlvd  $dst,$src,$shift\t! variable bit shift right shift packed8I" %}
17331   ins_encode %{
17332     int vector_len = 1;
17333     __ vpsrlvd($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17334   %}
17335   ins_pipe( pipe_slow );
17336 %}
17337 
17338 instruct vsrlv8I_reg_evex(vecY dst, vecY src, vecY shift) %{
17339   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && n->in(2)->Opcode() != Op_RShiftCntV);
17340   match(Set dst (URShiftVI src shift));
17341   format %{ "vpsrlvd  $dst,$src,$shift\t! variable bit shift right shift packed8I" %}
17342   ins_encode %{
17343     int vector_len = 1;
17344     __ vpsrlvd($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17345   %}
17346   ins_pipe( pipe_slow );
17347 %}
17348 
17349 instruct vsrlv16I_reg_evex(vecZ dst, vecZ src, vecZ shift) %{
17350   predicate(UseAVX > 2 && n->as_Vector()->length() == 16 && n->in(2)->Opcode() != Op_RShiftCntV);
17351   match(Set dst (URShiftVI src shift));
17352   format %{ "vpsrlvd  $dst,$src,$shift\t! variable bit shift right shift packed16I" %}
17353   ins_encode %{
17354     int vector_len = 2;
17355     __ vpsrlvd($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17356   %}
17357   ins_pipe( pipe_slow );
17358 %}
17359 
17360 //Long Variable right shift
17361 instruct vsrlv1L_reg(vecD dst, vecD src, vecD shift) %{
17362   predicate(UseAVX > 1 && n->as_Vector()->length() == 1 && n->in(2)->Opcode() != Op_RShiftCntV);
17363   match(Set dst (URShiftVL src shift));
17364   format %{ "vpsrlvq  $dst,$src,$shift\t! variable bit shift right shift packed1L" %}
17365   ins_encode %{
17366     int vector_len = 0;
17367     __ vpsrlvq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17368   %}
17369   ins_pipe( pipe_slow );
17370 %}
17371 
17372 instruct vsrlv2L_reg(vecX dst, vecX src, vecX shift) %{
17373   predicate(UseAVX > 1 && n->as_Vector()->length() == 2 && n->in(2)->Opcode() != Op_RShiftCntV);
17374   match(Set dst (URShiftVL src shift));
17375   format %{ "vpsrlvq  $dst,$src,$shift\t! variable bit shift right shift packed2L" %}
17376   ins_encode %{
17377     int vector_len = 0;
17378     __ vpsrlvq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17379   %}
17380   ins_pipe( pipe_slow );
17381 %}
17382 
17383 instruct vsrlv2L_reg_evex(vecX dst, vecX src, vecX shift) %{
17384   predicate(UseAVX > 2 && n->as_Vector()->length() == 2 && n->in(2)->Opcode() != Op_RShiftCntV);
17385   match(Set dst (URShiftVL src shift));
17386   format %{ "vpsrlvq  $dst,$src,$shift\t! variable bit shift right shift packed2L" %}
17387   ins_encode %{
17388     int vector_len = 0;
17389     __ vpsrlvq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17390   %}
17391   ins_pipe( pipe_slow );
17392 %}
17393 
17394 instruct vsrlv4L_reg(vecY dst, vecY src, vecY shift) %{
17395   predicate(UseAVX > 1 && n->as_Vector()->length() == 4 && n->in(2)->Opcode() != Op_RShiftCntV);
17396   match(Set dst (URShiftVL src shift));
17397   format %{ "vpsrlvq  $dst,$src,$shift\t! variable bit shift right shift packed4L" %}
17398   ins_encode %{
17399     int vector_len = 1;
17400     __ vpsrlvq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17401   %}
17402   ins_pipe( pipe_slow );
17403 %}
17404 
17405 instruct vsrlv4L_reg_evex(vecY dst, vecY src, vecY shift) %{
17406   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && n->in(2)->Opcode() != Op_RShiftCntV);
17407   match(Set dst (URShiftVL src shift));
17408   format %{ "vpsrlvq  $dst,$src,$shift\t! variable bit shift right shift packed4L" %}
17409   ins_encode %{
17410     int vector_len = 1;
17411     __ vpsrlvq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17412   %}
17413   ins_pipe( pipe_slow );
17414 %}
17415 
17416 instruct vsrlv8L_reg(vecZ dst, vecZ src, vecZ shift) %{
17417   predicate(UseAVX > 1 && n->as_Vector()->length() == 8 && n->in(2)->Opcode() != Op_RShiftCntV);
17418   match(Set dst (URShiftVL src shift));
17419   format %{ "vpsrlvq  $dst,$src,$shift\t! variable bit shift right shift packed8L" %}
17420   ins_encode %{
17421     int vector_len = 2;
17422     __ vpsrlvq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17423   %}
17424   ins_pipe( pipe_slow );
17425 %}
17426 
17427 // ------------------- Variable Bit Shift Right Arithmetic -----------------------------
17428 //Integer Variable right shift
17429 instruct vsrav2I_reg(vecD dst, vecD src, vecD shift) %{
17430   predicate(UseAVX > 1 && n->as_Vector()->length() == 2 && n->in(2)->Opcode() != Op_RShiftCntV);
17431   match(Set dst (RShiftVI src shift));
17432   format %{ "vpsravd  $dst,$src,$shift\t! variable bit shift right shift packed2I" %}
17433   ins_encode %{
17434     int vector_len = 0;
17435     __ vpsravd($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17436   %}
17437   ins_pipe( pipe_slow );
17438 %}
17439 
17440 instruct vsrav4I_reg(vecX dst, vecX src, vecX shift) %{
17441   predicate(UseAVX > 1 && n->as_Vector()->length() == 4 && n->in(2)->Opcode() != Op_RShiftCntV);
17442   match(Set dst (RShiftVI src shift));
17443   format %{ "vpsravd  $dst,$src,$shift\t! variable bit shift right shift packed4I" %}
17444   ins_encode %{
17445     int vector_len = 0;
17446     __ vpsravd($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17447   %}
17448   ins_pipe( pipe_slow );
17449 %}
17450 
17451 instruct vsrav4I_reg_evex(vecX dst, vecX src, vecX shift) %{
17452   predicate(UseAVX > 2 && n->as_Vector()->length() == 4 && n->in(2)->Opcode() != Op_RShiftCntV);
17453   match(Set dst (RShiftVI src shift));
17454   format %{ "vpsravd  $dst,$src,$shift\t! variable bit shift right shift packed4I" %}
17455   ins_encode %{
17456     int vector_len = 0;
17457     __ vpsravd($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17458   %}
17459   ins_pipe( pipe_slow );
17460 %}
17461 
17462 instruct vsrav8I_reg(vecY dst, vecY src, vecY shift) %{
17463   predicate(UseAVX > 1 && n->as_Vector()->length() == 8 && n->in(2)->Opcode() != Op_RShiftCntV);
17464   match(Set dst (RShiftVI src shift));
17465   format %{ "vpsravd  $dst,$src,$shift\t! variable bit shift right shift packed8I" %}
17466   ins_encode %{
17467     int vector_len = 1;
17468     __ vpsravd($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17469   %}
17470   ins_pipe( pipe_slow );
17471 %}
17472 
17473 instruct vsrav8I_reg_evex(vecY dst, vecY src, vecY shift) %{
17474   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && n->in(2)->Opcode() != Op_RShiftCntV);
17475   match(Set dst (RShiftVI src shift));
17476   format %{ "vpsravd  $dst,$src,$shift\t! variable bit shift right shift packed8I" %}
17477   ins_encode %{
17478     int vector_len = 1;
17479     __ vpsravd($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17480   %}
17481   ins_pipe( pipe_slow );
17482 %}
17483 
17484 instruct vsrav16I_reg_evex(vecZ dst, vecZ src, vecZ shift) %{
17485   predicate(UseAVX > 2 && n->as_Vector()->length() == 16 && n->in(2)->Opcode() != Op_RShiftCntV);
17486   match(Set dst (RShiftVI src shift));
17487   format %{ "vpsravd  $dst,$src,$shift\t! variable bit shift right shift packed16I" %}
17488   ins_encode %{
17489     int vector_len = 2;
17490     __ vpsravd($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17491   %}
17492   ins_pipe( pipe_slow );
17493 %}
17494 
17495 //Long Variable right shift arithmetic
17496 instruct vsrav1L_reg(vecD dst, vecD src, vecD shift, vecD tmp) %{
17497   predicate(UseAVX > 0 && n->as_Vector()->length() == 1 && n->in(2)->Opcode() != Op_RShiftCntV);
17498   match(Set dst (RShiftVL src shift));
17499   effect(TEMP dst, TEMP tmp);
17500   format %{ "vpsrlvq   $dst,$src,$shift\n\t"
17501             "vmovdqu   $tmp,[0x8000000000000000]\n\t"
17502             "vpsrlvq   $tmp,$tmp,$shift\n\t"
17503             "vpxor     $dst,$dst,$tmp\n\t"
17504             "vpsubq    $dst,$dst,$tmp\t! variable arithmetic right shift packed1L" %}
17505    ins_encode %{
17506      int vector_len = 0;
17507      __ vpsrlvq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17508      __ vmovdqu($tmp$$XMMRegister, ExternalAddress(vector_all_ones_mask()));
17509      __ vpsrlvq($tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vector_len);
17510      __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
17511      __ vpsubq($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
17512    %}
17513    ins_pipe( pipe_slow );
17514  %}
17515 
17516 instruct vsrav1L_reg_evex(vecD dst, vecD src, vecD shift) %{
17517   predicate(UseAVX > 2 && n->as_Vector()->length() == 1 && n->in(2)->Opcode() != Op_RShiftCntV);
17518   match(Set dst (RShiftVL src shift));
17519   format %{ "evpsravq  $dst,$src,$shift\t! variable arithmetic right shift packed1L" %}
17520   ins_encode %{
17521     int vector_len = 0;
17522     __ evpsravq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17523   %}
17524   ins_pipe( pipe_slow );
17525 %}
17526 
17527 instruct vsrav2L_reg(vecX dst, vecX src, vecX shift, vecX tmp) %{
17528   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 && n->in(2)->Opcode() != Op_RShiftCntV);
17529   match(Set dst (RShiftVL src shift));
17530   effect(TEMP dst, TEMP tmp);
17531   format %{ "vpsrlvq   $dst,$src,$shift\n\t"
17532             "vmovdqu   $tmp,[0x8000000000000000]\n\t"
17533             "vpsrlvq   $tmp,$tmp,$shift\n\t"
17534             "vpxor     $dst,$dst,$tmp\n\t"
17535             "vpsubq    $dst,$dst,$tmp\t! variable arithmetic right shift packed2L" %}
17536   ins_encode %{
17537     int vector_len = 0;
17538     __ vpsrlvq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17539     __ vmovdqu($tmp$$XMMRegister, ExternalAddress(vector_all_ones_mask()));
17540     __ vpsrlvq($tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vector_len);
17541     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
17542     __ vpsubq($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
17543   %}
17544   ins_pipe( pipe_slow );
17545 %}
17546 
17547 instruct vsrav2L_reg_evex(vecX dst, vecX src, vecX shift) %{
17548   predicate(UseAVX > 2 && n->as_Vector()->length() == 2 && n->in(2)->Opcode() != Op_RShiftCntV);
17549   match(Set dst (RShiftVL src shift));
17550   format %{ "evpsravq  $dst,$src,$shift\t! variable arithmetic right shift packed2L" %}
17551   ins_encode %{
17552     int vector_len = 0;
17553     __ evpsravq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17554   %}
17555   ins_pipe( pipe_slow );
17556 %}
17557 
17558 instruct vsrav4L_reg(vecY dst, vecY src, vecY shift, vecY tmp) %{
17559   predicate(UseAVX > 1 && n->as_Vector()->length() == 4 && n->in(2)->Opcode() != Op_RShiftCntV);
17560   match(Set dst (RShiftVL src shift));
17561   effect(TEMP dst, TEMP tmp);
17562   format %{ "vpsrlvq   $dst,$src,$shift\n\t"
17563             "vmovdqu   $tmp,[0x8000000000000000]\n\t"
17564             "vpsrlvq   $tmp,$tmp,$shift\n\t"
17565             "vpxor     $dst,$dst,$tmp\n\t"
17566             "vpsubq    $dst,$dst,$tmp\t! variable arithmetic right shift packed4L" %}
17567   ins_encode %{
17568     int vector_len = 1;
17569     __ vpsrlvq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17570     __ vmovdqu($tmp$$XMMRegister, ExternalAddress(vector_all_ones_mask()));
17571     __ vpsrlvq($tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vector_len);
17572     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
17573     __ vpsubq($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
17574   %}
17575   ins_pipe( pipe_slow );
17576 %}
17577 
17578 instruct vsrav4L_reg_evex(vecY dst, vecY src, vecY shift) %{
17579   predicate(UseAVX > 2 && n->as_Vector()->length() == 4 && n->in(2)->Opcode() != Op_RShiftCntV);
17580   match(Set dst (RShiftVL src shift));
17581   format %{ "evpsravq  $dst,$src,$shift\t! variable bit shift right shift packed4L" %}
17582   ins_encode %{
17583     int vector_len = 1;
17584     __ evpsravq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17585   %}
17586   ins_pipe( pipe_slow );
17587 %}
17588 
17589 instruct vsrav8L_reg_evex(vecZ dst, vecZ src, vecZ shift) %{
17590   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && n->in(2)->Opcode() != Op_RShiftCntV);
17591   match(Set dst (RShiftVL src shift));
17592   format %{ "evpsravq  $dst,$src,$shift\t! variable bit shift right shift packed8L" %}
17593   ins_encode %{
17594     int vector_len = 2;
17595     __ evpsravq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
17596   %}
17597   ins_pipe( pipe_slow );
17598 %}
17599 
17600 // --------------------------------- AND --------------------------------------
17601 
17602 instruct vand4B(vecS dst, vecS src) %{
17603   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 4);
17604   match(Set dst (AndV dst src));
17605   format %{ "pand    $dst,$src\t! and vectors (4 bytes)" %}
17606   ins_encode %{
17607     __ pand($dst$$XMMRegister, $src$$XMMRegister);
17608   %}
17609   ins_pipe( pipe_slow );
17610 %}
17611 
17612 instruct vand4B_reg(vecS dst, vecS src1, vecS src2) %{
17613   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
17614   match(Set dst (AndV src1 src2));
17615   format %{ "vpand   $dst,$src1,$src2\t! and vectors (4 bytes)" %}
17616   ins_encode %{
17617     int vector_len = 0;
17618     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
17619   %}
17620   ins_pipe( pipe_slow );
17621 %}
17622 
17623 instruct vand4B_mem(vecS dst, vecS src, memory mem) %{
17624   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
17625   match(Set dst (AndV src (LoadVector mem)));
17626   format %{ "vpand   $dst,$src,$mem\t! and vectors (4 bytes)" %}
17627   ins_encode %{
17628     int vector_len = 0;
17629     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
17630   %}
17631   ins_pipe( pipe_slow );
17632 %}
17633 
17634 instruct vand8B(vecD dst, vecD src) %{
17635   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 8);
17636   match(Set dst (AndV dst src));
17637   format %{ "pand    $dst,$src\t! and vectors (8 bytes)" %}
17638   ins_encode %{
17639     __ pand($dst$$XMMRegister, $src$$XMMRegister);
17640   %}
17641   ins_pipe( pipe_slow );
17642 %}
17643 
17644 instruct vand8B_reg(vecD dst, vecD src1, vecD src2) %{
17645   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
17646   match(Set dst (AndV src1 src2));
17647   format %{ "vpand   $dst,$src1,$src2\t! and vectors (8 bytes)" %}
17648   ins_encode %{
17649     int vector_len = 0;
17650     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
17651   %}
17652   ins_pipe( pipe_slow );
17653 %}
17654 
17655 instruct vand8B_mem(vecD dst, vecD src, memory mem) %{
17656   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
17657   match(Set dst (AndV src (LoadVector mem)));
17658   format %{ "vpand   $dst,$src,$mem\t! and vectors (8 bytes)" %}
17659   ins_encode %{
17660     int vector_len = 0;
17661     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
17662   %}
17663   ins_pipe( pipe_slow );
17664 %}
17665 
17666 instruct vand16B(vecX dst, vecX src) %{
17667   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 16);
17668   match(Set dst (AndV dst src));
17669   format %{ "pand    $dst,$src\t! and vectors (16 bytes)" %}
17670   ins_encode %{
17671     __ pand($dst$$XMMRegister, $src$$XMMRegister);
17672   %}
17673   ins_pipe( pipe_slow );
17674 %}
17675 
17676 instruct vand16B_reg(vecX dst, vecX src1, vecX src2) %{
17677   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
17678   match(Set dst (AndV src1 src2));
17679   format %{ "vpand   $dst,$src1,$src2\t! and vectors (16 bytes)" %}
17680   ins_encode %{
17681     int vector_len = 0;
17682     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
17683   %}
17684   ins_pipe( pipe_slow );
17685 %}
17686 
17687 instruct vand16B_mem(vecX dst, vecX src, memory mem) %{
17688   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
17689   match(Set dst (AndV src (LoadVector mem)));
17690   format %{ "vpand   $dst,$src,$mem\t! and vectors (16 bytes)" %}
17691   ins_encode %{
17692     int vector_len = 0;
17693     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
17694   %}
17695   ins_pipe( pipe_slow );
17696 %}
17697 
17698 instruct vand32B_reg(vecY dst, vecY src1, vecY src2) %{
17699   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
17700   match(Set dst (AndV src1 src2));
17701   format %{ "vpand   $dst,$src1,$src2\t! and vectors (32 bytes)" %}
17702   ins_encode %{
17703     int vector_len = 1;
17704     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
17705   %}
17706   ins_pipe( pipe_slow );
17707 %}
17708 
17709 instruct vand32B_mem(vecY dst, vecY src, memory mem) %{
17710   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
17711   match(Set dst (AndV src (LoadVector mem)));
17712   format %{ "vpand   $dst,$src,$mem\t! and vectors (32 bytes)" %}
17713   ins_encode %{
17714     int vector_len = 1;
17715     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
17716   %}
17717   ins_pipe( pipe_slow );
17718 %}
17719 
17720 instruct vand64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
17721   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
17722   match(Set dst (AndV src1 src2));
17723   format %{ "vpand   $dst,$src1,$src2\t! and vectors (64 bytes)" %}
17724   ins_encode %{
17725     int vector_len = 2;
17726     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
17727   %}
17728   ins_pipe( pipe_slow );
17729 %}
17730 
17731 instruct vand64B_mem(vecZ dst, vecZ src, memory mem) %{
17732   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
17733   match(Set dst (AndV src (LoadVector mem)));
17734   format %{ "vpand   $dst,$src,$mem\t! and vectors (64 bytes)" %}
17735   ins_encode %{
17736     int vector_len = 2;
17737     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
17738   %}
17739   ins_pipe( pipe_slow );
17740 %}
17741 
17742 // --------------------------------- OR ---------------------------------------
17743 
17744 instruct vor4B(vecS dst, vecS src) %{
17745   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 4);
17746   match(Set dst (OrV dst src));
17747   format %{ "por     $dst,$src\t! or vectors (4 bytes)" %}
17748   ins_encode %{
17749     __ por($dst$$XMMRegister, $src$$XMMRegister);
17750   %}
17751   ins_pipe( pipe_slow );
17752 %}
17753 
17754 instruct vor4B_reg(vecS dst, vecS src1, vecS src2) %{
17755   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
17756   match(Set dst (OrV src1 src2));
17757   format %{ "vpor    $dst,$src1,$src2\t! or vectors (4 bytes)" %}
17758   ins_encode %{
17759     int vector_len = 0;
17760     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
17761   %}
17762   ins_pipe( pipe_slow );
17763 %}
17764 
17765 instruct vor4B_mem(vecS dst, vecS src, memory mem) %{
17766   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
17767   match(Set dst (OrV src (LoadVector mem)));
17768   format %{ "vpor    $dst,$src,$mem\t! or vectors (4 bytes)" %}
17769   ins_encode %{
17770     int vector_len = 0;
17771     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
17772   %}
17773   ins_pipe( pipe_slow );
17774 %}
17775 
17776 instruct vor8B(vecD dst, vecD src) %{
17777   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 8);
17778   match(Set dst (OrV dst src));
17779   format %{ "por     $dst,$src\t! or vectors (8 bytes)" %}
17780   ins_encode %{
17781     __ por($dst$$XMMRegister, $src$$XMMRegister);
17782   %}
17783   ins_pipe( pipe_slow );
17784 %}
17785 
17786 instruct vor8B_reg(vecD dst, vecD src1, vecD src2) %{
17787   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
17788   match(Set dst (OrV src1 src2));
17789   format %{ "vpor    $dst,$src1,$src2\t! or vectors (8 bytes)" %}
17790   ins_encode %{
17791     int vector_len = 0;
17792     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
17793   %}
17794   ins_pipe( pipe_slow );
17795 %}
17796 
17797 instruct vor8B_mem(vecD dst, vecD src, memory mem) %{
17798   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
17799   match(Set dst (OrV src (LoadVector mem)));
17800   format %{ "vpor    $dst,$src,$mem\t! or vectors (8 bytes)" %}
17801   ins_encode %{
17802     int vector_len = 0;
17803     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
17804   %}
17805   ins_pipe( pipe_slow );
17806 %}
17807 
17808 instruct vor16B(vecX dst, vecX src) %{
17809   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 16);
17810   match(Set dst (OrV dst src));
17811   format %{ "por     $dst,$src\t! or vectors (16 bytes)" %}
17812   ins_encode %{
17813     __ por($dst$$XMMRegister, $src$$XMMRegister);
17814   %}
17815   ins_pipe( pipe_slow );
17816 %}
17817 
17818 instruct vor16B_reg(vecX dst, vecX src1, vecX src2) %{
17819   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
17820   match(Set dst (OrV src1 src2));
17821   format %{ "vpor    $dst,$src1,$src2\t! or vectors (16 bytes)" %}
17822   ins_encode %{
17823     int vector_len = 0;
17824     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
17825   %}
17826   ins_pipe( pipe_slow );
17827 %}
17828 
17829 instruct vor16B_mem(vecX dst, vecX src, memory mem) %{
17830   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
17831   match(Set dst (OrV src (LoadVector mem)));
17832   format %{ "vpor    $dst,$src,$mem\t! or vectors (16 bytes)" %}
17833   ins_encode %{
17834     int vector_len = 0;
17835     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
17836   %}
17837   ins_pipe( pipe_slow );
17838 %}
17839 
17840 instruct vor32B_reg(vecY dst, vecY src1, vecY src2) %{
17841   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
17842   match(Set dst (OrV src1 src2));
17843   format %{ "vpor    $dst,$src1,$src2\t! or vectors (32 bytes)" %}
17844   ins_encode %{
17845     int vector_len = 1;
17846     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
17847   %}
17848   ins_pipe( pipe_slow );
17849 %}
17850 
17851 instruct vor32B_mem(vecY dst, vecY src, memory mem) %{
17852   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
17853   match(Set dst (OrV src (LoadVector mem)));
17854   format %{ "vpor    $dst,$src,$mem\t! or vectors (32 bytes)" %}
17855   ins_encode %{
17856     int vector_len = 1;
17857     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
17858   %}
17859   ins_pipe( pipe_slow );
17860 %}
17861 
17862 instruct vor64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
17863   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
17864   match(Set dst (OrV src1 src2));
17865   format %{ "vpor    $dst,$src1,$src2\t! or vectors (64 bytes)" %}
17866   ins_encode %{
17867     int vector_len = 2;
17868     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
17869   %}
17870   ins_pipe( pipe_slow );
17871 %}
17872 
17873 instruct vor64B_mem(vecZ dst, vecZ src, memory mem) %{
17874   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
17875   match(Set dst (OrV src (LoadVector mem)));
17876   format %{ "vpor    $dst,$src,$mem\t! or vectors (64 bytes)" %}
17877   ins_encode %{
17878     int vector_len = 2;
17879     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
17880   %}
17881   ins_pipe( pipe_slow );
17882 %}
17883 
17884 // --------------------------------- XOR --------------------------------------
17885 
17886 instruct vxor4B(vecS dst, vecS src) %{
17887   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 4);
17888   match(Set dst (XorV dst src));
17889   format %{ "pxor    $dst,$src\t! xor vectors (4 bytes)" %}
17890   ins_encode %{
17891     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
17892   %}
17893   ins_pipe( pipe_slow );
17894 %}
17895 
17896 instruct vxor4B_reg(vecS dst, vecS src1, vecS src2) %{
17897   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
17898   match(Set dst (XorV src1 src2));
17899   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (4 bytes)" %}
17900   ins_encode %{
17901     int vector_len = 0;
17902     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
17903   %}
17904   ins_pipe( pipe_slow );
17905 %}
17906 
17907 instruct vxor4B_mem(vecS dst, vecS src, memory mem) %{
17908   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
17909   match(Set dst (XorV src (LoadVector mem)));
17910   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (4 bytes)" %}
17911   ins_encode %{
17912     int vector_len = 0;
17913     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
17914   %}
17915   ins_pipe( pipe_slow );
17916 %}
17917 
17918 instruct vxor8B(vecD dst, vecD src) %{
17919   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 8);
17920   match(Set dst (XorV dst src));
17921   format %{ "pxor    $dst,$src\t! xor vectors (8 bytes)" %}
17922   ins_encode %{
17923     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
17924   %}
17925   ins_pipe( pipe_slow );
17926 %}
17927 
17928 instruct vxor8B_reg(vecD dst, vecD src1, vecD src2) %{
17929   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
17930   match(Set dst (XorV src1 src2));
17931   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (8 bytes)" %}
17932   ins_encode %{
17933     int vector_len = 0;
17934     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
17935   %}
17936   ins_pipe( pipe_slow );
17937 %}
17938 
17939 instruct vxor8B_mem(vecD dst, vecD src, memory mem) %{
17940   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
17941   match(Set dst (XorV src (LoadVector mem)));
17942   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (8 bytes)" %}
17943   ins_encode %{
17944     int vector_len = 0;
17945     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
17946   %}
17947   ins_pipe( pipe_slow );
17948 %}
17949 
17950 instruct vxor16B(vecX dst, vecX src) %{
17951   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 16);
17952   match(Set dst (XorV dst src));
17953   format %{ "pxor    $dst,$src\t! xor vectors (16 bytes)" %}
17954   ins_encode %{
17955     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
17956   %}
17957   ins_pipe( pipe_slow );
17958 %}
17959 
17960 instruct vxor16B_reg(vecX dst, vecX src1, vecX src2) %{
17961   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
17962   match(Set dst (XorV src1 src2));
17963   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (16 bytes)" %}
17964   ins_encode %{
17965     int vector_len = 0;
17966     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
17967   %}
17968   ins_pipe( pipe_slow );
17969 %}
17970 
17971 instruct vxor16B_mem(vecX dst, vecX src, memory mem) %{
17972   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
17973   match(Set dst (XorV src (LoadVector mem)));
17974   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (16 bytes)" %}
17975   ins_encode %{
17976     int vector_len = 0;
17977     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
17978   %}
17979   ins_pipe( pipe_slow );
17980 %}
17981 
17982 instruct vxor32B_reg(vecY dst, vecY src1, vecY src2) %{
17983   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
17984   match(Set dst (XorV src1 src2));
17985   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (32 bytes)" %}
17986   ins_encode %{
17987     int vector_len = 1;
17988     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
17989   %}
17990   ins_pipe( pipe_slow );
17991 %}
17992 
17993 instruct vxor32B_mem(vecY dst, vecY src, memory mem) %{
17994   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
17995   match(Set dst (XorV src (LoadVector mem)));
17996   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (32 bytes)" %}
17997   ins_encode %{
17998     int vector_len = 1;
17999     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
18000   %}
18001   ins_pipe( pipe_slow );
18002 %}
18003 
18004 instruct vxor64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
18005   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
18006   match(Set dst (XorV src1 src2));
18007   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (64 bytes)" %}
18008   ins_encode %{
18009     int vector_len = 2;
18010     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
18011   %}
18012   ins_pipe( pipe_slow );
18013 %}
18014 
18015 instruct vxor64B_mem(vecZ dst, vecZ src, memory mem) %{
18016   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
18017   match(Set dst (XorV src (LoadVector mem)));
18018   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (64 bytes)" %}
18019   ins_encode %{
18020     int vector_len = 2;
18021     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
18022   %}
18023   ins_pipe( pipe_slow );
18024 %}
18025 
18026 instruct vcvt4Bto4S_reg(vecD dst, vecS src) %{
18027   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
18028   match(Set dst (VectorCastB2X src));
18029   format %{ "vpmovsxbw   $dst,$src\t! convert 4B to 4S vector" %}
18030   ins_encode %{
18031     int vector_len = 0;
18032     __ vpmovsxbw($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18033   %}
18034   ins_pipe( pipe_slow );
18035 %}
18036 
18037 instruct vcvt8Bto8S_reg(vecX dst, vecD src) %{
18038   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
18039   match(Set dst (VectorCastB2X src));
18040   format %{ "vpmovsxbw   $dst,$src\t! convert 8B to 8S vector" %}
18041   ins_encode %{
18042     int vector_len = 0;
18043     __ vpmovsxbw($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18044   %}
18045   ins_pipe( pipe_slow );
18046 %}
18047 
18048 instruct vcvt16Bto16S_reg(vecY dst, vecX src) %{
18049   predicate(UseAVX >= 2 && n->as_Vector()->length() == 16 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
18050   match(Set dst (VectorCastB2X src));
18051   format %{ "vpmovsxbw   $dst,$src\t! convert 16B to 16S vector" %}
18052   ins_encode %{
18053     int vector_len = 1;
18054     __ vpmovsxbw($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18055   %}
18056   ins_pipe( pipe_slow );
18057 %}
18058 
18059 instruct vcvt32Bto32S_reg(vecZ dst, vecY src) %{
18060   predicate(UseAVX > 2 && n->as_Vector()->length() == 32 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
18061   match(Set dst (VectorCastB2X src));
18062   format %{ "vpmovsxbw   $dst,$src\t! convert 32B to 32S vector" %}
18063   ins_encode %{
18064     int vector_len = 2;
18065     __ vpmovsxbw($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18066   %}
18067   ins_pipe( pipe_slow );
18068 %}
18069 
18070 instruct vcvt4Bto4I_reg(vecX dst, vecS src) %{
18071   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
18072   match(Set dst (VectorCastB2X src));
18073   format %{ "vpmovsxbd   $dst,$src\t! convert 4B to 4I vector" %}
18074   ins_encode %{
18075     int vector_len = 0;
18076     __ vpmovsxbd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18077   %}
18078   ins_pipe( pipe_slow );
18079 %}
18080 
18081 instruct vcvt8Bto8I_reg(vecY dst, vecD src) %{
18082   predicate(UseAVX >= 2 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
18083   match(Set dst (VectorCastB2X src));
18084   format %{ "vpmovsxbd   $dst,$src\t! convert 8B to 8I vector" %}
18085   ins_encode %{
18086     int vector_len = 1;
18087     __ vpmovsxbd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18088   %}
18089   ins_pipe( pipe_slow );
18090 %}
18091 
18092 instruct vcvt16Bto16I_reg(vecZ dst, vecX src) %{
18093   predicate(UseAVX > 2 && n->as_Vector()->length() == 16 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
18094   match(Set dst (VectorCastB2X src));
18095   format %{ "vpmovsxbd   $dst,$src\t! convert 16B to 16I vector" %}
18096   ins_encode %{
18097     int vector_len = 2;
18098     __ vpmovsxbd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18099   %}
18100   ins_pipe( pipe_slow );
18101 %}
18102 
18103 instruct vcvt4Bto4L_reg(vecY dst, vecS src) %{
18104   predicate(UseAVX >= 2 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
18105   match(Set dst (VectorCastB2X src));
18106   format %{ "vpmovsxbq   $dst,$src\t! convert 4B to 4L vector" %}
18107   ins_encode %{
18108     int vector_len = 1;
18109     __ vpmovsxbq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18110   %}
18111   ins_pipe( pipe_slow );
18112 %}
18113 
18114 instruct vcvt8Bto8L_reg(vecZ dst, vecD src) %{
18115   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
18116   match(Set dst (VectorCastB2X src));
18117   format %{ "vpmovsxbq   $dst,$src\t! convert 8B to 8L vector" %}
18118   ins_encode %{
18119     int vector_len = 2;
18120     __ vpmovsxbq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18121   %}
18122   ins_pipe( pipe_slow );
18123 %}
18124 
18125 instruct vcvt4Bto4F_reg(vecX dst, vecS src) %{
18126   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
18127   match(Set dst (VectorCastB2X src));
18128   format %{ "vpmovsxbd   $dst,$src\n\t"
18129             "vcvtdq2ps   $dst,$dst\t! convert 4B to 4F vector" %}
18130   ins_encode %{
18131     int vector_len = 0;
18132     __ vpmovsxbd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18133     __ vcvtdq2ps($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
18134   %}
18135   ins_pipe( pipe_slow );
18136 %}
18137 
18138 instruct vcvt8Bto8F_reg(vecY dst, vecD src) %{
18139   predicate(UseAVX >= 2 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
18140   match(Set dst (VectorCastB2X src));
18141   format %{ "vpmovsxbd   $dst,$src\n\t"
18142             "vcvtdq2ps   $dst,$dst\t! convert 8B to 8F vector" %}
18143   ins_encode %{
18144     int vector_len = 1;
18145     __ vpmovsxbd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18146     __ vcvtdq2ps($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
18147   %}
18148   ins_pipe( pipe_slow );
18149 %}
18150 
18151 instruct vcvt16Bto16F_reg(vecZ dst, vecX src) %{
18152   predicate(UseAVX > 2 && n->as_Vector()->length() == 16 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
18153   match(Set dst (VectorCastB2X src));
18154   format %{ "vpmovsxbd   $dst,$src\n\t"
18155             "vcvtdq2ps   $dst,$dst\t! convert 16B to 16F vector" %}
18156   ins_encode %{
18157     int vector_len = 2;
18158     __ vpmovsxbd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18159     __ vcvtdq2ps($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
18160   %}
18161   ins_pipe( pipe_slow );
18162 %}
18163 
18164 instruct vcvt4Bto4D_reg(vecY dst, vecS src) %{
18165   predicate(UseAVX >= 2 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
18166   match(Set dst (VectorCastB2X src));
18167   format %{ "vpmovsxbd   $dst,$src\n\t"
18168             "vcvtdq2pd   $dst,$dst\t! convert 4B to 4D vector" %}
18169   ins_encode %{
18170     __ vpmovsxbd($dst$$XMMRegister, $src$$XMMRegister, 0);
18171     __ vcvtdq2pd($dst$$XMMRegister, $dst$$XMMRegister, 1);
18172   %}
18173   ins_pipe( pipe_slow );
18174 %}
18175 
18176 instruct vcvt8Bto8D_reg(vecZ dst, vecD src) %{
18177   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
18178   match(Set dst (VectorCastB2X src));
18179   format %{ "vpmovsxbd   $dst,$src\n\t"
18180             "vcvtdq2pd   $dst,$dst\t! convert 8B to 8D vector" %}
18181   ins_encode %{
18182     __ vpmovsxbd($dst$$XMMRegister, $src$$XMMRegister, 1);
18183     __ vcvtdq2pd($dst$$XMMRegister, $dst$$XMMRegister, 2);
18184   %}
18185   ins_pipe( pipe_slow );
18186 %}
18187 
18188 instruct vcvt4Sto4B_reg(vecS dst, vecD src, rRegL scratch) %{
18189   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
18190   effect(TEMP scratch);
18191   match(Set dst (VectorCastS2X src));
18192   format %{ "vpand      $dst,$src,[0x00FF00FF00FF00FF]\n\t"
18193             "vpackuswb  $dst,$dst\t! convert 4S to 4B vector" %}
18194   ins_encode %{
18195     int vector_len = 0;
18196     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vector_len, $scratch$$Register);
18197     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
18198   %}
18199   ins_pipe( pipe_slow );
18200 %}
18201 
18202 instruct vcvt8Sto8B_reg(vecD dst, vecX src, rRegL scratch) %{
18203   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
18204   effect(TEMP scratch);
18205   match(Set dst (VectorCastS2X src));
18206   format %{ "vpand      $dst,$src,[0x00FF00FF00FF00FF]\n\t"
18207             "vpackuswb  $dst,$dst\t! convert 8S to 8B vector" %}
18208   ins_encode %{
18209     int vector_len = 0;
18210     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vector_len, $scratch$$Register);
18211     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
18212   %}
18213   ins_pipe( pipe_slow );
18214 %}
18215 
18216 instruct vcvt16Sto16B_reg(vecX dst, vecY src, vecY tmp, rRegL scratch) %{
18217   predicate(UseAVX > 0 && n->as_Vector()->length() == 16 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
18218   effect(TEMP scratch, TEMP tmp);
18219   match(Set dst (VectorCastS2X src));
18220     format %{ "vpand      $dst,$src,[0x00FF00FF00FF00FF]\n\t"
18221               "vextracti128 $tmp,$dst,0x1\n\t"
18222               "vpackuswb  $dst,$dst,$tmp\t! convert 16S to 16B vector" %}
18223   ins_encode %{
18224     int vector_len = 1;
18225     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vector_len, $scratch$$Register);
18226     __ vextracti128($tmp$$XMMRegister, $dst$$XMMRegister, 0x1);
18227     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
18228   %}
18229   ins_pipe( pipe_slow );
18230 %}
18231 
18232 instruct vcvt32Sto32B_reg(vecY dst, vecZ src) %{
18233   predicate(UseAVX > 2 && n->as_Vector()->length() == 32 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
18234   match(Set dst (VectorCastS2X src));
18235     format %{ "evpmovwb   $dst,$src\t! convert 32S to 32B vector" %}
18236   ins_encode %{
18237     int vector_len = 2;
18238     __ evpmovwb($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18239   %}
18240   ins_pipe( pipe_slow );
18241 %}
18242 
18243 instruct vcvt2Sto2I_reg(vecD dst, vecS src) %{
18244   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
18245   match(Set dst (VectorCastS2X src));
18246   format %{ "vpmovsxwd   $dst,$src\t! convert 2S to 2I vector" %}
18247   ins_encode %{
18248     int vector_len = 0;
18249     __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18250   %}
18251   ins_pipe( pipe_slow );
18252 %}
18253 
18254 instruct vcvt4Sto4I_reg(vecX dst, vecD src) %{
18255   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
18256   match(Set dst (VectorCastS2X src));
18257   format %{ "vpmovsxwd   $dst,$src\t! convert 4S to 4I vector" %}
18258   ins_encode %{
18259     int vector_len = 0;
18260     __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18261   %}
18262   ins_pipe( pipe_slow );
18263 %}
18264 
18265 instruct vcvt8Sto8I_reg(vecY dst, vecX src) %{
18266   predicate(UseAVX >= 2 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
18267   match(Set dst (VectorCastS2X src));
18268   format %{ "vpmovsxwd   $dst,$src\t! convert 8S to 8I vector" %}
18269   ins_encode %{
18270     int vector_len = 1;
18271     __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18272   %}
18273   ins_pipe( pipe_slow );
18274 %}
18275 
18276 instruct vcvt16Sto16I_reg(vecZ dst, vecY src) %{
18277   predicate(UseAVX > 2 && n->as_Vector()->length() == 16 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
18278   match(Set dst (VectorCastS2X src));
18279   format %{ "vpmovsxwd   $dst,$src\t! convert 16S to 16I vector" %}
18280   ins_encode %{
18281     int vector_len = 2;
18282     __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18283   %}
18284   ins_pipe( pipe_slow );
18285 %}
18286 
18287 instruct vcvt2Sto2L_reg(vecX dst, vecS src) %{
18288   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
18289   match(Set dst (VectorCastS2X src));
18290   format %{ "vpmovsxwq   $dst,$src\t! convert 2S to 2L vector" %}
18291   ins_encode %{
18292     int vector_len = 0;
18293     __ vpmovsxwq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18294   %}
18295   ins_pipe( pipe_slow );
18296 %}
18297 
18298 instruct vcvt4Sto4L_reg(vecY dst, vecD src) %{
18299   predicate(UseAVX >= 2 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
18300   match(Set dst (VectorCastS2X src));
18301   format %{ "vpmovsxwq   $dst,$src\t! convert 4S to 4L vector" %}
18302   ins_encode %{
18303     int vector_len = 1;
18304     __ vpmovsxwq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18305   %}
18306   ins_pipe( pipe_slow );
18307 %}
18308 
18309 instruct vcvt8Sto8L_reg(vecZ dst, vecX src) %{
18310   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
18311   match(Set dst (VectorCastS2X src));
18312   format %{ "vpmovsxwq   $dst,$src\t! convert 8S to 8L vector" %}
18313   ins_encode %{
18314     int vector_len = 2;
18315     __ vpmovsxwq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18316   %}
18317   ins_pipe( pipe_slow );
18318 %}
18319 
18320 instruct vcvt2Sto2F_reg(vecD dst, vecS src) %{
18321   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
18322   match(Set dst (VectorCastS2X src));
18323   format %{ "vpmovsxwd   $dst,$src\n\t"
18324             "vcvtdq2ps   $dst,$dst\t! convert 2S to 2F vector" %}
18325   ins_encode %{
18326     int vector_len = 0;
18327     __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18328     __ vcvtdq2ps($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
18329   %}
18330   ins_pipe( pipe_slow );
18331 %}
18332 
18333 instruct vcvt4Sto4F_reg(vecX dst, vecD src) %{
18334   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
18335   match(Set dst (VectorCastS2X src));
18336   format %{ "vpmovsxwd   $dst,$src\n\t"
18337             "vcvtdq2ps   $dst,$dst\t! convert 4S to 4F vector" %}
18338   ins_encode %{
18339     int vector_len = 0;
18340     __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18341     __ vcvtdq2ps($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
18342   %}
18343   ins_pipe( pipe_slow );
18344 %}
18345 
18346 instruct vcvt8Sto8F_reg(vecY dst, vecX src) %{
18347   predicate(UseAVX >= 2 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
18348   match(Set dst (VectorCastS2X src));
18349   format %{ "vpmovsxwd   $dst,$src\n\t"
18350             "vcvtdq2ps   $dst,$dst\t! convert 8S to 8F vector" %}
18351   ins_encode %{
18352     int vector_len = 1;
18353     __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18354     __ vcvtdq2ps($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
18355   %}
18356   ins_pipe( pipe_slow );
18357 %}
18358 
18359 instruct vcvt16Sto16F_reg(vecZ dst, vecY src) %{
18360   predicate(UseAVX > 2 && n->as_Vector()->length() == 16 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
18361   match(Set dst (VectorCastS2X src));
18362   format %{ "vpmovsxwd   $dst,$src\n\t"
18363             "vcvtdq2ps   $dst,$dst\t! convert 16S to 16F vector" %}
18364   ins_encode %{
18365     int vector_len = 2;
18366     __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18367     __ vcvtdq2ps($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
18368   %}
18369   ins_pipe( pipe_slow );
18370 %}
18371 
18372 instruct vcvt2Sto2D_reg(vecX dst, vecS src) %{
18373   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
18374   match(Set dst (VectorCastS2X src));
18375   format %{ "vpmovsxwd   $dst,$src\n\t"
18376             "vcvtdq2pd   $dst,$dst\t! convert 2S to 2D vector" %}
18377   ins_encode %{
18378     int vector_len = 0;
18379     __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18380     __ vcvtdq2pd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
18381   %}
18382   ins_pipe( pipe_slow );
18383 %}
18384 
18385 instruct vcvt4Sto4D_reg(vecY dst, vecD src) %{
18386   predicate(UseAVX >= 2 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
18387   match(Set dst (VectorCastS2X src));
18388   format %{ "vpmovsxwd   $dst,$src\n\t"
18389             "vcvtdq2pd   $dst,$dst\t! convert 4S to 4D vector" %}
18390   ins_encode %{
18391     int vector_len = 1;
18392     __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18393     __ vcvtdq2pd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
18394   %}
18395   ins_pipe( pipe_slow );
18396 %}
18397 
18398 instruct vcvt8Sto8D_reg(vecZ dst, vecX src) %{
18399   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
18400   match(Set dst (VectorCastS2X src));
18401   format %{ "vpmovsxwd   $dst,$src\n\t"
18402             "vcvtdq2pd   $dst,$dst\t! convert 8S to 8D vector" %}
18403   ins_encode %{
18404     int vector_len = 2;
18405     __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18406     __ vcvtdq2pd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
18407   %}
18408   ins_pipe( pipe_slow );
18409 %}
18410 
18411 instruct vcvt4Ito4B_reg(vecS dst, vecX src, rRegL scratch) %{
18412   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
18413   effect(TEMP scratch);
18414   match(Set dst (VectorCastI2X src));
18415   format %{ "vpand      $dst,$src,[0x000000FF000000FF]\n\t"
18416             "vpackusdw  $dst,$dst\n\t"
18417             "vpackuswb  $dst,$dst\t! convert 4I to 4B vector" %}
18418   ins_encode %{
18419     int vector_len = 0;
18420     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_byte_mask()), vector_len, $scratch$$Register);
18421     __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
18422     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
18423   %}
18424   ins_pipe( pipe_slow );
18425 %}
18426 
18427 instruct vcvt8Ito8B_reg(vecD dst, vecY src, vecY tmp, rRegL scratch) %{
18428   predicate(UseAVX >= 2 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
18429   effect(TEMP scratch, TEMP tmp);
18430   match(Set dst (VectorCastI2X src));
18431   format %{ "vpand      $dst,$src,[0x000000FF000000FF]\n\t"
18432             "vextracti128 $tmp,$dst,0x1\n\t"
18433             "vpackusdw  $dst,$dst,$tmp\n\t"
18434             "vpackuswb  $dst,$dst\t! convert 8I to 8B vector" %}
18435   ins_encode %{
18436     int vector_len = 1;
18437     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_byte_mask()), vector_len, $scratch$$Register);
18438     __ vextracti128($tmp$$XMMRegister, $dst$$XMMRegister, 0x1);
18439     __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
18440     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0);
18441   %}
18442   ins_pipe( pipe_slow );
18443 %}
18444 
18445 instruct vcvt16Ito16B_reg(vecX dst, vecZ src) %{
18446   predicate(UseAVX > 2 && n->as_Vector()->length() == 16 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
18447   match(Set dst (VectorCastI2X src));
18448     format %{ "evpmovdb   $dst,$src\t! convert 16I to 16B vector" %}
18449   ins_encode %{
18450     int vector_len = 2;
18451     __ evpmovdb($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18452   %}
18453   ins_pipe( pipe_slow );
18454 %}
18455 
18456 instruct vcvt2Ito2S_reg(vecS dst, vecD src, rRegL scratch) %{
18457   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
18458   effect(TEMP scratch);
18459   match(Set dst (VectorCastI2X src));
18460   format %{ "vpand      $dst,$src,[0x0000FFFF0000FFFF]\n\t"
18461             "vpackusdw  $dst,$dst\t! convert 2I to 2S vector" %}
18462   ins_encode %{
18463     int vector_len = 0;
18464     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vector_len, $scratch$$Register);
18465     __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
18466   %}
18467   ins_pipe( pipe_slow );
18468 %}
18469 
18470 instruct vcvt4Ito4S_reg(vecD dst, vecX src, rRegL scratch) %{
18471   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
18472   effect(TEMP scratch);
18473   match(Set dst (VectorCastI2X src));
18474   format %{ "vpand      $dst,$src,[0x0000FFFF0000FFFF]\n\t"
18475             "vpackusdw  $dst,$dst\t! convert 4I to 4S vector" %}
18476   ins_encode %{
18477     int vector_len = 0;
18478     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vector_len, $scratch$$Register);
18479     __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
18480   %}
18481   ins_pipe( pipe_slow );
18482 %}
18483 
18484 instruct vcvt8Ito8S_reg(vecX dst, vecY src, vecY tmp, rRegL scratch) %{
18485   predicate(UseAVX >= 2 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
18486   effect(TEMP scratch, TEMP tmp);
18487   match(Set dst (VectorCastI2X src));
18488   format %{ "vpand      $dst,$src,[0x0000FFFF0000FFFF]\n\t"
18489             "vextracti128 $tmp,$dst,0x1\n\t"
18490             "vpackusdw  $dst,$dst,$tmp\t! convert 8I to 8S vector" %}
18491   ins_encode %{
18492     int vector_len = 1;
18493     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vector_len, $scratch$$Register);
18494     __ vextracti128($tmp$$XMMRegister, $dst$$XMMRegister, 0x1);
18495     __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
18496   %}
18497   ins_pipe( pipe_slow );
18498 %}
18499 
18500 instruct vcvt16Ito16S_reg(vecY dst, vecZ src) %{
18501   predicate(UseAVX > 2 && n->as_Vector()->length() == 16 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
18502   match(Set dst (VectorCastI2X src));
18503     format %{ "evpmovdw   $dst,$src\t! convert 16I to 16S vector" %}
18504   ins_encode %{
18505     int vector_len = 2;
18506     __ evpmovdw($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18507   %}
18508   ins_pipe( pipe_slow );
18509 %}
18510 
18511 instruct vcvt2Ito2L_reg(vecX dst, vecD src) %{
18512   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
18513   match(Set dst (VectorCastI2X src));
18514   format %{ "vpmovsxdq   $dst,$src\t! convert 2I to 2L vector" %}
18515   ins_encode %{
18516     int vector_len = 0;
18517     __ vpmovsxdq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18518   %}
18519   ins_pipe( pipe_slow );
18520 %}
18521 
18522 instruct vcvt4Ito4L_reg(vecY dst, vecX src) %{
18523   predicate(UseAVX >= 2 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
18524   match(Set dst (VectorCastI2X src));
18525   format %{ "vpmovsxdq   $dst,$src\t! convert 4I to 4L vector" %}
18526   ins_encode %{
18527     int vector_len = 1;
18528     __ vpmovsxdq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18529   %}
18530   ins_pipe( pipe_slow );
18531 %}
18532 
18533 instruct vcvt8Ito8L_reg(vecZ dst, vecY src) %{
18534   predicate(UseAVX >= 2 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
18535   match(Set dst (VectorCastI2X src));
18536   format %{ "vpmovsxdq   $dst,$src\t! convert 8I to 8L vector" %}
18537   ins_encode %{
18538     int vector_len = 2;
18539     __ vpmovsxdq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18540   %}
18541   ins_pipe( pipe_slow );
18542 %}
18543 
18544 instruct vcvt2Ito2F_reg(vecD dst, vecD src) %{
18545   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
18546   match(Set dst (VectorCastI2X src));
18547   format %{ "vcvtdq2ps   $dst,$src\t! convert 2I to 2F vector" %}
18548   ins_encode %{
18549     int vector_len = 0;
18550     __ vcvtdq2ps($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18551   %}
18552   ins_pipe( pipe_slow );
18553 %}
18554 
18555 instruct vcvt4Ito4F_reg(vecX dst, vecX src) %{
18556   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
18557   match(Set dst (VectorCastI2X src));
18558   format %{ "vcvtdq2ps   $dst,$src\t! convert 4I to 4F vector" %}
18559   ins_encode %{
18560     int vector_len = 0;
18561     __ vcvtdq2ps($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18562   %}
18563   ins_pipe( pipe_slow );
18564 %}
18565 
18566 instruct vcvt8Ito8F_reg(vecY dst, vecY src) %{
18567   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
18568   match(Set dst (VectorCastI2X src));
18569   format %{ "vcvtdq2ps   $dst,$src\t! convert 8I to 8F vector" %}
18570   ins_encode %{
18571     int vector_len = 1;
18572     __ vcvtdq2ps($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18573   %}
18574   ins_pipe( pipe_slow );
18575 %}
18576 
18577 instruct vcvt16Ito16F_reg(vecZ dst, vecZ src) %{
18578   predicate(UseAVX > 2 && n->as_Vector()->length() == 16 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
18579   match(Set dst (VectorCastI2X src));
18580   format %{ "vcvtdq2ps   $dst,$src\t! convert 16I to 16F vector" %}
18581   ins_encode %{
18582     int vector_len = 2;
18583     __ vcvtdq2ps($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18584   %}
18585   ins_pipe( pipe_slow );
18586 %}
18587 
18588 instruct vcvt2Ito2D_reg(vecX dst, vecD src) %{
18589   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
18590   match(Set dst (VectorCastI2X src));
18591   format %{ "vcvtdq2pd   $dst,$src\t! convert 2I to 2D vector" %}
18592   ins_encode %{
18593     int vector_len = 0;
18594     __ vcvtdq2pd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18595   %}
18596   ins_pipe( pipe_slow );
18597 %}
18598 
18599 instruct vcvt4Ito4D_reg(vecY dst, vecX src) %{
18600   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
18601   match(Set dst (VectorCastI2X src));
18602   format %{ "vcvtdq2pd   $dst,$src\t! convert 4I to 4D vector" %}
18603   ins_encode %{
18604     int vector_len = 1;
18605     __ vcvtdq2pd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18606   %}
18607   ins_pipe( pipe_slow );
18608 %}
18609 
18610 instruct vcvt8Ito8D_reg(vecZ dst, vecY src) %{
18611   predicate(UseAVX >= 2 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
18612   match(Set dst (VectorCastI2X src));
18613   format %{ "vcvtdq2pd   $dst,$src\t! convert 8I to 8D vector" %}
18614   ins_encode %{
18615     int vector_len = 2;
18616     __ vcvtdq2pd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18617   %}
18618   ins_pipe( pipe_slow );
18619 %}
18620 
18621 instruct vcvt4Lto4B_reg(vecS dst, vecY src, rRegL scratch) %{
18622   predicate(UseAVX >= 2 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
18623   match(Set dst (VectorCastL2X src));
18624   effect(TEMP scratch);
18625   format %{ "vpermilps  $dst,$src,8\n\t"
18626             "vpermpd    $dst,$dst,8\n\t"
18627             "vpand      $dst,$dst,[0x000000FF000000FF]\n\t"
18628             "vpackusdw  $dst,$dst\n\t"
18629             "vpackuswb  $dst,$dst\t! convert 4L to 4B vector" %}
18630   ins_encode %{
18631     int vector_len = 1;
18632     __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 8, vector_len);
18633     __ vpermpd($dst$$XMMRegister, $dst$$XMMRegister, 8, vector_len);
18634     // Since cast to int has been done, do rest of operations in 128.
18635     vector_len = 0;
18636     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_int_to_byte_mask()), vector_len, $scratch$$Register);
18637     __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
18638     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
18639   %}
18640   ins_pipe( pipe_slow );
18641 %}
18642 
18643 instruct vcvt8Lto8B_reg(vecD dst, vecZ src) %{
18644   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
18645   match(Set dst (VectorCastL2X src));
18646     format %{ "evpmovqb   $dst,$src\t! convert 8L to 8B vector" %}
18647   ins_encode %{
18648     int vector_len = 2;
18649     __ evpmovqb($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18650   %}
18651   ins_pipe( pipe_slow );
18652 %}
18653 
18654 instruct vcvt2Lto2S_reg(vecS dst, vecX src, rRegL scratch) %{
18655   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
18656   match(Set dst (VectorCastL2X src));
18657   effect(TEMP scratch);
18658   format %{ "vpshufd    $dst,$src,8\n\t"
18659             "vpand      $dst,$dst,[0x0000FFFF0000FFFF]\n\t"
18660             "vpackusdw  $dst,$dst\t! convert 2L to 2S vector" %}
18661   ins_encode %{
18662     int vector_len = 0;
18663     __ vpshufd($dst$$XMMRegister, $src$$XMMRegister, 8, vector_len);
18664     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vector_len, $scratch$$Register);
18665     __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
18666   %}
18667   ins_pipe( pipe_slow );
18668 %}
18669 
18670 instruct vcvt4Lto4S_reg(vecD dst, vecY src, rRegL scratch) %{
18671   predicate(UseAVX >= 2 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
18672   match(Set dst (VectorCastL2X src));
18673   effect(TEMP scratch);
18674   format %{ "vpermilps  $dst,$src,8\n\t"
18675             "vpermpd    $dst,$dst,8\n\t"
18676             "vpand      $dst,$dst,[0x0000FFFF0000FFFF]\n\t"
18677             "vpackusdw  $dst,$dst\t! convert 4L to 4S vector" %}
18678   ins_encode %{
18679     int vector_len = 1;
18680     __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 8, vector_len);
18681     __ vpermpd($dst$$XMMRegister, $dst$$XMMRegister, 8, vector_len);
18682     // Since cast to int has been done, do rest of operations in 128.
18683     vector_len = 0;
18684     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vector_len, $scratch$$Register);
18685     __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
18686   %}
18687   ins_pipe( pipe_slow );
18688 %}
18689 
18690 instruct vcvt8Lto8S_reg(vecX dst, vecZ src) %{
18691   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
18692   match(Set dst (VectorCastL2X src));
18693     format %{ "evpmovqw   $dst,$src\t! convert 8L to 8S vector" %}
18694   ins_encode %{
18695     int vector_len = 2;
18696     __ evpmovqw($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18697   %}
18698   ins_pipe( pipe_slow );
18699 %}
18700 
18701 instruct vcvt1Lto1I_reg(vecS dst, vecD src) %{
18702   predicate(n->as_Vector()->length() == 1 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
18703   match(Set dst (VectorCastL2X src));
18704   format %{ "movdqu   $dst,$src\t! convert 1L to 1I vector" %}
18705   ins_encode %{
18706     // If register is the same, then move is not needed.
18707     if ($dst$$XMMRegister != $src$$XMMRegister) {
18708       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
18709     }
18710   %}
18711   ins_pipe( pipe_slow );
18712 %}
18713 
18714 instruct vcvt2Lto2I_reg(vecD dst, vecX src) %{
18715   predicate(UseAVX == 0 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
18716   match(Set dst (VectorCastL2X src));
18717   format %{ "pshufd   $dst,$src,8\t! convert 2L to 2I vector" %}
18718   ins_encode %{
18719     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 8);
18720   %}
18721   ins_pipe( pipe_slow );
18722 %}
18723 
18724 instruct vcvt2Lto2I_reg_avx(vecD dst, vecX src) %{
18725   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
18726   match(Set dst (VectorCastL2X src));
18727   format %{ "vpshufd   $dst,$src,8\t! convert 2L to 2I vector" %}
18728   ins_encode %{
18729     int vector_len = 0;
18730     __ vpshufd($dst$$XMMRegister, $src$$XMMRegister, 8, vector_len);
18731   %}
18732   ins_pipe( pipe_slow );
18733 %}
18734 
18735 instruct vcvt4Lto4I_reg(vecX dst, vecY src) %{
18736   predicate(UseAVX >= 2 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
18737   match(Set dst (VectorCastL2X src));
18738   format %{ "vpermilps  $dst,$src,8\n\t"
18739           "vpermpd  $dst,$dst,8\t! convert 4L to 4I vector" %}
18740   ins_encode %{
18741     int vector_len = 1;
18742     __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 8, vector_len);
18743     __ vpermpd($dst$$XMMRegister, $dst$$XMMRegister, 8, vector_len);
18744   %}
18745   ins_pipe( pipe_slow );
18746 %}
18747 
18748 instruct vcvt8Lto8I_reg(vecY dst, vecZ src) %{
18749   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_INT);
18750   match(Set dst (VectorCastL2X src));
18751     format %{ "evpmovqd   $dst,$src\t! convert 8L to 8I vector" %}
18752   ins_encode %{
18753     int vector_len = 2;
18754     __ evpmovqd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18755   %}
18756   ins_pipe( pipe_slow );
18757 %}
18758 
18759 instruct vcvt2Lto2F_reg(vecD dst, vecX src) %{
18760   predicate(UseAVX > 2 && VM_Version::supports_avx512dq() && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
18761   match(Set dst (VectorCastL2X src));
18762   format %{ "vcvtqq2ps   $dst,$src\t! convert 2L to 2F vector" %}
18763   ins_encode %{
18764     int vector_len = 0;
18765     __ evcvtqq2ps($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18766   %}
18767   ins_pipe( pipe_slow );
18768 %}
18769 
18770 instruct vcvt4Lto4F_reg(vecX dst, vecY src) %{
18771   predicate(UseAVX > 2 && VM_Version::supports_avx512dq() && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
18772   match(Set dst (VectorCastL2X src));
18773   format %{ "vcvtqq2ps   $dst,$src\t! convert 4L to 4F vector" %}
18774   ins_encode %{
18775     int vector_len = 1;
18776     __ evcvtqq2ps($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18777   %}
18778   ins_pipe( pipe_slow );
18779 %}
18780 
18781 instruct vcvt8Lto8F_reg(vecY dst, vecZ src) %{
18782   predicate(UseAVX > 2 && VM_Version::supports_avx512dq() && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
18783   match(Set dst (VectorCastL2X src));
18784   format %{ "vcvtqq2ps   $dst,$src\t! convert 8L to 8F vector" %}
18785   ins_encode %{
18786     int vector_len = 2;
18787     __ evcvtqq2ps($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18788   %}
18789   ins_pipe( pipe_slow );
18790 %}
18791 
18792 instruct vcvt1Lto1D_reg(vecD dst, vecD src) %{
18793   predicate(UseAVX > 2 && VM_Version::supports_avx512dq() && n->as_Vector()->length() == 1 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
18794   match(Set dst (VectorCastL2X src));
18795   format %{ "vcvtqq2pd   $dst,$src\t! convert 1L to 1D vector" %}
18796   ins_encode %{
18797     int vector_len = 0;
18798     __ evcvtqq2pd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18799   %}
18800   ins_pipe( pipe_slow );
18801 %}
18802 
18803 instruct vcvt2Lto2D_reg(vecX dst, vecX src) %{
18804   predicate(UseAVX > 2 && VM_Version::supports_avx512dq() && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
18805   match(Set dst (VectorCastL2X src));
18806   format %{ "vcvtqq2pd   $dst,$src\t! convert 2L to 2D vector" %}
18807   ins_encode %{
18808     int vector_len = 0;
18809     __ evcvtqq2pd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18810   %}
18811   ins_pipe( pipe_slow );
18812 %}
18813 
18814 instruct vcvt4Lto4D_reg(vecY dst, vecY src) %{
18815   predicate(UseAVX > 2 && VM_Version::supports_avx512dq() && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
18816   match(Set dst (VectorCastL2X src));
18817   format %{ "vcvtqq2pd   $dst,$src\t! convert 4L to 4D vector" %}
18818   ins_encode %{
18819     int vector_len = 1;
18820     __ evcvtqq2pd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18821   %}
18822   ins_pipe( pipe_slow );
18823 %}
18824 
18825 instruct vcvt8Lto8D_reg(vecZ dst, vecZ src) %{
18826   predicate(UseAVX > 2 && VM_Version::supports_avx512dq() && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
18827   match(Set dst (VectorCastL2X src));
18828   format %{ "vcvtqq2pd   $dst,$src\t! convert 8L to 8D vector" %}
18829   ins_encode %{
18830     int vector_len = 2;
18831     __ evcvtqq2pd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18832   %}
18833   ins_pipe( pipe_slow );
18834 %}
18835 
18836 instruct vcvt2Fto2D_reg(vecX dst, vecD src) %{
18837   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
18838   match(Set dst (VectorCastF2X src));
18839   format %{ "vcvtps2pd   $dst,$src\t! convert 2F to 2D vector" %}
18840   ins_encode %{
18841     int vector_len = 0;
18842     __ vcvtps2pd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18843   %}
18844   ins_pipe( pipe_slow );
18845 %}
18846 
18847 instruct vcvt4Fto4D_reg(vecY dst, vecX src) %{
18848   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
18849   match(Set dst (VectorCastF2X src));
18850   format %{ "vcvtps2pd   $dst,$src\t! convert 4F to 4D vector" %}
18851   ins_encode %{
18852     int vector_len = 1;
18853     __ vcvtps2pd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18854   %}
18855   ins_pipe( pipe_slow );
18856 %}
18857 
18858 instruct vcvt8Fto8D_reg(vecZ dst, vecY src) %{
18859   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
18860   match(Set dst (VectorCastF2X src));
18861   format %{ "vcvtps2pd   $dst,$src\t! convert 8F to 8D vector" %}
18862   ins_encode %{
18863     int vector_len = 2;
18864     __ vcvtps2pd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18865   %}
18866   ins_pipe( pipe_slow );
18867 %}
18868 
18869 instruct vcvt2Dto2F_reg(vecD dst, vecX src) %{
18870   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
18871   match(Set dst (VectorCastD2X src));
18872   format %{ "vcvtpd2ps   $dst,$src\t! convert 2D to 2F vector" %}
18873   ins_encode %{
18874     int vector_len = 0;
18875     __ vcvtpd2ps($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18876   %}
18877   ins_pipe( pipe_slow );
18878 %}
18879 
18880 instruct vcvt4Dto4F_reg(vecX dst, vecY src) %{
18881   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
18882   match(Set dst (VectorCastD2X src));
18883   format %{ "vcvtpd2ps   $dst,$src\t! convert 4D to 4F vector" %}
18884   ins_encode %{
18885     int vector_len = 1;
18886     __ vcvtpd2ps($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18887   %}
18888   ins_pipe( pipe_slow );
18889 %}
18890 
18891 instruct vcvt8Dto8F_reg(vecY dst, vecZ src) %{
18892   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
18893   match(Set dst (VectorCastD2X src));
18894   format %{ "vcvtpd2ps   $dst,$src\t! convert 8D to 8F vector" %}
18895   ins_encode %{
18896     int vector_len = 2;
18897     __ vcvtpd2ps($dst$$XMMRegister, $src$$XMMRegister, vector_len);
18898   %}
18899   ins_pipe( pipe_slow );
18900 %}
18901 
18902 instruct vcmpeq2F(vecD dst, vecD src1, vecD src2) %{
18903   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 &&
18904             n->as_VectorMaskCmp()->get_predicate() == BoolTest::eq &&
18905             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
18906   match(Set dst (VectorMaskCmp src1 src2));
18907   format %{ "vcmpeqps  $dst,$src1,$src2\t! cmpeq packed2F" %}
18908   ins_encode %{
18909     int vector_len = 0;
18910     Assembler::ComparisonPredicateFP cmp = Assembler::EQ_OQ;  // ordered non-signaling
18911     __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
18912   %}
18913   ins_pipe( pipe_slow );
18914 %}
18915 
18916 instruct vcmpeq4F(vecX dst, vecX src1, vecX src2) %{
18917   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 &&
18918             n->as_VectorMaskCmp()->get_predicate() == BoolTest::eq &&
18919             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
18920   match(Set dst (VectorMaskCmp src1 src2));
18921   format %{ "vcmpeqps  $dst,$src1,$src2\t! cmpeq packed4F" %}
18922   ins_encode %{
18923     int vector_len = 0;
18924     Assembler::ComparisonPredicateFP cmp = Assembler::EQ_OQ;  // ordered non-signaling
18925     __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
18926   %}
18927   ins_pipe( pipe_slow );
18928 %}
18929 
18930 instruct vcmpeq8F(vecY dst, vecY src1, vecY src2) %{
18931   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 &&
18932             n->as_VectorMaskCmp()->get_predicate() == BoolTest::eq &&
18933             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
18934   match(Set dst (VectorMaskCmp src1 src2));
18935   format %{ "vcmpeqps  $dst,$src1,$src2\t! cmpeq packed8F" %}
18936   ins_encode %{
18937     int vector_len = 1;
18938     Assembler::ComparisonPredicateFP cmp = Assembler::EQ_OQ;  // ordered non-signaling
18939     __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
18940   %}
18941   ins_pipe( pipe_slow );
18942 %}
18943 
18944 instruct vcmpeq16F(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
18945   predicate(UseAVX > 2 && n->as_Vector()->length() == 16 &&
18946             n->as_VectorMaskCmp()->get_predicate() == BoolTest::eq &&
18947             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
18948   match(Set dst (VectorMaskCmp src1 src2));
18949   effect(TEMP dst, TEMP scratch);
18950   format %{ "vcmpeqps  k2,$src1,$src2\n\t"
18951             "vmovdqu8  $dst, k2{z}, 0xFFFFFFFFFF \t! cmpeq packed16F" %}
18952   ins_encode %{
18953     int vector_len = 2;
18954     Assembler::ComparisonPredicateFP cmp = Assembler::EQ_OQ;  // ordered non-signaling
18955     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
18956     KRegister mask = k0; // The comparison itself is not being masked.
18957     __ evcmpps(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
18958     __ evmovdqul($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
18959   %}
18960   ins_pipe( pipe_slow );
18961 %}
18962 
18963 instruct vcmplt2F(vecD dst, vecD src1, vecD src2) %{
18964   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 &&
18965             n->as_VectorMaskCmp()->get_predicate() == BoolTest::lt &&
18966             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
18967   match(Set dst (VectorMaskCmp src1 src2));
18968   format %{ "vcmpltps  $dst,$src1,$src2\t! cmplt packed2F" %}
18969   ins_encode %{
18970     int vector_len = 0;
18971     Assembler::ComparisonPredicateFP cmp = Assembler::LT_OQ; //ordered non-signaling
18972     __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
18973   %}
18974   ins_pipe( pipe_slow );
18975 %}
18976 
18977 instruct vcmplt4F(vecX dst, vecX src1, vecX src2) %{
18978   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 &&
18979             n->as_VectorMaskCmp()->get_predicate() == BoolTest::lt &&
18980             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
18981   match(Set dst (VectorMaskCmp src1 src2));
18982   format %{ "vcmpltps  $dst,$src1,$src2\t! cmplt packed4F" %}
18983   ins_encode %{
18984     int vector_len = 0;
18985     Assembler::ComparisonPredicateFP cmp = Assembler::LT_OQ; //ordered non-signaling
18986     __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
18987   %}
18988   ins_pipe( pipe_slow );
18989 %}
18990 
18991 instruct vcmplt8F(vecY dst, vecY src1, vecY src2) %{
18992   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 &&
18993             n->as_VectorMaskCmp()->get_predicate() == BoolTest::lt &&
18994             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
18995   match(Set dst (VectorMaskCmp src1 src2));
18996   format %{ "vcmpltps  $dst,$src1,$src2\t! cmplt packed8F" %}
18997   ins_encode %{
18998     int vector_len = 1;
18999     Assembler::ComparisonPredicateFP cmp = Assembler::LT_OQ; //ordered non-signaling
19000     __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19001   %}
19002   ins_pipe( pipe_slow );
19003 %}
19004 
19005 instruct vcmplt16F(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
19006   predicate(UseAVX > 2 && n->as_Vector()->length() == 16 &&
19007             n->as_VectorMaskCmp()->get_predicate() == BoolTest::lt &&
19008             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
19009   match(Set dst (VectorMaskCmp src1 src2));
19010   effect(TEMP dst, TEMP scratch);
19011   format %{ "vcmpltps  k2,$src1,$src2\n\t"
19012             "vmovdqu8  $dst, k2{z}, 0xFFFFFFFFFF \t! cmplt packed16F" %}
19013   ins_encode %{
19014     int vector_len = 2;
19015     Assembler::ComparisonPredicateFP cmp = Assembler::LT_OQ; //ordered non-signaling
19016     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
19017     KRegister mask = k0; // The comparison itself is not being masked.
19018     __ evcmpps(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19019     __ evmovdqul($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
19020   %}
19021   ins_pipe( pipe_slow );
19022 %}
19023 
19024 instruct vcmpgt2F(vecD dst, vecD src1, vecD src2) %{
19025   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 &&
19026             n->as_VectorMaskCmp()->get_predicate() == BoolTest::gt &&
19027             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
19028   match(Set dst (VectorMaskCmp src1 src2));
19029   format %{ "vcmpgtps  $dst,$src1,$src2\t! cmpgt packed2F" %}
19030   ins_encode %{
19031     int vector_len = 0;
19032     Assembler::ComparisonPredicateFP cmp = Assembler::GT_OQ; //ordered non-signaling
19033     __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19034   %}
19035   ins_pipe( pipe_slow );
19036 %}
19037 
19038 instruct vcmpgt4F(vecX dst, vecX src1, vecX src2) %{
19039   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 &&
19040             n->as_VectorMaskCmp()->get_predicate() == BoolTest::gt &&
19041             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
19042   match(Set dst (VectorMaskCmp src1 src2));
19043   format %{ "vcmpgtps  $dst,$src1,$src2\t! cmpgt packed4F" %}
19044   ins_encode %{
19045     int vector_len = 0;
19046     Assembler::ComparisonPredicateFP cmp = Assembler::GT_OQ; //ordered non-signaling
19047     __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19048   %}
19049   ins_pipe( pipe_slow );
19050 %}
19051 
19052 instruct vcmpgt8F(vecY dst, vecY src1, vecY src2) %{
19053   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 &&
19054             n->as_VectorMaskCmp()->get_predicate() == BoolTest::gt &&
19055             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
19056   match(Set dst (VectorMaskCmp src1 src2));
19057   format %{ "vcmpgtps  $dst,$src1,$src2\t! cmpgt packed8F" %}
19058   ins_encode %{
19059     int vector_len = 1;
19060     Assembler::ComparisonPredicateFP cmp = Assembler::GT_OQ; //ordered non-signaling
19061     __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19062   %}
19063   ins_pipe( pipe_slow );
19064 %}
19065 
19066 instruct vcmpgt16F(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
19067   predicate(UseAVX > 2 && n->as_Vector()->length() == 16 &&
19068             n->as_VectorMaskCmp()->get_predicate() == BoolTest::gt &&
19069             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
19070   match(Set dst (VectorMaskCmp src1 src2));
19071   effect(TEMP dst, TEMP scratch);
19072   format %{ "vcmpgtps  k2,$src1,$src2\n\t"
19073             "vmovdqu8  $dst, k2{z}, 0xFFFFFFFFFF \t! cmpgt packed16F" %}
19074   ins_encode %{
19075     int vector_len = 2;
19076     Assembler::ComparisonPredicateFP cmp = Assembler::GT_OQ; //ordered non-signaling
19077     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
19078     KRegister mask = k0; // The comparison itself is not being masked.
19079     __ evcmpps(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19080     __ evmovdqul($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
19081   %}
19082   ins_pipe( pipe_slow );
19083 %}
19084 
19085 instruct vcmpge2F(vecD dst, vecD src1, vecD src2) %{
19086   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 &&
19087             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ge &&
19088             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
19089   match(Set dst (VectorMaskCmp src1 src2));
19090   format %{ "vcmpgeps  $dst,$src1,$src2\t! cmpge packed2F" %}
19091   ins_encode %{
19092     int vector_len = 0;
19093     Assembler::ComparisonPredicateFP cmp = Assembler::GE_OQ; //ordered non-signaling
19094     __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19095   %}
19096   ins_pipe( pipe_slow );
19097 %}
19098 
19099 instruct vcmpge4F(vecX dst, vecX src1, vecX src2) %{
19100   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 &&
19101             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ge &&
19102             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
19103   match(Set dst (VectorMaskCmp src1 src2));
19104   format %{ "vcmpgeps  $dst,$src1,$src2\t! cmpge packed4F" %}
19105   ins_encode %{
19106     int vector_len = 0;
19107     Assembler::ComparisonPredicateFP cmp = Assembler::GE_OQ; //ordered non-signaling
19108     __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19109   %}
19110   ins_pipe( pipe_slow );
19111 %}
19112 
19113 instruct vcmpge8F(vecY dst, vecY src1, vecY src2) %{
19114   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 &&
19115             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ge &&
19116             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
19117   match(Set dst (VectorMaskCmp src1 src2));
19118   format %{ "vcmpgeps  $dst,$src1,$src2\t! cmpge packed8F" %}
19119   ins_encode %{
19120     int vector_len = 1;
19121     Assembler::ComparisonPredicateFP cmp = Assembler::GE_OQ; //ordered non-signaling
19122     __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19123   %}
19124   ins_pipe( pipe_slow );
19125 %}
19126 
19127 instruct vcmpge16F(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
19128   predicate(UseAVX > 2 && n->as_Vector()->length() == 16 &&
19129             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ge &&
19130             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
19131   match(Set dst (VectorMaskCmp src1 src2));
19132   effect(TEMP dst, TEMP scratch);
19133   format %{ "vcmpgeps  k2,$src1,$src2\n\t"
19134             "vmovdqu8  $dst, k2{z}, 0xFFFFFFFFFF \t! cmpge packed16F" %}
19135   ins_encode %{
19136     int vector_len = 2;
19137     Assembler::ComparisonPredicateFP cmp = Assembler::GE_OQ; //ordered non-signaling
19138     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
19139     KRegister mask = k0; // The comparison itself is not being masked.
19140     __ evcmpps(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19141     __ evmovdqul($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
19142   %}
19143   ins_pipe( pipe_slow );
19144 %}
19145 
19146 instruct vcmple2F(vecD dst, vecD src1, vecD src2) %{
19147   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 &&
19148             n->as_VectorMaskCmp()->get_predicate() == BoolTest::le &&
19149             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
19150   match(Set dst (VectorMaskCmp src1 src2));
19151   format %{ "vcmpleps  $dst,$src1,$src2\t! cmple packed2F" %}
19152   ins_encode %{
19153     int vector_len = 0;
19154     Assembler::ComparisonPredicateFP cmp = Assembler::LE_OQ; //ordered non-signaling
19155     __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19156   %}
19157   ins_pipe( pipe_slow );
19158 %}
19159 
19160 instruct vcmple4F(vecX dst, vecX src1, vecX src2) %{
19161   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 &&
19162             n->as_VectorMaskCmp()->get_predicate() == BoolTest::le &&
19163             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
19164   match(Set dst (VectorMaskCmp src1 src2));
19165   format %{ "vcmpleps  $dst,$src1,$src2\t! cmple packed4F" %}
19166   ins_encode %{
19167     int vector_len = 0;
19168     Assembler::ComparisonPredicateFP cmp = Assembler::LE_OQ; //ordered non-signaling
19169     __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19170   %}
19171   ins_pipe( pipe_slow );
19172 %}
19173 
19174 instruct vcmple8F(vecY dst, vecY src1, vecY src2) %{
19175   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 &&
19176             n->as_VectorMaskCmp()->get_predicate() == BoolTest::le &&
19177             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
19178   match(Set dst (VectorMaskCmp src1 src2));
19179   format %{ "vcmpleps  $dst,$src1,$src2\t! cmple packed8F" %}
19180   ins_encode %{
19181     int vector_len = 1;
19182     Assembler::ComparisonPredicateFP cmp = Assembler::LE_OQ; //ordered non-signaling
19183     __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19184   %}
19185   ins_pipe( pipe_slow );
19186 %}
19187 
19188 instruct vcmple16F(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
19189   predicate(UseAVX > 2 && n->as_Vector()->length() == 16 &&
19190             n->as_VectorMaskCmp()->get_predicate() == BoolTest::le &&
19191             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
19192   match(Set dst (VectorMaskCmp src1 src2));
19193   effect(TEMP dst, TEMP scratch);
19194   format %{ "vcmpleps  k2,$src1,$src2\n\t"
19195             "vmovdqu8  $dst, k2{z}, 0xFFFFFFFFFF \t! cmple packed16F" %}
19196   ins_encode %{
19197     int vector_len = 2;
19198     Assembler::ComparisonPredicateFP cmp = Assembler::LE_OQ; //ordered non-signaling
19199     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
19200     KRegister mask = k0; // The comparison itself is not being masked.
19201     __ evcmpps(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19202     __ evmovdqul($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
19203   %}
19204   ins_pipe( pipe_slow );
19205 %}
19206 
19207 instruct vcmpne2F(vecD dst, vecD src1, vecD src2) %{
19208   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 &&
19209             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ne &&
19210             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
19211   match(Set dst (VectorMaskCmp src1 src2));
19212   format %{ "vcmpneps  $dst,$src1,$src2\t! cmpne packed2F" %}
19213   ins_encode %{
19214     int vector_len = 0;
19215     // As per JLS 15.21.1, != of NaNs is true. Thus use unordered compare.
19216     Assembler::ComparisonPredicateFP cmp = Assembler::NEQ_UQ; //unordered non-signaling
19217     __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19218   %}
19219   ins_pipe( pipe_slow );
19220 %}
19221 
19222 instruct vcmpne4F(vecX dst, vecX src1, vecX src2) %{
19223   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 &&
19224             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ne &&
19225             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
19226   match(Set dst (VectorMaskCmp src1 src2));
19227   format %{ "vcmpneps  $dst,$src1,$src2\t! cmpne packed4F" %}
19228   ins_encode %{
19229     int vector_len = 0;
19230     // As per JLS 15.21.1, != of NaNs is true. Thus use unordered compare.
19231     Assembler::ComparisonPredicateFP cmp = Assembler::NEQ_UQ; //unordered non-signaling
19232     __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19233   %}
19234   ins_pipe( pipe_slow );
19235 %}
19236 
19237 instruct vcmpne8F(vecY dst, vecY src1, vecY src2) %{
19238   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 &&
19239             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ne &&
19240             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
19241   match(Set dst (VectorMaskCmp src1 src2));
19242   format %{ "vcmpneps  $dst,$src1,$src2\t! cmpne packed8F" %}
19243   ins_encode %{
19244     int vector_len = 1;
19245     // As per JLS 15.21.1, != of NaNs is true. Thus use unordered compare.
19246     Assembler::ComparisonPredicateFP cmp = Assembler::NEQ_UQ; //unordered non-signaling
19247     __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19248   %}
19249   ins_pipe( pipe_slow );
19250 %}
19251 
19252 instruct vcmpne16F(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
19253   predicate(UseAVX > 2 && n->as_Vector()->length() == 16 &&
19254             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ne &&
19255             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
19256   match(Set dst (VectorMaskCmp src1 src2));
19257   effect(TEMP dst, TEMP scratch);
19258   format %{ "vcmpneps  k2,$src1,$src2\n\t"
19259             "vmovdqu8  $dst, k2{z}, 0xFFFFFFFFFF \t! cmpne packed16F" %}
19260   ins_encode %{
19261     int vector_len = 2;
19262     // As per JLS 15.21.1, != of NaNs is true. Thus use unordered compare.
19263     Assembler::ComparisonPredicateFP cmp = Assembler::NEQ_UQ; //unordered non-signaling
19264     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
19265     KRegister mask = k0; // The comparison itself is not being masked.
19266     __ evcmpps(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19267     __ evmovdqul($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
19268   %}
19269   ins_pipe( pipe_slow );
19270 %}
19271 
19272 instruct vcmpeq1D(vecD dst, vecD src1, vecD src2) %{
19273   predicate(UseAVX > 0 && n->as_Vector()->length() == 1 &&
19274             n->as_VectorMaskCmp()->get_predicate() == BoolTest::eq &&
19275             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
19276   match(Set dst (VectorMaskCmp src1 src2));
19277   format %{ "vcmpeqpd  $dst,$src1,$src2\t! cmpeq packed1D" %}
19278   ins_encode %{
19279     int vector_len = 0;
19280     Assembler::ComparisonPredicateFP cmp = Assembler::EQ_OQ;  // ordered non-signaling
19281     __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19282   %}
19283   ins_pipe( pipe_slow );
19284 %}
19285 
19286 instruct vcmpeq2D(vecX dst, vecX src1, vecX src2) %{
19287   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 &&
19288             n->as_VectorMaskCmp()->get_predicate() == BoolTest::eq &&
19289             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
19290   match(Set dst (VectorMaskCmp src1 src2));
19291   format %{ "vcmpeqpd  $dst,$src1,$src2\t! cmpeq packed2D" %}
19292   ins_encode %{
19293     int vector_len = 0;
19294     Assembler::ComparisonPredicateFP cmp = Assembler::EQ_OQ;  // ordered non-signaling
19295     __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19296   %}
19297   ins_pipe( pipe_slow );
19298 %}
19299 
19300 instruct vcmpeq4D(vecY dst, vecY src1, vecY src2) %{
19301   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 &&
19302             n->as_VectorMaskCmp()->get_predicate() == BoolTest::eq &&
19303             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
19304   match(Set dst (VectorMaskCmp src1 src2));
19305   format %{ "vcmpeqpd  $dst,$src1,$src2\t! cmpeq packed4D" %}
19306   ins_encode %{
19307     int vector_len = 1;
19308     Assembler::ComparisonPredicateFP cmp = Assembler::EQ_OQ;  // ordered non-signaling
19309     __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19310   %}
19311   ins_pipe( pipe_slow );
19312 %}
19313 
19314 instruct vcmpeq8D(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
19315   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 &&
19316             n->as_VectorMaskCmp()->get_predicate() == BoolTest::eq &&
19317             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
19318   match(Set dst (VectorMaskCmp src1 src2));
19319   effect(TEMP dst, TEMP scratch);
19320   format %{ "vcmpeqpd  k2,$src1,$src2\n\t"
19321             "vmovdqu8  $dst, k2{z}, 0xFFFFFFFFFF \t! cmpeq packed8D" %}
19322   ins_encode %{
19323     int vector_len = 2;
19324     Assembler::ComparisonPredicateFP cmp = Assembler::EQ_OQ;  // ordered non-signaling
19325     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
19326     KRegister mask = k0; // The comparison itself is not being masked.
19327     __ evcmppd(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19328     __ evmovdquq($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
19329   %}
19330   ins_pipe( pipe_slow );
19331 %}
19332 
19333 instruct vcmplt1D(vecD dst, vecD src1, vecD src2) %{
19334   predicate(UseAVX > 0 && n->as_Vector()->length() == 1 &&
19335             n->as_VectorMaskCmp()->get_predicate() == BoolTest::lt &&
19336             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
19337   match(Set dst (VectorMaskCmp src1 src2));
19338   format %{ "vcmpltpd  $dst,$src1,$src2\t! cmplt packed1D" %}
19339   ins_encode %{
19340     int vector_len = 0;
19341     Assembler::ComparisonPredicateFP cmp = Assembler::LT_OQ; //ordered non-signaling
19342     __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19343   %}
19344   ins_pipe( pipe_slow );
19345 %}
19346 
19347 instruct vcmplt2D(vecX dst, vecX src1, vecX src2) %{
19348   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 &&
19349             n->as_VectorMaskCmp()->get_predicate() == BoolTest::lt &&
19350             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
19351   match(Set dst (VectorMaskCmp src1 src2));
19352   format %{ "vcmpltpd  $dst,$src1,$src2\t! cmplt packed2D" %}
19353   ins_encode %{
19354     int vector_len = 0;
19355     Assembler::ComparisonPredicateFP cmp = Assembler::LT_OQ; //ordered non-signaling
19356     __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19357   %}
19358   ins_pipe( pipe_slow );
19359 %}
19360 
19361 instruct vcmplt4D(vecY dst, vecY src1, vecY src2) %{
19362   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 &&
19363             n->as_VectorMaskCmp()->get_predicate() == BoolTest::lt &&
19364             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
19365   match(Set dst (VectorMaskCmp src1 src2));
19366   format %{ "vcmpltpd  $dst,$src1,$src2\t! cmplt packed4D" %}
19367   ins_encode %{
19368     int vector_len = 1;
19369     Assembler::ComparisonPredicateFP cmp = Assembler::LT_OQ; //ordered non-signaling
19370     __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19371   %}
19372   ins_pipe( pipe_slow );
19373 %}
19374 
19375 instruct vcmplt8D(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
19376   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 &&
19377             n->as_VectorMaskCmp()->get_predicate() == BoolTest::lt &&
19378             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
19379   match(Set dst (VectorMaskCmp src1 src2));
19380   effect(TEMP dst, TEMP scratch);
19381   format %{ "vcmpltpd  k2,$src1,$src2\n\t"
19382             "vmovdqu8  $dst, k2{z}, 0xFFFFFFFFFF \t! cmplt packed8D" %}
19383   ins_encode %{
19384     int vector_len = 2;
19385     Assembler::ComparisonPredicateFP cmp = Assembler::LT_OQ; //ordered non-signaling
19386     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
19387     KRegister mask = k0; // The comparison itself is not being masked.
19388     __ evcmppd(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19389     __ evmovdquq($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
19390   %}
19391   ins_pipe( pipe_slow );
19392 %}
19393 
19394 instruct vcmpgt1D(vecD dst, vecD src1, vecD src2) %{
19395   predicate(UseAVX > 0 && n->as_Vector()->length() == 1 &&
19396             n->as_VectorMaskCmp()->get_predicate() == BoolTest::gt &&
19397             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
19398   match(Set dst (VectorMaskCmp src1 src2));
19399   format %{ "vcmpgtpd  $dst,$src1,$src2\t! cmpgt packed1D" %}
19400   ins_encode %{
19401     int vector_len = 0;
19402     Assembler::ComparisonPredicateFP cmp = Assembler::GT_OQ; //ordered non-signaling
19403     __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19404   %}
19405   ins_pipe( pipe_slow );
19406 %}
19407 
19408 instruct vcmpgt2D(vecX dst, vecX src1, vecX src2) %{
19409   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 &&
19410             n->as_VectorMaskCmp()->get_predicate() == BoolTest::gt &&
19411             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
19412   match(Set dst (VectorMaskCmp src1 src2));
19413   format %{ "vcmpgtpd  $dst,$src1,$src2\t! cmpgt packed2D" %}
19414   ins_encode %{
19415     int vector_len = 0;
19416     Assembler::ComparisonPredicateFP cmp = Assembler::GT_OQ; //ordered non-signaling
19417     __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19418   %}
19419   ins_pipe( pipe_slow );
19420 %}
19421 
19422 instruct vcmpgt4D(vecY dst, vecY src1, vecY src2) %{
19423   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 &&
19424             n->as_VectorMaskCmp()->get_predicate() == BoolTest::gt &&
19425             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
19426   match(Set dst (VectorMaskCmp src1 src2));
19427   format %{ "vcmpgtpd  $dst,$src1,$src2\t! cmpgt packed4D" %}
19428   ins_encode %{
19429     int vector_len = 1;
19430     Assembler::ComparisonPredicateFP cmp = Assembler::GT_OQ; //ordered non-signaling
19431     __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19432   %}
19433   ins_pipe( pipe_slow );
19434 %}
19435 
19436 instruct vcmpgt8D(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
19437   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 &&
19438             n->as_VectorMaskCmp()->get_predicate() == BoolTest::gt &&
19439             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
19440   match(Set dst (VectorMaskCmp src1 src2));
19441   effect(TEMP dst, TEMP scratch);
19442   format %{ "vcmpgtpd  k2,$src1,$src2\n\t"
19443             "vmovdqu8  $dst, k2{z}, 0xFFFFFFFFFF \t! cmpgt packed8D" %}
19444   ins_encode %{
19445     int vector_len = 2;
19446     Assembler::ComparisonPredicateFP cmp = Assembler::GT_OQ; //ordered non-signaling
19447     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
19448     KRegister mask = k0; // The comparison itself is not being masked.
19449     __ evcmppd(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19450     __ evmovdquq($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
19451   %}
19452   ins_pipe( pipe_slow );
19453 %}
19454 
19455 instruct vcmpge1D(vecD dst, vecD src1, vecD src2) %{
19456   predicate(UseAVX > 0 && n->as_Vector()->length() == 1 &&
19457             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ge &&
19458             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
19459   match(Set dst (VectorMaskCmp src1 src2));
19460   format %{ "vcmpgepd  $dst,$src1,$src2\t! cmpge packed1D" %}
19461   ins_encode %{
19462     int vector_len = 0;
19463     Assembler::ComparisonPredicateFP cmp = Assembler::GE_OQ; //ordered non-signaling
19464     __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19465   %}
19466   ins_pipe( pipe_slow );
19467 %}
19468 
19469 instruct vcmpge2D(vecX dst, vecX src1, vecX src2) %{
19470   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 &&
19471             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ge &&
19472             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
19473   match(Set dst (VectorMaskCmp src1 src2));
19474   format %{ "vcmpgepd  $dst,$src1,$src2\t! cmpge packed2D" %}
19475   ins_encode %{
19476     int vector_len = 0;
19477     Assembler::ComparisonPredicateFP cmp = Assembler::GE_OQ; //ordered non-signaling
19478     __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19479   %}
19480   ins_pipe( pipe_slow );
19481 %}
19482 
19483 instruct vcmpge4D(vecY dst, vecY src1, vecY src2) %{
19484   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 &&
19485             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ge &&
19486             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
19487   match(Set dst (VectorMaskCmp src1 src2));
19488   format %{ "vcmpgepd  $dst,$src1,$src2\t! cmpge packed4D" %}
19489   ins_encode %{
19490     int vector_len = 1;
19491     Assembler::ComparisonPredicateFP cmp = Assembler::GE_OQ; //ordered non-signaling
19492     __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19493   %}
19494   ins_pipe( pipe_slow );
19495 %}
19496 
19497 instruct vcmpge8D(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
19498   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 &&
19499             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ge &&
19500             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
19501   match(Set dst (VectorMaskCmp src1 src2));
19502   effect(TEMP dst, TEMP scratch);
19503   format %{ "vcmpgepd  k2,$src1,$src2\n\t"
19504             "vmovdqu8  $dst, k2{z}, 0xFFFFFFFFFF \t! cmpge packed8D" %}
19505   ins_encode %{
19506     int vector_len = 2;
19507     Assembler::ComparisonPredicateFP cmp = Assembler::GE_OQ; //ordered non-signaling
19508     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
19509     KRegister mask = k0; // The comparison itself is not being masked.
19510     __ evcmppd(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19511     __ evmovdquq($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
19512   %}
19513   ins_pipe( pipe_slow );
19514 %}
19515 
19516 instruct vcmple1D(vecD dst, vecD src1, vecD src2) %{
19517   predicate(UseAVX > 0 && n->as_Vector()->length() == 1 &&
19518             n->as_VectorMaskCmp()->get_predicate() == BoolTest::le &&
19519             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
19520   match(Set dst (VectorMaskCmp src1 src2));
19521   format %{ "vcmplepd  $dst,$src1,$src2\t! cmple packed1D" %}
19522   ins_encode %{
19523     int vector_len = 0;
19524     Assembler::ComparisonPredicateFP cmp = Assembler::LE_OQ; //ordered non-signaling
19525     __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19526   %}
19527   ins_pipe( pipe_slow );
19528 %}
19529 
19530 instruct vcmple2D(vecX dst, vecX src1, vecX src2) %{
19531   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 &&
19532             n->as_VectorMaskCmp()->get_predicate() == BoolTest::le &&
19533             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
19534   match(Set dst (VectorMaskCmp src1 src2));
19535   format %{ "vcmplepd  $dst,$src1,$src2\t! cmple packed2D" %}
19536   ins_encode %{
19537     int vector_len = 0;
19538     Assembler::ComparisonPredicateFP cmp = Assembler::LE_OQ; //ordered non-signaling
19539     __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19540   %}
19541   ins_pipe( pipe_slow );
19542 %}
19543 
19544 instruct vcmple4D(vecY dst, vecY src1, vecY src2) %{
19545   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 &&
19546             n->as_VectorMaskCmp()->get_predicate() == BoolTest::le &&
19547             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
19548   match(Set dst (VectorMaskCmp src1 src2));
19549   format %{ "vcmplepd  $dst,$src1,$src2\t! cmple packed4D" %}
19550   ins_encode %{
19551     int vector_len = 1;
19552     Assembler::ComparisonPredicateFP cmp = Assembler::LE_OQ; //ordered non-signaling
19553     __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19554   %}
19555   ins_pipe( pipe_slow );
19556 %}
19557 
19558 instruct vcmple8D(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
19559   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 &&
19560             n->as_VectorMaskCmp()->get_predicate() == BoolTest::le &&
19561             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
19562   match(Set dst (VectorMaskCmp src1 src2));
19563   effect(TEMP dst, TEMP scratch);
19564   format %{ "vcmplepd  k2,$src1,$src2\n\t"
19565             "vmovdqu8  $dst, k2{z}, 0xFFFFFFFFFF \t! cmple packed8D" %}
19566   ins_encode %{
19567     int vector_len = 2;
19568     Assembler::ComparisonPredicateFP cmp = Assembler::LE_OQ; //ordered non-signaling
19569     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
19570     KRegister mask = k0; // The comparison itself is not being masked.
19571     __ evcmppd(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19572     __ evmovdquq($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
19573   %}
19574   ins_pipe( pipe_slow );
19575 %}
19576 
19577 instruct vcmpne1D(vecD dst, vecD src1, vecD src2) %{
19578   predicate(UseAVX > 0 && n->as_Vector()->length() == 1 &&
19579             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ne &&
19580             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
19581   match(Set dst (VectorMaskCmp src1 src2));
19582   format %{ "vcmpnepd  $dst,$src1,$src2\t! cmpne packed1D" %}
19583   ins_encode %{
19584     int vector_len = 0;
19585     // As per JLS 15.21.1, != of NaNs is true. Thus use unordered compare.
19586     Assembler::ComparisonPredicateFP cmp = Assembler::NEQ_UQ; //unordered non-signaling
19587     __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19588   %}
19589   ins_pipe( pipe_slow );
19590 %}
19591 
19592 instruct vcmpne2D(vecX dst, vecX src1, vecX src2) %{
19593   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 &&
19594             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ne &&
19595             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
19596   match(Set dst (VectorMaskCmp src1 src2));
19597   format %{ "vcmpnepd  $dst,$src1,$src2\t! cmpne packed2D" %}
19598   ins_encode %{
19599     int vector_len = 0;
19600     // As per JLS 15.21.1, != of NaNs is true. Thus use unordered compare.
19601     Assembler::ComparisonPredicateFP cmp = Assembler::NEQ_UQ; //unordered non-signaling
19602     __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19603   %}
19604   ins_pipe( pipe_slow );
19605 %}
19606 
19607 instruct vcmpne4D(vecY dst, vecY src1, vecY src2) %{
19608   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 &&
19609             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ne &&
19610             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
19611   match(Set dst (VectorMaskCmp src1 src2));
19612   format %{ "vcmpnepd  $dst,$src1,$src2\t! cmpne packed4D" %}
19613   ins_encode %{
19614     int vector_len = 1;
19615     // As per JLS 15.21.1, != of NaNs is true. Thus use unordered compare.
19616     Assembler::ComparisonPredicateFP cmp = Assembler::NEQ_UQ; //unordered non-signaling
19617     __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19618   %}
19619   ins_pipe( pipe_slow );
19620 %}
19621 
19622 instruct vcmpne8D(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
19623   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 &&
19624             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ne &&
19625             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE);
19626   match(Set dst (VectorMaskCmp src1 src2));
19627   effect(TEMP dst, TEMP scratch);
19628   format %{ "vcmpnepd  k2,$src1,$src2\n\t"
19629             "vmovdqu8  $dst, k2{z}, 0xFFFFFFFFFF \t! cmpne packed8D" %}
19630   ins_encode %{
19631     int vector_len = 2;
19632     // As per JLS 15.21.1, != of NaNs is true. Thus use unordered compare.
19633     Assembler::ComparisonPredicateFP cmp = Assembler::NEQ_UQ; //unordered non-signaling
19634     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
19635     KRegister mask = k0; // The comparison itself is not being masked.
19636     __ evcmppd(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19637     __ evmovdquq($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
19638   %}
19639   ins_pipe( pipe_slow );
19640 %}
19641 
19642 instruct vcmpeq2I(vecD dst, vecD src1, vecD src2) %{
19643   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 &&
19644             n->as_VectorMaskCmp()->get_predicate() == BoolTest::eq &&
19645             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_INT);
19646   match(Set dst (VectorMaskCmp src1 src2));
19647   format %{ "vpcmpeqd  $dst,$src1,$src2\n\t! cmpeq packed2I" %}
19648   ins_encode %{
19649     int vector_len = 0;
19650     __ vpcmpeqd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
19651   %}
19652   ins_pipe( pipe_slow );
19653 %}
19654 
19655 instruct vcmpeq4I(vecX dst, vecX src1, vecX src2) %{
19656   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 &&
19657             n->as_VectorMaskCmp()->get_predicate() == BoolTest::eq &&
19658             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_INT);
19659   match(Set dst (VectorMaskCmp src1 src2));
19660   format %{ "vpcmpeqd  $dst,$src1,$src2\n\t! cmpeq packed4I" %}
19661   ins_encode %{
19662     int vector_len = 0;
19663     __ vpcmpeqd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
19664   %}
19665   ins_pipe( pipe_slow );
19666 %}
19667 
19668 instruct vcmpeq8I(vecY dst, vecY src1, vecY src2) %{
19669   predicate(UseAVX > 1 && n->as_Vector()->length() == 8 &&
19670             n->as_VectorMaskCmp()->get_predicate() == BoolTest::eq &&
19671             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_INT);
19672   match(Set dst (VectorMaskCmp src1 src2));
19673   format %{ "vpcmpeqd  $dst,$src1,$src2\n\t! cmpeq packed8I" %}
19674   ins_encode %{
19675     int vector_len = 1;
19676     __ vpcmpeqd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
19677   %}
19678   ins_pipe( pipe_slow );
19679 %}
19680 
19681 instruct vcmpeq16I(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
19682   predicate(UseAVX > 2 && n->as_Vector()->length() == 16 &&
19683             n->as_VectorMaskCmp()->get_predicate() == BoolTest::eq &&
19684             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_INT);
19685   match(Set dst (VectorMaskCmp src1 src2));
19686   effect(TEMP dst, TEMP scratch);
19687   format %{ "vpcmpeqd  k2,$src1,$src2\n\t"
19688             "vmovdqu32 $dst, k2{z}, 0xFFFFFFFFFF \t! cmpeq packed16I" %}
19689   ins_encode %{
19690     int vector_len = 2;
19691     Assembler::ComparisonPredicate cmp = Assembler::eq;
19692     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
19693     KRegister mask = k0; // The comparison itself is not being masked.
19694     __ evpcmpd(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19695     __ evmovdqul($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
19696   %}
19697   ins_pipe( pipe_slow );
19698 %}
19699 
19700 instruct vcmplt2I(vecD dst, vecD src1, vecD src2) %{
19701   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 &&
19702             n->as_VectorMaskCmp()->get_predicate() == BoolTest::lt &&
19703             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_INT);
19704   match(Set dst (VectorMaskCmp src1 src2));
19705   format %{ "vpcmpgtd  $dst,$src2,$src1\t! cmplt packed2I" %}
19706   ins_encode %{
19707     int vector_len = 0;
19708     __ vpcmpgtd($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, vector_len);
19709   %}
19710   ins_pipe( pipe_slow );
19711 %}
19712 
19713 instruct vcmplt4I(vecX dst, vecX src1, vecX src2) %{
19714   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 &&
19715             n->as_VectorMaskCmp()->get_predicate() == BoolTest::lt &&
19716             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_INT);
19717   match(Set dst (VectorMaskCmp src1 src2));
19718   format %{ "vpcmpgtd  $dst,$src2,$src1\t! cmplt packed4I" %}
19719   ins_encode %{
19720     int vector_len = 0;
19721     __ vpcmpgtd($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, vector_len);
19722   %}
19723   ins_pipe( pipe_slow );
19724 %}
19725 
19726 instruct vcmplt8I(vecY dst, vecY src1, vecY src2) %{
19727   predicate(UseAVX > 1 && n->as_Vector()->length() == 8 &&
19728             n->as_VectorMaskCmp()->get_predicate() == BoolTest::lt &&
19729             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_INT);
19730   match(Set dst (VectorMaskCmp src1 src2));
19731   format %{ "vpcmpgtd  $dst,$src2,$src1\t! cmplt packed8I" %}
19732   ins_encode %{
19733     int vector_len = 1;
19734     __ vpcmpgtd($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, vector_len);
19735   %}
19736   ins_pipe( pipe_slow );
19737 %}
19738 
19739 instruct vcmplt16I(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
19740   predicate(UseAVX > 2 && n->as_Vector()->length() == 16 &&
19741             n->as_VectorMaskCmp()->get_predicate() == BoolTest::lt &&
19742             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_INT);
19743   match(Set dst (VectorMaskCmp src1 src2));
19744   effect(TEMP dst, TEMP scratch);
19745   format %{ "vpcmpnled  k2,$src1,$src2\n\t"
19746             "vmovdqu32   $dst, k2{z}, 0xFFFFFFFFFF \t! cmpgt packed16I" %}
19747   ins_encode %{
19748     int vector_len = 2;
19749     Assembler::ComparisonPredicate cmp = Assembler::lt;
19750     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
19751     KRegister mask = k0; // The comparison itself is not being masked.
19752     __ evpcmpd(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19753     __ evmovdqul($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
19754   %}
19755   ins_pipe( pipe_slow );
19756 %}
19757 
19758 instruct vcmpgt2I(vecD dst, vecD src1, vecD src2) %{
19759   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 &&
19760             n->as_VectorMaskCmp()->get_predicate() == BoolTest::gt &&
19761             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_INT);
19762   match(Set dst (VectorMaskCmp src1 src2));
19763   format %{ "vpcmpgtd  $dst,$src1,$src2\t! cmpgt packed2I" %}
19764   ins_encode %{
19765     int vector_len = 0;
19766     __ vpcmpgtd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
19767   %}
19768   ins_pipe( pipe_slow );
19769 %}
19770 
19771 instruct vcmpgt4I(vecX dst, vecX src1, vecX src2) %{
19772   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 &&
19773             n->as_VectorMaskCmp()->get_predicate() == BoolTest::gt &&
19774             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_INT);
19775   match(Set dst (VectorMaskCmp src1 src2));
19776   format %{ "vpcmpgtd  $dst,$src1,$src2\t! cmpgt packed4I" %}
19777   ins_encode %{
19778     int vector_len = 0;
19779     __ vpcmpgtd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
19780   %}
19781   ins_pipe( pipe_slow );
19782 %}
19783 
19784 instruct vcmpgt8I(vecY dst, vecY src1, vecY src2) %{
19785   predicate(UseAVX > 1 && n->as_Vector()->length() == 8 &&
19786             n->as_VectorMaskCmp()->get_predicate() == BoolTest::gt &&
19787             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_INT);
19788   match(Set dst (VectorMaskCmp src1 src2));
19789   format %{ "vpcmpgtd  $dst,$src1,$src2\t! cmpgt packed8I" %}
19790   ins_encode %{
19791     int vector_len = 1;
19792     __ vpcmpgtd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
19793   %}
19794   ins_pipe( pipe_slow );
19795 %}
19796 
19797 instruct vcmpgt16I(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
19798   predicate(UseAVX > 2 && n->as_Vector()->length() == 16 &&
19799             n->as_VectorMaskCmp()->get_predicate() == BoolTest::gt &&
19800             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_INT);
19801   match(Set dst (VectorMaskCmp src1 src2));
19802   effect(TEMP dst, TEMP scratch);
19803   format %{ "vpcmpnled  k2,$src1,$src2\n\t"
19804             "vmovdqu32   $dst, k2{z}, 0xFFFFFFFFFF \t! cmpgt packed16I" %}
19805   ins_encode %{
19806     int vector_len = 2;
19807     Assembler::ComparisonPredicate cmp = Assembler::nle;
19808     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
19809     KRegister mask = k0; // The comparison itself is not being masked.
19810     __ evpcmpd(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19811     __ evmovdqul($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
19812   %}
19813   ins_pipe( pipe_slow );
19814 %}
19815 
19816 instruct vcmpge2I(vecD dst, vecD src1, vecD src2, rRegL scratch) %{
19817   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 &&
19818             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ge &&
19819             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_INT);
19820   match(Set dst (VectorMaskCmp src1 src2));
19821   effect(TEMP scratch);
19822   format %{ "vpcmpgtd  $dst,$src2,$src1\n\t"
19823             "vpxor $dst,$dst,0xFFFFFFFF\t! cmpge packed2I" %}
19824   ins_encode %{
19825     int vector_len = 0;
19826     __ vpcmpgtd($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, vector_len);
19827     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
19828   %}
19829   ins_pipe( pipe_slow );
19830 %}
19831 
19832 instruct vcmpge4I(vecX dst, vecX src1, vecX src2, rRegL scratch) %{
19833   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 &&
19834             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ge &&
19835             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_INT);
19836   match(Set dst (VectorMaskCmp src1 src2));
19837   effect(TEMP scratch);
19838   format %{ "vpcmpgtd  $dst,$src2,$src1\n\t"
19839             "vpxor $dst,$dst,0xFFFFFFFF\t! cmpge packed4I" %}
19840   ins_encode %{
19841     int vector_len = 0;
19842     __ vpcmpgtd($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, vector_len);
19843     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
19844   %}
19845   ins_pipe( pipe_slow );
19846 %}
19847 
19848 instruct vcmpge8I(vecY dst, vecY src1, vecY src2, rRegL scratch) %{
19849   predicate(UseAVX > 1 && n->as_Vector()->length() == 8 &&
19850             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ge &&
19851             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_INT);
19852   match(Set dst (VectorMaskCmp src1 src2));
19853   effect(TEMP scratch);
19854   format %{ "vpcmpgtd  $dst,$src2,$src1\n\t"
19855             "vpxor $dst,$dst,0xFFFFFFFF\t! cmpge packed8I" %}
19856   ins_encode %{
19857     int vector_len = 1;
19858     __ vpcmpgtd($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, vector_len);
19859     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
19860   %}
19861   ins_pipe( pipe_slow );
19862 %}
19863 
19864 instruct vcmpge16I(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
19865   predicate(UseAVX > 2 && n->as_Vector()->length() == 16 &&
19866             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ge &&
19867             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_INT);
19868   match(Set dst (VectorMaskCmp src1 src2));
19869   effect(TEMP dst, TEMP scratch);
19870   format %{ "vpcmpnltd  k2,$src1,$src2\n\t"
19871             "vmovdqu32   $dst, k2{z}, 0xFFFFFFFFFF \t! cmpge packed16I" %}
19872   ins_encode %{
19873     int vector_len = 2;
19874     Assembler::ComparisonPredicate cmp = Assembler::nlt;
19875     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
19876     KRegister mask = k0; // The comparison itself is not being masked.
19877     __ evpcmpd(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19878     __ evmovdqul($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
19879   %}
19880   ins_pipe( pipe_slow );
19881 %}
19882 
19883 instruct vcmple2I(vecD dst, vecD src1, vecD src2, rRegL scratch) %{
19884   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 &&
19885             n->as_VectorMaskCmp()->get_predicate() == BoolTest::le &&
19886             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_INT);
19887   match(Set dst (VectorMaskCmp src1 src2));
19888   effect(TEMP scratch);
19889   format %{ "vpcmpgtd  $dst,$src1,$src2\n\t"
19890             "vpxor $dst,$dst,0xFFFFFFFF\t! cmple packed2I" %}
19891   ins_encode %{
19892     int vector_len = 0;
19893     __ vpcmpgtd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
19894     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
19895   %}
19896   ins_pipe( pipe_slow );
19897 %}
19898 
19899 instruct vcmple4I(vecX dst, vecX src1, vecX src2, rRegL scratch) %{
19900   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 &&
19901             n->as_VectorMaskCmp()->get_predicate() == BoolTest::le &&
19902             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_INT);
19903   match(Set dst (VectorMaskCmp src1 src2));
19904   effect(TEMP scratch);
19905   format %{ "vpcmpgtd  $dst,$src1,$src2\n\t"
19906             "vpxor $dst,$dst,0xFFFFFFFF\t! cmple packed4I" %}
19907   ins_encode %{
19908     int vector_len = 0;
19909     __ vpcmpgtd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
19910     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
19911   %}
19912   ins_pipe( pipe_slow );
19913 %}
19914 
19915 instruct vcmple8I(vecY dst, vecY src1, vecY src2, rRegL scratch) %{
19916   predicate(UseAVX > 1 && n->as_Vector()->length() == 8 &&
19917             n->as_VectorMaskCmp()->get_predicate() == BoolTest::le &&
19918             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_INT);
19919   match(Set dst (VectorMaskCmp src1 src2));
19920   effect(TEMP scratch);
19921   format %{ "vpcmpgtd  $dst,$src1,$src2\n\t"
19922             "vpxor $dst,$dst,0xFFFFFFFF\t! cmple packed8I" %}
19923   ins_encode %{
19924     int vector_len = 1;
19925     __ vpcmpgtd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
19926     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
19927   %}
19928   ins_pipe( pipe_slow );
19929 %}
19930 
19931 instruct vcmple16I(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
19932   predicate(UseAVX > 2 && n->as_Vector()->length() == 16 &&
19933             n->as_VectorMaskCmp()->get_predicate() == BoolTest::le &&
19934             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_INT);
19935   match(Set dst (VectorMaskCmp src1 src2));
19936   effect(TEMP dst, TEMP scratch);
19937   format %{ "vpcmpled  k2,$src1,$src2\n\t"
19938             "vmovdqu32   $dst, k2{z}, 0xFFFFFFFFFF \t! cmple packed16I" %}
19939   ins_encode %{
19940     int vector_len = 2;
19941     Assembler::ComparisonPredicate cmp = Assembler::le;
19942     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
19943     KRegister mask = k0; // The comparison itself is not being masked.
19944     __ evpcmpd(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
19945     __ evmovdqul($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
19946   %}
19947   ins_pipe( pipe_slow );
19948 %}
19949 
19950 instruct vcmpne2I(vecD dst, vecD src1, vecD src2, rRegL scratch) %{
19951   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 &&
19952             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ne &&
19953             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_INT);
19954   match(Set dst (VectorMaskCmp src1 src2));
19955   effect(TEMP scratch);
19956   format %{ "vpcmpeqd  $dst,$src1,$src2\n\t"
19957             "vpxor $dst,$dst,0xFFFFFFFF\t! cmpneq packed2I" %}
19958   ins_encode %{
19959     int vector_len = 0;
19960     __ vpcmpeqd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
19961     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
19962   %}
19963   ins_pipe( pipe_slow );
19964 %}
19965 
19966 instruct vcmpne4I(vecX dst, vecX src1, vecX src2, rRegL scratch) %{
19967   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 &&
19968             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ne &&
19969             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_INT);
19970   match(Set dst (VectorMaskCmp src1 src2));
19971   effect(TEMP scratch);
19972   format %{ "vpcmpeqd  $dst,$src1,$src2\n\t"
19973             "vpxor $dst,$dst,0xFFFFFFFF\t! cmpneq packed4I" %}
19974   ins_encode %{
19975     int vector_len = 0;
19976     __ vpcmpeqd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
19977     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
19978   %}
19979   ins_pipe( pipe_slow );
19980 %}
19981 
19982 instruct vcmpne8I(vecY dst, vecY src1, vecY src2, rRegL scratch) %{
19983   predicate(UseAVX > 1 && n->as_Vector()->length() == 8 &&
19984             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ne &&
19985             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_INT);
19986   match(Set dst (VectorMaskCmp src1 src2));
19987   effect(TEMP scratch);
19988   format %{ "vpcmpeqd  $dst,$src1,$src2\n\t"
19989             "vpxor $dst,$dst,0xFFFFFFFF\t! cmpneq packed8I" %}
19990   ins_encode %{
19991     int vector_len = 1;
19992     __ vpcmpeqd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
19993     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
19994   %}
19995   ins_pipe( pipe_slow );
19996 %}
19997 
19998 instruct vcmpne16I(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
19999   predicate(UseAVX > 2 && n->as_Vector()->length() == 16 &&
20000             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ne &&
20001             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_INT);
20002   match(Set dst (VectorMaskCmp src1 src2));
20003   effect(TEMP dst, TEMP scratch);
20004   format %{ "vpcmpneqd  k2,$src1,$src2\n\t"
20005             "vmovdqu32   $dst, k2{z}, 0xFFFFFFFFFF \t! cmpneq packed16I" %}
20006   ins_encode %{
20007     int vector_len = 2;
20008     Assembler::ComparisonPredicate cmp = Assembler::neq;
20009     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
20010     KRegister mask = k0; // The comparison itself is not being masked.
20011     __ evpcmpd(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
20012     __ evmovdqul($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
20013   %}
20014   ins_pipe( pipe_slow );
20015 %}
20016 
20017 instruct vcmpeq8B(vecD dst, vecD src1, vecD src2) %{
20018   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 &&
20019             n->as_VectorMaskCmp()->get_predicate() == BoolTest::eq &&
20020             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
20021   match(Set dst (VectorMaskCmp src1 src2));
20022   format %{ "vpcmpeqb  $dst,$src1,$src2\n\t! cmpeq packed8B" %}
20023   ins_encode %{
20024     int vector_len = 0;
20025     __ vpcmpeqb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
20026   %}
20027   ins_pipe( pipe_slow );
20028 %}
20029 
20030 instruct vcmpeq16B(vecX dst, vecX src1, vecX src2) %{
20031   predicate(UseAVX > 0 && n->as_Vector()->length() == 16 &&
20032             n->as_VectorMaskCmp()->get_predicate() == BoolTest::eq &&
20033             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
20034   match(Set dst (VectorMaskCmp src1 src2));
20035   format %{ "vpcmpeqb  $dst,$src1,$src2\n\t! cmpeq packed16B" %}
20036   ins_encode %{
20037     int vector_len = 0;
20038     __ vpcmpeqb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
20039   %}
20040   ins_pipe( pipe_slow );
20041 %}
20042 
20043 instruct vcmpeq32B(vecY dst, vecY src1, vecY src2) %{
20044   predicate(UseAVX > 1 && n->as_Vector()->length() == 32 &&
20045             n->as_VectorMaskCmp()->get_predicate() == BoolTest::eq &&
20046             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
20047   match(Set dst (VectorMaskCmp src1 src2));
20048   format %{ "vpcmpeqb  $dst,$src1,$src2\n\t! cmpeq packed32B" %}
20049   ins_encode %{
20050     int vector_len = 1;
20051     __ vpcmpeqb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
20052   %}
20053   ins_pipe( pipe_slow );
20054 %}
20055 
20056 instruct vcmpeq64B(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
20057   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64 &&
20058             n->as_VectorMaskCmp()->get_predicate() == BoolTest::eq &&
20059             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
20060   match(Set dst (VectorMaskCmp src1 src2));
20061   effect(TEMP dst, TEMP scratch);
20062   format %{ "vpcmpeqb  k2,$src1,$src2\n\t"
20063             "vmovdqu8 $dst, k2{z}, 0xFFFFFFFFFF \t! cmpeq packed64B" %}
20064   ins_encode %{
20065     int vector_len = 2;
20066     Assembler::ComparisonPredicate cmp = Assembler::eq;
20067     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
20068     KRegister mask = k0; // The comparison itself is not being masked.
20069     __ evpcmpb(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
20070     __ evmovdqub($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
20071   %}
20072   ins_pipe( pipe_slow );
20073 %}
20074 
20075 instruct vcmplt8B(vecD dst, vecD src1, vecD src2) %{
20076   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 &&
20077             n->as_VectorMaskCmp()->get_predicate() == BoolTest::lt &&
20078             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
20079   match(Set dst (VectorMaskCmp src1 src2));
20080   format %{ "vpcmpgtb  $dst,$src2,$src1\t! cmplt packed8B" %}
20081   ins_encode %{
20082     int vector_len = 0;
20083     __ vpcmpgtb($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, vector_len);
20084   %}
20085   ins_pipe( pipe_slow );
20086 %}
20087 
20088 instruct vcmplt16B(vecX dst, vecX src1, vecX src2) %{
20089   predicate(UseAVX > 0 && n->as_Vector()->length() == 16 &&
20090             n->as_VectorMaskCmp()->get_predicate() == BoolTest::lt &&
20091             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
20092   match(Set dst (VectorMaskCmp src1 src2));
20093   format %{ "vpcmpgtb  $dst,$src2,$src1\t! cmplt packed16B" %}
20094   ins_encode %{
20095     int vector_len = 0;
20096     __ vpcmpgtb($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, vector_len);
20097   %}
20098   ins_pipe( pipe_slow );
20099 %}
20100 
20101 instruct vcmplt32B(vecY dst, vecY src1, vecY src2) %{
20102   predicate(UseAVX > 1 && n->as_Vector()->length() == 32 &&
20103             n->as_VectorMaskCmp()->get_predicate() == BoolTest::lt &&
20104             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
20105   match(Set dst (VectorMaskCmp src1 src2));
20106   format %{ "vpcmpgtb  $dst,$src2,$src1\t! cmplt packed32B" %}
20107   ins_encode %{
20108     int vector_len = 1;
20109     __ vpcmpgtb($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, vector_len);
20110   %}
20111   ins_pipe( pipe_slow );
20112 %}
20113 
20114 instruct vcmplt64B(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
20115   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64 &&
20116             n->as_VectorMaskCmp()->get_predicate() == BoolTest::lt &&
20117             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
20118   match(Set dst (VectorMaskCmp src1 src2));
20119   effect(TEMP dst, TEMP scratch);
20120   format %{ "vpcmpnleb  k2,$src1,$src2\n\t"
20121             "vmovdqu8   $dst, k2{z}, 0xFFFFFFFFFF \t! cmpgt packed64B" %}
20122   ins_encode %{
20123     int vector_len = 2;
20124     Assembler::ComparisonPredicate cmp = Assembler::lt;
20125     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
20126     KRegister mask = k0; // The comparison itself is not being masked.
20127     __ evpcmpb(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
20128     __ evmovdqub($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
20129   %}
20130   ins_pipe( pipe_slow );
20131 %}
20132 
20133 instruct vcmpgt8B(vecD dst, vecD src1, vecD src2) %{
20134   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 &&
20135             n->as_VectorMaskCmp()->get_predicate() == BoolTest::gt &&
20136             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
20137   match(Set dst (VectorMaskCmp src1 src2));
20138   format %{ "vpcmpgtb  $dst,$src1,$src2\t! cmpgt packed8B" %}
20139   ins_encode %{
20140     int vector_len = 0;
20141     __ vpcmpgtb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
20142   %}
20143   ins_pipe( pipe_slow );
20144 %}
20145 
20146 instruct vcmpgt16B(vecX dst, vecX src1, vecX src2) %{
20147   predicate(UseAVX > 0 && n->as_Vector()->length() == 16 &&
20148             n->as_VectorMaskCmp()->get_predicate() == BoolTest::gt &&
20149             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
20150   match(Set dst (VectorMaskCmp src1 src2));
20151   format %{ "vpcmpgtb  $dst,$src1,$src2\t! cmpgt packed16B" %}
20152   ins_encode %{
20153     int vector_len = 0;
20154     __ vpcmpgtb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
20155   %}
20156   ins_pipe( pipe_slow );
20157 %}
20158 
20159 instruct vcmpgt32B(vecY dst, vecY src1, vecY src2) %{
20160   predicate(UseAVX > 1 && n->as_Vector()->length() == 32 &&
20161             n->as_VectorMaskCmp()->get_predicate() == BoolTest::gt &&
20162             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
20163   match(Set dst (VectorMaskCmp src1 src2));
20164   format %{ "vpcmpgtb  $dst,$src1,$src2\t! cmpgt packed32B" %}
20165   ins_encode %{
20166     int vector_len = 1;
20167     __ vpcmpgtb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
20168   %}
20169   ins_pipe( pipe_slow );
20170 %}
20171 
20172 instruct vcmpgt64B(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
20173   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64 &&
20174             n->as_VectorMaskCmp()->get_predicate() == BoolTest::gt &&
20175             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
20176   match(Set dst (VectorMaskCmp src1 src2));
20177   effect(TEMP dst, TEMP scratch);
20178   format %{ "vpcmpnleb  k2,$src1,$src2\n\t"
20179             "vmovdqu8   $dst, k2{z}, 0xFFFFFFFFFF \t! cmpgt packed64B" %}
20180   ins_encode %{
20181     int vector_len = 2;
20182     Assembler::ComparisonPredicate cmp = Assembler::nle;
20183     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
20184     KRegister mask = k0; // The comparison itself is not being masked.
20185     __ evpcmpb(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
20186     __ evmovdqub($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
20187   %}
20188   ins_pipe( pipe_slow );
20189 %}
20190 
20191 instruct vcmpge8B(vecD dst, vecD src1, vecD src2, rRegL scratch) %{
20192   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 &&
20193             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ge &&
20194             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
20195   match(Set dst (VectorMaskCmp src1 src2));
20196   effect(TEMP scratch);
20197   format %{ "vpcmpgtb  $dst,$src2,$src1\n\t"
20198             "vpxor $dst,$dst,0xFFFFFFFF\t! cmpge packed8B" %}
20199   ins_encode %{
20200     int vector_len = 0;
20201     __ vpcmpgtb($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, vector_len);
20202     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
20203   %}
20204   ins_pipe( pipe_slow );
20205 %}
20206 
20207 instruct vcmpge16B(vecX dst, vecX src1, vecX src2, rRegL scratch) %{
20208   predicate(UseAVX > 0 && n->as_Vector()->length() == 16 &&
20209             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ge &&
20210             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
20211   match(Set dst (VectorMaskCmp src1 src2));
20212   effect(TEMP scratch);
20213   format %{ "vpcmpgtb  $dst,$src2,$src1\n\t"
20214             "vpxor $dst,$dst,0xFFFFFFFF\t! cmpge packed16B" %}
20215   ins_encode %{
20216     int vector_len = 0;
20217     __ vpcmpgtb($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, vector_len);
20218     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
20219   %}
20220   ins_pipe( pipe_slow );
20221 %}
20222 
20223 instruct extract8d(regD dst, vecZ src, vecZ tmp, immI idx) %{
20224   predicate(UseAVX > 2 && n->in(1)->bottom_type()->is_vect()->length() == 8);
20225   match(Set dst (ExtractD src idx));
20226   effect(TEMP tmp);
20227   ins_encode %{
20228     int vector_len = 2;
20229     int midx = 0x7 & $idx$$constant;
20230     if (midx == 0 && $dst$$XMMRegister != $src$$XMMRegister) {
20231       __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister);
20232     } else if (midx == 1) {
20233       __ vpshufpd($dst$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, midx, vector_len);
20234     } else if (midx > 1 && midx <= 7) {
20235       int extr_idx1 = midx / 2;
20236       int extr_idx2 = midx % 2;
20237       __ vextractf32x4($tmp$$XMMRegister, $src$$XMMRegister, extr_idx1);
20238       __ vpshufpd($dst$$XMMRegister, $tmp$$XMMRegister, $tmp$$XMMRegister, extr_idx2, vector_len);
20239     }
20240   %}
20241   ins_pipe( pipe_slow );
20242 %}
20243 
20244 instruct extract4d(regD dst, vecY src, vecY tmp, immI idx) %{
20245   predicate(UseAVX > 0 && n->in(1)->bottom_type()->is_vect()->length() == 4);
20246   match(Set dst (ExtractD src idx));
20247   effect(TEMP tmp);
20248   ins_encode %{
20249     int vector_len = 1;
20250     int midx = 0x3 & $idx$$constant;
20251     if (midx == 0 && $dst$$XMMRegister != $src$$XMMRegister) {
20252       __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister);
20253     } else if (midx == 1) {
20254       __ vpshufpd($dst$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, midx, vector_len);
20255     } else if (midx > 1 && midx <= 3) {
20256       __ vextractf128($tmp$$XMMRegister, $src$$XMMRegister, 0x1);
20257       __ vpshufpd($dst$$XMMRegister, $tmp$$XMMRegister, $tmp$$XMMRegister, midx - 2, vector_len);
20258     }
20259   %}
20260   ins_pipe( pipe_slow );
20261 %}
20262 
20263 instruct extract2d(regD dst, vecX src, immI idx) %{
20264   predicate(UseSSE >= 2 && n->in(1)->bottom_type()->is_vect()->length() == 2);
20265   match(Set dst (ExtractD src idx));
20266   ins_encode %{
20267     int midx = 0x1 & $idx$$constant;
20268     if ($dst$$XMMRegister != $src$$XMMRegister) {
20269       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
20270     }
20271     if (midx > 0) {
20272       __ pshufpd($dst$$XMMRegister, $dst$$XMMRegister, midx);
20273     }
20274   %}
20275   ins_pipe( pipe_slow );
20276 %}
20277 
20278 instruct extract1d(regD dst, vecD src, immI idx) %{
20279   predicate(UseSSE >= 2  && n->in(1)->bottom_type()->is_vect()->length() == 1);
20280   match(Set dst (ExtractD src idx));
20281   ins_encode %{
20282     int midx = 0x1 & $idx$$constant;
20283     if (midx == 0 && $dst$$XMMRegister != $src$$XMMRegister) {
20284       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
20285     }
20286   %}
20287   ins_pipe( pipe_slow );
20288 %}
20289 
20290 instruct extract16f(regF dst, vecZ src, vecZ tmp, immI idx) %{
20291   predicate(UseAVX > 2 && n->in(1)->bottom_type()->is_vect()->length() == 16);
20292   match(Set dst (ExtractF src idx));
20293   effect(TEMP tmp);
20294   ins_encode %{
20295     int vector_len=2;
20296     int midx = 0xF & $idx$$constant;
20297     if (midx == 0 && $dst$$XMMRegister != $src$$XMMRegister) {
20298       __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister);
20299     } else if (midx >= 1 && midx <= 3) {
20300       __ vpshufps($dst$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, midx, vector_len);
20301     } else {
20302       int extr_idx1 = midx / 4;
20303       int extr_idx2 = midx % 4;
20304       __ vextractf32x4($tmp$$XMMRegister, $src$$XMMRegister, extr_idx1);
20305       __ vpshufps($dst$$XMMRegister, $tmp$$XMMRegister, $tmp$$XMMRegister, extr_idx2, vector_len);
20306     }
20307   %}
20308   ins_pipe( pipe_slow );
20309 %}
20310 
20311 instruct extract8f(regF dst, vecY src, vecY tmp, immI idx) %{
20312   predicate(UseAVX > 0 && n->in(1)->bottom_type()->is_vect()->length() == 8);
20313   match(Set dst (ExtractF src idx));
20314   effect(TEMP tmp);
20315   ins_encode %{
20316     int vector_len=1;
20317     int midx = 0x7 & $idx$$constant;
20318     if (midx == 0 && $dst$$XMMRegister != $src$$XMMRegister) {
20319       __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister);
20320     } else if (midx >= 1 && midx <= 3) {
20321       __ vpshufps($dst$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, midx, vector_len);
20322     } else if (midx >= 4) {
20323       __ vextractf128($tmp$$XMMRegister, $src$$XMMRegister, 0x1);
20324       __ vpshufps($dst$$XMMRegister, $tmp$$XMMRegister, $tmp$$XMMRegister, midx - 4, vector_len);
20325     }
20326   %}
20327   ins_pipe( pipe_slow );
20328 %}
20329 
20330 instruct extract4f(regF dst, vecX src, immI idx) %{
20331   predicate(UseSSE >= 2 && n->in(1)->bottom_type()->is_vect()->length() == 4);
20332   match(Set dst (ExtractF src idx));
20333   ins_encode %{
20334     int midx = 0x3 & $idx$$constant;
20335     if ($dst$$XMMRegister != $src$$XMMRegister) {
20336       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
20337     }
20338     if (midx > 0) {
20339       __ pshufps($dst$$XMMRegister, $dst$$XMMRegister, midx);
20340     }
20341   %}
20342   ins_pipe( pipe_slow );
20343 %}
20344 
20345 instruct extract2f(regF dst, vecD src, immI idx) %{
20346   predicate(UseSSE >= 2 && n->in(1)->bottom_type()->is_vect()->length() == 2);
20347   match(Set dst (ExtractF src idx));
20348   ins_encode %{
20349     int midx = 0x1 & $idx$$constant;
20350     if ($dst$$XMMRegister != $src$$XMMRegister) {
20351       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
20352     }
20353     if (midx > 0)
20354     {
20355       __ pshufps($dst$$XMMRegister, $dst$$XMMRegister, midx);
20356     }
20357   %}
20358   ins_pipe( pipe_slow );
20359 %}
20360 
20361 instruct extract1l(rRegL dst, vecD src, immI idx) %{
20362   predicate(UseSSE > 1 && n->in(1)->bottom_type()->is_vect()->length() == 1);
20363   match(Set dst (ExtractL src idx));
20364   ins_encode %{
20365     int midx = 0x1 & $idx$$constant;
20366     if (midx == 0) {
20367       __ movq($dst$$Register, $src$$XMMRegister);
20368     }
20369   %}
20370   ins_pipe( pipe_slow );
20371 %}
20372 
20373 instruct extract2l(rRegL dst, vecX src, immI idx) %{
20374   predicate(UseSSE >= 4 && n->in(1)->bottom_type()->is_vect()->length() == 2);
20375   match(Set dst (ExtractL src idx));
20376   ins_encode %{
20377     int midx = 0x1 & $idx$$constant;
20378     if (midx == 0) {
20379       __ movq($dst$$Register, $src$$XMMRegister);
20380     } else {
20381       __ pextrq($dst$$Register, $src$$XMMRegister, midx);
20382     }
20383   %}
20384   ins_pipe( pipe_slow );
20385 %}
20386 
20387 instruct extract4l(rRegL dst, vecY src, immI idx, vecX tmp) %{
20388   predicate(UseAVX > 1 && n->in(1)->bottom_type()->is_vect()->length() == 4);
20389   match(Set dst (ExtractL src idx));
20390   effect(TEMP tmp);
20391   ins_encode %{
20392     int midx = 0x3 & $idx$$constant;
20393     if (midx == 0) {
20394       __ movq($dst$$Register, $src$$XMMRegister);
20395     } else if(midx==1){
20396       __ pextrq($dst$$Register, $src$$XMMRegister, midx);
20397     } else {
20398       __ vextracti128($tmp$$XMMRegister, $src$$XMMRegister, 0x1);
20399       __ pextrq($dst$$Register, $tmp$$XMMRegister, midx-2);
20400     }
20401   %}
20402   ins_pipe( pipe_slow );
20403 %}
20404 
20405 instruct extract8l(rRegL dst, vecZ src, vecX tmp, immI idx) %{
20406   predicate(UseAVX > 2 && n->in(1)->bottom_type()->is_vect()->length() == 8);
20407   match(Set dst (ExtractL src idx));
20408   effect(TEMP tmp);
20409   ins_encode %{
20410     int midx = 0x7 & $idx$$constant;
20411     if (midx == 0) {
20412       __ movq($dst$$Register, $src$$XMMRegister);
20413     } else if (midx == 1) {
20414       __ pextrq($dst$$Register, $src$$XMMRegister, midx);
20415     } else {
20416       // Using 2 because there are 2 longs in 128-bit
20417       int extr_idx1 = midx / 2;
20418       int extr_idx2 = midx % 2;
20419       __ vextracti32x4($tmp$$XMMRegister, $src$$XMMRegister, extr_idx1);
20420       __ pextrq($dst$$Register, $tmp$$XMMRegister, extr_idx2);
20421     }
20422   %}
20423   ins_pipe( pipe_slow );
20424 %}
20425 
20426 instruct extract2i(rRegI dst, vecD src, immI idx) %{
20427   predicate(UseSSE > 3 && n->in(1)->bottom_type()->is_vect()->length() == 2);
20428   match(Set dst (ExtractI src idx));
20429   ins_encode %{
20430     int midx = 0x1 & $idx$$constant;
20431     if (midx == 0) {
20432       __ movdl($dst$$Register, $src$$XMMRegister);
20433     } else if (midx >= 1) {
20434       __ pextrd($dst$$Register, $src$$XMMRegister, midx);
20435     }
20436   %}
20437   ins_pipe( pipe_slow );
20438 %}
20439 
20440 instruct extract4i(rRegI dst, vecX src, immI idx) %{
20441   predicate(UseSSE > 3 && n->in(1)->bottom_type()->is_vect()->length() == 4);
20442   match(Set dst (ExtractI src idx));
20443   ins_encode %{
20444     int midx = 0x3 & $idx$$constant;
20445     if (midx == 0) {
20446       __ movdl($dst$$Register, $src$$XMMRegister);
20447     } else if (midx >= 1 && midx <= 3) {
20448       __ pextrd($dst$$Register, $src$$XMMRegister, midx);
20449     }
20450   %}
20451   ins_pipe( pipe_slow );
20452 %}
20453 
20454 instruct extract8i(rRegI dst, vecY src, vecX tmp, immI idx) %{
20455   predicate(UseAVX > 1 && n->in(1)->bottom_type()->is_vect()->length() == 8);
20456   match(Set dst (ExtractI src idx));
20457   effect(TEMP tmp);
20458   ins_encode %{
20459     int midx = 0x7 & $idx$$constant;
20460     if (midx == 0) {
20461       __ movdl($dst$$Register, $src$$XMMRegister);
20462     } else if (midx >= 1 && midx <= 3) {
20463       __ pextrd($dst$$Register, $src$$XMMRegister, midx);
20464     } else if (midx >= 4) {
20465       __ vextracti128($tmp$$XMMRegister, $src$$XMMRegister, 0x1);
20466       __ pextrd($dst$$Register, $tmp$$XMMRegister, midx - 4);
20467     }
20468   %}
20469   ins_pipe( pipe_slow );
20470 %}
20471 
20472 instruct extract16i(rRegI dst, vecZ src, vecX tmp, immI idx) %{
20473   predicate(UseAVX > 2 && n->in(1)->bottom_type()->is_vect()->length() == 16);
20474   match(Set dst (ExtractI src idx));
20475   effect(TEMP tmp);
20476     ins_encode %{
20477     int midx = 0xF & $idx$$constant;
20478     if (midx == 0) {
20479       __ movdl($dst$$Register, $src$$XMMRegister);
20480     } else if (midx >= 1 && midx <= 3) {
20481       __ pextrd($dst$$Register, $src$$XMMRegister, midx);
20482     } else {
20483       // Using 4 because there are 4 ints in 128-bit
20484       int extr_idx1 = midx / 4;
20485       int extr_idx2 = midx % 4;
20486       __ vextracti32x4($tmp$$XMMRegister, $src$$XMMRegister, extr_idx1);
20487       __ pextrd($dst$$Register, $tmp$$XMMRegister, extr_idx2);
20488     }
20489   %}
20490   ins_pipe( pipe_slow );
20491 %}
20492 
20493 instruct extract4s(rRegI dst, vecD src, immI idx) %{
20494   predicate(UseSSE > 1 && n->in(1)->bottom_type()->is_vect()->length() == 4);
20495   match(Set dst (ExtractS src idx));
20496   ins_encode %{
20497     int midx = 0x3 & $idx$$constant;
20498     if (midx == 0) {
20499       __ movdl($dst$$Register, $src$$XMMRegister);
20500       __ movswl($dst$$Register, $dst$$Register);
20501     } else if (midx >= 1) {
20502       __ pextrw($dst$$Register, $src$$XMMRegister, midx);
20503       __ movswl($dst$$Register, $dst$$Register);
20504     }
20505   %}
20506   ins_pipe( pipe_slow );
20507 %}
20508 
20509 instruct extract8s(rRegI dst, vecX src, immI idx) %{
20510   predicate(UseSSE > 1 && n->in(1)->bottom_type()->is_vect()->length() == 8);
20511   match(Set dst (ExtractS src idx));
20512   ins_encode %{
20513     int midx = 0x7 & $idx$$constant;
20514     if (midx == 0) {
20515       __ movdl($dst$$Register, $src$$XMMRegister);
20516       __ movswl($dst$$Register, $dst$$Register);
20517     } else if (midx >= 1) {
20518       __ pextrw($dst$$Register, $src$$XMMRegister, midx);
20519       __ movswl($dst$$Register, $dst$$Register);
20520     }
20521   %}
20522   ins_pipe( pipe_slow );
20523 %}
20524 
20525 instruct extract16s(rRegI dst, vecY src, vecX tmp, immI idx) %{
20526   predicate(UseAVX > 1 && n->in(1)->bottom_type()->is_vect()->length() == 16);
20527   match(Set dst (ExtractS src idx));
20528   effect(TEMP tmp);
20529   ins_encode %{
20530     int midx = 0xF & $idx$$constant;
20531     if (midx == 0) {
20532       __ movdl($dst$$Register, $src$$XMMRegister);
20533       __ movswl($dst$$Register, $dst$$Register);
20534     } else if (midx >= 1 && midx <= 7) {
20535       __ pextrw($dst$$Register, $src$$XMMRegister, midx);
20536       __ movswl($dst$$Register, $dst$$Register);
20537     } else {
20538       __ vextracti128($tmp$$XMMRegister, $src$$XMMRegister, 0x1);
20539       __ pextrw($dst$$Register, $tmp$$XMMRegister, midx-8);
20540       __ movswl($dst$$Register, $dst$$Register);
20541     }
20542   %}
20543   ins_pipe( pipe_slow );
20544 %}
20545 
20546 instruct extract32s(rRegI dst, vecZ src, vecX tmp, immI idx) %{
20547   predicate(UseAVX > 2 && n->in(1)->bottom_type()->is_vect()->length() == 32);
20548   match(Set dst (ExtractS src idx));
20549   effect(TEMP tmp);
20550     ins_encode %{
20551     int midx = 0x1F & $idx$$constant;
20552     if (midx == 0) {
20553     __ movdl($dst$$Register, $src$$XMMRegister);
20554     __ movswl($dst$$Register, $dst$$Register);
20555     } else if (midx >= 1 && midx <= 7) {
20556       __ pextrw($dst$$Register, $src$$XMMRegister, midx);
20557       __ movswl($dst$$Register, $dst$$Register);
20558     } else {
20559       int extr_idx1 = midx / 8;
20560       int extr_idx2 = midx % 8;
20561       __ vextracti32x4($tmp$$XMMRegister, $src$$XMMRegister, extr_idx1);
20562       __ pextrw($dst$$Register, $tmp$$XMMRegister, extr_idx2);
20563       __ movswl($dst$$Register, $dst$$Register);
20564     }
20565   %}
20566   ins_pipe( pipe_slow );
20567 %}
20568 
20569 instruct extract8b(rRegI dst, vecD src, immI idx) %{
20570   predicate(UseSSE >= 4 && n->in(1)->bottom_type()->is_vect()->length() == 8);
20571   match(Set dst (ExtractB src idx));
20572   ins_encode %{
20573     int midx = 0x7 & $idx$$constant;
20574     if (midx == 0) {
20575       __ movdl($dst$$Register, $src$$XMMRegister);
20576       __ movsbl($dst$$Register, $dst$$Register);
20577     } else if (midx >= 1) {
20578       __ pextrb($dst$$Register, $src$$XMMRegister, midx);
20579       __ movsbl($dst$$Register, $dst$$Register);
20580     }
20581   %}
20582   ins_pipe( pipe_slow );
20583 %}
20584 
20585 instruct extract16b(rRegI dst, vecX src, immI idx) %{
20586   predicate(UseSSE >= 4 && n->in(1)->bottom_type()->is_vect()->length() == 16);
20587   match(Set dst (ExtractB src idx));
20588   ins_encode %{
20589     int midx = 0xF & $idx$$constant;
20590     if (midx == 0) {
20591       __ movdl($dst$$Register, $src$$XMMRegister);
20592       __ movsbl($dst$$Register, $dst$$Register);
20593     } else if (midx >= 1) {
20594       __ pextrb($dst$$Register, $src$$XMMRegister, midx);
20595       __ movsbl($dst$$Register, $dst$$Register);
20596     }
20597   %}
20598   ins_pipe( pipe_slow );
20599 %}
20600 
20601 instruct extract32b(rRegI dst, vecY src, vecX tmp, immI idx) %{
20602   predicate(UseAVX > 0 && n->in(1)->bottom_type()->is_vect()->length() == 32);
20603   match(Set dst (ExtractB src idx));
20604   effect(TEMP tmp);
20605     ins_encode %{
20606     int midx = 0x1F & $idx$$constant;
20607     if (midx == 0) {
20608       __ movdl($dst$$Register, $src$$XMMRegister);
20609       __ movsbl($dst$$Register, $dst$$Register);
20610     } else if (midx >= 1 && midx <= 15) {
20611       __ pextrb($dst$$Register, $src$$XMMRegister, midx);
20612       __ movsbl($dst$$Register, $dst$$Register);
20613     } else {
20614       int extr_idx1 = midx / 16;
20615       int extr_idx2 = midx % 16;
20616       __ vextractf128($tmp$$XMMRegister, $src$$XMMRegister, extr_idx1);
20617       __ pextrb($dst$$Register, $tmp$$XMMRegister, extr_idx2);
20618       __ movsbl($dst$$Register, $dst$$Register);
20619     }
20620   %}
20621   ins_pipe( pipe_slow );
20622 %}
20623 
20624 instruct extract64b(rRegI dst, vecZ src, vecX tmp, immI idx) %{
20625   predicate(UseAVX > 2 && n->in(1)->bottom_type()->is_vect()->length() == 64);
20626   match(Set dst (ExtractB src idx));
20627   effect(TEMP tmp);
20628     ins_encode %{
20629     int midx = 0x3F & $idx$$constant;
20630     if (midx == 0) {
20631     __ movdl($dst$$Register, $src$$XMMRegister);
20632     __ movsbl($dst$$Register, $dst$$Register);
20633     } else if (midx >= 1 && midx <= 15) {
20634       __ pextrb($dst$$Register, $src$$XMMRegister, midx);
20635       __ movsbl($dst$$Register, $dst$$Register);
20636     } else {
20637       int extr_idx1 = midx / 16;
20638       int extr_idx2 = midx % 16;
20639       __ vextracti32x4($tmp$$XMMRegister, $src$$XMMRegister, extr_idx1);
20640       __ pextrb($dst$$Register, $tmp$$XMMRegister, extr_idx2);
20641       __ movsbl($dst$$Register, $dst$$Register);
20642     }
20643   %}
20644   ins_pipe( pipe_slow );
20645 %}
20646 
20647 instruct vcmpge32B(vecY dst, vecY src1, vecY src2, rRegL scratch) %{
20648   predicate(UseAVX > 1 && n->as_Vector()->length() == 32 &&
20649             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ge &&
20650             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
20651   match(Set dst (VectorMaskCmp src1 src2));
20652   effect(TEMP scratch);
20653   format %{ "vpcmpgtb  $dst,$src2,$src1\n  "
20654             "vpxor $dst,$dst,0xFFFFFFFF\t! cmpge packed32B" %}
20655   ins_encode %{
20656     int vector_len = 1;
20657     __ vpcmpgtb($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, vector_len);
20658     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
20659   %}
20660   ins_pipe( pipe_slow );
20661 %}
20662 
20663 instruct vcmpge64B(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
20664   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64 &&
20665             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ge &&
20666             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
20667   match(Set dst (VectorMaskCmp src1 src2));
20668   effect(TEMP dst, TEMP scratch);
20669   format %{ "vpcmpnltb  k2,$src1,$src2\n\t"
20670             "vmovdqu8   $dst, k2{z}, 0xFFFFFFFFFF \t! cmpge packed64B" %}
20671   ins_encode %{
20672     int vector_len = 2;
20673     Assembler::ComparisonPredicate cmp = Assembler::nlt;
20674     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
20675     KRegister mask = k0; // The comparison itself is not being masked.
20676     __ evpcmpb(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
20677     __ evmovdqub($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
20678   %}
20679   ins_pipe( pipe_slow );
20680 %}
20681 
20682 instruct vcmple8B(vecD dst, vecD src1, vecD src2, rRegL scratch) %{
20683   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 &&
20684             n->as_VectorMaskCmp()->get_predicate() == BoolTest::le &&
20685             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
20686   match(Set dst (VectorMaskCmp src1 src2));
20687   effect(TEMP scratch);
20688   format %{ "vpcmpgtb  $dst,$src1,$src2\n\t"
20689             "vpxor $dst,$dst,0xFFFFFFFF\t! cmple packed8B" %}
20690   ins_encode %{
20691     int vector_len = 0;
20692     __ vpcmpgtb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
20693     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
20694   %}
20695   ins_pipe( pipe_slow );
20696 %}
20697 
20698 instruct vcmple16B(vecX dst, vecX src1, vecX src2, rRegL scratch) %{
20699   predicate(UseAVX > 0 && n->as_Vector()->length() == 16 &&
20700             n->as_VectorMaskCmp()->get_predicate() == BoolTest::le &&
20701             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
20702   match(Set dst (VectorMaskCmp src1 src2));
20703   effect(TEMP scratch);
20704   format %{ "vpcmpgtb  $dst,$src1,$src2\n\t"
20705             "vpxor $dst,$dst,0xFFFFFFFF\t! cmple packed16B" %}
20706   ins_encode %{
20707     int vector_len = 0;
20708     __ vpcmpgtb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
20709     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
20710   %}
20711   ins_pipe( pipe_slow );
20712 %}
20713 
20714 instruct vcmple32B(vecY dst, vecY src1, vecY src2, rRegL scratch) %{
20715   predicate(UseAVX > 1 && n->as_Vector()->length() == 32 &&
20716             n->as_VectorMaskCmp()->get_predicate() == BoolTest::le &&
20717             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
20718   match(Set dst (VectorMaskCmp src1 src2));
20719   effect(TEMP scratch);
20720   format %{ "vpcmpgtb  $dst,$src1,$src2\n\t"
20721             "vpxor $dst,$dst,0xFFFFFFFF\t! cmple packed32B" %}
20722   ins_encode %{
20723     int vector_len = 1;
20724     __ vpcmpgtb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
20725     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
20726   %}
20727   ins_pipe( pipe_slow );
20728 %}
20729 
20730 instruct vcmple64B(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
20731   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64 &&
20732             n->as_VectorMaskCmp()->get_predicate() == BoolTest::le &&
20733             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
20734   match(Set dst (VectorMaskCmp src1 src2));
20735   effect(TEMP dst, TEMP scratch);
20736   format %{ "vpcmpleb  k2,$src1,$src2\n\t"
20737             "vmovdqu8   $dst, k2{z}, 0xFFFFFFFFFF \t! cmple packed64B" %}
20738   ins_encode %{
20739     int vector_len = 2;
20740     Assembler::ComparisonPredicate cmp = Assembler::le;
20741     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
20742     KRegister mask = k0; // The comparison itself is not being masked.
20743     __ evpcmpb(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
20744     __ evmovdqub($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
20745   %}
20746   ins_pipe( pipe_slow );
20747 %}
20748 
20749 instruct vcmpne8B(vecD dst, vecD src1, vecD src2, rRegL scratch) %{
20750   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 &&
20751             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ne &&
20752             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
20753   match(Set dst (VectorMaskCmp src1 src2));
20754   effect(TEMP scratch);
20755   format %{ "vpcmpeqb  $dst,$src1,$src2\n\t"
20756             "vpxor $dst,$dst,0xFFFFFFFF\t! cmpneq packed8B" %}
20757   ins_encode %{
20758     int vector_len = 0;
20759     __ vpcmpeqb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
20760     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
20761   %}
20762   ins_pipe( pipe_slow );
20763 %}
20764 
20765 instruct vcmpne16B(vecX dst, vecX src1, vecX src2, rRegL scratch) %{
20766   predicate(UseAVX > 0 && n->as_Vector()->length() == 16 &&
20767             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ne &&
20768             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
20769   match(Set dst (VectorMaskCmp src1 src2));
20770   effect(TEMP scratch);
20771   format %{ "vpcmpeqb  $dst,$src1,$src2\n\t"
20772             "vpxor $dst,$dst,0xFFFFFFFF\t! cmpneq packed16B" %}
20773   ins_encode %{
20774     int vector_len = 0;
20775     __ vpcmpeqb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
20776     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
20777   %}
20778   ins_pipe( pipe_slow );
20779 %}
20780 
20781 instruct vcmpne32B(vecY dst, vecY src1, vecY src2, rRegL scratch) %{
20782   predicate(UseAVX > 1 && n->as_Vector()->length() == 32 &&
20783             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ne &&
20784             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
20785   match(Set dst (VectorMaskCmp src1 src2));
20786   effect(TEMP scratch);
20787   format %{ "vpcmpeqb  $dst,$src1,$src2\n\t"
20788             "vpxor $dst,$dst,0xFFFFFFFF\t! cmpneq packed32B" %}
20789   ins_encode %{
20790     int vector_len = 1;
20791     __ vpcmpeqb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
20792     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
20793   %}
20794   ins_pipe( pipe_slow );
20795 %}
20796 
20797 instruct vcmpne64B(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
20798   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64 &&
20799             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ne &&
20800             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
20801   match(Set dst (VectorMaskCmp src1 src2));
20802   effect(TEMP dst, TEMP scratch);
20803   format %{ "vpcmpneqb  k2,$src1,$src2\n\t"
20804             "vmovdqu8   $dst, k2{z}, 0xFFFFFFFFFF \t! cmpneq packed64B" %}
20805   ins_encode %{
20806     int vector_len = 2;
20807     Assembler::ComparisonPredicate cmp = Assembler::neq;
20808     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
20809     KRegister mask = k0; // The comparison itself is not being masked.
20810     __ evpcmpb(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
20811     __ evmovdqub($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
20812   %}
20813   ins_pipe( pipe_slow );
20814 %}
20815 
20816 instruct vcmpeq4S(vecD dst, vecD src1, vecD src2) %{
20817   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 &&
20818             n->as_VectorMaskCmp()->get_predicate() == BoolTest::eq &&
20819             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
20820   match(Set dst (VectorMaskCmp src1 src2));
20821   format %{ "vpcmpeqw  $dst,$src1,$src2\n\t! cmpeq packed4S" %}
20822   ins_encode %{
20823     int vector_len = 0;
20824     __ vpcmpeqw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
20825   %}
20826   ins_pipe( pipe_slow );
20827 %}
20828 
20829 instruct vcmpeq8S(vecX dst, vecX src1, vecX src2) %{
20830   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 &&
20831             n->as_VectorMaskCmp()->get_predicate() == BoolTest::eq &&
20832             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
20833   match(Set dst (VectorMaskCmp src1 src2));
20834   format %{ "vpcmpeqw  $dst,$src1,$src2\n\t! cmpeq packed8S" %}
20835   ins_encode %{
20836     int vector_len = 0;
20837     __ vpcmpeqw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
20838   %}
20839   ins_pipe( pipe_slow );
20840 %}
20841 
20842 instruct vcmpeq16S(vecY dst, vecY src1, vecY src2) %{
20843   predicate(UseAVX > 1 && n->as_Vector()->length() == 16 &&
20844             n->as_VectorMaskCmp()->get_predicate() == BoolTest::eq &&
20845             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
20846   match(Set dst (VectorMaskCmp src1 src2));
20847   format %{ "vpcmpeqw  $dst,$src1,$src2\n\t! cmpeq packed16S" %}
20848   ins_encode %{
20849     int vector_len = 1;
20850     __ vpcmpeqw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
20851   %}
20852   ins_pipe( pipe_slow );
20853 %}
20854 
20855 instruct vcmpeq32S(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
20856   predicate(UseAVX > 2 && n->as_Vector()->length() == 32 &&
20857             n->as_VectorMaskCmp()->get_predicate() == BoolTest::eq &&
20858             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
20859   match(Set dst (VectorMaskCmp src1 src2));
20860   effect(TEMP dst, TEMP scratch);
20861   format %{ "vpcmpeqw  k2,$src1,$src2\n\t"
20862             "vmovdqu16 $dst, k2{z}, 0xFFFFFFFFFF \t! cmpeq packed32S" %}
20863   ins_encode %{
20864     int vector_len = 2;
20865     Assembler::ComparisonPredicate cmp = Assembler::eq;
20866     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
20867     KRegister mask = k0; // The comparison itself is not being masked.
20868     __ evpcmpw(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
20869     __ evmovdquw($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
20870   %}
20871   ins_pipe( pipe_slow );
20872 %}
20873 
20874 instruct vcmplt4S(vecD dst, vecD src1, vecD src2) %{
20875   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 &&
20876             n->as_VectorMaskCmp()->get_predicate() == BoolTest::lt &&
20877             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
20878   match(Set dst (VectorMaskCmp src1 src2));
20879   format %{ "vpcmpgtw  $dst,$src2,$src1\t! cmplt packed4S" %}
20880   ins_encode %{
20881     int vector_len = 0;
20882     __ vpcmpgtw($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, vector_len);
20883   %}
20884   ins_pipe( pipe_slow );
20885 %}
20886 
20887 instruct vcmplt8S(vecX dst, vecX src1, vecX src2) %{
20888   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 &&
20889             n->as_VectorMaskCmp()->get_predicate() == BoolTest::lt &&
20890             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
20891   match(Set dst (VectorMaskCmp src1 src2));
20892   format %{ "vpcmpgtw  $dst,$src2,$src1\t! cmplt packed8S" %}
20893   ins_encode %{
20894     int vector_len = 0;
20895     __ vpcmpgtw($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, vector_len);
20896   %}
20897   ins_pipe( pipe_slow );
20898 %}
20899 
20900 instruct vcmplt16S(vecY dst, vecY src1, vecY src2) %{
20901   predicate(UseAVX > 1 && n->as_Vector()->length() == 16 &&
20902             n->as_VectorMaskCmp()->get_predicate() == BoolTest::lt &&
20903             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
20904   match(Set dst (VectorMaskCmp src1 src2));
20905   format %{ "vpcmpgtw  $dst,$src2,$src1\t! cmplt packed16S" %}
20906   ins_encode %{
20907     int vector_len = 1;
20908     __ vpcmpgtw($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, vector_len);
20909   %}
20910   ins_pipe( pipe_slow );
20911 %}
20912 
20913 instruct vcmplt32S(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
20914   predicate(UseAVX > 2 && n->as_Vector()->length() == 32 &&
20915             n->as_VectorMaskCmp()->get_predicate() == BoolTest::lt &&
20916             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
20917   match(Set dst (VectorMaskCmp src1 src2));
20918   effect(TEMP dst, TEMP scratch);
20919   format %{ "vpcmpnlew  k2,$src1,$src2\n\t"
20920             "vmovdqu16   $dst, k2{z}, 0xFFFFFFFFFF \t! cmpgt packed32S" %}
20921   ins_encode %{
20922     int vector_len = 2;
20923     Assembler::ComparisonPredicate cmp = Assembler::lt;
20924     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
20925     KRegister mask = k0; // The comparison itself is not being masked.
20926     __ evpcmpw(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
20927     __ evmovdquw($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
20928   %}
20929   ins_pipe( pipe_slow );
20930 %}
20931 
20932 instruct vcmpgt4S(vecD dst, vecD src1, vecD src2) %{
20933   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 &&
20934             n->as_VectorMaskCmp()->get_predicate() == BoolTest::gt &&
20935             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
20936   match(Set dst (VectorMaskCmp src1 src2));
20937   format %{ "vpcmpgtw  $dst,$src1,$src2\t! cmpgt packed4S" %}
20938   ins_encode %{
20939     int vector_len = 0;
20940     __ vpcmpgtw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
20941   %}
20942   ins_pipe( pipe_slow );
20943 %}
20944 
20945 instruct vcmpgt8S(vecX dst, vecX src1, vecX src2) %{
20946   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 &&
20947             n->as_VectorMaskCmp()->get_predicate() == BoolTest::gt &&
20948             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
20949   match(Set dst (VectorMaskCmp src1 src2));
20950   format %{ "vpcmpgtw  $dst,$src1,$src2\t! cmpgt packed8S" %}
20951   ins_encode %{
20952     int vector_len = 0;
20953     __ vpcmpgtw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
20954   %}
20955   ins_pipe( pipe_slow );
20956 %}
20957 
20958 instruct vcmpgt16S(vecY dst, vecY src1, vecY src2) %{
20959   predicate(UseAVX > 1 && n->as_Vector()->length() == 16 &&
20960             n->as_VectorMaskCmp()->get_predicate() == BoolTest::gt &&
20961             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
20962   match(Set dst (VectorMaskCmp src1 src2));
20963   format %{ "vpcmpgtw  $dst,$src1,$src2\t! cmpgt packed16S" %}
20964   ins_encode %{
20965     int vector_len = 1;
20966     __ vpcmpgtw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
20967   %}
20968   ins_pipe( pipe_slow );
20969 %}
20970 
20971 instruct vcmpgt32S(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
20972   predicate(UseAVX > 2 && n->as_Vector()->length() == 32 &&
20973             n->as_VectorMaskCmp()->get_predicate() == BoolTest::gt &&
20974             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
20975   match(Set dst (VectorMaskCmp src1 src2));
20976   effect(TEMP dst, TEMP scratch);
20977   format %{ "vpcmpnlew  k2,$src1,$src2\n\t"
20978             "vmovdqu16   $dst, k2{z}, 0xFFFFFFFFFF \t! cmpgt packed32S" %}
20979   ins_encode %{
20980     int vector_len = 2;
20981     Assembler::ComparisonPredicate cmp = Assembler::nle;
20982     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
20983     KRegister mask = k0; // The comparison itself is not being masked.
20984     __ evpcmpw(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
20985     __ evmovdquw($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
20986   %}
20987   ins_pipe( pipe_slow );
20988 %}
20989 
20990 instruct vcmpge4S(vecD dst, vecD src1, vecD src2, rRegL scratch) %{
20991   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 &&
20992             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ge &&
20993             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
20994   match(Set dst (VectorMaskCmp src1 src2));
20995   effect(TEMP scratch);
20996   format %{ "vpcmpgtw  $dst,$src2,$src1\n\t"
20997             "vpxor $dst,$dst,0xFFFFFFFF\t! cmpge packed4S" %}
20998   ins_encode %{
20999     int vector_len = 0;
21000     __ vpcmpgtw($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, vector_len);
21001     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
21002   %}
21003   ins_pipe( pipe_slow );
21004 %}
21005 
21006 instruct vcmpge8S(vecX dst, vecX src1, vecX src2, rRegL scratch) %{
21007   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 &&
21008             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ge &&
21009             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
21010   match(Set dst (VectorMaskCmp src1 src2));
21011   effect(TEMP scratch);
21012   format %{ "vpcmpgtw  $dst,$src2,$src1\n\t"
21013             "vpxor $dst,$dst,0xFFFFFFFF\t! cmpge packed8S" %}
21014   ins_encode %{
21015     int vector_len = 0;
21016     __ vpcmpgtw($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, vector_len);
21017     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
21018   %}
21019   ins_pipe( pipe_slow );
21020 %}
21021 
21022 instruct vcmpge16S(vecY dst, vecY src1, vecY src2, rRegL scratch) %{
21023   predicate(UseAVX > 1 && n->as_Vector()->length() == 16 &&
21024             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ge &&
21025             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
21026   match(Set dst (VectorMaskCmp src1 src2));
21027   effect(TEMP scratch);
21028   format %{ "vpcmpgtw  $dst,$src2,$src1\n\t"
21029             "vpxor $dst,$dst,0xFFFFFFFF\t! cmpge packed16S" %}
21030   ins_encode %{
21031     int vector_len = 1;
21032     __ vpcmpgtw($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, vector_len);
21033     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
21034   %}
21035   ins_pipe( pipe_slow );
21036 %}
21037 
21038 instruct vcmpge32S(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
21039   predicate(UseAVX > 2 && n->as_Vector()->length() == 32 &&
21040             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ge &&
21041             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
21042   match(Set dst (VectorMaskCmp src1 src2));
21043   effect(TEMP dst, TEMP scratch);
21044   format %{ "vpcmpnltw  k2,$src1,$src2\n\t"
21045             "vmovdqu16   $dst, k2{z}, 0xFFFFFFFFFF \t! cmpge packed32S" %}
21046   ins_encode %{
21047     int vector_len = 2;
21048     Assembler::ComparisonPredicate cmp = Assembler::nlt;
21049     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
21050     KRegister mask = k0; // The comparison itself is not being masked.
21051     __ evpcmpw(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
21052     __ evmovdquw($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
21053   %}
21054   ins_pipe( pipe_slow );
21055 %}
21056 
21057 instruct vcmple4S(vecD dst, vecD src1, vecD src2, rRegL scratch) %{
21058   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 &&
21059             n->as_VectorMaskCmp()->get_predicate() == BoolTest::le &&
21060             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
21061   match(Set dst (VectorMaskCmp src1 src2));
21062   effect(TEMP scratch);
21063   format %{ "vpcmpgtw  $dst,$src1,$src2\n\t"
21064             "vpxor $dst,$dst,0xFFFFFFFF\t! cmple packed4S" %}
21065   ins_encode %{
21066     int vector_len = 0;
21067     __ vpcmpgtw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
21068     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
21069   %}
21070   ins_pipe( pipe_slow );
21071 %}
21072 
21073 instruct vcmple8S(vecX dst, vecX src1, vecX src2, rRegL scratch) %{
21074   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 &&
21075             n->as_VectorMaskCmp()->get_predicate() == BoolTest::le &&
21076             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
21077   match(Set dst (VectorMaskCmp src1 src2));
21078   effect(TEMP scratch);
21079   format %{ "vpcmpgtw  $dst,$src1,$src2\n\t"
21080             "vpxor $dst,$dst,0xFFFFFFFF\t! cmple packed8S" %}
21081   ins_encode %{
21082     int vector_len = 0;
21083     __ vpcmpgtw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
21084     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
21085   %}
21086   ins_pipe( pipe_slow );
21087 %}
21088 
21089 instruct vcmple16S(vecY dst, vecY src1, vecY src2, rRegL scratch) %{
21090   predicate(UseAVX > 1 && n->as_Vector()->length() == 16 &&
21091             n->as_VectorMaskCmp()->get_predicate() == BoolTest::le &&
21092             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
21093   match(Set dst (VectorMaskCmp src1 src2));
21094   effect(TEMP scratch);
21095   format %{ "vpcmpgtw  $dst,$src1,$src2\n\t"
21096             "vpxor $dst,$dst,0xFFFFFFFF\t! cmple packed16S" %}
21097   ins_encode %{
21098     int vector_len = 1;
21099     __ vpcmpgtw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
21100     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
21101   %}
21102   ins_pipe( pipe_slow );
21103 %}
21104 
21105 instruct vcmple32S(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
21106   predicate(UseAVX > 2 && n->as_Vector()->length() == 32 &&
21107             n->as_VectorMaskCmp()->get_predicate() == BoolTest::le &&
21108             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
21109   match(Set dst (VectorMaskCmp src1 src2));
21110   effect(TEMP dst, TEMP scratch);
21111   format %{ "vpcmplew  k2,$src1,$src2\n\t"
21112             "vmovdqu16   $dst, k2{z}, 0xFFFFFFFFFF \t! cmple packed32S" %}
21113   ins_encode %{
21114     int vector_len = 2;
21115     Assembler::ComparisonPredicate cmp = Assembler::le;
21116     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
21117     KRegister mask = k0; // The comparison itself is not being masked.
21118     __ evpcmpw(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
21119     __ evmovdquw($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
21120   %}
21121   ins_pipe( pipe_slow );
21122 %}
21123 
21124 instruct vcmpne4S(vecD dst, vecD src1, vecD src2, rRegL scratch) %{
21125   predicate(UseAVX > 0 && n->as_Vector()->length() == 4 &&
21126             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ne &&
21127             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
21128   match(Set dst (VectorMaskCmp src1 src2));
21129   effect(TEMP scratch);
21130   format %{ "vpcmpeqw  $dst,$src1,$src2\n\t"
21131             "vpxor $dst,$dst,0xFFFFFFFF\t! cmpneq packed4S" %}
21132   ins_encode %{
21133     int vector_len = 0;
21134     __ vpcmpeqw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
21135     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
21136   %}
21137   ins_pipe( pipe_slow );
21138 %}
21139 
21140 instruct vcmpne8S(vecX dst, vecX src1, vecX src2, rRegL scratch) %{
21141   predicate(UseAVX > 0 && n->as_Vector()->length() == 8 &&
21142             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ne &&
21143             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
21144   match(Set dst (VectorMaskCmp src1 src2));
21145   effect(TEMP scratch);
21146   format %{ "vpcmpeqw  $dst,$src1,$src2\n\t"
21147             "vpxor $dst,$dst,0xFFFFFFFF\t! cmpneq packed8S" %}
21148   ins_encode %{
21149     int vector_len = 0;
21150     __ vpcmpeqw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
21151     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
21152   %}
21153   ins_pipe( pipe_slow );
21154 %}
21155 
21156 instruct vcmpne16S(vecY dst, vecY src1, vecY src2, rRegL scratch) %{
21157   predicate(UseAVX > 1 && n->as_Vector()->length() == 16 &&
21158             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ne &&
21159             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
21160   match(Set dst (VectorMaskCmp src1 src2));
21161   effect(TEMP scratch);
21162   format %{ "vpcmpeqw  $dst,$src1,$src2\n\t"
21163             "vpxor $dst,$dst,0xFFFFFFFF\t! cmpneq packed16S" %}
21164   ins_encode %{
21165     int vector_len = 1;
21166     __ vpcmpeqw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
21167     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_all_bits_set()), vector_len, $scratch$$Register);
21168   %}
21169   ins_pipe( pipe_slow );
21170 %}
21171 
21172 instruct vcmpne32S(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
21173   predicate(UseAVX > 2 && n->as_Vector()->length() == 32 &&
21174             n->as_VectorMaskCmp()->get_predicate() == BoolTest::ne &&
21175             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
21176   match(Set dst (VectorMaskCmp src1 src2));
21177   effect(TEMP dst, TEMP scratch);
21178   format %{ "vpcmpneqw  k2,$src1,$src2\n\t"
21179             "vmovdqu16   $dst, k2{z}, 0xFFFFFFFFFF \t! cmpneq packed32S" %}
21180   ins_encode %{
21181     int vector_len = 2;
21182     Assembler::ComparisonPredicate cmp = Assembler::neq;
21183     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
21184     KRegister mask = k0; // The comparison itself is not being masked.
21185     __ evpcmpw(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
21186     __ evmovdquw($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
21187   %}
21188   ins_pipe( pipe_slow );
21189 %}
21190 
21191 instruct vcmpeq1L(vecD dst, vecD src1, vecD src2) %{
21192   predicate(UseAVX > 0 && n->as_Vector()->length() == 1 &&
21193             n->as_VectorMaskCmp()->get_predicate() == BoolTest::eq &&
21194             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
21195   match(Set dst (VectorMaskCmp src1 src2));
21196   format %{ "vpcmpeqq  $dst,$src1,$src2\n\t! cmpeq packed1L" %}
21197   ins_encode %{
21198     int vector_len = 0;
21199     __ vpcmpeqq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
21200   %}
21201   ins_pipe( pipe_slow );
21202 %}
21203 
21204 instruct vcmpeq2L(vecX dst, vecX src1, vecX src2) %{
21205   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 &&
21206             n->as_VectorMaskCmp()->get_predicate() == BoolTest::eq &&
21207             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
21208   match(Set dst (VectorMaskCmp src1 src2));
21209   format %{ "vpcmpeqq  $dst,$src1,$src2\n\t! cmpeq packed2L" %}
21210   ins_encode %{
21211     int vector_len = 0;
21212     __ vpcmpeqq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
21213   %}
21214   ins_pipe( pipe_slow );
21215 %}
21216 
21217 instruct vcmpeq4L(vecY dst, vecY src1, vecY src2) %{
21218   predicate(UseAVX > 1 && n->as_Vector()->length() == 4 &&
21219             n->as_VectorMaskCmp()->get_predicate() == BoolTest::eq &&
21220             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
21221   match(Set dst (VectorMaskCmp src1 src2));
21222   format %{ "vpcmpeqq  $dst,$src1,$src2\n\t! cmpeq packed4L" %}
21223   ins_encode %{
21224     int vector_len = 1;
21225     __ vpcmpeqq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
21226   %}
21227   ins_pipe( pipe_slow );
21228 %}
21229 
21230 instruct vcmpeq8L(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
21231   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 &&
21232             n->as_VectorMaskCmp()->get_predicate() == BoolTest::eq &&
21233             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
21234   match(Set dst (VectorMaskCmp src1 src2));
21235   effect(TEMP dst, TEMP scratch);
21236   format %{ "vpcmpeqq  k2,$src1,$src2\n\t"
21237             "vmovdqu64 $dst, k2{z}, 0xFFFFFFFFFF \t! cmpeq packed8L" %}
21238   ins_encode %{
21239     int vector_len = 2;
21240     Assembler::ComparisonPredicate cmp = Assembler::eq;
21241     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
21242     KRegister mask = k0; // The comparison itself is not being masked.
21243     __ evpcmpq(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
21244     __ evmovdquq($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
21245   %}
21246   ins_pipe( pipe_slow );
21247 %}
21248 
21249 instruct vcmplt1L(vecD dst, vecD src1, vecD src2) %{
21250   predicate(UseAVX > 0 && n->as_Vector()->length() == 1 &&
21251             n->as_VectorMaskCmp()->get_predicate() == BoolTest::lt &&
21252             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
21253   match(Set dst (VectorMaskCmp src1 src2));
21254   format %{ "vpcmpgtq  $dst,$src2,$src1\t! cmplt packed1L" %}
21255   ins_encode %{
21256     int vector_len = 0;
21257     __ vpcmpgtq($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, vector_len);
21258   %}
21259   ins_pipe( pipe_slow );
21260 %}
21261 
21262 instruct vcmplt2L(vecX dst, vecX src1, vecX src2) %{
21263   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 &&
21264             n->as_VectorMaskCmp()->get_predicate() == BoolTest::lt &&
21265             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
21266   match(Set dst (VectorMaskCmp src1 src2));
21267   format %{ "vpcmpgtq  $dst,$src2,$src1\t! cmplt packed2L" %}
21268   ins_encode %{
21269     int vector_len = 0;
21270     __ vpcmpgtq($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, vector_len);
21271   %}
21272   ins_pipe( pipe_slow );
21273 %}
21274 
21275 instruct vcmplt4L(vecY dst, vecY src1, vecY src2) %{
21276   predicate(UseAVX > 1 && n->as_Vector()->length() == 4 &&
21277             n->as_VectorMaskCmp()->get_predicate() == BoolTest::lt &&
21278             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
21279   match(Set dst (VectorMaskCmp src1 src2));
21280   format %{ "vpcmpgtq  $dst,$src2,$src1\t! cmplt packed4L" %}
21281   ins_encode %{
21282     int vector_len = 1;
21283     __ vpcmpgtq($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, vector_len);
21284   %}
21285   ins_pipe( pipe_slow );
21286 %}
21287 
21288 instruct vcmplt8L(vecZ dst, vecZ src1, vecZ src2, rRegL scratch) %{
21289   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 &&
21290             n->as_VectorMaskCmp()->get_predicate() == BoolTest::lt &&
21291             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
21292   match(Set dst (VectorMaskCmp src1 src2));
21293   effect(TEMP dst, TEMP scratch);
21294   format %{ "vpcmpnleq  k2,$src1,$src2\n\t"
21295             "vmovdqu64   $dst, k2{z}, 0xFFFFFFFFFF \t! cmpgt packed8L" %}
21296   ins_encode %{
21297     int vector_len = 2;
21298     Assembler::ComparisonPredicate cmp = Assembler::lt;
21299     KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
21300     KRegister mask = k0; // The comparison itself is not being masked.
21301     __ evpcmpq(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vector_len);
21302     __ evmovdquq($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vector_len, $scratch$$Register);
21303   %}
21304   ins_pipe( pipe_slow );
21305 %}
21306 
21307 instruct vcmpgt1L(vecD dst, vecD src1, vecD src2) %{
21308   predicate(UseAVX > 0 && n->as_Vector()->length() == 1 &&
21309             n->as_VectorMaskCmp()->get_predicate() == BoolTest::gt &&
21310             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
21311   match(Set dst (VectorMaskCmp src1 src2));
21312   format %{ "vpcmpgtq  $dst,$src1,$src2\t! cmpgt packed1L" %}
21313   ins_encode %{
21314     int vector_len = 0;
21315     __ vpcmpgtq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
21316   %}
21317   ins_pipe( pipe_slow );
21318 %}
21319 
21320 instruct vcmpgt2L(vecX dst, vecX src1, vecX src2) %{
21321   predicate(UseAVX > 0 && n->as_Vector()->length() == 2 &&
21322             n->as_VectorMaskCmp()->get_predicate() == BoolTest::gt &&
21323             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_LONG);
21324   match(Set dst (VectorMaskCmp src1 src2));
21325   format %{ "vpcmpgtq  $dst,$src1,$src2\t! cmpgt packed2L" %}
21326   ins_encode %{
21327     int vector_len = 0;
21328     __ vpcmpgtq($dst$$XMMRegister, $src1$$XMMRegi