< prev index next >

src/cpu/x86/vm/x86.ad

Print this page




2877   ins_cost(145);
2878   format %{ "vmovdqu $mem,$src\t! store vector (32 bytes)" %}
2879   ins_encode %{
2880     __ vmovdqu($mem$$Address, $src$$XMMRegister);
2881   %}
2882   ins_pipe( pipe_slow );
2883 %}
2884 
2885 instruct storeV64(memory mem, vecZ src) %{
2886   predicate(n->as_StoreVector()->memory_size() == 64);
2887   match(Set mem (StoreVector mem src));
2888   ins_cost(145);
2889   format %{ "vmovdqu $mem k0,$src\t! store vector (64 bytes)" %}
2890   ins_encode %{
2891     int vector_len = 2;
2892     __ evmovdqu($mem$$Address, $src$$XMMRegister, vector_len);
2893   %}
2894   ins_pipe( pipe_slow );
2895 %}
2896 
2897 // Replicate byte scalar to be vector
2898 instruct Repl4B(vecS dst, rRegI src) %{
2899   predicate(n->as_Vector()->length() == 4);
2900   match(Set dst (ReplicateB src));
2901   format %{ "movd    $dst,$src\n\t"
2902             "punpcklbw $dst,$dst\n\t"
2903             "pshuflw $dst,$dst,0x00\t! replicate4B" %}
2904   ins_encode %{
2905     __ movdl($dst$$XMMRegister, $src$$Register);
2906     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
2907     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
2908   %}
2909   ins_pipe( pipe_slow );
2910 %}
2911 
2912 instruct Repl8B(vecD dst, rRegI src) %{
2913   predicate(n->as_Vector()->length() == 8);
2914   match(Set dst (ReplicateB src));
2915   format %{ "movd    $dst,$src\n\t"
2916             "punpcklbw $dst,$dst\n\t"
2917             "pshuflw $dst,$dst,0x00\t! replicate8B" %}

2918   ins_encode %{
2919     __ movdl($dst$$XMMRegister, $src$$Register);
2920     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
2921     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);

2922   %}
2923   ins_pipe( pipe_slow );
2924 %}
2925 
2926 instruct Repl16B(vecX dst, rRegI src) %{
2927   predicate(n->as_Vector()->length() == 16);
2928   match(Set dst (ReplicateB src));
2929   format %{ "movd    $dst,$src\n\t"
2930             "punpcklbw $dst,$dst\n\t"
2931             "pshuflw $dst,$dst,0x00\n\t"
2932             "punpcklqdq $dst,$dst\t! replicate16B" %}
2933   ins_encode %{
2934     __ movdl($dst$$XMMRegister, $src$$Register);
2935     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
2936     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
2937     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2938   %}
2939   ins_pipe( pipe_slow );
2940 %}
2941 
2942 instruct Repl32B(vecY dst, rRegI src) %{
2943   predicate(n->as_Vector()->length() == 32);
2944   match(Set dst (ReplicateB src));
2945   format %{ "movd    $dst,$src\n\t"
2946             "punpcklbw $dst,$dst\n\t"
2947             "pshuflw $dst,$dst,0x00\n\t"
2948             "punpcklqdq $dst,$dst\n\t"
2949             "vinserti128h $dst,$dst,$dst\t! replicate32B" %}
2950   ins_encode %{
2951     __ movdl($dst$$XMMRegister, $src$$Register);
2952     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
2953     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
2954     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2955     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
2956   %}
2957   ins_pipe( pipe_slow );
2958 %}
2959 
2960 instruct Repl64B(vecZ dst, rRegI src) %{
2961   predicate(n->as_Vector()->length() == 64);
2962   match(Set dst (ReplicateB src));
2963   format %{ "movd    $dst,$src\n\t"
2964             "punpcklbw $dst,$dst\n\t"
2965             "pshuflw $dst,$dst,0x00\n\t"
2966             "punpcklqdq $dst,$dst\n\t"
2967             "vinserti128h $dst,$dst,$dst\t! lower replicate32B\n\t"
2968             "vinserti64x4h $dst k0,$dst,$dst\t! upper replicate632B" %}
2969   ins_encode %{
2970     __ movdl($dst$$XMMRegister, $src$$Register);
2971     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
2972     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
2973     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2974     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
2975     __ vinserti64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
2976   %}
2977   ins_pipe( pipe_slow );
2978 %}
2979 
2980 // Replicate byte scalar immediate to be vector by loading from const table.
2981 instruct Repl4B_imm(vecS dst, immI con) %{
2982   predicate(n->as_Vector()->length() == 4);
2983   match(Set dst (ReplicateB con));
2984   format %{ "movdl   $dst,[$constantaddress]\t! replicate4B($con)" %}
2985   ins_encode %{
2986     __ movdl($dst$$XMMRegister, $constantaddress(replicate4_imm($con$$constant, 1)));
2987   %}
2988   ins_pipe( pipe_slow );
2989 %}
2990 
2991 instruct Repl8B_imm(vecD dst, immI con) %{
2992   predicate(n->as_Vector()->length() == 8);
2993   match(Set dst (ReplicateB con));
2994   format %{ "movq    $dst,[$constantaddress]\t! replicate8B($con)" %}
2995   ins_encode %{
2996     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
2997   %}
2998   ins_pipe( pipe_slow );
2999 %}
3000 
3001 instruct Repl16B_imm(vecX dst, immI con) %{
3002   predicate(n->as_Vector()->length() == 16);
3003   match(Set dst (ReplicateB con));
3004   format %{ "movq    $dst,[$constantaddress]\n\t"
3005             "punpcklqdq $dst,$dst\t! replicate16B($con)" %}
3006   ins_encode %{
3007     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
3008     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3009   %}
3010   ins_pipe( pipe_slow );
3011 %}
3012 
3013 instruct Repl32B_imm(vecY dst, immI con) %{
3014   predicate(n->as_Vector()->length() == 32);
3015   match(Set dst (ReplicateB con));
3016   format %{ "movq    $dst,[$constantaddress]\n\t"
3017             "punpcklqdq $dst,$dst\n\t"
3018             "vinserti128h $dst,$dst,$dst\t! lreplicate32B($con)" %}
3019   ins_encode %{
3020     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
3021     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3022     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
3023   %}
3024   ins_pipe( pipe_slow );
3025 %}
3026 
3027 instruct Repl64B_imm(vecZ dst, immI con) %{
3028   predicate(n->as_Vector()->length() == 64);
3029   match(Set dst (ReplicateB con));
3030   format %{ "movq    $dst,[$constantaddress]\n\t"
3031             "punpcklqdq $dst,$dst\n\t"
3032             "vinserti128h $dst,$dst,$dst\t! lower replicate32B($con)\n\t"
3033             "vinserti64x4h $dst k0,$dst,$dst\t! upper replicate32B($con)" %}
3034   ins_encode %{
3035     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
3036     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3037     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
3038     __ vinserti64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
3039   %}
3040   ins_pipe( pipe_slow );
3041 %}
3042 
3043 // Replicate byte scalar zero to be vector
3044 instruct Repl4B_zero(vecS dst, immI0 zero) %{
3045   predicate(n->as_Vector()->length() == 4);
3046   match(Set dst (ReplicateB zero));
3047   format %{ "pxor    $dst,$dst\t! replicate4B zero" %}
3048   ins_encode %{
3049     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3050   %}
3051   ins_pipe( fpu_reg_reg );
3052 %}
3053 
3054 instruct Repl8B_zero(vecD dst, immI0 zero) %{
3055   predicate(n->as_Vector()->length() == 8);
3056   match(Set dst (ReplicateB zero));
3057   format %{ "pxor    $dst,$dst\t! replicate8B zero" %}
3058   ins_encode %{
3059     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3060   %}
3061   ins_pipe( fpu_reg_reg );
3062 %}
3063 
3064 instruct Repl16B_zero(vecX dst, immI0 zero) %{
3065   predicate(n->as_Vector()->length() == 16);
3066   match(Set dst (ReplicateB zero));
3067   format %{ "pxor    $dst,$dst\t! replicate16B zero" %}
3068   ins_encode %{
3069     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3070   %}
3071   ins_pipe( fpu_reg_reg );
3072 %}
3073 
3074 instruct Repl32B_zero(vecY dst, immI0 zero) %{
3075   predicate(n->as_Vector()->length() == 32);
3076   match(Set dst (ReplicateB zero));
3077   format %{ "vpxor   $dst,$dst,$dst\t! replicate32B zero" %}
3078   ins_encode %{
3079     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
3080     int vector_len = 1;
3081     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3082   %}
3083   ins_pipe( fpu_reg_reg );
3084 %}
3085 
3086 instruct Repl64B_zero(vecZ dst, immI0 zero) %{
3087   predicate(n->as_Vector()->length() == 64);
3088   match(Set dst (ReplicateB zero));
3089   format %{ "vpxor   $dst k0,$dst,$dst\t! replicate64B zero" %}
3090   ins_encode %{
3091     // Use vxorpd since AVX does not have vpxor for 512-bit (EVEX will have it).
3092     int vector_len = 2;
3093     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3094   %}
3095   ins_pipe( fpu_reg_reg );
3096 %}
3097 
3098 // Replicate char/short (2 byte) scalar to be vector
3099 instruct Repl2S(vecS dst, rRegI src) %{
3100   predicate(n->as_Vector()->length() == 2);
3101   match(Set dst (ReplicateS src));
3102   format %{ "movd    $dst,$src\n\t"
3103             "pshuflw $dst,$dst,0x00\t! replicate2S" %}
3104   ins_encode %{
3105     __ movdl($dst$$XMMRegister, $src$$Register);
3106     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3107   %}
3108   ins_pipe( fpu_reg_reg );
3109 %}
3110 
3111 instruct Repl4S(vecD dst, rRegI src) %{
3112   predicate(n->as_Vector()->length() == 4);
3113   match(Set dst (ReplicateS src));
3114   format %{ "movd    $dst,$src\n\t"
3115             "pshuflw $dst,$dst,0x00\t! replicate4S" %}
3116   ins_encode %{
3117     __ movdl($dst$$XMMRegister, $src$$Register);
3118     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3119   %}
3120   ins_pipe( fpu_reg_reg );
3121 %}
3122 
3123 instruct Repl8S(vecX dst, rRegI src) %{
3124   predicate(n->as_Vector()->length() == 8);
3125   match(Set dst (ReplicateS src));
3126   format %{ "movd    $dst,$src\n\t"
3127             "pshuflw $dst,$dst,0x00\n\t"
3128             "punpcklqdq $dst,$dst\t! replicate8S" %}
3129   ins_encode %{
3130     __ movdl($dst$$XMMRegister, $src$$Register);
3131     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3132     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3133   %}
3134   ins_pipe( pipe_slow );
3135 %}
3136 
3137 instruct Repl16S(vecY dst, rRegI src) %{
3138   predicate(n->as_Vector()->length() == 16);
3139   match(Set dst (ReplicateS src));
3140   format %{ "movd    $dst,$src\n\t"
3141             "pshuflw $dst,$dst,0x00\n\t"
3142             "punpcklqdq $dst,$dst\n\t"
3143             "vinserti128h $dst,$dst,$dst\t! replicate16S" %}
3144   ins_encode %{
3145     __ movdl($dst$$XMMRegister, $src$$Register);
3146     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3147     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3148     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
3149   %}
3150   ins_pipe( pipe_slow );
3151 %}
3152 
3153 instruct Repl32S(vecZ dst, rRegI src) %{
3154   predicate(n->as_Vector()->length() == 32);
3155   match(Set dst (ReplicateS src));
3156   format %{ "movd    $dst,$src\n\t"
3157             "pshuflw $dst,$dst,0x00\n\t"
3158             "punpcklqdq $dst,$dst\n\t"
3159             "vinserti128h $dst,$dst,$dst\t! lower replicate16S\n\t"
3160             "vinserti64x4h $dst k0,$dst,$dst\t! upper replicate16S" %}
3161   ins_encode %{
3162     __ movdl($dst$$XMMRegister, $src$$Register);
3163     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3164     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3165     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
3166     __ vinserti64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
3167   %}
3168   ins_pipe( pipe_slow );
3169 %}
3170 
3171 // Replicate char/short (2 byte) scalar immediate to be vector by loading from const table.
3172 instruct Repl2S_imm(vecS dst, immI con) %{
3173   predicate(n->as_Vector()->length() == 2);
3174   match(Set dst (ReplicateS con));
3175   format %{ "movdl   $dst,[$constantaddress]\t! replicate2S($con)" %}
3176   ins_encode %{
3177     __ movdl($dst$$XMMRegister, $constantaddress(replicate4_imm($con$$constant, 2)));
3178   %}
3179   ins_pipe( fpu_reg_reg );
3180 %}
3181 
3182 instruct Repl4S_imm(vecD dst, immI con) %{
3183   predicate(n->as_Vector()->length() == 4);
3184   match(Set dst (ReplicateS con));
3185   format %{ "movq    $dst,[$constantaddress]\t! replicate4S($con)" %}
3186   ins_encode %{
3187     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
3188   %}
3189   ins_pipe( fpu_reg_reg );
3190 %}
3191 
3192 instruct Repl8S_imm(vecX dst, immI con) %{
3193   predicate(n->as_Vector()->length() == 8);
3194   match(Set dst (ReplicateS con));
3195   format %{ "movq    $dst,[$constantaddress]\n\t"
3196             "punpcklqdq $dst,$dst\t! replicate8S($con)" %}
3197   ins_encode %{
3198     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
3199     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3200   %}
3201   ins_pipe( pipe_slow );
3202 %}
3203 
3204 instruct Repl16S_imm(vecY dst, immI con) %{
3205   predicate(n->as_Vector()->length() == 16);
3206   match(Set dst (ReplicateS con));
3207   format %{ "movq    $dst,[$constantaddress]\n\t"
3208             "punpcklqdq $dst,$dst\n\t"
3209             "vinserti128h $dst,$dst,$dst\t! replicate16S($con)" %}
3210   ins_encode %{
3211     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
3212     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3213     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
3214   %}
3215   ins_pipe( pipe_slow );
3216 %}
3217 
3218 instruct Repl32S_imm(vecZ dst, immI con) %{
3219   predicate(n->as_Vector()->length() == 32);
3220   match(Set dst (ReplicateS con));
3221   format %{ "movq    $dst,[$constantaddress]\n\t"
3222             "punpcklqdq $dst,$dst\n\t"
3223             "vinserti128h $dst,$dst,$dst\t! lower replicate16S($con)\n\t"
3224             "vinserti64x4h $dst k0,$dst,$dst\t! upper replicate16S($con)" %}
3225   ins_encode %{
3226     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
3227     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3228     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
3229     __ vinserti64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
3230   %}
3231   ins_pipe( pipe_slow );
3232 %}
3233 
3234 // Replicate char/short (2 byte) scalar zero to be vector
3235 instruct Repl2S_zero(vecS dst, immI0 zero) %{
3236   predicate(n->as_Vector()->length() == 2);
3237   match(Set dst (ReplicateS zero));
3238   format %{ "pxor    $dst,$dst\t! replicate2S zero" %}
3239   ins_encode %{
3240     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3241   %}
3242   ins_pipe( fpu_reg_reg );
3243 %}
3244 
3245 instruct Repl4S_zero(vecD dst, immI0 zero) %{
3246   predicate(n->as_Vector()->length() == 4);
3247   match(Set dst (ReplicateS zero));
3248   format %{ "pxor    $dst,$dst\t! replicate4S zero" %}
3249   ins_encode %{
3250     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3251   %}
3252   ins_pipe( fpu_reg_reg );
3253 %}
3254 
3255 instruct Repl8S_zero(vecX dst, immI0 zero) %{
3256   predicate(n->as_Vector()->length() == 8);
3257   match(Set dst (ReplicateS zero));
3258   format %{ "pxor    $dst,$dst\t! replicate8S zero" %}
3259   ins_encode %{
3260     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3261   %}
3262   ins_pipe( fpu_reg_reg );
3263 %}
3264 
3265 instruct Repl16S_zero(vecY dst, immI0 zero) %{
3266   predicate(n->as_Vector()->length() == 16);
3267   match(Set dst (ReplicateS zero));
3268   format %{ "vpxor   $dst,$dst,$dst\t! replicate16S zero" %}
3269   ins_encode %{
3270     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
3271     int vector_len = 1;
3272     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3273   %}
3274   ins_pipe( fpu_reg_reg );
3275 %}
3276 
3277 instruct Repl32S_zero(vecZ dst, immI0 zero) %{
3278   predicate(n->as_Vector()->length() == 32);
3279   match(Set dst (ReplicateS zero));
3280   format %{ "vpxor   $dst k0,$dst,$dst\t! replicate32S zero" %}
3281   ins_encode %{
3282     // Use vxorpd since AVX does not have vpxor for 512-bit (EVEX will have it).
3283     int vector_len = 2;
3284     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3285   %}
3286   ins_pipe( fpu_reg_reg );
3287 %}
3288 
3289 // Replicate integer (4 byte) scalar to be vector
3290 instruct Repl2I(vecD dst, rRegI src) %{
3291   predicate(n->as_Vector()->length() == 2);
3292   match(Set dst (ReplicateI src));
3293   format %{ "movd    $dst,$src\n\t"
3294             "pshufd  $dst,$dst,0x00\t! replicate2I" %}
3295   ins_encode %{
3296     __ movdl($dst$$XMMRegister, $src$$Register);
3297     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3298   %}
3299   ins_pipe( fpu_reg_reg );
3300 %}
3301 
3302 instruct Repl4I(vecX dst, rRegI src) %{
3303   predicate(n->as_Vector()->length() == 4);
3304   match(Set dst (ReplicateI src));
3305   format %{ "movd    $dst,$src\n\t"
3306             "pshufd  $dst,$dst,0x00\t! replicate4I" %}
3307   ins_encode %{
3308     __ movdl($dst$$XMMRegister, $src$$Register);
3309     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3310   %}
3311   ins_pipe( pipe_slow );
3312 %}
3313 
3314 instruct Repl8I(vecY dst, rRegI src) %{
3315   predicate(n->as_Vector()->length() == 8);
3316   match(Set dst (ReplicateI src));
3317   format %{ "movd    $dst,$src\n\t"
3318             "pshufd  $dst,$dst,0x00\n\t"
3319             "vinserti128h $dst,$dst,$dst\t! replicate8I" %}
3320   ins_encode %{
3321     __ movdl($dst$$XMMRegister, $src$$Register);
3322     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3323     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
3324   %}
3325   ins_pipe( pipe_slow );
3326 %}
3327 
3328 instruct Repl16I(vecZ dst, rRegI src) %{
3329   predicate(n->as_Vector()->length() == 16);
3330   match(Set dst (ReplicateI src));
3331   format %{ "movd    $dst,$src\n\t"
3332             "pshufd  $dst,$dst,0x00\n\t"
3333             "vinserti128h $dst,$dst,$dst\t! lower replicate8I\n\t"
3334             "vinserti64x4h $dst k0,$dst,$dst\t! upper replicate8I" %}
3335   ins_encode %{
3336     __ movdl($dst$$XMMRegister, $src$$Register);
3337     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3338     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
3339     __ vinserti64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
3340   %}
3341   ins_pipe( pipe_slow );
3342 %}
3343 
3344 // Replicate integer (4 byte) scalar immediate to be vector by loading from const table.
3345 instruct Repl2I_imm(vecD dst, immI con) %{
3346   predicate(n->as_Vector()->length() == 2);
3347   match(Set dst (ReplicateI con));
3348   format %{ "movq    $dst,[$constantaddress]\t! replicate2I($con)" %}
3349   ins_encode %{
3350     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
3351   %}
3352   ins_pipe( fpu_reg_reg );
3353 %}
3354 
3355 instruct Repl4I_imm(vecX dst, immI con) %{
3356   predicate(n->as_Vector()->length() == 4);
3357   match(Set dst (ReplicateI con));
3358   format %{ "movq    $dst,[$constantaddress]\t! replicate4I($con)\n\t"
3359             "punpcklqdq $dst,$dst" %}
3360   ins_encode %{
3361     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
3362     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3363   %}
3364   ins_pipe( pipe_slow );
3365 %}
3366 
3367 instruct Repl8I_imm(vecY dst, immI con) %{
3368   predicate(n->as_Vector()->length() == 8);
3369   match(Set dst (ReplicateI con));
3370   format %{ "movq    $dst,[$constantaddress]\t! replicate8I($con)\n\t"
3371             "punpcklqdq $dst,$dst\n\t"
3372             "vinserti128h $dst,$dst,$dst" %}
3373   ins_encode %{
3374     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
3375     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3376     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
3377   %}
3378   ins_pipe( pipe_slow );
3379 %}
3380 
3381 instruct Repl16I_imm(vecZ dst, immI con) %{
3382   predicate(n->as_Vector()->length() == 16);
3383   match(Set dst (ReplicateI con));
3384   format %{ "movq    $dst,[$constantaddress]\t! replicate16I($con)\n\t"
3385             "punpcklqdq $dst,$dst\n\t"
3386             "vinserti128h $dst,$dst,$dst\n\t"
3387             "vinserti64x4h $dst k0,$dst,$dst" %}
3388   ins_encode %{
3389     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));






















































































3390     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3391     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
3392     __ vinserti64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
3393   %}
3394   ins_pipe( pipe_slow );
3395 %}
3396 
3397 // Integer could be loaded into xmm register directly from memory.
3398 instruct Repl2I_mem(vecD dst, memory mem) %{
3399   predicate(n->as_Vector()->length() == 2);
3400   match(Set dst (ReplicateI (LoadI mem)));
3401   format %{ "movd    $dst,$mem\n\t"
3402             "pshufd  $dst,$dst,0x00\t! replicate2I" %}
3403   ins_encode %{
3404     __ movdl($dst$$XMMRegister, $mem$$Address);
3405     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3406   %}
3407   ins_pipe( fpu_reg_reg );
3408 %}
3409 
3410 instruct Repl4I_mem(vecX dst, memory mem) %{







































3411   predicate(n->as_Vector()->length() == 4);
3412   match(Set dst (ReplicateI (LoadI mem)));
3413   format %{ "movd    $dst,$mem\n\t"
3414             "pshufd  $dst,$dst,0x00\t! replicate4I" %}

3415   ins_encode %{
3416     __ movdl($dst$$XMMRegister, $mem$$Address);
3417     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);

3418   %}
3419   ins_pipe( pipe_slow );
3420 %}
3421 
3422 instruct Repl8I_mem(vecY dst, memory mem) %{












3423   predicate(n->as_Vector()->length() == 8);
3424   match(Set dst (ReplicateI (LoadI mem)));
3425   format %{ "movd    $dst,$mem\n\t"
3426             "pshufd  $dst,$dst,0x00\n\t"
3427             "vinserti128h $dst,$dst,$dst\t! replicate8I" %}
3428   ins_encode %{
3429     __ movdl($dst$$XMMRegister, $mem$$Address);
3430     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3431     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
3432   %}
3433   ins_pipe( pipe_slow );
3434 %}
3435 
3436 instruct Repl16I_mem(vecZ dst, memory mem) %{
3437   predicate(n->as_Vector()->length() == 16);
3438   match(Set dst (ReplicateI (LoadI mem)));
3439   format %{ "movd    $dst,$mem\n\t"
3440             "pshufd  $dst,$dst,0x00\n\t"
3441             "vinserti128h $dst,$dst,$dst\t! lower replicate8I\n\t"
3442             "vinserti64x4h $dst k0,$dst,$dst\t! upper replicate8I" %}
3443   ins_encode %{
3444     __ movdl($dst$$XMMRegister, $mem$$Address);
3445     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3446     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
3447     __ vinserti64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
3448   %}
3449   ins_pipe( pipe_slow );
3450 %}
3451 
3452 // Replicate integer (4 byte) scalar zero to be vector
3453 instruct Repl2I_zero(vecD dst, immI0 zero) %{
3454   predicate(n->as_Vector()->length() == 2);
3455   match(Set dst (ReplicateI zero));
3456   format %{ "pxor    $dst,$dst\t! replicate2I" %}
3457   ins_encode %{
3458     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3459   %}
3460   ins_pipe( fpu_reg_reg );
3461 %}
3462 
3463 instruct Repl4I_zero(vecX dst, immI0 zero) %{











3464   predicate(n->as_Vector()->length() == 4);
3465   match(Set dst (ReplicateI zero));
3466   format %{ "pxor    $dst,$dst\t! replicate4I zero)" %}
3467   ins_encode %{
3468     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3469   %}
3470   ins_pipe( fpu_reg_reg );
3471 %}
3472 
3473 instruct Repl8I_zero(vecY dst, immI0 zero) %{
3474   predicate(n->as_Vector()->length() == 8);
3475   match(Set dst (ReplicateI zero));
3476   format %{ "vpxor   $dst,$dst,$dst\t! replicate8I zero" %}
3477   ins_encode %{
3478     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
3479     int vector_len = 1;
3480     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);

































3481   %}
3482   ins_pipe( fpu_reg_reg );
3483 %}
3484 
3485 instruct Repl16I_zero(vecZ dst, immI0 zero) %{
3486   predicate(n->as_Vector()->length() == 16);























































































































































































































































































































































































































































































































































































































































3487   match(Set dst (ReplicateI zero));
3488   format %{ "vpxor   $dst k0,$dst,$dst\t! replicate16I zero" %}
3489   ins_encode %{
3490     // Use vxorpd since AVX does not have vpxor for 512-bit (AVX2 will have it).
3491     int vector_len = 2;
3492     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3493   %}
3494   ins_pipe( fpu_reg_reg );
3495 %}
3496 
3497 // Replicate long (8 byte) scalar to be vector
3498 #ifdef _LP64
3499 instruct Repl2L(vecX dst, rRegL src) %{
3500   predicate(n->as_Vector()->length() == 2);
3501   match(Set dst (ReplicateL src));
3502   format %{ "movdq   $dst,$src\n\t"
3503             "punpcklqdq $dst,$dst\t! replicate2L" %}
3504   ins_encode %{
3505     __ movdq($dst$$XMMRegister, $src$$Register);
3506     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3507   %}
3508   ins_pipe( pipe_slow );
3509 %}
3510 
3511 instruct Repl4L(vecY dst, rRegL src) %{
3512   predicate(n->as_Vector()->length() == 4);
3513   match(Set dst (ReplicateL src));
3514   format %{ "movdq   $dst,$src\n\t"
3515             "punpcklqdq $dst,$dst\n\t"
3516             "vinserti128h $dst,$dst,$dst\t! replicate4L" %}
3517   ins_encode %{
3518     __ movdq($dst$$XMMRegister, $src$$Register);
3519     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3520     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
3521   %}
3522   ins_pipe( pipe_slow );
3523 %}
3524 
3525 instruct Repl8L(vecZ dst, rRegL src) %{
3526   predicate(n->as_Vector()->length() == 8);
3527   match(Set dst (ReplicateL src));
3528   format %{ "movdq   $dst,$src\n\t"
3529             "punpcklqdq $dst,$dst\n\t"
3530             "vinserti128h $dst,$dst,$dst\t! lower replicate4L\n\t"
3531             "vinserti64x4h $dst k0,$dst,$dst\t! upper replicate4L" %}
3532   ins_encode %{
3533     __ movdq($dst$$XMMRegister, $src$$Register);
3534     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3535     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
3536     __ vinserti64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
3537   %}
3538   ins_pipe( pipe_slow );
3539 %}
3540 #else // _LP64
3541 instruct Repl2L(vecX dst, eRegL src, regD tmp) %{
3542   predicate(n->as_Vector()->length() == 2);
3543   match(Set dst (ReplicateL src));
3544   effect(TEMP dst, USE src, TEMP tmp);
3545   format %{ "movdl   $dst,$src.lo\n\t"
3546             "movdl   $tmp,$src.hi\n\t"
3547             "punpckldq $dst,$tmp\n\t"
3548             "punpcklqdq $dst,$dst\t! replicate2L"%}
3549   ins_encode %{
3550     __ movdl($dst$$XMMRegister, $src$$Register);
3551     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
3552     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
3553     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3554   %}
3555   ins_pipe( pipe_slow );
3556 %}
3557 
3558 instruct Repl4L(vecY dst, eRegL src, regD tmp) %{
3559   predicate(n->as_Vector()->length() == 4);
3560   match(Set dst (ReplicateL src));
3561   effect(TEMP dst, USE src, TEMP tmp);
3562   format %{ "movdl   $dst,$src.lo\n\t"
3563             "movdl   $tmp,$src.hi\n\t"
3564             "punpckldq $dst,$tmp\n\t"
3565             "punpcklqdq $dst,$dst\n\t"
3566             "vinserti128h $dst,$dst,$dst\t! replicate4L" %}
3567   ins_encode %{

3568     __ movdl($dst$$XMMRegister, $src$$Register);
3569     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
3570     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
3571     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3572     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
3573   %}
3574   ins_pipe( pipe_slow );
3575 %}
3576 
3577 instruct Repl8L(vecZ dst, eRegL src, regD tmp) %{
3578   predicate(n->as_Vector()->length() == 4);
3579   match(Set dst (ReplicateL src));
3580   effect(TEMP dst, USE src, TEMP tmp);
3581   format %{ "movdl   $dst,$src.lo\n\t"
3582             "movdl   $tmp,$src.hi\n\t"
3583             "punpckldq $dst,$tmp\n\t"
3584             "vinserti128h $dst,$dst,$dst\t! lower replicate4L\n\t"
3585             "vinserti64x4h $dst k0,$dst,$dst\t! upper replicate4L" %}
3586   ins_encode %{

3587     __ movdl($dst$$XMMRegister, $src$$Register);
3588     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
3589     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
3590     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3591     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
3592     __ vinserti64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
3593   %}
3594   ins_pipe( pipe_slow );
3595 %}
3596 #endif // _LP64
3597 
3598 // Replicate long (8 byte) scalar immediate to be vector by loading from const table.
3599 instruct Repl2L_imm(vecX dst, immL con) %{
3600   predicate(n->as_Vector()->length() == 2);
3601   match(Set dst (ReplicateL con));
3602   format %{ "movq    $dst,[$constantaddress]\n\t"
3603             "punpcklqdq $dst,$dst\t! replicate2L($con)" %}
3604   ins_encode %{
3605     __ movq($dst$$XMMRegister, $constantaddress($con));
3606     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3607   %}
3608   ins_pipe( pipe_slow );
3609 %}
3610 
3611 instruct Repl4L_imm(vecY dst, immL con) %{
3612   predicate(n->as_Vector()->length() == 4);
3613   match(Set dst (ReplicateL con));
3614   format %{ "movq    $dst,[$constantaddress]\n\t"
3615             "punpcklqdq $dst,$dst\n\t"
3616             "vinserti128h $dst,$dst,$dst\t! replicate4L($con)" %}
3617   ins_encode %{

3618     __ movq($dst$$XMMRegister, $constantaddress($con));
3619     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3620     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
3621   %}
3622   ins_pipe( pipe_slow );
3623 %}
3624 
3625 instruct Repl8L_imm(vecZ dst, immL con) %{
3626   predicate(n->as_Vector()->length() == 8);
3627   match(Set dst (ReplicateL con));
3628   format %{ "movq    $dst,[$constantaddress]\n\t"
3629             "punpcklqdq $dst,$dst\n\t"
3630             "vinserti128h $dst,$dst,$dst\t! lower replicate4L($con)\n\t"
3631             "vinserti64x4h $dst k0,$dst,$dst\t! upper replicate4L($con)" %}
3632   ins_encode %{

3633     __ movq($dst$$XMMRegister, $constantaddress($con));
3634     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3635     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
3636     __ vinserti64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
3637   %}
3638   ins_pipe( pipe_slow );
3639 %}
3640 
3641 // Long could be loaded into xmm register directly from memory.
3642 instruct Repl2L_mem(vecX dst, memory mem) %{
3643   predicate(n->as_Vector()->length() == 2);
3644   match(Set dst (ReplicateL (LoadL mem)));
3645   format %{ "movq    $dst,$mem\n\t"
3646             "punpcklqdq $dst,$dst\t! replicate2L" %}
3647   ins_encode %{
3648     __ movq($dst$$XMMRegister, $mem$$Address);
3649     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3650   %}
3651   ins_pipe( pipe_slow );
3652 %}
3653 
3654 instruct Repl4L_mem(vecY dst, memory mem) %{
3655   predicate(n->as_Vector()->length() == 4);
3656   match(Set dst (ReplicateL (LoadL mem)));
3657   format %{ "movq    $dst,$mem\n\t"
3658             "punpcklqdq $dst,$dst\n\t"
3659             "vinserti128h $dst,$dst,$dst\t! replicate4L" %}
3660   ins_encode %{
3661     __ movq($dst$$XMMRegister, $mem$$Address);
3662     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3663     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
3664   %}
3665   ins_pipe( pipe_slow );
3666 %}
3667 
3668 instruct Repl8L_mem(vecZ dst, memory mem) %{
3669   predicate(n->as_Vector()->length() == 8);
3670   match(Set dst (ReplicateL (LoadL mem)));
3671   format %{ "movq    $dst,$mem\n\t"
3672             "punpcklqdq $dst,$dst\n\t"
3673             "vinserti128h $dst,$dst,$dst\t! lower replicate4L\n\t"
3674             "vinserti64x4h $dst k0,$dst,$dst\t! upper replicate4L" %}
3675   ins_encode %{
3676     __ movq($dst$$XMMRegister, $mem$$Address);
3677     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3678     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
3679     __ vinserti64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
3680   %}
3681   ins_pipe( pipe_slow );
3682 %}
3683 
3684 // Replicate long (8 byte) scalar zero to be vector
3685 instruct Repl2L_zero(vecX dst, immL0 zero) %{
3686   predicate(n->as_Vector()->length() == 2);
3687   match(Set dst (ReplicateL zero));
3688   format %{ "pxor    $dst,$dst\t! replicate2L zero" %}
3689   ins_encode %{
3690     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3691   %}
3692   ins_pipe( fpu_reg_reg );
3693 %}
3694 
3695 instruct Repl4L_zero(vecY dst, immL0 zero) %{
3696   predicate(n->as_Vector()->length() == 4);
3697   match(Set dst (ReplicateL zero));
3698   format %{ "vpxor   $dst,$dst,$dst\t! replicate4L zero" %}
3699   ins_encode %{
3700     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
3701     int vector_len = 1;
3702     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3703   %}
3704   ins_pipe( fpu_reg_reg );
3705 %}
3706 
3707 instruct Repl8L_zero(vecZ dst, immL0 zero) %{
3708   predicate(n->as_Vector()->length() == 8);
3709   match(Set dst (ReplicateL zero));
3710   format %{ "vpxor   $dst k0,$dst,$dst\t! replicate8L zero" %}
3711   ins_encode %{
3712     // Use vxorpd since AVX does not have vpxor for 512-bit (EVEX will have it).
3713     int vector_len = 2;
3714     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3715   %}
3716   ins_pipe( fpu_reg_reg );
3717 %}
3718 
3719 // Replicate float (4 byte) scalar to be vector
3720 instruct Repl2F(vecD dst, regF src) %{
3721   predicate(n->as_Vector()->length() == 2);
3722   match(Set dst (ReplicateF src));
3723   format %{ "pshufd  $dst,$dst,0x00\t! replicate2F" %}
3724   ins_encode %{
3725     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
3726   %}
3727   ins_pipe( fpu_reg_reg );
3728 %}
3729 
3730 instruct Repl4F(vecX dst, regF src) %{
3731   predicate(n->as_Vector()->length() == 4);
3732   match(Set dst (ReplicateF src));
3733   format %{ "pshufd  $dst,$dst,0x00\t! replicate4F" %}
3734   ins_encode %{
3735     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);

3736   %}
3737   ins_pipe( pipe_slow );
3738 %}
3739 
3740 instruct Repl8F(vecY dst, regF src) %{
3741   predicate(n->as_Vector()->length() == 8);
3742   match(Set dst (ReplicateF src));
3743   format %{ "pshufd  $dst,$src,0x00\n\t"
3744             "vinsertf128h $dst,$dst,$dst\t! replicate8F" %}
3745   ins_encode %{
3746     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
3747     __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
3748   %}
3749   ins_pipe( pipe_slow );
3750 %}
3751 
3752 instruct Repl16F(vecZ dst, regF src) %{
3753   predicate(n->as_Vector()->length() == 16);
3754   match(Set dst (ReplicateF src));
3755   format %{ "pshufd  $dst,$src,0x00\n\t"
3756             "vinsertf128h $dst,$dst,$dst\t! lower replicate8F\n\t"
3757             "vinsertf64x4h $dst k0,$dst,$dst\t! lower replicate8F" %}
3758   ins_encode %{
3759     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
3760     __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
3761     __ vinsertf64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
3762   %}
3763   ins_pipe( pipe_slow );
3764 %}
3765 
3766 // Replicate float (4 byte) scalar zero to be vector
3767 instruct Repl2F_zero(vecD dst, immF0 zero) %{
3768   predicate(n->as_Vector()->length() == 2);
3769   match(Set dst (ReplicateF zero));
3770   format %{ "xorps   $dst,$dst\t! replicate2F zero" %}
3771   ins_encode %{
3772     __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
3773   %}
3774   ins_pipe( fpu_reg_reg );
3775 %}
3776 
3777 instruct Repl4F_zero(vecX dst, immF0 zero) %{
3778   predicate(n->as_Vector()->length() == 4);
3779   match(Set dst (ReplicateF zero));
3780   format %{ "xorps   $dst,$dst\t! replicate4F zero" %}
3781   ins_encode %{
3782     __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
3783   %}
3784   ins_pipe( fpu_reg_reg );
3785 %}
3786 
3787 instruct Repl8F_zero(vecY dst, immF0 zero) %{
3788   predicate(n->as_Vector()->length() == 8);
3789   match(Set dst (ReplicateF zero));
3790   format %{ "vxorps  $dst,$dst,$dst\t! replicate8F zero" %}
3791   ins_encode %{
3792     int vector_len = 1;
3793     __ vxorps($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3794   %}
3795   ins_pipe( fpu_reg_reg );
3796 %}
3797 
3798 instruct Repl16F_zero(vecZ dst, immF0 zero) %{
3799   predicate(n->as_Vector()->length() == 16);
3800   match(Set dst (ReplicateF zero));
3801   format %{ "vxorps  $dst k0,$dst,$dst\t! replicate16F zero" %}
3802   ins_encode %{
3803     int vector_len = 2;
3804     __ vxorps($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3805   %}
3806   ins_pipe( fpu_reg_reg );
3807 %}
3808 
3809 // Replicate double (8 bytes) scalar to be vector
3810 instruct Repl2D(vecX dst, regD src) %{
3811   predicate(n->as_Vector()->length() == 2);
3812   match(Set dst (ReplicateD src));
3813   format %{ "pshufd  $dst,$src,0x44\t! replicate2D" %}
3814   ins_encode %{
3815     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);

3816   %}
3817   ins_pipe( pipe_slow );
3818 %}
3819 
3820 instruct Repl4D(vecY dst, regD src) %{
3821   predicate(n->as_Vector()->length() == 4);
3822   match(Set dst (ReplicateD src));
3823   format %{ "pshufd  $dst,$src,0x44\n\t"
3824             "vinsertf128h $dst,$dst,$dst\t! replicate4D" %}
3825   ins_encode %{
3826     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
3827     __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
3828   %}
3829   ins_pipe( pipe_slow );
3830 %}
3831 
3832 instruct Repl8D(vecZ dst, regD src) %{
3833   predicate(n->as_Vector()->length() == 8);
3834   match(Set dst (ReplicateD src));
3835   format %{ "pshufd  $dst,$src,0x44\n\t"
3836             "vinsertf128h $dst,$dst,$dst\t! lower replicate4D\n\t"
3837             "vinsertf64x4h $dst k0,$dst,$dst\t! upper replicate4D" %}
3838   ins_encode %{
3839     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
3840     __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
3841     __ vinsertf64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
3842   %}
3843   ins_pipe( pipe_slow );
3844 %}
3845 
3846 // Replicate double (8 byte) scalar zero to be vector
3847 instruct Repl2D_zero(vecX dst, immD0 zero) %{
3848   predicate(n->as_Vector()->length() == 2);
3849   match(Set dst (ReplicateD zero));
3850   format %{ "xorpd   $dst,$dst\t! replicate2D zero" %}
3851   ins_encode %{
3852     __ xorpd($dst$$XMMRegister, $dst$$XMMRegister);
3853   %}
3854   ins_pipe( fpu_reg_reg );
3855 %}
3856 
3857 instruct Repl4D_zero(vecY dst, immD0 zero) %{
3858   predicate(n->as_Vector()->length() == 4);
3859   match(Set dst (ReplicateD zero));
3860   format %{ "vxorpd  $dst,$dst,$dst,vect256\t! replicate4D zero" %}
3861   ins_encode %{
3862     int vector_len = 1;
3863     __ vxorpd($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3864   %}
3865   ins_pipe( fpu_reg_reg );
3866 %}
3867 
3868 instruct Repl8D_zero(vecZ dst, immD0 zero) %{
3869   predicate(n->as_Vector()->length() == 8);
3870   match(Set dst (ReplicateD zero));
3871   format %{ "vxorpd  $dst k0,$dst,$dst,vect512\t! replicate8D zero" %}
3872   ins_encode %{
3873     int vector_len = 2;
3874     __ vxorpd($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3875   %}
3876   ins_pipe( fpu_reg_reg );
3877 %}
3878 
3879 // ====================REDUCTION ARITHMETIC=======================================
3880 
3881 instruct rsadd2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
3882   predicate(UseSSE > 2 && UseAVX == 0);
3883   match(Set dst (AddReductionVI src1 src2));
3884   effect(TEMP tmp2, TEMP tmp);
3885   format %{ "movdqu  $tmp2,$src2\n\t"
3886             "phaddd  $tmp2,$tmp2\n\t"
3887             "movd    $tmp,$src1\n\t"
3888             "paddd   $tmp,$tmp2\n\t"
3889             "movd    $dst,$tmp\t! add reduction2I" %}


4946   ins_pipe( pipe_slow );
4947 %}
4948 
4949 // ====================VECTOR ARITHMETIC=======================================
4950 
4951 // --------------------------------- ADD --------------------------------------
4952 
4953 // Bytes vector add
4954 instruct vadd4B(vecS dst, vecS src) %{
4955   predicate(n->as_Vector()->length() == 4);
4956   match(Set dst (AddVB dst src));
4957   format %{ "paddb   $dst,$src\t! add packed4B" %}
4958   ins_encode %{
4959     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
4960   %}
4961   ins_pipe( pipe_slow );
4962 %}
4963 
4964 instruct vadd4B_reg(vecS dst, vecS src1, vecS src2) %{
4965   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
4966   match(Set dst (AddVB src1 src2));
4967   format %{ "vpaddb  $dst,$src1,$src2\t! add packed4B" %}











4968   ins_encode %{
4969     int vector_len = 0;
4970     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
4971   %}
4972   ins_pipe( pipe_slow );
4973 %}
4974 
4975 instruct vadd8B(vecD dst, vecD src) %{
4976   predicate(n->as_Vector()->length() == 8);
4977   match(Set dst (AddVB dst src));
4978   format %{ "paddb   $dst,$src\t! add packed8B" %}
4979   ins_encode %{
4980     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
4981   %}
4982   ins_pipe( pipe_slow );
4983 %}
4984 
4985 instruct vadd8B_reg(vecD dst, vecD src1, vecD src2) %{
4986   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
4987   match(Set dst (AddVB src1 src2));
4988   format %{ "vpaddb  $dst,$src1,$src2\t! add packed8B" %}
4989   ins_encode %{
4990     int vector_len = 0;
4991     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
4992   %}
4993   ins_pipe( pipe_slow );
4994 %}
4995 











4996 instruct vadd16B(vecX dst, vecX src) %{
4997   predicate(n->as_Vector()->length() == 16);
4998   match(Set dst (AddVB dst src));
4999   format %{ "paddb   $dst,$src\t! add packed16B" %}
5000   ins_encode %{
5001     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
5002   %}
5003   ins_pipe( pipe_slow );
5004 %}
5005 
5006 instruct vadd16B_reg(vecX dst, vecX src1, vecX src2) %{
5007   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
5008   match(Set dst (AddVB src1 src2));
5009   format %{ "vpaddb  $dst,$src1,$src2\t! add packed16B" %}
5010   ins_encode %{
5011     int vector_len = 0;
5012     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5013   %}
5014   ins_pipe( pipe_slow );
5015 %}


5074   predicate(n->as_Vector()->length() == 2);
5075   match(Set dst (AddVS dst src));
5076   format %{ "paddw   $dst,$src\t! add packed2S" %}
5077   ins_encode %{
5078     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
5079   %}
5080   ins_pipe( pipe_slow );
5081 %}
5082 
5083 instruct vadd2S_reg(vecS dst, vecS src1, vecS src2) %{
5084   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
5085   match(Set dst (AddVS src1 src2));
5086   format %{ "vpaddw  $dst,$src1,$src2\t! add packed2S" %}
5087   ins_encode %{
5088     int vector_len = 0;
5089     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5090   %}
5091   ins_pipe( pipe_slow );
5092 %}
5093 











5094 instruct vadd4S(vecD dst, vecD src) %{
5095   predicate(n->as_Vector()->length() == 4);
5096   match(Set dst (AddVS dst src));
5097   format %{ "paddw   $dst,$src\t! add packed4S" %}
5098   ins_encode %{
5099     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
5100   %}
5101   ins_pipe( pipe_slow );
5102 %}
5103 
5104 instruct vadd4S_reg(vecD dst, vecD src1, vecD src2) %{
5105   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
5106   match(Set dst (AddVS src1 src2));
5107   format %{ "vpaddw  $dst,$src1,$src2\t! add packed4S" %}
5108   ins_encode %{
5109     int vector_len = 0;
5110     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5111   %}
5112   ins_pipe( pipe_slow );
5113 %}
5114 











5115 instruct vadd8S(vecX dst, vecX src) %{
5116   predicate(n->as_Vector()->length() == 8);
5117   match(Set dst (AddVS dst src));
5118   format %{ "paddw   $dst,$src\t! add packed8S" %}
5119   ins_encode %{
5120     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
5121   %}
5122   ins_pipe( pipe_slow );
5123 %}
5124 
5125 instruct vadd8S_reg(vecX dst, vecX src1, vecX src2) %{
5126   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
5127   match(Set dst (AddVS src1 src2));
5128   format %{ "vpaddw  $dst,$src1,$src2\t! add packed8S" %}
5129   ins_encode %{
5130     int vector_len = 0;
5131     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5132   %}
5133   ins_pipe( pipe_slow );
5134 %}


5193   predicate(n->as_Vector()->length() == 2);
5194   match(Set dst (AddVI dst src));
5195   format %{ "paddd   $dst,$src\t! add packed2I" %}
5196   ins_encode %{
5197     __ paddd($dst$$XMMRegister, $src$$XMMRegister);
5198   %}
5199   ins_pipe( pipe_slow );
5200 %}
5201 
5202 instruct vadd2I_reg(vecD dst, vecD src1, vecD src2) %{
5203   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
5204   match(Set dst (AddVI src1 src2));
5205   format %{ "vpaddd  $dst,$src1,$src2\t! add packed2I" %}
5206   ins_encode %{
5207     int vector_len = 0;
5208     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5209   %}
5210   ins_pipe( pipe_slow );
5211 %}
5212 











5213 instruct vadd4I(vecX dst, vecX src) %{
5214   predicate(n->as_Vector()->length() == 4);
5215   match(Set dst (AddVI dst src));
5216   format %{ "paddd   $dst,$src\t! add packed4I" %}
5217   ins_encode %{
5218     __ paddd($dst$$XMMRegister, $src$$XMMRegister);
5219   %}
5220   ins_pipe( pipe_slow );
5221 %}
5222 
5223 instruct vadd4I_reg(vecX dst, vecX src1, vecX src2) %{
5224   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
5225   match(Set dst (AddVI src1 src2));
5226   format %{ "vpaddd  $dst,$src1,$src2\t! add packed4I" %}
5227   ins_encode %{
5228     int vector_len = 0;
5229     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5230   %}
5231   ins_pipe( pipe_slow );
5232 %}


5368   predicate(n->as_Vector()->length() == 2);
5369   match(Set dst (AddVF dst src));
5370   format %{ "addps   $dst,$src\t! add packed2F" %}
5371   ins_encode %{
5372     __ addps($dst$$XMMRegister, $src$$XMMRegister);
5373   %}
5374   ins_pipe( pipe_slow );
5375 %}
5376 
5377 instruct vadd2F_reg(vecD dst, vecD src1, vecD src2) %{
5378   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
5379   match(Set dst (AddVF src1 src2));
5380   format %{ "vaddps  $dst,$src1,$src2\t! add packed2F" %}
5381   ins_encode %{
5382     int vector_len = 0;
5383     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5384   %}
5385   ins_pipe( pipe_slow );
5386 %}
5387 











5388 instruct vadd4F(vecX dst, vecX src) %{
5389   predicate(n->as_Vector()->length() == 4);
5390   match(Set dst (AddVF dst src));
5391   format %{ "addps   $dst,$src\t! add packed4F" %}
5392   ins_encode %{
5393     __ addps($dst$$XMMRegister, $src$$XMMRegister);
5394   %}
5395   ins_pipe( pipe_slow );
5396 %}
5397 
5398 instruct vadd4F_reg(vecX dst, vecX src1, vecX src2) %{
5399   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
5400   match(Set dst (AddVF src1 src2));
5401   format %{ "vaddps  $dst,$src1,$src2\t! add packed4F" %}
5402   ins_encode %{
5403     int vector_len = 0;
5404     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5405   %}
5406   ins_pipe( pipe_slow );
5407 %}


5545   predicate(n->as_Vector()->length() == 4);
5546   match(Set dst (SubVB dst src));
5547   format %{ "psubb   $dst,$src\t! sub packed4B" %}
5548   ins_encode %{
5549     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
5550   %}
5551   ins_pipe( pipe_slow );
5552 %}
5553 
5554 instruct vsub4B_reg(vecS dst, vecS src1, vecS src2) %{
5555   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
5556   match(Set dst (SubVB src1 src2));
5557   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed4B" %}
5558   ins_encode %{
5559     int vector_len = 0;
5560     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5561   %}
5562   ins_pipe( pipe_slow );
5563 %}
5564 











5565 instruct vsub8B(vecD dst, vecD src) %{
5566   predicate(n->as_Vector()->length() == 8);
5567   match(Set dst (SubVB dst src));
5568   format %{ "psubb   $dst,$src\t! sub packed8B" %}
5569   ins_encode %{
5570     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
5571   %}
5572   ins_pipe( pipe_slow );
5573 %}
5574 
5575 instruct vsub8B_reg(vecD dst, vecD src1, vecD src2) %{
5576   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
5577   match(Set dst (SubVB src1 src2));
5578   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed8B" %}
5579   ins_encode %{
5580     int vector_len = 0;
5581     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5582   %}
5583   ins_pipe( pipe_slow );
5584 %}
5585 











5586 instruct vsub16B(vecX dst, vecX src) %{
5587   predicate(n->as_Vector()->length() == 16);
5588   match(Set dst (SubVB dst src));
5589   format %{ "psubb   $dst,$src\t! sub packed16B" %}
5590   ins_encode %{
5591     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
5592   %}
5593   ins_pipe( pipe_slow );
5594 %}
5595 
5596 instruct vsub16B_reg(vecX dst, vecX src1, vecX src2) %{
5597   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
5598   match(Set dst (SubVB src1 src2));
5599   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed16B" %}
5600   ins_encode %{
5601     int vector_len = 0;
5602     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5603   %}
5604   ins_pipe( pipe_slow );
5605 %}


5664   predicate(n->as_Vector()->length() == 2);
5665   match(Set dst (SubVS dst src));
5666   format %{ "psubw   $dst,$src\t! sub packed2S" %}
5667   ins_encode %{
5668     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
5669   %}
5670   ins_pipe( pipe_slow );
5671 %}
5672 
5673 instruct vsub2S_reg(vecS dst, vecS src1, vecS src2) %{
5674   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
5675   match(Set dst (SubVS src1 src2));
5676   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed2S" %}
5677   ins_encode %{
5678     int vector_len = 0;
5679     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5680   %}
5681   ins_pipe( pipe_slow );
5682 %}
5683 











5684 instruct vsub4S(vecD dst, vecD src) %{
5685   predicate(n->as_Vector()->length() == 4);
5686   match(Set dst (SubVS dst src));
5687   format %{ "psubw   $dst,$src\t! sub packed4S" %}
5688   ins_encode %{
5689     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
5690   %}
5691   ins_pipe( pipe_slow );
5692 %}
5693 
5694 instruct vsub4S_reg(vecD dst, vecD src1, vecD src2) %{
5695   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
5696   match(Set dst (SubVS src1 src2));
5697   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed4S" %}
5698   ins_encode %{
5699     int vector_len = 0;
5700     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5701   %}
5702   ins_pipe( pipe_slow );
5703 %}
5704 











5705 instruct vsub8S(vecX dst, vecX src) %{
5706   predicate(n->as_Vector()->length() == 8);
5707   match(Set dst (SubVS dst src));
5708   format %{ "psubw   $dst,$src\t! sub packed8S" %}
5709   ins_encode %{
5710     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
5711   %}
5712   ins_pipe( pipe_slow );
5713 %}
5714 
5715 instruct vsub8S_reg(vecX dst, vecX src1, vecX src2) %{
5716   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
5717   match(Set dst (SubVS src1 src2));
5718   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed8S" %}
5719   ins_encode %{
5720     int vector_len = 0;
5721     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5722   %}
5723   ins_pipe( pipe_slow );
5724 %}


5783   predicate(n->as_Vector()->length() == 2);
5784   match(Set dst (SubVI dst src));
5785   format %{ "psubd   $dst,$src\t! sub packed2I" %}
5786   ins_encode %{
5787     __ psubd($dst$$XMMRegister, $src$$XMMRegister);
5788   %}
5789   ins_pipe( pipe_slow );
5790 %}
5791 
5792 instruct vsub2I_reg(vecD dst, vecD src1, vecD src2) %{
5793   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
5794   match(Set dst (SubVI src1 src2));
5795   format %{ "vpsubd  $dst,$src1,$src2\t! sub packed2I" %}
5796   ins_encode %{
5797     int vector_len = 0;
5798     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5799   %}
5800   ins_pipe( pipe_slow );
5801 %}
5802 











5803 instruct vsub4I(vecX dst, vecX src) %{
5804   predicate(n->as_Vector()->length() == 4);
5805   match(Set dst (SubVI dst src));
5806   format %{ "psubd   $dst,$src\t! sub packed4I" %}
5807   ins_encode %{
5808     __ psubd($dst$$XMMRegister, $src$$XMMRegister);
5809   %}
5810   ins_pipe( pipe_slow );
5811 %}
5812 
5813 instruct vsub4I_reg(vecX dst, vecX src1, vecX src2) %{
5814   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
5815   match(Set dst (SubVI src1 src2));
5816   format %{ "vpsubd  $dst,$src1,$src2\t! sub packed4I" %}
5817   ins_encode %{
5818     int vector_len = 0;
5819     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5820   %}
5821   ins_pipe( pipe_slow );
5822 %}


5958   predicate(n->as_Vector()->length() == 2);
5959   match(Set dst (SubVF dst src));
5960   format %{ "subps   $dst,$src\t! sub packed2F" %}
5961   ins_encode %{
5962     __ subps($dst$$XMMRegister, $src$$XMMRegister);
5963   %}
5964   ins_pipe( pipe_slow );
5965 %}
5966 
5967 instruct vsub2F_reg(vecD dst, vecD src1, vecD src2) %{
5968   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
5969   match(Set dst (SubVF src1 src2));
5970   format %{ "vsubps  $dst,$src1,$src2\t! sub packed2F" %}
5971   ins_encode %{
5972     int vector_len = 0;
5973     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5974   %}
5975   ins_pipe( pipe_slow );
5976 %}
5977 











5978 instruct vsub4F(vecX dst, vecX src) %{
5979   predicate(n->as_Vector()->length() == 4);
5980   match(Set dst (SubVF dst src));
5981   format %{ "subps   $dst,$src\t! sub packed4F" %}
5982   ins_encode %{
5983     __ subps($dst$$XMMRegister, $src$$XMMRegister);
5984   %}
5985   ins_pipe( pipe_slow );
5986 %}
5987 
5988 instruct vsub4F_reg(vecX dst, vecX src1, vecX src2) %{
5989   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
5990   match(Set dst (SubVF src1 src2));
5991   format %{ "vsubps  $dst,$src1,$src2\t! sub packed4F" %}
5992   ins_encode %{
5993     int vector_len = 0;
5994     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5995   %}
5996   ins_pipe( pipe_slow );
5997 %}


6135   predicate(n->as_Vector()->length() == 2);
6136   match(Set dst (MulVS dst src));
6137   format %{ "pmullw $dst,$src\t! mul packed2S" %}
6138   ins_encode %{
6139     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
6140   %}
6141   ins_pipe( pipe_slow );
6142 %}
6143 
6144 instruct vmul2S_reg(vecS dst, vecS src1, vecS src2) %{
6145   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6146   match(Set dst (MulVS src1 src2));
6147   format %{ "vpmullw $dst,$src1,$src2\t! mul packed2S" %}
6148   ins_encode %{
6149     int vector_len = 0;
6150     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6151   %}
6152   ins_pipe( pipe_slow );
6153 %}
6154 











6155 instruct vmul4S(vecD dst, vecD src) %{
6156   predicate(n->as_Vector()->length() == 4);
6157   match(Set dst (MulVS dst src));
6158   format %{ "pmullw  $dst,$src\t! mul packed4S" %}
6159   ins_encode %{
6160     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
6161   %}
6162   ins_pipe( pipe_slow );
6163 %}
6164 
6165 instruct vmul4S_reg(vecD dst, vecD src1, vecD src2) %{
6166   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6167   match(Set dst (MulVS src1 src2));
6168   format %{ "vpmullw $dst,$src1,$src2\t! mul packed4S" %}
6169   ins_encode %{
6170     int vector_len = 0;
6171     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6172   %}
6173   ins_pipe( pipe_slow );
6174 %}
6175 











6176 instruct vmul8S(vecX dst, vecX src) %{
6177   predicate(n->as_Vector()->length() == 8);
6178   match(Set dst (MulVS dst src));
6179   format %{ "pmullw  $dst,$src\t! mul packed8S" %}
6180   ins_encode %{
6181     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
6182   %}
6183   ins_pipe( pipe_slow );
6184 %}
6185 
6186 instruct vmul8S_reg(vecX dst, vecX src1, vecX src2) %{
6187   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6188   match(Set dst (MulVS src1 src2));
6189   format %{ "vpmullw $dst,$src1,$src2\t! mul packed8S" %}
6190   ins_encode %{
6191     int vector_len = 0;
6192     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6193   %}
6194   ins_pipe( pipe_slow );
6195 %}


6254   predicate(UseSSE > 3 && n->as_Vector()->length() == 2);
6255   match(Set dst (MulVI dst src));
6256   format %{ "pmulld  $dst,$src\t! mul packed2I" %}
6257   ins_encode %{
6258     __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
6259   %}
6260   ins_pipe( pipe_slow );
6261 %}
6262 
6263 instruct vmul2I_reg(vecD dst, vecD src1, vecD src2) %{
6264   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6265   match(Set dst (MulVI src1 src2));
6266   format %{ "vpmulld $dst,$src1,$src2\t! mul packed2I" %}
6267   ins_encode %{
6268     int vector_len = 0;
6269     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6270   %}
6271   ins_pipe( pipe_slow );
6272 %}
6273 
6274 instruct vmul2L_reg(vecX dst, vecX src1, vecX src2) %{
6275   predicate(UseAVX > 2 && n->as_Vector()->length() == 2 && VM_Version::supports_avx512dq());
6276   match(Set dst (MulVL src1 src2));
6277   format %{ "vpmullq $dst,$src1,$src2\t! mul packed2L" %}
6278   ins_encode %{
6279     int vector_len = 0;
6280     __ vpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6281   %}
6282   ins_pipe( pipe_slow );
6283 %}
6284 
6285 instruct vmul4I(vecX dst, vecX src) %{
6286   predicate(UseSSE > 3 && n->as_Vector()->length() == 4);
6287   match(Set dst (MulVI dst src));
6288   format %{ "pmulld  $dst,$src\t! mul packed4I" %}
6289   ins_encode %{
6290     __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
6291   %}
6292   ins_pipe( pipe_slow );
6293 %}
6294 
6295 instruct vmul4I_reg(vecX dst, vecX src1, vecX src2) %{
6296   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6297   match(Set dst (MulVI src1 src2));
6298   format %{ "vpmulld $dst,$src1,$src2\t! mul packed4I" %}
6299   ins_encode %{
6300     int vector_len = 0;
6301     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6302   %}
6303   ins_pipe( pipe_slow );
6304 %}
6305 
6306 instruct vmul4I_mem(vecX dst, vecX src, memory mem) %{
6307   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6308   match(Set dst (MulVI src (LoadVector mem)));
6309   format %{ "vpmulld $dst,$src,$mem\t! mul packed4I" %}
6310   ins_encode %{
6311     int vector_len = 0;
6312     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6313   %}
6314   ins_pipe( pipe_slow );
6315 %}
6316 






















6317 instruct vmul4L_reg(vecY dst, vecY src1, vecY src2) %{
6318   predicate(UseAVX > 2 && n->as_Vector()->length() == 4 && VM_Version::supports_avx512dq());
6319   match(Set dst (MulVL src1 src2));
6320   format %{ "vpmullq $dst,$src1,$src2\t! mul packed4L" %}
6321   ins_encode %{
6322     int vector_len = 1;
6323     __ vpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6324   %}
6325   ins_pipe( pipe_slow );
6326 %}
6327 
6328 instruct vmul4L_mem(vecY dst, vecY src, memory mem) %{
6329   predicate(UseAVX > 2 && n->as_Vector()->length() == 4 && VM_Version::supports_avx512dq());
6330   match(Set dst (MulVL src (LoadVector mem)));
6331   format %{ "vpmullq $dst,$src,$mem\t! mul packed4L" %}
6332   ins_encode %{
6333     int vector_len = 1;
6334     __ vpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6335   %}
6336   ins_pipe( pipe_slow );
6337 %}
6338 
6339 instruct vmul8I_reg(vecY dst, vecY src1, vecY src2) %{
6340   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
6341   match(Set dst (MulVI src1 src2));
6342   format %{ "vpmulld $dst,$src1,$src2\t! mul packed8I" %}
6343   ins_encode %{
6344     int vector_len = 1;
6345     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6346   %}
6347   ins_pipe( pipe_slow );
6348 %}
6349 
6350 instruct vmul8L_reg(vecZ dst, vecZ src1, vecZ src2) %{
6351   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && VM_Version::supports_avx512dq());
6352   match(Set dst (MulVL src1 src2));
6353   format %{ "vpmullq $dst,$src1,$src2\t! mul packed8L" %}
6354   ins_encode %{
6355     int vector_len = 2;
6356     __ vpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6357   %}
6358   ins_pipe( pipe_slow );
6359 %}
6360 
6361 instruct vmul16I_reg(vecZ dst, vecZ src1, vecZ src2) %{
6362   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
6363   match(Set dst (MulVI src1 src2));
6364   format %{ "vpmulld $dst,$src1,$src2\t! mul packed16I" %}
6365   ins_encode %{
6366     int vector_len = 2;
6367     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6368   %}
6369   ins_pipe( pipe_slow );
6370 %}
6371 
6372 instruct vmul8I_mem(vecY dst, vecY src, memory mem) %{
6373   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
6374   match(Set dst (MulVI src (LoadVector mem)));
6375   format %{ "vpmulld $dst,$src,$mem\t! mul packed8I" %}
6376   ins_encode %{
6377     int vector_len = 1;
6378     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6379   %}
6380   ins_pipe( pipe_slow );
6381 %}
6382 
6383 instruct vmul8L_mem(vecZ dst, vecZ src, memory mem) %{
6384   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && VM_Version::supports_avx512dq());
6385   match(Set dst (MulVL src (LoadVector mem)));
6386   format %{ "vpmullq $dst,$src,$mem\t! mul packed8L" %}
6387   ins_encode %{
6388     int vector_len = 2;
6389     __ vpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6390   %}
6391   ins_pipe( pipe_slow );
6392 %}
6393 
6394 instruct vmul16I_mem(vecZ dst, vecZ src, memory mem) %{
6395   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
6396   match(Set dst (MulVI src (LoadVector mem)));
6397   format %{ "vpmulld $dst,$src,$mem\t! mul packed16I" %}
6398   ins_encode %{
6399     int vector_len = 2;
6400     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6401   %}
6402   ins_pipe( pipe_slow );
6403 %}
6404 
6405 // Floats vector mul
6406 instruct vmul2F(vecD dst, vecD src) %{
6407   predicate(n->as_Vector()->length() == 2);
6408   match(Set dst (MulVF dst src));
6409   format %{ "mulps   $dst,$src\t! mul packed2F" %}
6410   ins_encode %{
6411     __ mulps($dst$$XMMRegister, $src$$XMMRegister);
6412   %}
6413   ins_pipe( pipe_slow );
6414 %}
6415 
6416 instruct vmul2F_reg(vecD dst, vecD src1, vecD src2) %{
6417   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6418   match(Set dst (MulVF src1 src2));
6419   format %{ "vmulps  $dst,$src1,$src2\t! mul packed2F" %}
6420   ins_encode %{
6421     int vector_len = 0;
6422     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6423   %}
6424   ins_pipe( pipe_slow );
6425 %}
6426 











6427 instruct vmul4F(vecX dst, vecX src) %{
6428   predicate(n->as_Vector()->length() == 4);
6429   match(Set dst (MulVF dst src));
6430   format %{ "mulps   $dst,$src\t! mul packed4F" %}
6431   ins_encode %{
6432     __ mulps($dst$$XMMRegister, $src$$XMMRegister);
6433   %}
6434   ins_pipe( pipe_slow );
6435 %}
6436 
6437 instruct vmul4F_reg(vecX dst, vecX src1, vecX src2) %{
6438   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6439   match(Set dst (MulVF src1 src2));
6440   format %{ "vmulps  $dst,$src1,$src2\t! mul packed4F" %}
6441   ins_encode %{
6442     int vector_len = 0;
6443     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6444   %}
6445   ins_pipe( pipe_slow );
6446 %}


6584   predicate(n->as_Vector()->length() == 2);
6585   match(Set dst (DivVF dst src));
6586   format %{ "divps   $dst,$src\t! div packed2F" %}
6587   ins_encode %{
6588     __ divps($dst$$XMMRegister, $src$$XMMRegister);
6589   %}
6590   ins_pipe( pipe_slow );
6591 %}
6592 
6593 instruct vdiv2F_reg(vecD dst, vecD src1, vecD src2) %{
6594   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6595   match(Set dst (DivVF src1 src2));
6596   format %{ "vdivps  $dst,$src1,$src2\t! div packed2F" %}
6597   ins_encode %{
6598     int vector_len = 0;
6599     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6600   %}
6601   ins_pipe( pipe_slow );
6602 %}
6603 











6604 instruct vdiv4F(vecX dst, vecX src) %{
6605   predicate(n->as_Vector()->length() == 4);
6606   match(Set dst (DivVF dst src));
6607   format %{ "divps   $dst,$src\t! div packed4F" %}
6608   ins_encode %{
6609     __ divps($dst$$XMMRegister, $src$$XMMRegister);
6610   %}
6611   ins_pipe( pipe_slow );
6612 %}
6613 
6614 instruct vdiv4F_reg(vecX dst, vecX src1, vecX src2) %{
6615   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6616   match(Set dst (DivVF src1 src2));
6617   format %{ "vdivps  $dst,$src1,$src2\t! div packed4F" %}
6618   ins_encode %{
6619     int vector_len = 0;
6620     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6621   %}
6622   ins_pipe( pipe_slow );
6623 %}


7861   predicate(n->as_Vector()->length_in_bytes() == 4);
7862   match(Set dst (AndV dst src));
7863   format %{ "pand    $dst,$src\t! and vectors (4 bytes)" %}
7864   ins_encode %{
7865     __ pand($dst$$XMMRegister, $src$$XMMRegister);
7866   %}
7867   ins_pipe( pipe_slow );
7868 %}
7869 
7870 instruct vand4B_reg(vecS dst, vecS src1, vecS src2) %{
7871   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
7872   match(Set dst (AndV src1 src2));
7873   format %{ "vpand   $dst,$src1,$src2\t! and vectors (4 bytes)" %}
7874   ins_encode %{
7875     int vector_len = 0;
7876     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7877   %}
7878   ins_pipe( pipe_slow );
7879 %}
7880 











7881 instruct vand8B(vecD dst, vecD src) %{
7882   predicate(n->as_Vector()->length_in_bytes() == 8);
7883   match(Set dst (AndV dst src));
7884   format %{ "pand    $dst,$src\t! and vectors (8 bytes)" %}
7885   ins_encode %{
7886     __ pand($dst$$XMMRegister, $src$$XMMRegister);
7887   %}
7888   ins_pipe( pipe_slow );
7889 %}
7890 
7891 instruct vand8B_reg(vecD dst, vecD src1, vecD src2) %{
7892   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
7893   match(Set dst (AndV src1 src2));
7894   format %{ "vpand   $dst,$src1,$src2\t! and vectors (8 bytes)" %}
7895   ins_encode %{
7896     int vector_len = 0;
7897     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7898   %}
7899   ins_pipe( pipe_slow );
7900 %}
7901 











7902 instruct vand16B(vecX dst, vecX src) %{
7903   predicate(n->as_Vector()->length_in_bytes() == 16);
7904   match(Set dst (AndV dst src));
7905   format %{ "pand    $dst,$src\t! and vectors (16 bytes)" %}
7906   ins_encode %{
7907     __ pand($dst$$XMMRegister, $src$$XMMRegister);
7908   %}
7909   ins_pipe( pipe_slow );
7910 %}
7911 
7912 instruct vand16B_reg(vecX dst, vecX src1, vecX src2) %{
7913   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
7914   match(Set dst (AndV src1 src2));
7915   format %{ "vpand   $dst,$src1,$src2\t! and vectors (16 bytes)" %}
7916   ins_encode %{
7917     int vector_len = 0;
7918     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7919   %}
7920   ins_pipe( pipe_slow );
7921 %}


7981   predicate(n->as_Vector()->length_in_bytes() == 4);
7982   match(Set dst (OrV dst src));
7983   format %{ "por     $dst,$src\t! or vectors (4 bytes)" %}
7984   ins_encode %{
7985     __ por($dst$$XMMRegister, $src$$XMMRegister);
7986   %}
7987   ins_pipe( pipe_slow );
7988 %}
7989 
7990 instruct vor4B_reg(vecS dst, vecS src1, vecS src2) %{
7991   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
7992   match(Set dst (OrV src1 src2));
7993   format %{ "vpor    $dst,$src1,$src2\t! or vectors (4 bytes)" %}
7994   ins_encode %{
7995     int vector_len = 0;
7996     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7997   %}
7998   ins_pipe( pipe_slow );
7999 %}
8000 











8001 instruct vor8B(vecD dst, vecD src) %{
8002   predicate(n->as_Vector()->length_in_bytes() == 8);
8003   match(Set dst (OrV dst src));
8004   format %{ "por     $dst,$src\t! or vectors (8 bytes)" %}
8005   ins_encode %{
8006     __ por($dst$$XMMRegister, $src$$XMMRegister);
8007   %}
8008   ins_pipe( pipe_slow );
8009 %}
8010 
8011 instruct vor8B_reg(vecD dst, vecD src1, vecD src2) %{
8012   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
8013   match(Set dst (OrV src1 src2));
8014   format %{ "vpor    $dst,$src1,$src2\t! or vectors (8 bytes)" %}
8015   ins_encode %{
8016     int vector_len = 0;
8017     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8018   %}
8019   ins_pipe( pipe_slow );
8020 %}
8021 











8022 instruct vor16B(vecX dst, vecX src) %{
8023   predicate(n->as_Vector()->length_in_bytes() == 16);
8024   match(Set dst (OrV dst src));
8025   format %{ "por     $dst,$src\t! or vectors (16 bytes)" %}
8026   ins_encode %{
8027     __ por($dst$$XMMRegister, $src$$XMMRegister);
8028   %}
8029   ins_pipe( pipe_slow );
8030 %}
8031 
8032 instruct vor16B_reg(vecX dst, vecX src1, vecX src2) %{
8033   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
8034   match(Set dst (OrV src1 src2));
8035   format %{ "vpor    $dst,$src1,$src2\t! or vectors (16 bytes)" %}
8036   ins_encode %{
8037     int vector_len = 0;
8038     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8039   %}
8040   ins_pipe( pipe_slow );
8041 %}


8101   predicate(n->as_Vector()->length_in_bytes() == 4);
8102   match(Set dst (XorV dst src));
8103   format %{ "pxor    $dst,$src\t! xor vectors (4 bytes)" %}
8104   ins_encode %{
8105     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
8106   %}
8107   ins_pipe( pipe_slow );
8108 %}
8109 
8110 instruct vxor4B_reg(vecS dst, vecS src1, vecS src2) %{
8111   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
8112   match(Set dst (XorV src1 src2));
8113   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (4 bytes)" %}
8114   ins_encode %{
8115     int vector_len = 0;
8116     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8117   %}
8118   ins_pipe( pipe_slow );
8119 %}
8120 











8121 instruct vxor8B(vecD dst, vecD src) %{
8122   predicate(n->as_Vector()->length_in_bytes() == 8);
8123   match(Set dst (XorV dst src));
8124   format %{ "pxor    $dst,$src\t! xor vectors (8 bytes)" %}
8125   ins_encode %{
8126     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
8127   %}
8128   ins_pipe( pipe_slow );
8129 %}
8130 
8131 instruct vxor8B_reg(vecD dst, vecD src1, vecD src2) %{
8132   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
8133   match(Set dst (XorV src1 src2));
8134   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (8 bytes)" %}
8135   ins_encode %{
8136     int vector_len = 0;
8137     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);











8138   %}
8139   ins_pipe( pipe_slow );
8140 %}
8141 
8142 instruct vxor16B(vecX dst, vecX src) %{
8143   predicate(n->as_Vector()->length_in_bytes() == 16);
8144   match(Set dst (XorV dst src));
8145   format %{ "pxor    $dst,$src\t! xor vectors (16 bytes)" %}
8146   ins_encode %{
8147     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
8148   %}
8149   ins_pipe( pipe_slow );
8150 %}
8151 
8152 instruct vxor16B_reg(vecX dst, vecX src1, vecX src2) %{
8153   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
8154   match(Set dst (XorV src1 src2));
8155   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (16 bytes)" %}
8156   ins_encode %{
8157     int vector_len = 0;




2877   ins_cost(145);
2878   format %{ "vmovdqu $mem,$src\t! store vector (32 bytes)" %}
2879   ins_encode %{
2880     __ vmovdqu($mem$$Address, $src$$XMMRegister);
2881   %}
2882   ins_pipe( pipe_slow );
2883 %}
2884 
2885 instruct storeV64(memory mem, vecZ src) %{
2886   predicate(n->as_StoreVector()->memory_size() == 64);
2887   match(Set mem (StoreVector mem src));
2888   ins_cost(145);
2889   format %{ "vmovdqu $mem k0,$src\t! store vector (64 bytes)" %}
2890   ins_encode %{
2891     int vector_len = 2;
2892     __ evmovdqu($mem$$Address, $src$$XMMRegister, vector_len);
2893   %}
2894   ins_pipe( pipe_slow );
2895 %}
2896 
2897 // ====================LEGACY REPLICATE=======================================













2898 
2899 instruct Repl16B(vecX dst, rRegI src) %{
2900   predicate(n->as_Vector()->length() == 16 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
2901   match(Set dst (ReplicateB src));
2902   format %{ "movd    $dst,$src\n\t"
2903             "punpcklbw $dst,$dst\n\t"
2904             "pshuflw $dst,$dst,0x00\n\t"
2905             "punpcklqdq $dst,$dst\t! replicate16B" %}
2906   ins_encode %{
2907     __ movdl($dst$$XMMRegister, $src$$Register);
2908     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
2909     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
2910     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2911   %}
2912   ins_pipe( pipe_slow );
2913 %}
2914 
2915 instruct Repl16B_mem(vecX dst, memory mem) %{
2916   predicate(n->as_Vector()->length() == 16 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
2917   match(Set dst (ReplicateB (LoadB mem)));
2918   format %{ "punpcklbw $dst,$mem\n\t"

2919             "pshuflw $dst,$dst,0x00\n\t"
2920             "punpcklqdq $dst,$dst\t! replicate16B" %}
2921   ins_encode %{
2922     __ punpcklbw($dst$$XMMRegister, $mem$$Address);

2923     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
2924     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2925   %}
2926   ins_pipe( pipe_slow );
2927 %}
2928 
2929 instruct Repl32B(vecY dst, rRegI src) %{
2930   predicate(n->as_Vector()->length() == 32 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
2931   match(Set dst (ReplicateB src));
2932   format %{ "movd    $dst,$src\n\t"
2933             "punpcklbw $dst,$dst\n\t"
2934             "pshuflw $dst,$dst,0x00\n\t"
2935             "punpcklqdq $dst,$dst\n\t"
2936             "vinserti128h $dst,$dst,$dst\t! replicate32B" %}
2937   ins_encode %{
2938     __ movdl($dst$$XMMRegister, $src$$Register);
2939     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
2940     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
2941     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2942     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
2943   %}
2944   ins_pipe( pipe_slow );
2945 %}
2946 
2947 instruct Repl32B_mem(vecY dst, memory mem) %{
2948   predicate(n->as_Vector()->length() == 32 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
2949   match(Set dst (ReplicateB (LoadB mem)));
2950   format %{ "punpcklbw $dst,$mem\n\t"

2951             "pshuflw $dst,$dst,0x00\n\t"
2952             "punpcklqdq $dst,$dst\n\t"
2953             "vinserti128h $dst,$dst,$dst\t! replicate32B" %}

2954   ins_encode %{
2955     __ punpcklbw($dst$$XMMRegister, $mem$$Address);

2956     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
2957     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2958     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);






















2959   %}
2960   ins_pipe( pipe_slow );
2961 %}
2962 
2963 instruct Repl16B_imm(vecX dst, immI con) %{
2964   predicate(n->as_Vector()->length() == 16 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
2965   match(Set dst (ReplicateB con));
2966   format %{ "movq    $dst,[$constantaddress]\n\t"
2967             "punpcklqdq $dst,$dst\t! replicate16B($con)" %}
2968   ins_encode %{
2969     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
2970     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2971   %}
2972   ins_pipe( pipe_slow );
2973 %}
2974 
2975 instruct Repl32B_imm(vecY dst, immI con) %{
2976   predicate(n->as_Vector()->length() == 32 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
2977   match(Set dst (ReplicateB con));
2978   format %{ "movq    $dst,[$constantaddress]\n\t"
2979             "punpcklqdq $dst,$dst\n\t"
2980             "vinserti128h $dst,$dst,$dst\t! lreplicate32B($con)" %}
2981   ins_encode %{
2982     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
2983     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2984     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
2985   %}
2986   ins_pipe( pipe_slow );
2987 %}
2988 





































2989 instruct Repl16B_zero(vecX dst, immI0 zero) %{
2990   predicate(n->as_Vector()->length() == 16 && UseAVX > 0 && UseAVX < 3);
2991   match(Set dst (ReplicateB zero));
2992   format %{ "pxor    $dst,$dst\t! replicate16B zero" %}
2993   ins_encode %{
2994     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
2995   %}
2996   ins_pipe( fpu_reg_reg );
2997 %}
2998 
2999 instruct Repl32B_zero(vecY dst, immI0 zero) %{
3000   predicate(n->as_Vector()->length() == 32 && UseAVX > 0 && UseAVX < 3);
3001   match(Set dst (ReplicateB zero));
3002   format %{ "vpxor   $dst,$dst,$dst\t! replicate32B zero" %}
3003   ins_encode %{
3004     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
3005     int vector_len = 1;
3006     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3007   %}
3008   ins_pipe( fpu_reg_reg );
3009 %}
3010 





































3011 instruct Repl8S(vecX dst, rRegI src) %{
3012   predicate(n->as_Vector()->length() == 8 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
3013   match(Set dst (ReplicateS src));
3014   format %{ "movd    $dst,$src\n\t"
3015             "pshuflw $dst,$dst,0x00\n\t"
3016             "punpcklqdq $dst,$dst\t! replicate8S" %}
3017   ins_encode %{
3018     __ movdl($dst$$XMMRegister, $src$$Register);
3019     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3020     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3021   %}
3022   ins_pipe( pipe_slow );
3023 %}
3024 
3025 instruct Repl16S(vecY dst, rRegI src) %{
3026   predicate(n->as_Vector()->length() == 16 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
3027   match(Set dst (ReplicateS src));
3028   format %{ "movd    $dst,$src\n\t"
3029             "pshuflw $dst,$dst,0x00\n\t"
3030             "punpcklqdq $dst,$dst\n\t"
3031             "vinserti128h $dst,$dst,$dst\t! replicate16S" %}
3032   ins_encode %{
3033     __ movdl($dst$$XMMRegister, $src$$Register);
3034     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3035     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3036     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
3037   %}
3038   ins_pipe( pipe_slow );
3039 %}
3040 







































3041 instruct Repl8S_imm(vecX dst, immI con) %{
3042   predicate(n->as_Vector()->length() == 8 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
3043   match(Set dst (ReplicateS con));
3044   format %{ "movq    $dst,[$constantaddress]\n\t"
3045             "punpcklqdq $dst,$dst\t! replicate8S($con)" %}
3046   ins_encode %{
3047     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
3048     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3049   %}
3050   ins_pipe( pipe_slow );
3051 %}
3052 
3053 instruct Repl16S_imm(vecY dst, immI con) %{
3054   predicate(n->as_Vector()->length() == 16 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
3055   match(Set dst (ReplicateS con));
3056   format %{ "movq    $dst,[$constantaddress]\n\t"
3057             "punpcklqdq $dst,$dst\n\t"
3058             "vinserti128h $dst,$dst,$dst\t! replicate16S($con)" %}
3059   ins_encode %{
3060     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
3061     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3062     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
3063   %}
3064   ins_pipe( pipe_slow );
3065 %}
3066 





































3067 instruct Repl8S_zero(vecX dst, immI0 zero) %{
3068   predicate(n->as_Vector()->length() == 8 && UseAVX > 0 && UseAVX < 3);
3069   match(Set dst (ReplicateS zero));
3070   format %{ "pxor    $dst,$dst\t! replicate8S zero" %}
3071   ins_encode %{
3072     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3073   %}
3074   ins_pipe( fpu_reg_reg );
3075 %}
3076 
3077 instruct Repl16S_zero(vecY dst, immI0 zero) %{
3078   predicate(n->as_Vector()->length() == 16 && UseAVX > 0 && UseAVX < 3);
3079   match(Set dst (ReplicateS zero));
3080   format %{ "vpxor   $dst,$dst,$dst\t! replicate16S zero" %}
3081   ins_encode %{
3082     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
3083     int vector_len = 1;
3084     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3085   %}
3086   ins_pipe( fpu_reg_reg );
3087 %}
3088 
3089 instruct Repl4I(vecX dst, rRegI src) %{
3090   predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vl());













3091   match(Set dst (ReplicateI src));
3092   format %{ "movd    $dst,$src\n\t"
3093             "pshufd  $dst,$dst,0x00\t! replicate4I" %}
3094   ins_encode %{
3095     __ movdl($dst$$XMMRegister, $src$$Register);
3096     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3097   %}
3098   ins_pipe( pipe_slow );
3099 %}
3100 
3101 instruct Repl4I_mem(vecX dst, memory mem) %{
3102   predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3103   match(Set dst (ReplicateI (LoadI mem)));
3104   format %{ "pshufd  $dst,$mem,0x00\t! replicate4I" %}

3105   ins_encode %{
3106     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);

3107   %}
3108   ins_pipe( pipe_slow );
3109 %}
3110 
3111 instruct Repl8I(vecY dst, rRegI src) %{
3112   predicate(n->as_Vector()->length() == 8 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3113   match(Set dst (ReplicateI src));
3114   format %{ "movd    $dst,$src\n\t"
3115             "pshufd  $dst,$dst,0x00\n\t"
3116             "vinserti128h $dst,$dst,$dst\t! replicate8I" %}
3117   ins_encode %{
3118     __ movdl($dst$$XMMRegister, $src$$Register);
3119     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3120     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
3121   %}
3122   ins_pipe( pipe_slow );
3123 %}
3124 
3125 instruct Repl8I_mem(vecY dst, memory mem) %{
3126   predicate(n->as_Vector()->length() == 8 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3127   match(Set dst (ReplicateI (LoadI mem)));
3128   format %{ "pshufd  $dst,$mem,0x00\n\t"
3129             "vinserti128h $dst,$dst,$dst\t! replicate8I" %}


3130   ins_encode %{
3131     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);

3132     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);

3133   %}
3134   ins_pipe( pipe_slow );
3135 %}
3136 











3137 instruct Repl4I_imm(vecX dst, immI con) %{
3138   predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3139   match(Set dst (ReplicateI con));
3140   format %{ "movq    $dst,[$constantaddress]\t! replicate4I($con)\n\t"
3141             "punpcklqdq $dst,$dst" %}
3142   ins_encode %{
3143     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
3144     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3145   %}
3146   ins_pipe( pipe_slow );
3147 %}
3148 
3149 instruct Repl8I_imm(vecY dst, immI con) %{
3150   predicate(n->as_Vector()->length() == 8 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3151   match(Set dst (ReplicateI con));
3152   format %{ "movq    $dst,[$constantaddress]\t! replicate8I($con)\n\t"
3153             "punpcklqdq $dst,$dst\n\t"
3154             "vinserti128h $dst,$dst,$dst" %}
3155   ins_encode %{
3156     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
3157     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3158     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
3159   %}
3160   ins_pipe( pipe_slow );
3161 %}
3162 
3163 instruct Repl4I_zero(vecX dst, immI0 zero) %{
3164   predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && UseAVX < 3);
3165   match(Set dst (ReplicateI zero));
3166   format %{ "pxor    $dst,$dst\t! replicate4I zero)" %}
3167   ins_encode %{
3168     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3169   %}
3170   ins_pipe( fpu_reg_reg );
3171 %}
3172 
3173 instruct Repl8I_zero(vecY dst, immI0 zero) %{
3174   predicate(n->as_Vector()->length() == 8 && UseAVX > 0 && UseAVX < 3);
3175   match(Set dst (ReplicateI zero));
3176   format %{ "vpxor   $dst,$dst,$dst\t! replicate8I zero" %}
3177   ins_encode %{
3178     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
3179     int vector_len = 1;
3180     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3181   %}
3182   ins_pipe( fpu_reg_reg );
3183 %}
3184 
3185 // Replicate long (8 byte) scalar to be vector
3186 #ifdef _LP64
3187 instruct Repl4L(vecY dst, rRegL src) %{
3188   predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3189   match(Set dst (ReplicateL src));
3190   format %{ "movdq   $dst,$src\n\t"
3191             "punpcklqdq $dst,$dst\n\t"
3192             "vinserti128h $dst,$dst,$dst\t! replicate4L" %}
3193   ins_encode %{
3194     __ movdq($dst$$XMMRegister, $src$$Register);
3195     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3196     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
3197   %}
3198   ins_pipe( pipe_slow );
3199 %}
3200 #else // _LP64
3201 instruct Repl4L(vecY dst, eRegL src, regD tmp) %{
3202   predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3203   match(Set dst (ReplicateL src));
3204   effect(TEMP dst, USE src, TEMP tmp);
3205   format %{ "movdl   $dst,$src.lo\n\t"
3206             "movdl   $tmp,$src.hi\n\t"
3207             "punpckldq $dst,$tmp\n\t"
3208             "punpcklqdq $dst,$dst\n\t"
3209             "vinserti128h $dst,$dst,$dst\t! replicate4L" %}
3210   ins_encode %{
3211     __ movdl($dst$$XMMRegister, $src$$Register);
3212     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
3213     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
3214     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3215     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
3216   %}
3217   ins_pipe( pipe_slow );
3218 %}
3219 #endif // _LP64
3220 
3221 instruct Repl4L_imm(vecY dst, immL con) %{
3222   predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3223   match(Set dst (ReplicateL con));
3224   format %{ "movq    $dst,[$constantaddress]\n\t"
3225             "punpcklqdq $dst,$dst\n\t"
3226             "vinserti128h $dst,$dst,$dst\t! replicate4L($con)" %}
3227   ins_encode %{
3228     __ movq($dst$$XMMRegister, $constantaddress($con));
3229     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3230     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
3231   %}
3232   ins_pipe( pipe_slow );
3233 %}
3234 
3235 instruct Repl4L_mem(vecY dst, memory mem) %{
3236   predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3237   match(Set dst (ReplicateL (LoadL mem)));
3238   format %{ "movq    $dst,$mem\n\t"
3239             "punpcklqdq $dst,$dst\n\t"
3240             "vinserti128h $dst,$dst,$dst\t! replicate4L" %}
3241   ins_encode %{
3242     __ movq($dst$$XMMRegister, $mem$$Address);
3243     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3244     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
3245   %}
3246   ins_pipe( pipe_slow );
3247 %}
3248 
3249 instruct Repl8L_mem(vecZ dst, memory mem) %{
3250   predicate(n->as_Vector()->length() == 8 && UseAVX > 0 && UseAVX < 3);
3251   match(Set dst (ReplicateL (LoadL mem)));
3252   format %{ "movq    $dst,$mem\n\t"
3253             "punpcklqdq $dst,$dst\n\t"
3254             "vinserti128h $dst,$dst,$dst\t! lower replicate4L\n\t"
3255             "vinserti64x4h $dst k0,$dst,$dst\t! upper replicate4L" %}
3256   ins_encode %{
3257     __ movq($dst$$XMMRegister, $mem$$Address);
3258     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3259     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
3260     __ vinserti64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
3261   %}
3262   ins_pipe( pipe_slow );
3263 %}
3264 
3265 instruct Repl8F(vecY dst, regF src) %{
3266   predicate(n->as_Vector()->length() == 8 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3267   match(Set dst (ReplicateF src));
3268   format %{ "pshufd  $dst,$src,0x00\n\t"
3269             "vinsertf128h $dst,$dst,$dst\t! replicate8F" %}

3270   ins_encode %{
3271     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
3272     __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
3273   %}
3274   ins_pipe( pipe_slow );
3275 %}
3276 
3277 instruct Repl8F_mem(vecY dst, memory mem) %{
3278   predicate(n->as_Vector()->length() == 8 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3279   match(Set dst (ReplicateF (LoadF mem)));
3280   format %{ "pshufd  $dst,$mem,0x00\n\t"
3281             "vinsertf128h $dst,$dst,$dst\t! replicate8F" %}
3282   ins_encode %{
3283     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3284     __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
3285   %}
3286   ins_pipe( pipe_slow );
3287 %}
3288 
3289 instruct Repl4D(vecY dst, regD src) %{
3290   predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3291   match(Set dst (ReplicateD src));
3292   format %{ "pshufd  $dst,$src,0x44\n\t"
3293             "vinsertf128h $dst,$dst,$dst\t! replicate4D" %}
3294   ins_encode %{
3295     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
3296     __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
3297   %}
3298   ins_pipe( pipe_slow );
3299 %}
3300 
3301 instruct Repl4D_mem(vecY dst, memory mem) %{
3302   predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3303   match(Set dst (ReplicateD (LoadD mem)));
3304   format %{ "pshufd  $dst,$mem,0x44\n\t"
3305             "vinsertf128h $dst,$dst,$dst\t! replicate4D" %}
3306   ins_encode %{
3307     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x44);
3308     __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
3309   %}
3310   ins_pipe( pipe_slow );
3311 %}
3312 
3313 // ====================GENERIC REPLICATE==========================================
3314 
3315 // Replicate byte scalar to be vector
3316 instruct Repl4B(vecS dst, rRegI src) %{
3317   predicate(n->as_Vector()->length() == 4);
3318   match(Set dst (ReplicateB src));
3319   format %{ "movd    $dst,$src\n\t"
3320             "punpcklbw $dst,$dst\n\t"
3321             "pshuflw $dst,$dst,0x00\t! replicate4B" %}
3322   ins_encode %{
3323     __ movdl($dst$$XMMRegister, $src$$Register);
3324     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
3325     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3326   %}
3327   ins_pipe( pipe_slow );
3328 %}
3329 
3330 instruct Repl4B_mem(vecS dst, memory mem) %{
3331   predicate(n->as_Vector()->length() == 4);
3332   match(Set dst (ReplicateB (LoadB mem)));
3333   format %{ "punpcklbw $dst,$mem\n\t"
3334             "pshuflw $dst,$dst,0x00\t! replicate4B" %}
3335   ins_encode %{
3336     __ punpcklbw($dst$$XMMRegister, $mem$$Address);
3337     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3338   %}
3339   ins_pipe( pipe_slow );
3340 %}
3341 
3342 instruct Repl8B(vecD dst, rRegI src) %{
3343   predicate(n->as_Vector()->length() == 8);
3344   match(Set dst (ReplicateB src));
3345   format %{ "movd    $dst,$src\n\t"
3346             "punpcklbw $dst,$dst\n\t"
3347             "pshuflw $dst,$dst,0x00\t! replicate8B" %}
3348   ins_encode %{
3349     __ movdl($dst$$XMMRegister, $src$$Register);
3350     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
3351     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3352   %}
3353   ins_pipe( pipe_slow );
3354 %}
3355 
3356 instruct Repl8B_mem(vecD dst, memory mem) %{
3357   predicate(n->as_Vector()->length() == 8);
3358   match(Set dst (ReplicateB (LoadB mem)));
3359   format %{ "punpcklbw $dst,$mem\n\t"
3360             "pshuflw $dst,$dst,0x00\t! replicate8B" %}


3361   ins_encode %{
3362     __ punpcklbw($dst$$XMMRegister, $mem$$Address);
3363     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);


3364   %}
3365   ins_pipe( pipe_slow );
3366 %}
3367 
3368 // Replicate byte scalar immediate to be vector by loading from const table.
3369 instruct Repl4B_imm(vecS dst, immI con) %{
3370   predicate(n->as_Vector()->length() == 4);
3371   match(Set dst (ReplicateB con));
3372   format %{ "movdl   $dst,[$constantaddress]\t! replicate4B($con)" %}
3373   ins_encode %{
3374     __ movdl($dst$$XMMRegister, $constantaddress(replicate4_imm($con$$constant, 1)));
3375   %}
3376   ins_pipe( pipe_slow );
3377 %}
3378 
3379 instruct Repl8B_imm(vecD dst, immI con) %{
3380   predicate(n->as_Vector()->length() == 8);
3381   match(Set dst (ReplicateB con));
3382   format %{ "movq    $dst,[$constantaddress]\t! replicate8B($con)" %}
3383   ins_encode %{
3384     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
3385   %}
3386   ins_pipe( pipe_slow );
3387 %}
3388 
3389 // Replicate byte scalar zero to be vector
3390 instruct Repl4B_zero(vecS dst, immI0 zero) %{
3391   predicate(n->as_Vector()->length() == 4);
3392   match(Set dst (ReplicateB zero));
3393   format %{ "pxor    $dst,$dst\t! replicate4B zero" %}
3394   ins_encode %{
3395     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3396   %}
3397   ins_pipe( fpu_reg_reg );
3398 %}
3399 
3400 instruct Repl8B_zero(vecD dst, immI0 zero) %{
3401   predicate(n->as_Vector()->length() == 8);
3402   match(Set dst (ReplicateB zero));
3403   format %{ "pxor    $dst,$dst\t! replicate8B zero" %}
3404   ins_encode %{
3405     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3406   %}
3407   ins_pipe( fpu_reg_reg );
3408 %}
3409 
3410 // Replicate char/short (2 byte) scalar to be vector
3411 instruct Repl2S(vecS dst, rRegI src) %{
3412   predicate(n->as_Vector()->length() == 2);
3413   match(Set dst (ReplicateS src));
3414   format %{ "movd    $dst,$src\n\t"
3415             "pshuflw $dst,$dst,0x00\t! replicate2S" %}
3416   ins_encode %{
3417     __ movdl($dst$$XMMRegister, $src$$Register);
3418     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3419   %}
3420   ins_pipe( fpu_reg_reg );
3421 %}
3422 
3423 instruct Repl4S(vecD dst, rRegI src) %{
3424   predicate(n->as_Vector()->length() == 4);
3425   match(Set dst (ReplicateS src));
3426   format %{ "movd    $dst,$src\n\t"
3427             "pshuflw $dst,$dst,0x00\t! replicate4S" %}
3428   ins_encode %{
3429     __ movdl($dst$$XMMRegister, $src$$Register);
3430     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3431   %}
3432   ins_pipe( fpu_reg_reg );
3433 %}
3434 
3435 instruct Repl4S_mem(vecD dst, memory mem) %{
3436   predicate(n->as_Vector()->length() == 4);
3437   match(Set dst (ReplicateS (LoadS mem)));
3438   format %{ "pshuflw $dst,$mem,0x00\t! replicate4S" %}
3439   ins_encode %{
3440     __ pshuflw($dst$$XMMRegister, $mem$$Address, 0x00);
3441   %}
3442   ins_pipe( fpu_reg_reg );
3443 %}
3444 
3445 // Replicate char/short (2 byte) scalar immediate to be vector by loading from const table.
3446 instruct Repl2S_imm(vecS dst, immI con) %{
3447   predicate(n->as_Vector()->length() == 2);
3448   match(Set dst (ReplicateS con));
3449   format %{ "movdl   $dst,[$constantaddress]\t! replicate2S($con)" %}
3450   ins_encode %{
3451     __ movdl($dst$$XMMRegister, $constantaddress(replicate4_imm($con$$constant, 2)));
3452   %}
3453   ins_pipe( fpu_reg_reg );
3454 %}
3455 
3456 instruct Repl4S_imm(vecD dst, immI con) %{
3457   predicate(n->as_Vector()->length() == 4);
3458   match(Set dst (ReplicateS con));
3459   format %{ "movq    $dst,[$constantaddress]\t! replicate4S($con)" %}
3460   ins_encode %{
3461     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
3462   %}
3463   ins_pipe( fpu_reg_reg );
3464 %}
3465 
3466 // Replicate char/short (2 byte) scalar zero to be vector
3467 instruct Repl2S_zero(vecS dst, immI0 zero) %{
3468   predicate(n->as_Vector()->length() == 2);
3469   match(Set dst (ReplicateS zero));
3470   format %{ "pxor    $dst,$dst\t! replicate2S zero" %}
3471   ins_encode %{
3472     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3473   %}
3474   ins_pipe( fpu_reg_reg );
3475 %}
3476 
3477 instruct Repl4S_zero(vecD dst, immI0 zero) %{
3478   predicate(n->as_Vector()->length() == 4);
3479   match(Set dst (ReplicateS zero));
3480   format %{ "pxor    $dst,$dst\t! replicate4S zero" %}
3481   ins_encode %{
3482     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3483   %}
3484   ins_pipe( fpu_reg_reg );
3485 %}
3486 
3487 // Replicate integer (4 byte) scalar to be vector
3488 instruct Repl2I(vecD dst, rRegI src) %{
3489   predicate(n->as_Vector()->length() == 2);
3490   match(Set dst (ReplicateI src));
3491   format %{ "movd    $dst,$src\n\t"
3492             "pshufd  $dst,$dst,0x00\t! replicate2I" %}
3493   ins_encode %{
3494     __ movdl($dst$$XMMRegister, $src$$Register);
3495     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3496   %}
3497   ins_pipe( fpu_reg_reg );
3498 %}
3499 
3500 // Integer could be loaded into xmm register directly from memory.
3501 instruct Repl2I_mem(vecD dst, memory mem) %{
3502   predicate(n->as_Vector()->length() == 2);
3503   match(Set dst (ReplicateI (LoadI mem)));
3504   format %{ "movd    $dst,$mem\n\t"
3505             "pshufd  $dst,$dst,0x00\t! replicate2I" %}
3506   ins_encode %{
3507     __ movdl($dst$$XMMRegister, $mem$$Address);
3508     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3509   %}
3510   ins_pipe( fpu_reg_reg );
3511 %}
3512 
3513 // Replicate integer (4 byte) scalar immediate to be vector by loading from const table.
3514 instruct Repl2I_imm(vecD dst, immI con) %{
3515   predicate(n->as_Vector()->length() == 2);
3516   match(Set dst (ReplicateI con));
3517   format %{ "movq    $dst,[$constantaddress]\t! replicate2I($con)" %}
3518   ins_encode %{
3519     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
3520   %}
3521   ins_pipe( fpu_reg_reg );
3522 %}
3523 
3524 // Replicate integer (4 byte) scalar zero to be vector
3525 instruct Repl2I_zero(vecD dst, immI0 zero) %{
3526   predicate(n->as_Vector()->length() == 2);
3527   match(Set dst (ReplicateI zero));
3528   format %{ "pxor    $dst,$dst\t! replicate2I" %}
3529   ins_encode %{
3530     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3531   %}
3532   ins_pipe( fpu_reg_reg );
3533 %}
3534 
3535 // Replicate long (8 byte) scalar to be vector
3536 #ifdef _LP64
3537 instruct Repl2L(vecX dst, rRegL src) %{
3538   predicate(n->as_Vector()->length() == 2);
3539   match(Set dst (ReplicateL src));
3540   format %{ "movdq   $dst,$src\n\t"
3541             "punpcklqdq $dst,$dst\t! replicate2L" %}
3542   ins_encode %{
3543     __ movdq($dst$$XMMRegister, $src$$Register);
3544     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3545   %}
3546   ins_pipe( pipe_slow );
3547 %}
3548 #else // _LP64
3549 instruct Repl2L(vecX dst, eRegL src, regD tmp) %{
3550   predicate(n->as_Vector()->length() == 2);
3551   match(Set dst (ReplicateL src));
3552   effect(TEMP dst, USE src, TEMP tmp);
3553   format %{ "movdl   $dst,$src.lo\n\t"
3554             "movdl   $tmp,$src.hi\n\t"
3555             "punpckldq $dst,$tmp\n\t"
3556             "punpcklqdq $dst,$dst\t! replicate2L"%}
3557   ins_encode %{
3558     __ movdl($dst$$XMMRegister, $src$$Register);
3559     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
3560     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
3561     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3562   %}
3563   ins_pipe( pipe_slow );
3564 %}
3565 #endif // _LP64
3566 
3567 // Replicate long (8 byte) scalar immediate to be vector by loading from const table.
3568 instruct Repl2L_imm(vecX dst, immL con) %{
3569   predicate(n->as_Vector()->length() == 2);
3570   match(Set dst (ReplicateL con));
3571   format %{ "movq    $dst,[$constantaddress]\n\t"
3572             "punpcklqdq $dst,$dst\t! replicate2L($con)" %}
3573   ins_encode %{
3574     __ movq($dst$$XMMRegister, $constantaddress($con));
3575     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3576   %}
3577   ins_pipe( pipe_slow );
3578 %}
3579 
3580 // Long could be loaded into xmm register directly from memory.
3581 instruct Repl2L_mem(vecX dst, memory mem) %{
3582   predicate(n->as_Vector()->length() == 2);
3583   match(Set dst (ReplicateL (LoadL mem)));
3584   format %{ "movq    $dst,$mem\n\t"
3585             "punpcklqdq $dst,$dst\t! replicate2L" %}
3586   ins_encode %{
3587     __ movq($dst$$XMMRegister, $mem$$Address);
3588     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3589   %}
3590   ins_pipe( pipe_slow );
3591 %}
3592 
3593 // Replicate long (8 byte) scalar zero to be vector
3594 instruct Repl2L_zero(vecX dst, immL0 zero) %{
3595   predicate(n->as_Vector()->length() == 2);
3596   match(Set dst (ReplicateL zero));
3597   format %{ "pxor    $dst,$dst\t! replicate2L zero" %}
3598   ins_encode %{
3599     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3600   %}
3601   ins_pipe( fpu_reg_reg );
3602 %}
3603 
3604 instruct Repl4L_zero(vecY dst, immL0 zero) %{
3605   predicate(n->as_Vector()->length() == 4);
3606   match(Set dst (ReplicateL zero));
3607   format %{ "vpxor   $dst,$dst,$dst\t! replicate4L zero" %}
3608   ins_encode %{
3609     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
3610     int vector_len = 1;
3611     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3612   %}
3613   ins_pipe( fpu_reg_reg );
3614 %}
3615 
3616 // Replicate float (4 byte) scalar to be vector
3617 instruct Repl2F(vecD dst, regF src) %{
3618   predicate(n->as_Vector()->length() == 2);
3619   match(Set dst (ReplicateF src));
3620   format %{ "pshufd  $dst,$dst,0x00\t! replicate2F" %}
3621   ins_encode %{
3622     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
3623   %}
3624   ins_pipe( fpu_reg_reg );
3625 %}
3626 
3627 instruct Repl2F_mem(vecD dst, memory mem) %{
3628   predicate(n->as_Vector()->length() == 2);
3629   match(Set dst (ReplicateF (LoadF mem)));
3630   format %{ "pshufd  $dst,$mem,0x00\t! replicate2F" %}
3631   ins_encode %{
3632     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3633   %}
3634   ins_pipe( pipe_slow );
3635 %}
3636 
3637 instruct Repl4F(vecX dst, regF src) %{
3638   predicate(n->as_Vector()->length() == 4);
3639   match(Set dst (ReplicateF src));
3640   format %{ "pshufd  $dst,$dst,0x00\t! replicate4F" %}
3641   ins_encode %{
3642     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
3643   %}
3644   ins_pipe( pipe_slow );
3645 %}
3646 
3647 instruct Repl4F_mem(vecX dst, memory mem) %{
3648   predicate(n->as_Vector()->length() == 4);
3649   match(Set dst (ReplicateF (LoadF mem)));
3650   format %{ "pshufd  $dst,$mem,0x00\t! replicate4F" %}
3651   ins_encode %{
3652     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3653   %}
3654   ins_pipe( pipe_slow );
3655 %}
3656 
3657 // Replicate float (4 byte) scalar zero to be vector
3658 instruct Repl2F_zero(vecD dst, immF0 zero) %{
3659   predicate(n->as_Vector()->length() == 2);
3660   match(Set dst (ReplicateF zero));
3661   format %{ "xorps   $dst,$dst\t! replicate2F zero" %}
3662   ins_encode %{
3663     __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
3664   %}
3665   ins_pipe( fpu_reg_reg );
3666 %}
3667 
3668 instruct Repl4F_zero(vecX dst, immF0 zero) %{
3669   predicate(n->as_Vector()->length() == 4);
3670   match(Set dst (ReplicateF zero));
3671   format %{ "xorps   $dst,$dst\t! replicate4F zero" %}
3672   ins_encode %{
3673     __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
3674   %}
3675   ins_pipe( fpu_reg_reg );
3676 %}
3677 
3678 instruct Repl8F_zero(vecY dst, immF0 zero) %{
3679   predicate(n->as_Vector()->length() == 8);
3680   match(Set dst (ReplicateF zero));
3681   format %{ "vxorps  $dst,$dst,$dst\t! replicate8F zero" %}
3682   ins_encode %{
3683     int vector_len = 1;
3684     __ vxorps($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3685   %}
3686   ins_pipe( fpu_reg_reg );
3687 %}
3688 
3689 // Replicate double (8 bytes) scalar to be vector
3690 instruct Repl2D(vecX dst, regD src) %{
3691   predicate(n->as_Vector()->length() == 2);
3692   match(Set dst (ReplicateD src));
3693   format %{ "pshufd  $dst,$src,0x44\t! replicate2D" %}
3694   ins_encode %{
3695     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
3696   %}
3697   ins_pipe( pipe_slow );
3698 %}
3699 
3700 instruct Repl2D_mem(vecX dst, memory mem) %{
3701   predicate(n->as_Vector()->length() == 2);
3702   match(Set dst (ReplicateD (LoadD mem)));
3703   format %{ "pshufd  $dst,$mem,0x44\t! replicate2D" %}
3704   ins_encode %{
3705     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x44);
3706   %}
3707   ins_pipe( pipe_slow );
3708 %}
3709 
3710 // Replicate double (8 byte) scalar zero to be vector
3711 instruct Repl2D_zero(vecX dst, immD0 zero) %{
3712   predicate(n->as_Vector()->length() == 2);
3713   match(Set dst (ReplicateD zero));
3714   format %{ "xorpd   $dst,$dst\t! replicate2D zero" %}
3715   ins_encode %{
3716     __ xorpd($dst$$XMMRegister, $dst$$XMMRegister);
3717   %}
3718   ins_pipe( fpu_reg_reg );
3719 %}
3720 
3721 instruct Repl4D_zero(vecY dst, immD0 zero) %{
3722   predicate(n->as_Vector()->length() == 4);
3723   match(Set dst (ReplicateD zero));
3724   format %{ "vxorpd  $dst,$dst,$dst,vect256\t! replicate4D zero" %}
3725   ins_encode %{
3726     int vector_len = 1;
3727     __ vxorpd($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3728   %}
3729   ins_pipe( fpu_reg_reg );
3730 %}
3731 
3732 // ====================EVEX REPLICATE=============================================
3733 
3734 // Note: some of the legacy forms are applicable to EVEX
3735 
3736 instruct Repl16B_evex(vecX dst, rRegI src) %{
3737   predicate(n->as_Vector()->length() == 16 && VM_Version::supports_avx512vlbw());
3738   match(Set dst (ReplicateB src));
3739   format %{ "vpbroadcastb $dst,$src\t! replicate16B" %}
3740   ins_encode %{
3741    int vector_len = 0;
3742     __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vector_len);
3743   %}
3744   ins_pipe( pipe_slow );
3745 %}
3746 
3747 instruct Repl16B_mem_evex(vecX dst, memory mem) %{
3748   predicate(n->as_Vector()->length() == 16 && VM_Version::supports_avx512vlbw());
3749   match(Set dst (ReplicateB (LoadB mem)));
3750   format %{ "vpbroadcastb  $dst,$mem\t! replicate16B" %}
3751   ins_encode %{
3752     int vector_len = 0;
3753     __ evpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
3754   %}
3755   ins_pipe( pipe_slow );
3756 %}
3757 
3758 instruct Repl32B_evex(vecY dst, rRegI src) %{
3759   predicate(n->as_Vector()->length() == 32 && VM_Version::supports_avx512vlbw());
3760   match(Set dst (ReplicateB src));
3761   format %{ "vpbroadcastb $dst,$src\t! replicate32B" %}
3762   ins_encode %{
3763    int vector_len = 1;
3764     __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vector_len);
3765   %}
3766   ins_pipe( pipe_slow );
3767 %}
3768 
3769 instruct Repl32B_mem_evex(vecY dst, memory mem) %{
3770   predicate(n->as_Vector()->length() == 32 && VM_Version::supports_avx512vlbw());
3771   match(Set dst (ReplicateB (LoadB mem)));
3772   format %{ "vpbroadcastb  $dst,$mem\t! replicate32B" %}
3773   ins_encode %{
3774     int vector_len = 1;
3775     __ evpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
3776   %}
3777   ins_pipe( pipe_slow );
3778 %}
3779 
3780 instruct Repl64B_evex(vecZ dst, rRegI src) %{
3781   predicate(n->as_Vector()->length() == 64 && UseAVX > 2);
3782   match(Set dst (ReplicateB src));
3783   format %{ "vpbroadcastb $dst,$src\t! upper replicate64B" %}
3784   ins_encode %{
3785    int vector_len = 2;
3786     __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vector_len);
3787   %}
3788   ins_pipe( pipe_slow );
3789 %}
3790 
3791 instruct Repl64B_mem_evex(vecZ dst, memory mem) %{
3792   predicate(n->as_Vector()->length() == 64 && VM_Version::supports_avx512vlbw());
3793   match(Set dst (ReplicateB (LoadB mem)));
3794   format %{ "vpbroadcastb  $dst,$mem\t! replicate64B" %}
3795   ins_encode %{
3796     int vector_len = 2;
3797     __ evpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
3798   %}
3799   ins_pipe( pipe_slow );
3800 %}
3801 
3802 instruct Repl16B_imm_evex(vecX dst, immI con) %{
3803   predicate(n->as_Vector()->length() == 16 && VM_Version::supports_avx512vlbw());
3804   match(Set dst (ReplicateB con));
3805   format %{ "movq    $dst,[$constantaddress]\n\t"
3806             "vpbroadcastb $dst,$dst\t! replicate16B" %}
3807   ins_encode %{
3808    int vector_len = 0;
3809     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
3810     __ evpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3811   %}
3812   ins_pipe( pipe_slow );
3813 %}
3814 
3815 instruct Repl32B_imm_evex(vecY dst, immI con) %{
3816   predicate(n->as_Vector()->length() == 32 && VM_Version::supports_avx512vlbw());
3817   match(Set dst (ReplicateB con));
3818   format %{ "movq    $dst,[$constantaddress]\n\t"
3819             "vpbroadcastb $dst,$dst\t! replicate32B" %}
3820   ins_encode %{
3821    int vector_len = 1;
3822     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
3823     __ evpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3824   %}
3825   ins_pipe( pipe_slow );
3826 %}
3827 
3828 instruct Repl64B_imm_evex(vecZ dst, immI con) %{
3829   predicate(n->as_Vector()->length() == 64 && UseAVX > 2);
3830   match(Set dst (ReplicateB con));
3831   format %{ "movq    $dst,[$constantaddress]\n\t"
3832             "vpbroadcastb $dst,$dst\t! upper replicate64B" %}
3833   ins_encode %{
3834    int vector_len = 2;
3835     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
3836     __ evpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3837   %}
3838   ins_pipe( pipe_slow );
3839   ins_pipe( pipe_slow );
3840 %}
3841 
3842 instruct Repl64B_zero_evex(vecZ dst, immI0 zero) %{
3843   predicate(n->as_Vector()->length() == 64 && UseAVX > 2);
3844   match(Set dst (ReplicateB zero));
3845   format %{ "vpxor   $dst k0,$dst,$dst\t! replicate64B zero" %}
3846   ins_encode %{
3847     // Use vxorpd since AVX does not have vpxor for 512-bit (EVEX will have it).
3848     int vector_len = 2;
3849     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3850   %}
3851   ins_pipe( fpu_reg_reg );
3852 %}
3853 
3854 instruct Repl8S_evex(vecX dst, rRegI src) %{
3855   predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vlbw());
3856   match(Set dst (ReplicateS src));
3857   format %{ "vpbroadcastw $dst,$src\t! replicate8S" %}
3858   ins_encode %{
3859    int vector_len = 0;
3860     __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
3861   %}
3862   ins_pipe( pipe_slow );
3863 %}
3864 
3865 instruct Repl8S_mem_evex(vecX dst, memory mem) %{
3866   predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vlbw());
3867   match(Set dst (ReplicateS (LoadS mem)));
3868   format %{ "vpbroadcastw  $dst,$mem\t! replicate8S" %}
3869   ins_encode %{
3870     int vector_len = 0;
3871     __ evpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
3872   %}
3873   ins_pipe( pipe_slow );
3874 %}
3875 
3876 instruct Repl16S_evex(vecY dst, rRegI src) %{
3877   predicate(n->as_Vector()->length() == 16 && VM_Version::supports_avx512vlbw());
3878   match(Set dst (ReplicateS src));
3879   format %{ "vpbroadcastw $dst,$src\t! replicate16S" %}
3880   ins_encode %{
3881    int vector_len = 1;
3882     __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
3883   %}
3884   ins_pipe( pipe_slow );
3885 %}
3886 
3887 instruct Repl16S_mem_evex(vecY dst, memory mem) %{
3888   predicate(n->as_Vector()->length() == 16 && VM_Version::supports_avx512vlbw());
3889   match(Set dst (ReplicateS (LoadS mem)));
3890   format %{ "vpbroadcastw  $dst,$mem\t! replicate16S" %}
3891   ins_encode %{
3892     int vector_len = 1;
3893     __ evpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
3894   %}
3895   ins_pipe( pipe_slow );
3896 %}
3897 
3898 instruct Repl32S_evex(vecZ dst, rRegI src) %{
3899   predicate(n->as_Vector()->length() == 32 && UseAVX > 2);
3900   match(Set dst (ReplicateS src));
3901   format %{ "vpbroadcastw $dst,$src\t! replicate32S" %}
3902   ins_encode %{
3903    int vector_len = 2;
3904     __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
3905   %}
3906   ins_pipe( pipe_slow );
3907 %}
3908 
3909 instruct Repl32S_mem_evex(vecZ dst, memory mem) %{
3910   predicate(n->as_Vector()->length() == 32 && UseAVX > 2);
3911   match(Set dst (ReplicateS (LoadS mem)));
3912   format %{ "vpbroadcastw  $dst,$mem\t! replicate32S" %}
3913   ins_encode %{
3914     int vector_len = 2;
3915     __ evpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
3916   %}
3917   ins_pipe( pipe_slow );
3918 %}
3919 
3920 instruct Repl8S_imm_evex(vecX dst, immI con) %{
3921   predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vlbw());
3922   match(Set dst (ReplicateS con));
3923   format %{ "movq    $dst,[$constantaddress]\n\t"
3924             "vpbroadcastw $dst,$dst\t! replicate8S" %}
3925   ins_encode %{
3926    int vector_len = 0;
3927     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
3928     __ evpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3929   %}
3930   ins_pipe( pipe_slow );
3931 %}
3932 
3933 instruct Repl16S_imm_evex(vecY dst, immI con) %{
3934   predicate(n->as_Vector()->length() == 16 && VM_Version::supports_avx512vlbw());
3935   match(Set dst (ReplicateS con));
3936   format %{ "movq    $dst,[$constantaddress]\n\t"
3937             "vpbroadcastw $dst,$dst\t! replicate16S" %}
3938   ins_encode %{
3939    int vector_len = 1;
3940     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
3941     __ evpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3942   %}
3943   ins_pipe( pipe_slow );
3944 %}
3945 
3946 instruct Repl32S_imm_evex(vecZ dst, immI con) %{
3947   predicate(n->as_Vector()->length() == 32 && UseAVX > 2);
3948   match(Set dst (ReplicateS con));
3949   format %{ "movq    $dst,[$constantaddress]\n\t"
3950             "vpbroadcastw $dst,$dst\t! replicate32S" %}
3951   ins_encode %{
3952    int vector_len = 2;
3953     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
3954     __ evpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3955   %}
3956   ins_pipe( pipe_slow );
3957 %}
3958 
3959 instruct Repl32S_zero_evex(vecZ dst, immI0 zero) %{
3960   predicate(n->as_Vector()->length() == 32 && UseAVX > 2);
3961   match(Set dst (ReplicateS zero));
3962   format %{ "vpxor   $dst k0,$dst,$dst\t! replicate32S zero" %}
3963   ins_encode %{
3964     // Use vxorpd since AVX does not have vpxor for 512-bit (EVEX will have it).
3965     int vector_len = 2;
3966     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3967   %}
3968   ins_pipe( fpu_reg_reg );
3969 %}
3970 
3971 instruct Repl4I_evex(vecX dst, rRegI src) %{
3972   predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
3973   match(Set dst (ReplicateI src));
3974   format %{ "vpbroadcastd  $dst,$src\t! replicate4I" %}
3975   ins_encode %{
3976     int vector_len = 0;
3977     __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vector_len);
3978   %}
3979   ins_pipe( pipe_slow );
3980 %}
3981 
3982 instruct Repl4I_mem_evex(vecX dst, memory mem) %{
3983   predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
3984   match(Set dst (ReplicateI (LoadI mem)));
3985   format %{ "vpbroadcastd  $dst,$mem\t! replicate4I" %}
3986   ins_encode %{
3987     int vector_len = 0;
3988     __ evpbroadcastd($dst$$XMMRegister, $mem$$Address, vector_len);
3989   %}
3990   ins_pipe( pipe_slow );
3991 %}
3992 
3993 instruct Repl8I_evex(vecY dst, rRegI src) %{
3994   predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vl());
3995   match(Set dst (ReplicateI src));
3996   format %{ "vpbroadcastd  $dst,$src\t! replicate8I" %}
3997   ins_encode %{
3998     int vector_len = 1;
3999     __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vector_len);
4000   %}
4001   ins_pipe( pipe_slow );
4002 %}
4003 
4004 instruct Repl8I_mem_evex(vecY dst, memory mem) %{
4005   predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vl());
4006   match(Set dst (ReplicateI (LoadI mem)));
4007   format %{ "vpbroadcastd  $dst,$mem\t! replicate8I" %}
4008   ins_encode %{
4009     int vector_len = 1;
4010     __ evpbroadcastd($dst$$XMMRegister, $mem$$Address, vector_len);
4011   %}
4012   ins_pipe( pipe_slow );
4013 %}
4014 
4015 instruct Repl16I_evex(vecZ dst, rRegI src) %{
4016   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4017   match(Set dst (ReplicateI src));
4018   format %{ "vpbroadcastd  $dst,$src\t! replicate16I" %}
4019   ins_encode %{
4020     int vector_len = 2;
4021     __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vector_len);
4022   %}
4023   ins_pipe( pipe_slow );
4024 %}
4025 
4026 instruct Repl16I_mem_evex(vecZ dst, memory mem) %{
4027   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4028   match(Set dst (ReplicateI (LoadI mem)));
4029   format %{ "vpbroadcastd  $dst,$mem\t! replicate16I" %}
4030   ins_encode %{
4031     int vector_len = 2;
4032     __ evpbroadcastd($dst$$XMMRegister, $mem$$Address, vector_len);
4033   %}
4034   ins_pipe( pipe_slow );
4035 %}
4036 
4037 instruct Repl4I_imm_evex(vecX dst, immI con) %{
4038   predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
4039   match(Set dst (ReplicateI con));
4040   format %{ "movq    $dst,[$constantaddress]\t! replicate8I($con)\n\t"
4041             "vpbroadcastd  $dst,$dst\t! replicate4I" %}
4042   ins_encode %{
4043     int vector_len = 0;
4044     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
4045     __ evpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4046   %}
4047   ins_pipe( pipe_slow );
4048 %}
4049 
4050 instruct Repl8I_imm_evex(vecY dst, immI con) %{
4051   predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vl());
4052   match(Set dst (ReplicateI con));
4053   format %{ "movq    $dst,[$constantaddress]\t! replicate8I($con)\n\t"
4054             "vpbroadcastd  $dst,$dst\t! replicate8I" %}
4055   ins_encode %{
4056     int vector_len = 1;
4057     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
4058     __ evpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4059   %}
4060   ins_pipe( pipe_slow );
4061 %}
4062 
4063 instruct Repl16I_imm_evex(vecZ dst, immI con) %{
4064   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4065   match(Set dst (ReplicateI con));
4066   format %{ "movq    $dst,[$constantaddress]\t! replicate16I($con)\n\t"
4067             "vpbroadcastd  $dst,$dst\t! replicate16I" %}
4068   ins_encode %{
4069     int vector_len = 2;
4070     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
4071     __ evpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4072   %}
4073   ins_pipe( pipe_slow );
4074 %}
4075 
4076 instruct Repl16I_zero_evex(vecZ dst, immI0 zero) %{
4077   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4078   match(Set dst (ReplicateI zero));
4079   format %{ "vpxor   $dst k0,$dst,$dst\t! replicate16I zero" %}
4080   ins_encode %{
4081     // Use vxorpd since AVX does not have vpxor for 512-bit (AVX2 will have it).
4082     int vector_len = 2;
4083     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4084   %}
4085   ins_pipe( fpu_reg_reg );
4086 %}
4087 
4088 // Replicate long (8 byte) scalar to be vector
4089 #ifdef _LP64
4090 instruct Repl4L_evex(vecY dst, rRegL src) %{
4091   predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());












4092   match(Set dst (ReplicateL src));
4093   format %{ "vpbroadcastq  $dst,$src\t! replicate4L" %}


4094   ins_encode %{
4095     int vector_len = 1;
4096     __ evpbroadcastq($dst$$XMMRegister, $src$$Register, vector_len);

4097   %}
4098   ins_pipe( pipe_slow );
4099 %}
4100 
4101 instruct Repl8L_evex(vecZ dst, rRegL src) %{
4102   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4103   match(Set dst (ReplicateL src));
4104   format %{ "vpbroadcastq  $dst,$src\t! replicate8L" %}



4105   ins_encode %{
4106     int vector_len = 2;
4107     __ evpbroadcastq($dst$$XMMRegister, $src$$Register, vector_len);


4108   %}
4109   ins_pipe( pipe_slow );
4110 %}
4111 #else // _LP64
4112 instruct Repl4L_evex(vecY dst, eRegL src, regD tmp) %{
4113   predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());

















4114   match(Set dst (ReplicateL src));
4115   effect(TEMP dst, USE src, TEMP tmp);
4116   format %{ "movdl   $dst,$src.lo\n\t"
4117             "movdl   $tmp,$src.hi\n\t"
4118             "punpckldq $dst,$tmp\n\t"
4119             "vpbroadcastq  $dst,$dst\t! replicate4L" %}

4120   ins_encode %{
4121     int vector_len = 1;
4122     __ movdl($dst$$XMMRegister, $src$$Register);
4123     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
4124     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
4125     __ evpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);

4126   %}
4127   ins_pipe( pipe_slow );
4128 %}
4129 
4130 instruct Repl8L_evex(vecZ dst, eRegL src, regD tmp) %{
4131   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4132   match(Set dst (ReplicateL src));
4133   effect(TEMP dst, USE src, TEMP tmp);
4134   format %{ "movdl   $dst,$src.lo\n\t"
4135             "movdl   $tmp,$src.hi\n\t"
4136             "punpckldq $dst,$tmp\n\t"
4137             "vpbroadcastq  $dst,$dst\t! replicate8L" %}

4138   ins_encode %{
4139     int vector_len = 2;
4140     __ movdl($dst$$XMMRegister, $src$$Register);
4141     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
4142     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
4143     __ evpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);


4144   %}
4145   ins_pipe( pipe_slow );
4146 %}
4147 #endif // _LP64
4148 
4149 instruct Repl4L_imm_evex(vecY dst, immL con) %{
4150   predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());













4151   match(Set dst (ReplicateL con));
4152   format %{ "movq    $dst,[$constantaddress]\n\t"
4153             "vpbroadcastq  $dst,$dst\t! replicate4L" %}

4154   ins_encode %{
4155     int vector_len = 1;
4156     __ movq($dst$$XMMRegister, $constantaddress($con));
4157     __ evpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);

4158   %}
4159   ins_pipe( pipe_slow );
4160 %}
4161 
4162 instruct Repl8L_imm_evex(vecZ dst, immL con) %{
4163   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4164   match(Set dst (ReplicateL con));
4165   format %{ "movq    $dst,[$constantaddress]\n\t"
4166             "vpbroadcastq  $dst,$dst\t! replicate8L" %}


4167   ins_encode %{
4168     int vector_len = 2;
4169     __ movq($dst$$XMMRegister, $constantaddress($con));
4170     __ evpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);















4171   %}
4172   ins_pipe( pipe_slow );
4173 %}
4174 
4175 instruct Repl4L_mem_evex(vecY dst, memory mem) %{
4176   predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
4177   match(Set dst (ReplicateL (LoadL mem)));
4178   format %{ "vpbroadcastd  $dst,$mem\t! replicate4L" %}


4179   ins_encode %{
4180     int vector_len = 1;
4181     __ evpbroadcastq($dst$$XMMRegister, $mem$$Address, vector_len);

4182   %}
4183   ins_pipe( pipe_slow );
4184 %}
4185 
4186 instruct Repl8L_mem_evex(vecZ dst, memory mem) %{
4187   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4188   match(Set dst (ReplicateL (LoadL mem)));
4189   format %{ "vpbroadcastd  $dst,$mem\t! replicate8L" %}



4190   ins_encode %{
4191     int vector_len = 2;
4192     __ evpbroadcastq($dst$$XMMRegister, $mem$$Address, vector_len);


4193   %}
4194   ins_pipe( pipe_slow );
4195 %}
4196 
4197 instruct Repl8L_zero_evex(vecZ dst, immL0 zero) %{
4198   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);























4199   match(Set dst (ReplicateL zero));
4200   format %{ "vpxor   $dst k0,$dst,$dst\t! replicate8L zero" %}
4201   ins_encode %{
4202     // Use vxorpd since AVX does not have vpxor for 512-bit (EVEX will have it).
4203     int vector_len = 2;
4204     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4205   %}
4206   ins_pipe( fpu_reg_reg );
4207 %}
4208 
4209 instruct Repl8F_evex(vecY dst, regF src) %{
4210   predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vl());











4211   match(Set dst (ReplicateF src));
4212   format %{ "vbroadcastss $dst,$src\t! replicate8F" %}
4213   ins_encode %{
4214     int vector_len = 1;
4215     __ evpbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vector_len);
4216   %}
4217   ins_pipe( pipe_slow );
4218 %}
4219 
4220 instruct Repl8F_mem_evex(vecY dst, memory mem) %{
4221   predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vl());
4222   match(Set dst (ReplicateF (LoadF mem)));
4223   format %{ "vbroadcastss  $dst,$mem\t! replicate8F" %}

4224   ins_encode %{
4225     int vector_len = 1;
4226     __ evpbroadcastss($dst$$XMMRegister, $mem$$Address, vector_len);
4227   %}
4228   ins_pipe( pipe_slow );
4229 %}
4230 
4231 instruct Repl16F_evex(vecZ dst, regF src) %{
4232   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4233   match(Set dst (ReplicateF src));
4234   format %{ "vbroadcastss $dst,$src\t! replicate16F" %}


4235   ins_encode %{
4236     int vector_len = 2;
4237     __ evpbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vector_len);

4238   %}
4239   ins_pipe( pipe_slow );
4240 %}
4241 
4242 instruct Repl16F_mem_evex(vecZ dst, memory mem) %{
4243   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4244   match(Set dst (ReplicateF (LoadF mem)));
4245   format %{ "vbroadcastss  $dst,$mem\t! replicate16F" %}





















4246   ins_encode %{
4247     int vector_len = 2;
4248     __ evpbroadcastss($dst$$XMMRegister, $mem$$Address, vector_len);
4249   %}
4250   ins_pipe( pipe_slow );
4251 %}
4252 
4253 instruct Repl16F_zero_evex(vecZ dst, immF0 zero) %{
4254   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4255   match(Set dst (ReplicateF zero));
4256   format %{ "vxorps  $dst k0,$dst,$dst\t! replicate16F zero" %}
4257   ins_encode %{
4258     int vector_len = 2;
4259     __ vxorps($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4260   %}
4261   ins_pipe( fpu_reg_reg );
4262 %}
4263 
4264 instruct Repl4D_evex(vecY dst, regD src) %{
4265   predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());

4266   match(Set dst (ReplicateD src));
4267   format %{ "vbroadcastsd $dst,$src\t! replicate4D" %}
4268   ins_encode %{
4269     int vector_len = 1;
4270     __ evpbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
4271   %}
4272   ins_pipe( pipe_slow );
4273 %}
4274 
4275 instruct Repl4D_mem_evex(vecY dst, memory mem) %{
4276   predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
4277   match(Set dst (ReplicateD (LoadD mem)));
4278   format %{ "vbroadcastsd  $dst,$mem\t! replicate4D" %}

4279   ins_encode %{
4280     int vector_len = 1;
4281     __ evpbroadcastsd($dst$$XMMRegister, $mem$$Address, vector_len);
4282   %}
4283   ins_pipe( pipe_slow );
4284 %}
4285 
4286 instruct Repl8D_evex(vecZ dst, regD src) %{
4287   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4288   match(Set dst (ReplicateD src));
4289   format %{ "vbroadcastsd $dst,$src\t! replicate8D" %}


4290   ins_encode %{
4291     int vector_len = 2;
4292     __ evpbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vector_len);

4293   %}
4294   ins_pipe( pipe_slow );
4295 %}
4296 
4297 instruct Repl8D_mem_evex(vecZ dst, memory mem) %{
4298   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4299   match(Set dst (ReplicateD (LoadD mem)));
4300   format %{ "vbroadcastsd  $dst,$mem\t! replicate8D" %}











4301   ins_encode %{
4302     int vector_len = 2;
4303     __ evpbroadcastsd($dst$$XMMRegister, $mem$$Address, vector_len);
4304   %}
4305   ins_pipe( pipe_slow );
4306 %}
4307 
4308 instruct Repl8D_zero_evex(vecZ dst, immD0 zero) %{
4309   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4310   match(Set dst (ReplicateD zero));
4311   format %{ "vxorpd  $dst k0,$dst,$dst,vect512\t! replicate8D zero" %}
4312   ins_encode %{
4313     int vector_len = 2;
4314     __ vxorpd($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4315   %}
4316   ins_pipe( fpu_reg_reg );
4317 %}
4318 
4319 // ====================REDUCTION ARITHMETIC=======================================
4320 
4321 instruct rsadd2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
4322   predicate(UseSSE > 2 && UseAVX == 0);
4323   match(Set dst (AddReductionVI src1 src2));
4324   effect(TEMP tmp2, TEMP tmp);
4325   format %{ "movdqu  $tmp2,$src2\n\t"
4326             "phaddd  $tmp2,$tmp2\n\t"
4327             "movd    $tmp,$src1\n\t"
4328             "paddd   $tmp,$tmp2\n\t"
4329             "movd    $dst,$tmp\t! add reduction2I" %}


5386   ins_pipe( pipe_slow );
5387 %}
5388 
5389 // ====================VECTOR ARITHMETIC=======================================
5390 
5391 // --------------------------------- ADD --------------------------------------
5392 
5393 // Bytes vector add
5394 instruct vadd4B(vecS dst, vecS src) %{
5395   predicate(n->as_Vector()->length() == 4);
5396   match(Set dst (AddVB dst src));
5397   format %{ "paddb   $dst,$src\t! add packed4B" %}
5398   ins_encode %{
5399     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
5400   %}
5401   ins_pipe( pipe_slow );
5402 %}
5403 
5404 instruct vadd4B_reg(vecS dst, vecS src1, vecS src2) %{
5405   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
5406   match(Set dst (AddVB src1 src2));
5407   format %{ "vpaddb  $dst,$src1,$src2\t! add packed4B" %}
5408   ins_encode %{
5409     int vector_len = 0;
5410     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5411   %}
5412   ins_pipe( pipe_slow );
5413 %}
5414 
5415 instruct vadd4B_mem(vecS dst, vecS src, memory mem) %{
5416   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
5417   match(Set dst (AddVB src (LoadVector mem)));
5418   format %{ "vpaddb  $dst,$src,$mem\t! add packed4B" %}
5419   ins_encode %{
5420     int vector_len = 0;
5421     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5422   %}
5423   ins_pipe( pipe_slow );
5424 %}
5425 
5426 instruct vadd8B(vecD dst, vecD src) %{
5427   predicate(n->as_Vector()->length() == 8);
5428   match(Set dst (AddVB dst src));
5429   format %{ "paddb   $dst,$src\t! add packed8B" %}
5430   ins_encode %{
5431     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
5432   %}
5433   ins_pipe( pipe_slow );
5434 %}
5435 
5436 instruct vadd8B_reg(vecD dst, vecD src1, vecD src2) %{
5437   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
5438   match(Set dst (AddVB src1 src2));
5439   format %{ "vpaddb  $dst,$src1,$src2\t! add packed8B" %}
5440   ins_encode %{
5441     int vector_len = 0;
5442     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5443   %}
5444   ins_pipe( pipe_slow );
5445 %}
5446 
5447 instruct vadd8B_mem(vecD dst, vecD src, memory mem) %{
5448   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
5449   match(Set dst (AddVB src (LoadVector mem)));
5450   format %{ "vpaddb  $dst,$src,$mem\t! add packed8B" %}
5451   ins_encode %{
5452     int vector_len = 0;
5453     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5454   %}
5455   ins_pipe( pipe_slow );
5456 %}
5457 
5458 instruct vadd16B(vecX dst, vecX src) %{
5459   predicate(n->as_Vector()->length() == 16);
5460   match(Set dst (AddVB dst src));
5461   format %{ "paddb   $dst,$src\t! add packed16B" %}
5462   ins_encode %{
5463     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
5464   %}
5465   ins_pipe( pipe_slow );
5466 %}
5467 
5468 instruct vadd16B_reg(vecX dst, vecX src1, vecX src2) %{
5469   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
5470   match(Set dst (AddVB src1 src2));
5471   format %{ "vpaddb  $dst,$src1,$src2\t! add packed16B" %}
5472   ins_encode %{
5473     int vector_len = 0;
5474     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5475   %}
5476   ins_pipe( pipe_slow );
5477 %}


5536   predicate(n->as_Vector()->length() == 2);
5537   match(Set dst (AddVS dst src));
5538   format %{ "paddw   $dst,$src\t! add packed2S" %}
5539   ins_encode %{
5540     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
5541   %}
5542   ins_pipe( pipe_slow );
5543 %}
5544 
5545 instruct vadd2S_reg(vecS dst, vecS src1, vecS src2) %{
5546   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
5547   match(Set dst (AddVS src1 src2));
5548   format %{ "vpaddw  $dst,$src1,$src2\t! add packed2S" %}
5549   ins_encode %{
5550     int vector_len = 0;
5551     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5552   %}
5553   ins_pipe( pipe_slow );
5554 %}
5555 
5556 instruct vadd2S_mem(vecS dst, vecS src, memory mem) %{
5557   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
5558   match(Set dst (AddVS src (LoadVector mem)));
5559   format %{ "vpaddw  $dst,$src,$mem\t! add packed2S" %}
5560   ins_encode %{
5561     int vector_len = 0;
5562     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5563   %}
5564   ins_pipe( pipe_slow );
5565 %}
5566 
5567 instruct vadd4S(vecD dst, vecD src) %{
5568   predicate(n->as_Vector()->length() == 4);
5569   match(Set dst (AddVS dst src));
5570   format %{ "paddw   $dst,$src\t! add packed4S" %}
5571   ins_encode %{
5572     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
5573   %}
5574   ins_pipe( pipe_slow );
5575 %}
5576 
5577 instruct vadd4S_reg(vecD dst, vecD src1, vecD src2) %{
5578   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
5579   match(Set dst (AddVS src1 src2));
5580   format %{ "vpaddw  $dst,$src1,$src2\t! add packed4S" %}
5581   ins_encode %{
5582     int vector_len = 0;
5583     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5584   %}
5585   ins_pipe( pipe_slow );
5586 %}
5587 
5588 instruct vadd4S_mem(vecD dst, vecD src, memory mem) %{
5589   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
5590   match(Set dst (AddVS src (LoadVector mem)));
5591   format %{ "vpaddw  $dst,$src,$mem\t! add packed4S" %}
5592   ins_encode %{
5593     int vector_len = 0;
5594     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5595   %}
5596   ins_pipe( pipe_slow );
5597 %}
5598 
5599 instruct vadd8S(vecX dst, vecX src) %{
5600   predicate(n->as_Vector()->length() == 8);
5601   match(Set dst (AddVS dst src));
5602   format %{ "paddw   $dst,$src\t! add packed8S" %}
5603   ins_encode %{
5604     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
5605   %}
5606   ins_pipe( pipe_slow );
5607 %}
5608 
5609 instruct vadd8S_reg(vecX dst, vecX src1, vecX src2) %{
5610   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
5611   match(Set dst (AddVS src1 src2));
5612   format %{ "vpaddw  $dst,$src1,$src2\t! add packed8S" %}
5613   ins_encode %{
5614     int vector_len = 0;
5615     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5616   %}
5617   ins_pipe( pipe_slow );
5618 %}


5677   predicate(n->as_Vector()->length() == 2);
5678   match(Set dst (AddVI dst src));
5679   format %{ "paddd   $dst,$src\t! add packed2I" %}
5680   ins_encode %{
5681     __ paddd($dst$$XMMRegister, $src$$XMMRegister);
5682   %}
5683   ins_pipe( pipe_slow );
5684 %}
5685 
5686 instruct vadd2I_reg(vecD dst, vecD src1, vecD src2) %{
5687   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
5688   match(Set dst (AddVI src1 src2));
5689   format %{ "vpaddd  $dst,$src1,$src2\t! add packed2I" %}
5690   ins_encode %{
5691     int vector_len = 0;
5692     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5693   %}
5694   ins_pipe( pipe_slow );
5695 %}
5696 
5697 instruct vadd2I_mem(vecD dst, vecD src, memory mem) %{
5698   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
5699   match(Set dst (AddVI src (LoadVector mem)));
5700   format %{ "vpaddd  $dst,$src,$mem\t! add packed2I" %}
5701   ins_encode %{
5702     int vector_len = 0;
5703     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5704   %}
5705   ins_pipe( pipe_slow );
5706 %}
5707 
5708 instruct vadd4I(vecX dst, vecX src) %{
5709   predicate(n->as_Vector()->length() == 4);
5710   match(Set dst (AddVI dst src));
5711   format %{ "paddd   $dst,$src\t! add packed4I" %}
5712   ins_encode %{
5713     __ paddd($dst$$XMMRegister, $src$$XMMRegister);
5714   %}
5715   ins_pipe( pipe_slow );
5716 %}
5717 
5718 instruct vadd4I_reg(vecX dst, vecX src1, vecX src2) %{
5719   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
5720   match(Set dst (AddVI src1 src2));
5721   format %{ "vpaddd  $dst,$src1,$src2\t! add packed4I" %}
5722   ins_encode %{
5723     int vector_len = 0;
5724     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5725   %}
5726   ins_pipe( pipe_slow );
5727 %}


5863   predicate(n->as_Vector()->length() == 2);
5864   match(Set dst (AddVF dst src));
5865   format %{ "addps   $dst,$src\t! add packed2F" %}
5866   ins_encode %{
5867     __ addps($dst$$XMMRegister, $src$$XMMRegister);
5868   %}
5869   ins_pipe( pipe_slow );
5870 %}
5871 
5872 instruct vadd2F_reg(vecD dst, vecD src1, vecD src2) %{
5873   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
5874   match(Set dst (AddVF src1 src2));
5875   format %{ "vaddps  $dst,$src1,$src2\t! add packed2F" %}
5876   ins_encode %{
5877     int vector_len = 0;
5878     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5879   %}
5880   ins_pipe( pipe_slow );
5881 %}
5882 
5883 instruct vadd2F_mem(vecD dst, vecD src, memory mem) %{
5884   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
5885   match(Set dst (AddVF src (LoadVector mem)));
5886   format %{ "vaddps  $dst,$src,$mem\t! add packed2F" %}
5887   ins_encode %{
5888     int vector_len = 0;
5889     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5890   %}
5891   ins_pipe( pipe_slow );
5892 %}
5893 
5894 instruct vadd4F(vecX dst, vecX src) %{
5895   predicate(n->as_Vector()->length() == 4);
5896   match(Set dst (AddVF dst src));
5897   format %{ "addps   $dst,$src\t! add packed4F" %}
5898   ins_encode %{
5899     __ addps($dst$$XMMRegister, $src$$XMMRegister);
5900   %}
5901   ins_pipe( pipe_slow );
5902 %}
5903 
5904 instruct vadd4F_reg(vecX dst, vecX src1, vecX src2) %{
5905   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
5906   match(Set dst (AddVF src1 src2));
5907   format %{ "vaddps  $dst,$src1,$src2\t! add packed4F" %}
5908   ins_encode %{
5909     int vector_len = 0;
5910     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5911   %}
5912   ins_pipe( pipe_slow );
5913 %}


6051   predicate(n->as_Vector()->length() == 4);
6052   match(Set dst (SubVB dst src));
6053   format %{ "psubb   $dst,$src\t! sub packed4B" %}
6054   ins_encode %{
6055     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
6056   %}
6057   ins_pipe( pipe_slow );
6058 %}
6059 
6060 instruct vsub4B_reg(vecS dst, vecS src1, vecS src2) %{
6061   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6062   match(Set dst (SubVB src1 src2));
6063   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed4B" %}
6064   ins_encode %{
6065     int vector_len = 0;
6066     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6067   %}
6068   ins_pipe( pipe_slow );
6069 %}
6070 
6071 instruct vsub4B_mem(vecS dst, vecS src, memory mem) %{
6072   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6073   match(Set dst (SubVB src (LoadVector mem)));
6074   format %{ "vpsubb  $dst,$src,$mem\t! sub packed4B" %}
6075   ins_encode %{
6076     int vector_len = 0;
6077     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6078   %}
6079   ins_pipe( pipe_slow );
6080 %}
6081 
6082 instruct vsub8B(vecD dst, vecD src) %{
6083   predicate(n->as_Vector()->length() == 8);
6084   match(Set dst (SubVB dst src));
6085   format %{ "psubb   $dst,$src\t! sub packed8B" %}
6086   ins_encode %{
6087     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
6088   %}
6089   ins_pipe( pipe_slow );
6090 %}
6091 
6092 instruct vsub8B_reg(vecD dst, vecD src1, vecD src2) %{
6093   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6094   match(Set dst (SubVB src1 src2));
6095   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed8B" %}
6096   ins_encode %{
6097     int vector_len = 0;
6098     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6099   %}
6100   ins_pipe( pipe_slow );
6101 %}
6102 
6103 instruct vsub8B_mem(vecD dst, vecD src, memory mem) %{
6104   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6105   match(Set dst (SubVB src (LoadVector mem)));
6106   format %{ "vpsubb  $dst,$src,$mem\t! sub packed8B" %}
6107   ins_encode %{
6108     int vector_len = 0;
6109     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6110   %}
6111   ins_pipe( pipe_slow );
6112 %}
6113 
6114 instruct vsub16B(vecX dst, vecX src) %{
6115   predicate(n->as_Vector()->length() == 16);
6116   match(Set dst (SubVB dst src));
6117   format %{ "psubb   $dst,$src\t! sub packed16B" %}
6118   ins_encode %{
6119     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
6120   %}
6121   ins_pipe( pipe_slow );
6122 %}
6123 
6124 instruct vsub16B_reg(vecX dst, vecX src1, vecX src2) %{
6125   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
6126   match(Set dst (SubVB src1 src2));
6127   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed16B" %}
6128   ins_encode %{
6129     int vector_len = 0;
6130     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6131   %}
6132   ins_pipe( pipe_slow );
6133 %}


6192   predicate(n->as_Vector()->length() == 2);
6193   match(Set dst (SubVS dst src));
6194   format %{ "psubw   $dst,$src\t! sub packed2S" %}
6195   ins_encode %{
6196     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
6197   %}
6198   ins_pipe( pipe_slow );
6199 %}
6200 
6201 instruct vsub2S_reg(vecS dst, vecS src1, vecS src2) %{
6202   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6203   match(Set dst (SubVS src1 src2));
6204   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed2S" %}
6205   ins_encode %{
6206     int vector_len = 0;
6207     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6208   %}
6209   ins_pipe( pipe_slow );
6210 %}
6211 
6212 instruct vsub2S_mem(vecS dst, vecS src, memory mem) %{
6213   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6214   match(Set dst (SubVS src (LoadVector mem)));
6215   format %{ "vpsubw  $dst,$src,$mem\t! sub packed2S" %}
6216   ins_encode %{
6217     int vector_len = 0;
6218     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6219   %}
6220   ins_pipe( pipe_slow );
6221 %}
6222 
6223 instruct vsub4S(vecD dst, vecD src) %{
6224   predicate(n->as_Vector()->length() == 4);
6225   match(Set dst (SubVS dst src));
6226   format %{ "psubw   $dst,$src\t! sub packed4S" %}
6227   ins_encode %{
6228     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
6229   %}
6230   ins_pipe( pipe_slow );
6231 %}
6232 
6233 instruct vsub4S_reg(vecD dst, vecD src1, vecD src2) %{
6234   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6235   match(Set dst (SubVS src1 src2));
6236   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed4S" %}
6237   ins_encode %{
6238     int vector_len = 0;
6239     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6240   %}
6241   ins_pipe( pipe_slow );
6242 %}
6243 
6244 instruct vsub4S_mem(vecD dst, vecD src, memory mem) %{
6245   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6246   match(Set dst (SubVS src (LoadVector mem)));
6247   format %{ "vpsubw  $dst,$src,$mem\t! sub packed4S" %}
6248   ins_encode %{
6249     int vector_len = 0;
6250     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6251   %}
6252   ins_pipe( pipe_slow );
6253 %}
6254 
6255 instruct vsub8S(vecX dst, vecX src) %{
6256   predicate(n->as_Vector()->length() == 8);
6257   match(Set dst (SubVS dst src));
6258   format %{ "psubw   $dst,$src\t! sub packed8S" %}
6259   ins_encode %{
6260     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
6261   %}
6262   ins_pipe( pipe_slow );
6263 %}
6264 
6265 instruct vsub8S_reg(vecX dst, vecX src1, vecX src2) %{
6266   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6267   match(Set dst (SubVS src1 src2));
6268   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed8S" %}
6269   ins_encode %{
6270     int vector_len = 0;
6271     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6272   %}
6273   ins_pipe( pipe_slow );
6274 %}


6333   predicate(n->as_Vector()->length() == 2);
6334   match(Set dst (SubVI dst src));
6335   format %{ "psubd   $dst,$src\t! sub packed2I" %}
6336   ins_encode %{
6337     __ psubd($dst$$XMMRegister, $src$$XMMRegister);
6338   %}
6339   ins_pipe( pipe_slow );
6340 %}
6341 
6342 instruct vsub2I_reg(vecD dst, vecD src1, vecD src2) %{
6343   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6344   match(Set dst (SubVI src1 src2));
6345   format %{ "vpsubd  $dst,$src1,$src2\t! sub packed2I" %}
6346   ins_encode %{
6347     int vector_len = 0;
6348     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6349   %}
6350   ins_pipe( pipe_slow );
6351 %}
6352 
6353 instruct vsub2I_mem(vecD dst, vecD src, memory mem) %{
6354   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6355   match(Set dst (SubVI src (LoadVector mem)));
6356   format %{ "vpsubd  $dst,$src,$mem\t! sub packed2I" %}
6357   ins_encode %{
6358     int vector_len = 0;
6359     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6360   %}
6361   ins_pipe( pipe_slow );
6362 %}
6363 
6364 instruct vsub4I(vecX dst, vecX src) %{
6365   predicate(n->as_Vector()->length() == 4);
6366   match(Set dst (SubVI dst src));
6367   format %{ "psubd   $dst,$src\t! sub packed4I" %}
6368   ins_encode %{
6369     __ psubd($dst$$XMMRegister, $src$$XMMRegister);
6370   %}
6371   ins_pipe( pipe_slow );
6372 %}
6373 
6374 instruct vsub4I_reg(vecX dst, vecX src1, vecX src2) %{
6375   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6376   match(Set dst (SubVI src1 src2));
6377   format %{ "vpsubd  $dst,$src1,$src2\t! sub packed4I" %}
6378   ins_encode %{
6379     int vector_len = 0;
6380     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6381   %}
6382   ins_pipe( pipe_slow );
6383 %}


6519   predicate(n->as_Vector()->length() == 2);
6520   match(Set dst (SubVF dst src));
6521   format %{ "subps   $dst,$src\t! sub packed2F" %}
6522   ins_encode %{
6523     __ subps($dst$$XMMRegister, $src$$XMMRegister);
6524   %}
6525   ins_pipe( pipe_slow );
6526 %}
6527 
6528 instruct vsub2F_reg(vecD dst, vecD src1, vecD src2) %{
6529   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6530   match(Set dst (SubVF src1 src2));
6531   format %{ "vsubps  $dst,$src1,$src2\t! sub packed2F" %}
6532   ins_encode %{
6533     int vector_len = 0;
6534     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6535   %}
6536   ins_pipe( pipe_slow );
6537 %}
6538 
6539 instruct vsub2F_mem(vecD dst, vecD src, memory mem) %{
6540   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6541   match(Set dst (SubVF src (LoadVector mem)));
6542   format %{ "vsubps  $dst,$src,$mem\t! sub packed2F" %}
6543   ins_encode %{
6544     int vector_len = 0;
6545     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6546   %}
6547   ins_pipe( pipe_slow );
6548 %}
6549 
6550 instruct vsub4F(vecX dst, vecX src) %{
6551   predicate(n->as_Vector()->length() == 4);
6552   match(Set dst (SubVF dst src));
6553   format %{ "subps   $dst,$src\t! sub packed4F" %}
6554   ins_encode %{
6555     __ subps($dst$$XMMRegister, $src$$XMMRegister);
6556   %}
6557   ins_pipe( pipe_slow );
6558 %}
6559 
6560 instruct vsub4F_reg(vecX dst, vecX src1, vecX src2) %{
6561   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6562   match(Set dst (SubVF src1 src2));
6563   format %{ "vsubps  $dst,$src1,$src2\t! sub packed4F" %}
6564   ins_encode %{
6565     int vector_len = 0;
6566     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6567   %}
6568   ins_pipe( pipe_slow );
6569 %}


6707   predicate(n->as_Vector()->length() == 2);
6708   match(Set dst (MulVS dst src));
6709   format %{ "pmullw $dst,$src\t! mul packed2S" %}
6710   ins_encode %{
6711     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
6712   %}
6713   ins_pipe( pipe_slow );
6714 %}
6715 
6716 instruct vmul2S_reg(vecS dst, vecS src1, vecS src2) %{
6717   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6718   match(Set dst (MulVS src1 src2));
6719   format %{ "vpmullw $dst,$src1,$src2\t! mul packed2S" %}
6720   ins_encode %{
6721     int vector_len = 0;
6722     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6723   %}
6724   ins_pipe( pipe_slow );
6725 %}
6726 
6727 instruct vmul2S_mem(vecS dst, vecS src, memory mem) %{
6728   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6729   match(Set dst (MulVS src (LoadVector mem)));
6730   format %{ "vpmullw $dst,$src,$mem\t! mul packed2S" %}
6731   ins_encode %{
6732     int vector_len = 0;
6733     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6734   %}
6735   ins_pipe( pipe_slow );
6736 %}
6737 
6738 instruct vmul4S(vecD dst, vecD src) %{
6739   predicate(n->as_Vector()->length() == 4);
6740   match(Set dst (MulVS dst src));
6741   format %{ "pmullw  $dst,$src\t! mul packed4S" %}
6742   ins_encode %{
6743     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
6744   %}
6745   ins_pipe( pipe_slow );
6746 %}
6747 
6748 instruct vmul4S_reg(vecD dst, vecD src1, vecD src2) %{
6749   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6750   match(Set dst (MulVS src1 src2));
6751   format %{ "vpmullw $dst,$src1,$src2\t! mul packed4S" %}
6752   ins_encode %{
6753     int vector_len = 0;
6754     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6755   %}
6756   ins_pipe( pipe_slow );
6757 %}
6758 
6759 instruct vmul4S_mem(vecD dst, vecD src, memory mem) %{
6760   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6761   match(Set dst (MulVS src (LoadVector mem)));
6762   format %{ "vpmullw $dst,$src,$mem\t! mul packed4S" %}
6763   ins_encode %{
6764     int vector_len = 0;
6765     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6766   %}
6767   ins_pipe( pipe_slow );
6768 %}
6769 
6770 instruct vmul8S(vecX dst, vecX src) %{
6771   predicate(n->as_Vector()->length() == 8);
6772   match(Set dst (MulVS dst src));
6773   format %{ "pmullw  $dst,$src\t! mul packed8S" %}
6774   ins_encode %{
6775     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
6776   %}
6777   ins_pipe( pipe_slow );
6778 %}
6779 
6780 instruct vmul8S_reg(vecX dst, vecX src1, vecX src2) %{
6781   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6782   match(Set dst (MulVS src1 src2));
6783   format %{ "vpmullw $dst,$src1,$src2\t! mul packed8S" %}
6784   ins_encode %{
6785     int vector_len = 0;
6786     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6787   %}
6788   ins_pipe( pipe_slow );
6789 %}


6848   predicate(UseSSE > 3 && n->as_Vector()->length() == 2);
6849   match(Set dst (MulVI dst src));
6850   format %{ "pmulld  $dst,$src\t! mul packed2I" %}
6851   ins_encode %{
6852     __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
6853   %}
6854   ins_pipe( pipe_slow );
6855 %}
6856 
6857 instruct vmul2I_reg(vecD dst, vecD src1, vecD src2) %{
6858   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6859   match(Set dst (MulVI src1 src2));
6860   format %{ "vpmulld $dst,$src1,$src2\t! mul packed2I" %}
6861   ins_encode %{
6862     int vector_len = 0;
6863     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6864   %}
6865   ins_pipe( pipe_slow );
6866 %}
6867 
6868 instruct vmul2I_mem(vecD dst, vecD src, memory mem) %{
6869   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6870   match(Set dst (MulVI src (LoadVector mem)));
6871   format %{ "vpmulld $dst,$src,$mem\t! mul packed2I" %}
6872   ins_encode %{
6873     int vector_len = 0;
6874     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6875   %}
6876   ins_pipe( pipe_slow );
6877 %}
6878 
6879 instruct vmul4I(vecX dst, vecX src) %{
6880   predicate(UseSSE > 3 && n->as_Vector()->length() == 4);
6881   match(Set dst (MulVI dst src));
6882   format %{ "pmulld  $dst,$src\t! mul packed4I" %}
6883   ins_encode %{
6884     __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
6885   %}
6886   ins_pipe( pipe_slow );
6887 %}
6888 
6889 instruct vmul4I_reg(vecX dst, vecX src1, vecX src2) %{
6890   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6891   match(Set dst (MulVI src1 src2));
6892   format %{ "vpmulld $dst,$src1,$src2\t! mul packed4I" %}
6893   ins_encode %{
6894     int vector_len = 0;
6895     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6896   %}
6897   ins_pipe( pipe_slow );
6898 %}
6899 
6900 instruct vmul4I_mem(vecX dst, vecX src, memory mem) %{
6901   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6902   match(Set dst (MulVI src (LoadVector mem)));
6903   format %{ "vpmulld $dst,$src,$mem\t! mul packed4I" %}
6904   ins_encode %{
6905     int vector_len = 0;
6906     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6907   %}
6908   ins_pipe( pipe_slow );
6909 %}
6910 
6911 instruct vmul2L_reg(vecX dst, vecX src1, vecX src2) %{
6912   predicate(UseAVX > 2 && n->as_Vector()->length() == 2 && VM_Version::supports_avx512dq());
6913   match(Set dst (MulVL src1 src2));
6914   format %{ "vpmullq $dst,$src1,$src2\t! mul packed2L" %}
6915   ins_encode %{
6916     int vector_len = 0;
6917     __ vpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6918   %}
6919   ins_pipe( pipe_slow );
6920 %}
6921 
6922 instruct vmul2L_mem(vecX dst, vecX src, memory mem) %{
6923   predicate(UseAVX > 2 && n->as_Vector()->length() == 2 && VM_Version::supports_avx512dq());
6924   match(Set dst (MulVL src (LoadVector mem)));
6925   format %{ "vpmullq $dst,$src,$mem\t! mul packed2L" %}
6926   ins_encode %{
6927     int vector_len = 0;
6928     __ vpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6929   %}
6930   ins_pipe( pipe_slow );
6931 %}
6932 
6933 instruct vmul4L_reg(vecY dst, vecY src1, vecY src2) %{
6934   predicate(UseAVX > 2 && n->as_Vector()->length() == 4 && VM_Version::supports_avx512dq());
6935   match(Set dst (MulVL src1 src2));
6936   format %{ "vpmullq $dst,$src1,$src2\t! mul packed4L" %}
6937   ins_encode %{
6938     int vector_len = 1;
6939     __ vpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6940   %}
6941   ins_pipe( pipe_slow );
6942 %}
6943 
6944 instruct vmul4L_mem(vecY dst, vecY src, memory mem) %{
6945   predicate(UseAVX > 2 && n->as_Vector()->length() == 4 && VM_Version::supports_avx512dq());
6946   match(Set dst (MulVL src (LoadVector mem)));
6947   format %{ "vpmullq $dst,$src,$mem\t! mul packed4L" %}
6948   ins_encode %{
6949     int vector_len = 1;
6950     __ vpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6951   %}
6952   ins_pipe( pipe_slow );
6953 %}
6954 
6955 instruct vmul8L_reg(vecZ dst, vecZ src1, vecZ src2) %{
6956   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && VM_Version::supports_avx512dq());
6957   match(Set dst (MulVL src1 src2));
6958   format %{ "vpmullq $dst,$src1,$src2\t! mul packed8L" %}
6959   ins_encode %{
6960     int vector_len = 2;
6961     __ vpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6962   %}
6963   ins_pipe( pipe_slow );
6964 %}
6965 
6966 instruct vmul8L_mem(vecZ dst, vecZ src, memory mem) %{
6967   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && VM_Version::supports_avx512dq());
6968   match(Set dst (MulVL src (LoadVector mem)));
6969   format %{ "vpmullq $dst,$src,$mem\t! mul packed8L" %}
6970   ins_encode %{
6971     int vector_len = 2;
6972     __ vpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6973   %}
6974   ins_pipe( pipe_slow );
6975 %}
6976 
6977 instruct vmul8I_reg(vecY dst, vecY src1, vecY src2) %{
6978   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
6979   match(Set dst (MulVI src1 src2));
6980   format %{ "vpmulld $dst,$src1,$src2\t! mul packed8I" %}
6981   ins_encode %{
6982     int vector_len = 1;
6983     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6984   %}
6985   ins_pipe( pipe_slow );
6986 %}
6987 
6988 instruct vmul8I_mem(vecY dst, vecY src, memory mem) %{
6989   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
6990   match(Set dst (MulVI src (LoadVector mem)));
6991   format %{ "vpmulld $dst,$src,$mem\t! mul packed8I" %}
6992   ins_encode %{
6993     int vector_len = 1;
6994     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6995   %}
6996   ins_pipe( pipe_slow );
6997 %}
6998 
6999 instruct vmul16I_reg(vecZ dst, vecZ src1, vecZ src2) %{
7000   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
7001   match(Set dst (MulVI src1 src2));
7002   format %{ "vpmulld $dst,$src1,$src2\t! mul packed16I" %}
7003   ins_encode %{
7004     int vector_len = 2;
7005     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7006   %}
7007   ins_pipe( pipe_slow );
7008 %}
7009 
7010 instruct vmul16I_mem(vecZ dst, vecZ src, memory mem) %{
7011   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
7012   match(Set dst (MulVI src (LoadVector mem)));
7013   format %{ "vpmulld $dst,$src,$mem\t! mul packed16I" %}
7014   ins_encode %{
7015     int vector_len = 2;
7016     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7017   %}
7018   ins_pipe( pipe_slow );
7019 %}
7020 
7021 // Floats vector mul
7022 instruct vmul2F(vecD dst, vecD src) %{
7023   predicate(n->as_Vector()->length() == 2);
7024   match(Set dst (MulVF dst src));
7025   format %{ "mulps   $dst,$src\t! mul packed2F" %}
7026   ins_encode %{
7027     __ mulps($dst$$XMMRegister, $src$$XMMRegister);
7028   %}
7029   ins_pipe( pipe_slow );
7030 %}
7031 
7032 instruct vmul2F_reg(vecD dst, vecD src1, vecD src2) %{
7033   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7034   match(Set dst (MulVF src1 src2));
7035   format %{ "vmulps  $dst,$src1,$src2\t! mul packed2F" %}
7036   ins_encode %{
7037     int vector_len = 0;
7038     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7039   %}
7040   ins_pipe( pipe_slow );
7041 %}
7042 
7043 instruct vmul2F_mem(vecD dst, vecD src, memory mem) %{
7044   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7045   match(Set dst (MulVF src (LoadVector mem)));
7046   format %{ "vmulps  $dst,$src,$mem\t! mul packed2F" %}
7047   ins_encode %{
7048     int vector_len = 0;
7049     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7050   %}
7051   ins_pipe( pipe_slow );
7052 %}
7053 
7054 instruct vmul4F(vecX dst, vecX src) %{
7055   predicate(n->as_Vector()->length() == 4);
7056   match(Set dst (MulVF dst src));
7057   format %{ "mulps   $dst,$src\t! mul packed4F" %}
7058   ins_encode %{
7059     __ mulps($dst$$XMMRegister, $src$$XMMRegister);
7060   %}
7061   ins_pipe( pipe_slow );
7062 %}
7063 
7064 instruct vmul4F_reg(vecX dst, vecX src1, vecX src2) %{
7065   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7066   match(Set dst (MulVF src1 src2));
7067   format %{ "vmulps  $dst,$src1,$src2\t! mul packed4F" %}
7068   ins_encode %{
7069     int vector_len = 0;
7070     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7071   %}
7072   ins_pipe( pipe_slow );
7073 %}


7211   predicate(n->as_Vector()->length() == 2);
7212   match(Set dst (DivVF dst src));
7213   format %{ "divps   $dst,$src\t! div packed2F" %}
7214   ins_encode %{
7215     __ divps($dst$$XMMRegister, $src$$XMMRegister);
7216   %}
7217   ins_pipe( pipe_slow );
7218 %}
7219 
7220 instruct vdiv2F_reg(vecD dst, vecD src1, vecD src2) %{
7221   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7222   match(Set dst (DivVF src1 src2));
7223   format %{ "vdivps  $dst,$src1,$src2\t! div packed2F" %}
7224   ins_encode %{
7225     int vector_len = 0;
7226     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7227   %}
7228   ins_pipe( pipe_slow );
7229 %}
7230 
7231 instruct vdiv2F_mem(vecD dst, vecD src, memory mem) %{
7232   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7233   match(Set dst (DivVF src (LoadVector mem)));
7234   format %{ "vdivps  $dst,$src,$mem\t! div packed2F" %}
7235   ins_encode %{
7236     int vector_len = 0;
7237     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7238   %}
7239   ins_pipe( pipe_slow );
7240 %}
7241 
7242 instruct vdiv4F(vecX dst, vecX src) %{
7243   predicate(n->as_Vector()->length() == 4);
7244   match(Set dst (DivVF dst src));
7245   format %{ "divps   $dst,$src\t! div packed4F" %}
7246   ins_encode %{
7247     __ divps($dst$$XMMRegister, $src$$XMMRegister);
7248   %}
7249   ins_pipe( pipe_slow );
7250 %}
7251 
7252 instruct vdiv4F_reg(vecX dst, vecX src1, vecX src2) %{
7253   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7254   match(Set dst (DivVF src1 src2));
7255   format %{ "vdivps  $dst,$src1,$src2\t! div packed4F" %}
7256   ins_encode %{
7257     int vector_len = 0;
7258     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7259   %}
7260   ins_pipe( pipe_slow );
7261 %}


8499   predicate(n->as_Vector()->length_in_bytes() == 4);
8500   match(Set dst (AndV dst src));
8501   format %{ "pand    $dst,$src\t! and vectors (4 bytes)" %}
8502   ins_encode %{
8503     __ pand($dst$$XMMRegister, $src$$XMMRegister);
8504   %}
8505   ins_pipe( pipe_slow );
8506 %}
8507 
8508 instruct vand4B_reg(vecS dst, vecS src1, vecS src2) %{
8509   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
8510   match(Set dst (AndV src1 src2));
8511   format %{ "vpand   $dst,$src1,$src2\t! and vectors (4 bytes)" %}
8512   ins_encode %{
8513     int vector_len = 0;
8514     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8515   %}
8516   ins_pipe( pipe_slow );
8517 %}
8518 
8519 instruct vand4B_mem(vecS dst, vecS src, memory mem) %{
8520   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
8521   match(Set dst (AndV src (LoadVector mem)));
8522   format %{ "vpand   $dst,$src,$mem\t! and vectors (4 bytes)" %}
8523   ins_encode %{
8524     int vector_len = 0;
8525     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8526   %}
8527   ins_pipe( pipe_slow );
8528 %}
8529 
8530 instruct vand8B(vecD dst, vecD src) %{
8531   predicate(n->as_Vector()->length_in_bytes() == 8);
8532   match(Set dst (AndV dst src));
8533   format %{ "pand    $dst,$src\t! and vectors (8 bytes)" %}
8534   ins_encode %{
8535     __ pand($dst$$XMMRegister, $src$$XMMRegister);
8536   %}
8537   ins_pipe( pipe_slow );
8538 %}
8539 
8540 instruct vand8B_reg(vecD dst, vecD src1, vecD src2) %{
8541   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
8542   match(Set dst (AndV src1 src2));
8543   format %{ "vpand   $dst,$src1,$src2\t! and vectors (8 bytes)" %}
8544   ins_encode %{
8545     int vector_len = 0;
8546     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8547   %}
8548   ins_pipe( pipe_slow );
8549 %}
8550 
8551 instruct vand8B_mem(vecD dst, vecD src, memory mem) %{
8552   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
8553   match(Set dst (AndV src (LoadVector mem)));
8554   format %{ "vpand   $dst,$src,$mem\t! and vectors (8 bytes)" %}
8555   ins_encode %{
8556     int vector_len = 0;
8557     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8558   %}
8559   ins_pipe( pipe_slow );
8560 %}
8561 
8562 instruct vand16B(vecX dst, vecX src) %{
8563   predicate(n->as_Vector()->length_in_bytes() == 16);
8564   match(Set dst (AndV dst src));
8565   format %{ "pand    $dst,$src\t! and vectors (16 bytes)" %}
8566   ins_encode %{
8567     __ pand($dst$$XMMRegister, $src$$XMMRegister);
8568   %}
8569   ins_pipe( pipe_slow );
8570 %}
8571 
8572 instruct vand16B_reg(vecX dst, vecX src1, vecX src2) %{
8573   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
8574   match(Set dst (AndV src1 src2));
8575   format %{ "vpand   $dst,$src1,$src2\t! and vectors (16 bytes)" %}
8576   ins_encode %{
8577     int vector_len = 0;
8578     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8579   %}
8580   ins_pipe( pipe_slow );
8581 %}


8641   predicate(n->as_Vector()->length_in_bytes() == 4);
8642   match(Set dst (OrV dst src));
8643   format %{ "por     $dst,$src\t! or vectors (4 bytes)" %}
8644   ins_encode %{
8645     __ por($dst$$XMMRegister, $src$$XMMRegister);
8646   %}
8647   ins_pipe( pipe_slow );
8648 %}
8649 
8650 instruct vor4B_reg(vecS dst, vecS src1, vecS src2) %{
8651   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
8652   match(Set dst (OrV src1 src2));
8653   format %{ "vpor    $dst,$src1,$src2\t! or vectors (4 bytes)" %}
8654   ins_encode %{
8655     int vector_len = 0;
8656     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8657   %}
8658   ins_pipe( pipe_slow );
8659 %}
8660 
8661 instruct vor4B_mem(vecS dst, vecS src, memory mem) %{
8662   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
8663   match(Set dst (OrV src (LoadVector mem)));
8664   format %{ "vpor    $dst,$src,$mem\t! or vectors (4 bytes)" %}
8665   ins_encode %{
8666     int vector_len = 0;
8667     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8668   %}
8669   ins_pipe( pipe_slow );
8670 %}
8671 
8672 instruct vor8B(vecD dst, vecD src) %{
8673   predicate(n->as_Vector()->length_in_bytes() == 8);
8674   match(Set dst (OrV dst src));
8675   format %{ "por     $dst,$src\t! or vectors (8 bytes)" %}
8676   ins_encode %{
8677     __ por($dst$$XMMRegister, $src$$XMMRegister);
8678   %}
8679   ins_pipe( pipe_slow );
8680 %}
8681 
8682 instruct vor8B_reg(vecD dst, vecD src1, vecD src2) %{
8683   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
8684   match(Set dst (OrV src1 src2));
8685   format %{ "vpor    $dst,$src1,$src2\t! or vectors (8 bytes)" %}
8686   ins_encode %{
8687     int vector_len = 0;
8688     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8689   %}
8690   ins_pipe( pipe_slow );
8691 %}
8692 
8693 instruct vor8B_mem(vecD dst, vecD src, memory mem) %{
8694   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
8695   match(Set dst (OrV src (LoadVector mem)));
8696   format %{ "vpor    $dst,$src,$mem\t! or vectors (8 bytes)" %}
8697   ins_encode %{
8698     int vector_len = 0;
8699     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8700   %}
8701   ins_pipe( pipe_slow );
8702 %}
8703 
8704 instruct vor16B(vecX dst, vecX src) %{
8705   predicate(n->as_Vector()->length_in_bytes() == 16);
8706   match(Set dst (OrV dst src));
8707   format %{ "por     $dst,$src\t! or vectors (16 bytes)" %}
8708   ins_encode %{
8709     __ por($dst$$XMMRegister, $src$$XMMRegister);
8710   %}
8711   ins_pipe( pipe_slow );
8712 %}
8713 
8714 instruct vor16B_reg(vecX dst, vecX src1, vecX src2) %{
8715   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
8716   match(Set dst (OrV src1 src2));
8717   format %{ "vpor    $dst,$src1,$src2\t! or vectors (16 bytes)" %}
8718   ins_encode %{
8719     int vector_len = 0;
8720     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8721   %}
8722   ins_pipe( pipe_slow );
8723 %}


8783   predicate(n->as_Vector()->length_in_bytes() == 4);
8784   match(Set dst (XorV dst src));
8785   format %{ "pxor    $dst,$src\t! xor vectors (4 bytes)" %}
8786   ins_encode %{
8787     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
8788   %}
8789   ins_pipe( pipe_slow );
8790 %}
8791 
8792 instruct vxor4B_reg(vecS dst, vecS src1, vecS src2) %{
8793   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
8794   match(Set dst (XorV src1 src2));
8795   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (4 bytes)" %}
8796   ins_encode %{
8797     int vector_len = 0;
8798     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8799   %}
8800   ins_pipe( pipe_slow );
8801 %}
8802 
8803 instruct vxor4B_mem(vecS dst, vecS src, memory mem) %{
8804   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
8805   match(Set dst (XorV src (LoadVector mem)));
8806   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (4 bytes)" %}
8807   ins_encode %{
8808     int vector_len = 0;
8809     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8810   %}
8811   ins_pipe( pipe_slow );
8812 %}
8813 
8814 instruct vxor8B(vecD dst, vecD src) %{
8815   predicate(n->as_Vector()->length_in_bytes() == 8);
8816   match(Set dst (XorV dst src));
8817   format %{ "pxor    $dst,$src\t! xor vectors (8 bytes)" %}
8818   ins_encode %{
8819     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
8820   %}
8821   ins_pipe( pipe_slow );
8822 %}
8823 
8824 instruct vxor8B_reg(vecD dst, vecD src1, vecD src2) %{
8825   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
8826   match(Set dst (XorV src1 src2));
8827   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (8 bytes)" %}
8828   ins_encode %{
8829     int vector_len = 0;
8830     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8831   %}
8832   ins_pipe( pipe_slow );
8833 %}
8834 
8835 instruct vxor8B_mem(vecD dst, vecD src, memory mem) %{
8836   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
8837   match(Set dst (XorV src (LoadVector mem)));
8838   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (8 bytes)" %}
8839   ins_encode %{
8840     int vector_len = 0;
8841     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8842   %}
8843   ins_pipe( pipe_slow );
8844 %}
8845 
8846 instruct vxor16B(vecX dst, vecX src) %{
8847   predicate(n->as_Vector()->length_in_bytes() == 16);
8848   match(Set dst (XorV dst src));
8849   format %{ "pxor    $dst,$src\t! xor vectors (16 bytes)" %}
8850   ins_encode %{
8851     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
8852   %}
8853   ins_pipe( pipe_slow );
8854 %}
8855 
8856 instruct vxor16B_reg(vecX dst, vecX src1, vecX src2) %{
8857   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
8858   match(Set dst (XorV src1 src2));
8859   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (16 bytes)" %}
8860   ins_encode %{
8861     int vector_len = 0;


< prev index next >