src/cpu/x86/vm/x86.ad
Index Unified diffs Context diffs Sdiffs Patch New Old Previous File Next File hotspot Sdiff src/cpu/x86/vm

src/cpu/x86/vm/x86.ad

Print this page
rev 10354 : imported patch vextrinscleanup2
rev 10357 : [mq]: vextrinscleanup5


3162   predicate(n->as_Vector()->length() == 16 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
3163   match(Set dst (ReplicateB (LoadB mem)));
3164   format %{ "punpcklbw $dst,$mem\n\t"
3165             "pshuflw $dst,$dst,0x00\n\t"
3166             "punpcklqdq $dst,$dst\t! replicate16B" %}
3167   ins_encode %{
3168     __ punpcklbw($dst$$XMMRegister, $mem$$Address);
3169     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3170     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3171   %}
3172   ins_pipe( pipe_slow );
3173 %}
3174 
3175 instruct Repl32B(vecY dst, rRegI src) %{
3176   predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
3177   match(Set dst (ReplicateB src));
3178   format %{ "movd    $dst,$src\n\t"
3179             "punpcklbw $dst,$dst\n\t"
3180             "pshuflw $dst,$dst,0x00\n\t"
3181             "punpcklqdq $dst,$dst\n\t"
3182             "vinserti128h $dst,$dst,$dst\t! replicate32B" %}
3183   ins_encode %{
3184     __ movdl($dst$$XMMRegister, $src$$Register);
3185     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
3186     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3187     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3188     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
3189   %}
3190   ins_pipe( pipe_slow );
3191 %}
3192 
3193 instruct Repl32B_mem(vecY dst, memory mem) %{
3194   predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
3195   match(Set dst (ReplicateB (LoadB mem)));
3196   format %{ "punpcklbw $dst,$mem\n\t"
3197             "pshuflw $dst,$dst,0x00\n\t"
3198             "punpcklqdq $dst,$dst\n\t"
3199             "vinserti128h $dst,$dst,$dst\t! replicate32B" %}
3200   ins_encode %{
3201     __ punpcklbw($dst$$XMMRegister, $mem$$Address);
3202     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3203     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3204     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
3205   %}
3206   ins_pipe( pipe_slow );
3207 %}
3208 
3209 instruct Repl16B_imm(vecX dst, immI con) %{
3210   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
3211   match(Set dst (ReplicateB con));
3212   format %{ "movq    $dst,[$constantaddress]\n\t"
3213             "punpcklqdq $dst,$dst\t! replicate16B($con)" %}
3214   ins_encode %{
3215     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
3216     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3217   %}
3218   ins_pipe( pipe_slow );
3219 %}
3220 
3221 instruct Repl32B_imm(vecY dst, immI con) %{
3222   predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
3223   match(Set dst (ReplicateB con));
3224   format %{ "movq    $dst,[$constantaddress]\n\t"
3225             "punpcklqdq $dst,$dst\n\t"
3226             "vinserti128h $dst,$dst,$dst\t! lreplicate32B($con)" %}
3227   ins_encode %{
3228     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
3229     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3230     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
3231   %}
3232   ins_pipe( pipe_slow );
3233 %}
3234 
3235 instruct Repl4S(vecD dst, rRegI src) %{
3236   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vlbw());
3237   match(Set dst (ReplicateS src));
3238   format %{ "movd    $dst,$src\n\t"
3239             "pshuflw $dst,$dst,0x00\t! replicate4S" %}
3240   ins_encode %{
3241     __ movdl($dst$$XMMRegister, $src$$Register);
3242     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3243   %}
3244   ins_pipe( pipe_slow );
3245 %}
3246 
3247 instruct Repl4S_mem(vecD dst, memory mem) %{
3248   predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
3249   match(Set dst (ReplicateS (LoadS mem)));
3250   format %{ "pshuflw $dst,$mem,0x00\t! replicate4S" %}


3281 %}
3282 
3283 instruct Repl8S_imm(vecX dst, immI con) %{
3284   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vlbw());
3285   match(Set dst (ReplicateS con));
3286   format %{ "movq    $dst,[$constantaddress]\n\t"
3287             "punpcklqdq $dst,$dst\t! replicate8S($con)" %}
3288   ins_encode %{
3289     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
3290     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3291   %}
3292   ins_pipe( pipe_slow );
3293 %}
3294 
3295 instruct Repl16S(vecY dst, rRegI src) %{
3296   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
3297   match(Set dst (ReplicateS src));
3298   format %{ "movd    $dst,$src\n\t"
3299             "pshuflw $dst,$dst,0x00\n\t"
3300             "punpcklqdq $dst,$dst\n\t"
3301             "vinserti128h $dst,$dst,$dst\t! replicate16S" %}
3302   ins_encode %{
3303     __ movdl($dst$$XMMRegister, $src$$Register);
3304     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3305     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3306     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
3307   %}
3308   ins_pipe( pipe_slow );
3309 %}
3310 
3311 instruct Repl16S_mem(vecY dst, memory mem) %{
3312   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
3313   match(Set dst (ReplicateS (LoadS mem)));
3314   format %{ "pshuflw $dst,$mem,0x00\n\t"
3315             "punpcklqdq $dst,$dst\n\t"
3316             "vinserti128h $dst,$dst,$dst\t! replicate16S" %}
3317   ins_encode %{
3318     __ pshuflw($dst$$XMMRegister, $mem$$Address, 0x00);
3319     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3320     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
3321   %}
3322   ins_pipe( pipe_slow );
3323 %}
3324 
3325 instruct Repl16S_imm(vecY dst, immI con) %{
3326   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
3327   match(Set dst (ReplicateS con));
3328   format %{ "movq    $dst,[$constantaddress]\n\t"
3329             "punpcklqdq $dst,$dst\n\t"
3330             "vinserti128h $dst,$dst,$dst\t! replicate16S($con)" %}
3331   ins_encode %{
3332     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
3333     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3334     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
3335   %}
3336   ins_pipe( pipe_slow );
3337 %}
3338 
3339 instruct Repl4I(vecX dst, rRegI src) %{
3340   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3341   match(Set dst (ReplicateI src));
3342   format %{ "movd    $dst,$src\n\t"
3343             "pshufd  $dst,$dst,0x00\t! replicate4I" %}
3344   ins_encode %{
3345     __ movdl($dst$$XMMRegister, $src$$Register);
3346     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3347   %}
3348   ins_pipe( pipe_slow );
3349 %}
3350 
3351 instruct Repl4I_mem(vecX dst, memory mem) %{
3352   predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3353   match(Set dst (ReplicateI (LoadI mem)));
3354   format %{ "pshufd  $dst,$mem,0x00\t! replicate4I" %}
3355   ins_encode %{
3356     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3357   %}
3358   ins_pipe( pipe_slow );
3359 %}
3360 
3361 instruct Repl8I(vecY dst, rRegI src) %{
3362   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3363   match(Set dst (ReplicateI src));
3364   format %{ "movd    $dst,$src\n\t"
3365             "pshufd  $dst,$dst,0x00\n\t"
3366             "vinserti128h $dst,$dst,$dst\t! replicate8I" %}
3367   ins_encode %{
3368     __ movdl($dst$$XMMRegister, $src$$Register);
3369     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3370     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
3371   %}
3372   ins_pipe( pipe_slow );
3373 %}
3374 
3375 instruct Repl8I_mem(vecY dst, memory mem) %{
3376   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3377   match(Set dst (ReplicateI (LoadI mem)));
3378   format %{ "pshufd  $dst,$mem,0x00\n\t"
3379             "vinserti128h $dst,$dst,$dst\t! replicate8I" %}
3380   ins_encode %{
3381     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3382     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
3383   %}
3384   ins_pipe( pipe_slow );
3385 %}
3386 
3387 instruct Repl4I_imm(vecX dst, immI con) %{
3388   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3389   match(Set dst (ReplicateI con));
3390   format %{ "movq    $dst,[$constantaddress]\t! replicate4I($con)\n\t"
3391             "punpcklqdq $dst,$dst" %}
3392   ins_encode %{
3393     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
3394     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3395   %}
3396   ins_pipe( pipe_slow );
3397 %}
3398 
3399 instruct Repl8I_imm(vecY dst, immI con) %{
3400   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3401   match(Set dst (ReplicateI con));
3402   format %{ "movq    $dst,[$constantaddress]\t! replicate8I($con)\n\t"
3403             "punpcklqdq $dst,$dst\n\t"
3404             "vinserti128h $dst,$dst,$dst" %}
3405   ins_encode %{
3406     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
3407     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3408     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
3409   %}
3410   ins_pipe( pipe_slow );
3411 %}
3412 
3413 // Long could be loaded into xmm register directly from memory.
3414 instruct Repl2L_mem(vecX dst, memory mem) %{
3415   predicate(n->as_Vector()->length() == 2 && !VM_Version::supports_avx512vlbw());
3416   match(Set dst (ReplicateL (LoadL mem)));
3417   format %{ "movq    $dst,$mem\n\t"
3418             "punpcklqdq $dst,$dst\t! replicate2L" %}
3419   ins_encode %{
3420     __ movq($dst$$XMMRegister, $mem$$Address);
3421     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3422   %}
3423   ins_pipe( pipe_slow );
3424 %}
3425 
3426 // Replicate long (8 byte) scalar to be vector
3427 #ifdef _LP64
3428 instruct Repl4L(vecY dst, rRegL src) %{
3429   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3430   match(Set dst (ReplicateL src));
3431   format %{ "movdq   $dst,$src\n\t"
3432             "punpcklqdq $dst,$dst\n\t"
3433             "vinserti128h $dst,$dst,$dst\t! replicate4L" %}
3434   ins_encode %{
3435     __ movdq($dst$$XMMRegister, $src$$Register);
3436     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3437     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
3438   %}
3439   ins_pipe( pipe_slow );
3440 %}
3441 #else // _LP64
3442 instruct Repl4L(vecY dst, eRegL src, regD tmp) %{
3443   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3444   match(Set dst (ReplicateL src));
3445   effect(TEMP dst, USE src, TEMP tmp);
3446   format %{ "movdl   $dst,$src.lo\n\t"
3447             "movdl   $tmp,$src.hi\n\t"
3448             "punpckldq $dst,$tmp\n\t"
3449             "punpcklqdq $dst,$dst\n\t"
3450             "vinserti128h $dst,$dst,$dst\t! replicate4L" %}
3451   ins_encode %{
3452     __ movdl($dst$$XMMRegister, $src$$Register);
3453     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
3454     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
3455     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3456     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
3457   %}
3458   ins_pipe( pipe_slow );
3459 %}
3460 #endif // _LP64
3461 
3462 instruct Repl4L_imm(vecY dst, immL con) %{
3463   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3464   match(Set dst (ReplicateL con));
3465   format %{ "movq    $dst,[$constantaddress]\n\t"
3466             "punpcklqdq $dst,$dst\n\t"
3467             "vinserti128h $dst,$dst,$dst\t! replicate4L($con)" %}
3468   ins_encode %{
3469     __ movq($dst$$XMMRegister, $constantaddress($con));
3470     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3471     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
3472   %}
3473   ins_pipe( pipe_slow );
3474 %}
3475 
3476 instruct Repl4L_mem(vecY dst, memory mem) %{
3477   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3478   match(Set dst (ReplicateL (LoadL mem)));
3479   format %{ "movq    $dst,$mem\n\t"
3480             "punpcklqdq $dst,$dst\n\t"
3481             "vinserti128h $dst,$dst,$dst\t! replicate4L" %}
3482   ins_encode %{
3483     __ movq($dst$$XMMRegister, $mem$$Address);
3484     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3485     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
3486   %}
3487   ins_pipe( pipe_slow );
3488 %}
3489 
3490 instruct Repl2F_mem(vecD dst, memory mem) %{
3491   predicate(n->as_Vector()->length() == 2 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3492   match(Set dst (ReplicateF (LoadF mem)));
3493   format %{ "pshufd  $dst,$mem,0x00\t! replicate2F" %}
3494   ins_encode %{
3495     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3496   %}
3497   ins_pipe( pipe_slow );
3498 %}
3499 
3500 instruct Repl4F_mem(vecX dst, memory mem) %{
3501   predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3502   match(Set dst (ReplicateF (LoadF mem)));
3503   format %{ "pshufd  $dst,$mem,0x00\t! replicate4F" %}
3504   ins_encode %{
3505     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3506   %}
3507   ins_pipe( pipe_slow );
3508 %}
3509 
3510 instruct Repl8F(vecY dst, regF src) %{
3511   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3512   match(Set dst (ReplicateF src));
3513   format %{ "pshufd  $dst,$src,0x00\n\t"
3514             "vinsertf128h $dst,$dst,$dst\t! replicate8F" %}
3515   ins_encode %{
3516     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
3517     __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
3518   %}
3519   ins_pipe( pipe_slow );
3520 %}
3521 
3522 instruct Repl8F_mem(vecY dst, memory mem) %{
3523   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3524   match(Set dst (ReplicateF (LoadF mem)));
3525   format %{ "pshufd  $dst,$mem,0x00\n\t"
3526             "vinsertf128h $dst,$dst,$dst\t! replicate8F" %}
3527   ins_encode %{
3528     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3529     __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
3530   %}
3531   ins_pipe( pipe_slow );
3532 %}
3533 
3534 instruct Repl2F_zero(vecD dst, immF0 zero) %{
3535   predicate(n->as_Vector()->length() == 2 && UseAVX < 3);
3536   match(Set dst (ReplicateF zero));
3537   format %{ "xorps   $dst,$dst\t! replicate2F zero" %}
3538   ins_encode %{
3539     __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
3540   %}
3541   ins_pipe( fpu_reg_reg );
3542 %}
3543 
3544 instruct Repl4F_zero(vecX dst, immF0 zero) %{
3545   predicate(n->as_Vector()->length() == 4 && UseAVX < 3);
3546   match(Set dst (ReplicateF zero));
3547   format %{ "xorps   $dst,$dst\t! replicate4F zero" %}
3548   ins_encode %{
3549     __ xorps($dst$$XMMRegister, $dst$$XMMRegister);


3559     int vector_len = 1;
3560     __ vxorps($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3561   %}
3562   ins_pipe( fpu_reg_reg );
3563 %}
3564 
3565 instruct Repl2D_mem(vecX dst, memory mem) %{
3566   predicate(n->as_Vector()->length() == 2 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3567   match(Set dst (ReplicateD (LoadD mem)));
3568   format %{ "pshufd  $dst,$mem,0x44\t! replicate2D" %}
3569   ins_encode %{
3570     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x44);
3571   %}
3572   ins_pipe( pipe_slow );
3573 %}
3574 
3575 instruct Repl4D(vecY dst, regD src) %{
3576   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3577   match(Set dst (ReplicateD src));
3578   format %{ "pshufd  $dst,$src,0x44\n\t"
3579             "vinsertf128h $dst,$dst,$dst\t! replicate4D" %}
3580   ins_encode %{
3581     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
3582     __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
3583   %}
3584   ins_pipe( pipe_slow );
3585 %}
3586 
3587 instruct Repl4D_mem(vecY dst, memory mem) %{
3588   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3589   match(Set dst (ReplicateD (LoadD mem)));
3590   format %{ "pshufd  $dst,$mem,0x44\n\t"
3591             "vinsertf128h $dst,$dst,$dst\t! replicate4D" %}
3592   ins_encode %{
3593     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x44);
3594     __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
3595   %}
3596   ins_pipe( pipe_slow );
3597 %}
3598 
3599 // Replicate double (8 byte) scalar zero to be vector
3600 instruct Repl2D_zero(vecX dst, immD0 zero) %{
3601   predicate(n->as_Vector()->length() == 2 && UseAVX < 3);
3602   match(Set dst (ReplicateD zero));
3603   format %{ "xorpd   $dst,$dst\t! replicate2D zero" %}
3604   ins_encode %{
3605     __ xorpd($dst$$XMMRegister, $dst$$XMMRegister);
3606   %}
3607   ins_pipe( fpu_reg_reg );
3608 %}
3609 
3610 instruct Repl4D_zero(vecY dst, immD0 zero) %{
3611   predicate(n->as_Vector()->length() == 4 && UseAVX < 3);
3612   match(Set dst (ReplicateD zero));
3613   format %{ "vxorpd  $dst,$dst,$dst,vect256\t! replicate4D zero" %}
3614   ins_encode %{


4774             "movd    $dst,$tmp2\t! add reduction4I" %}
4775   ins_encode %{
4776     int vector_len = 0;
4777     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
4778     __ vpaddd($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4779     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
4780     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4781     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4782     __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4783     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4784   %}
4785   ins_pipe( pipe_slow );
4786 %}
4787 
4788 instruct rvadd8I_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2) %{
4789   predicate(VM_Version::supports_avxonly());
4790   match(Set dst (AddReductionVI src1 src2));
4791   effect(TEMP tmp, TEMP tmp2);
4792   format %{ "vphaddd  $tmp,$src2,$src2\n\t"
4793             "vphaddd  $tmp,$tmp,$tmp2\n\t"
4794             "vextracti128  $tmp2,$tmp\n\t"
4795             "vpaddd   $tmp,$tmp,$tmp2\n\t"
4796             "movd     $tmp2,$src1\n\t"
4797             "vpaddd   $tmp2,$tmp2,$tmp\n\t"
4798             "movd     $dst,$tmp2\t! add reduction8I" %}
4799   ins_encode %{
4800     int vector_len = 1;
4801     __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len);
4802     __ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4803     __ vextracti128h($tmp2$$XMMRegister, $tmp$$XMMRegister);
4804     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
4805     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4806     __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
4807     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4808   %}
4809   ins_pipe( pipe_slow );
4810 %}
4811 
4812 instruct rvadd8I_reduction_reg_evex(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2) %{
4813   predicate(UseAVX > 2);
4814   match(Set dst (AddReductionVI src1 src2));
4815   effect(TEMP tmp, TEMP tmp2);
4816   format %{ "vextracti128  $tmp,$src2\n\t"
4817             "vpaddd  $tmp,$tmp,$src2\n\t"
4818             "pshufd  $tmp2,$tmp,0xE\n\t"
4819             "vpaddd  $tmp,$tmp,$tmp2\n\t"
4820             "pshufd  $tmp2,$tmp,0x1\n\t"
4821             "vpaddd  $tmp,$tmp,$tmp2\n\t"
4822             "movd    $tmp2,$src1\n\t"
4823             "vpaddd  $tmp2,$tmp,$tmp2\n\t"
4824             "movd    $dst,$tmp2\t! add reduction8I" %}
4825   ins_encode %{
4826     int vector_len = 0;
4827     __ vextracti128h($tmp$$XMMRegister, $src2$$XMMRegister);
4828     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
4829     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
4830     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4831     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
4832     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4833     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4834     __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4835     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4836   %}
4837   ins_pipe( pipe_slow );
4838 %}
4839 
4840 instruct rvadd16I_reduction_reg_evex(rRegI dst, rRegI src1, vecZ src2, regF tmp, regF tmp2, regF tmp3) %{
4841   predicate(UseAVX > 2);
4842   match(Set dst (AddReductionVI src1 src2));
4843   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
4844   format %{ "vextracti64x4  $tmp3,$src2,0x1\n\t"
4845             "vpaddd  $tmp3,$tmp3,$src2\n\t"
4846             "vextracti128   $tmp,$tmp3\n\t"
4847             "vpaddd  $tmp,$tmp,$tmp3\n\t"
4848             "pshufd  $tmp2,$tmp,0xE\n\t"
4849             "vpaddd  $tmp,$tmp,$tmp2\n\t"
4850             "pshufd  $tmp2,$tmp,0x1\n\t"
4851             "vpaddd  $tmp,$tmp,$tmp2\n\t"
4852             "movd    $tmp2,$src1\n\t"
4853             "vpaddd  $tmp2,$tmp,$tmp2\n\t"
4854             "movd    $dst,$tmp2\t! mul reduction16I" %}
4855   ins_encode %{
4856     __ vextracti64x4h($tmp3$$XMMRegister, $src2$$XMMRegister, 1);
4857     __ vpaddd($tmp3$$XMMRegister, $tmp3$$XMMRegister, $src2$$XMMRegister, 1);
4858     __ vextracti128h($tmp$$XMMRegister, $tmp3$$XMMRegister);
4859     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp3$$XMMRegister, 0);
4860     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
4861     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
4862     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
4863     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
4864     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4865     __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
4866     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4867   %}
4868   ins_pipe( pipe_slow );
4869 %}
4870 
4871 #ifdef _LP64
4872 instruct rvadd2L_reduction_reg(rRegL dst, rRegL src1, vecX src2, regF tmp, regF tmp2) %{
4873   predicate(UseAVX > 2);
4874   match(Set dst (AddReductionVL src1 src2));
4875   effect(TEMP tmp, TEMP tmp2);
4876   format %{ "pshufd  $tmp2,$src2,0xE\n\t"
4877             "vpaddq  $tmp,$src2,$tmp2\n\t"
4878             "movdq   $tmp2,$src1\n\t"
4879             "vpaddq  $tmp2,$tmp,$tmp2\n\t"
4880             "movdq   $dst,$tmp2\t! add reduction2L" %}
4881   ins_encode %{
4882     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
4883     __ vpaddq($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, 0);
4884     __ movdq($tmp2$$XMMRegister, $src1$$Register);
4885     __ vpaddq($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
4886     __ movdq($dst$$Register, $tmp2$$XMMRegister);
4887   %}
4888   ins_pipe( pipe_slow );
4889 %}
4890 
4891 instruct rvadd4L_reduction_reg(rRegL dst, rRegL src1, vecY src2, regF tmp, regF tmp2) %{
4892   predicate(UseAVX > 2);
4893   match(Set dst (AddReductionVL src1 src2));
4894   effect(TEMP tmp, TEMP tmp2);
4895   format %{ "vextracti128  $tmp,$src2\n\t"
4896             "vpaddq  $tmp2,$tmp,$src2\n\t"
4897             "pshufd  $tmp,$tmp2,0xE\n\t"
4898             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
4899             "movdq   $tmp,$src1\n\t"
4900             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
4901             "movdq   $dst,$tmp2\t! add reduction4L" %}
4902   ins_encode %{
4903     __ vextracti128h($tmp$$XMMRegister, $src2$$XMMRegister);
4904     __ vpaddq($tmp2$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, 0);
4905     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
4906     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
4907     __ movdq($tmp$$XMMRegister, $src1$$Register);
4908     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
4909     __ movdq($dst$$Register, $tmp2$$XMMRegister);
4910   %}
4911   ins_pipe( pipe_slow );
4912 %}
4913 
4914 instruct rvadd8L_reduction_reg(rRegL dst, rRegL src1, vecZ src2, regF tmp, regF tmp2) %{
4915   predicate(UseAVX > 2);
4916   match(Set dst (AddReductionVL src1 src2));
4917   effect(TEMP tmp, TEMP tmp2);
4918   format %{ "vextracti64x4  $tmp2,$src2,0x1\n\t"
4919             "vpaddq  $tmp2,$tmp2,$src2\n\t"
4920             "vextracti128   $tmp,$tmp2\n\t"
4921             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
4922             "pshufd  $tmp,$tmp2,0xE\n\t"
4923             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
4924             "movdq   $tmp,$src1\n\t"
4925             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
4926             "movdq   $dst,$tmp2\t! add reduction8L" %}
4927   ins_encode %{
4928     __ vextracti64x4h($tmp2$$XMMRegister, $src2$$XMMRegister, 1);
4929     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
4930     __ vextracti128h($tmp$$XMMRegister, $tmp2$$XMMRegister);
4931     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
4932     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
4933     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
4934     __ movdq($tmp$$XMMRegister, $src1$$Register);
4935     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
4936     __ movdq($dst$$Register, $tmp2$$XMMRegister);
4937   %}
4938   ins_pipe( pipe_slow );
4939 %}
4940 #endif
4941 
4942 instruct rsadd2F_reduction_reg(regF dst, vecD src2, regF tmp) %{
4943   predicate(UseSSE >= 1 && UseAVX == 0);
4944   match(Set dst (AddReductionVF dst src2));
4945   effect(TEMP dst, TEMP tmp);
4946   format %{ "addss   $dst,$src2\n\t"
4947             "pshufd  $tmp,$src2,0x01\n\t"
4948             "addss   $dst,$tmp\t! add reduction2F" %}
4949   ins_encode %{
4950     __ addss($dst$$XMMRegister, $src2$$XMMRegister);


5009     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5010     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5011     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5012     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5013     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5014   %}
5015   ins_pipe( pipe_slow );
5016 %}
5017 
5018 instruct radd8F_reduction_reg(regF dst, vecY src2, regF tmp, regF tmp2) %{
5019   predicate(UseAVX > 0);
5020   match(Set dst (AddReductionVF dst src2));
5021   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5022   format %{ "vaddss  $dst,$dst,$src2\n\t"
5023             "pshufd  $tmp,$src2,0x01\n\t"
5024             "vaddss  $dst,$dst,$tmp\n\t"
5025             "pshufd  $tmp,$src2,0x02\n\t"
5026             "vaddss  $dst,$dst,$tmp\n\t"
5027             "pshufd  $tmp,$src2,0x03\n\t"
5028             "vaddss  $dst,$dst,$tmp\n\t"
5029             "vextractf128  $tmp2,$src2\n\t"
5030             "vaddss  $dst,$dst,$tmp2\n\t"
5031             "pshufd  $tmp,$tmp2,0x01\n\t"
5032             "vaddss  $dst,$dst,$tmp\n\t"
5033             "pshufd  $tmp,$tmp2,0x02\n\t"
5034             "vaddss  $dst,$dst,$tmp\n\t"
5035             "pshufd  $tmp,$tmp2,0x03\n\t"
5036             "vaddss  $dst,$dst,$tmp\t! add reduction8F" %}
5037   ins_encode %{
5038     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5039     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5040     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5041     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5042     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5043     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5044     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5045     __ vextractf128h($tmp2$$XMMRegister, $src2$$XMMRegister);
5046     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5047     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5048     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5049     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5050     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5051     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5052     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5053   %}
5054   ins_pipe( pipe_slow );
5055 %}
5056 
5057 instruct radd16F_reduction_reg(regF dst, vecZ src2, regF tmp, regF tmp2) %{
5058   predicate(UseAVX > 2);
5059   match(Set dst (AddReductionVF dst src2));
5060   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5061   format %{ "vaddss  $dst,$dst,$src2\n\t"
5062             "pshufd  $tmp,$src2,0x01\n\t"
5063             "vaddss  $dst,$dst,$tmp\n\t"
5064             "pshufd  $tmp,$src2,0x02\n\t"
5065             "vaddss  $dst,$dst,$tmp\n\t"
5066             "pshufd  $tmp,$src2,0x03\n\t"
5067             "vaddss  $dst,$dst,$tmp\n\t"
5068             "vextractf32x4  $tmp2,$src2, 0x1\n\t"
5069             "vaddss  $dst,$dst,$tmp2\n\t"
5070             "pshufd  $tmp,$tmp2,0x01\n\t"
5071             "vaddss  $dst,$dst,$tmp\n\t"
5072             "pshufd  $tmp,$tmp2,0x02\n\t"
5073             "vaddss  $dst,$dst,$tmp\n\t"
5074             "pshufd  $tmp,$tmp2,0x03\n\t"
5075             "vaddss  $dst,$dst,$tmp\n\t"
5076             "vextractf32x4  $tmp2,$src2, 0x2\n\t"
5077             "vaddss  $dst,$dst,$tmp2\n\t"
5078             "pshufd  $tmp,$tmp2,0x01\n\t"
5079             "vaddss  $dst,$dst,$tmp\n\t"
5080             "pshufd  $tmp,$tmp2,0x02\n\t"
5081             "vaddss  $dst,$dst,$tmp\n\t"
5082             "pshufd  $tmp,$tmp2,0x03\n\t"
5083             "vaddss  $dst,$dst,$tmp\n\t"
5084             "vextractf32x4  $tmp2,$src2, 0x3\n\t"
5085             "vaddss  $dst,$dst,$tmp2\n\t"
5086             "pshufd  $tmp,$tmp2,0x01\n\t"
5087             "vaddss  $dst,$dst,$tmp\n\t"
5088             "pshufd  $tmp,$tmp2,0x02\n\t"
5089             "vaddss  $dst,$dst,$tmp\n\t"
5090             "pshufd  $tmp,$tmp2,0x03\n\t"
5091             "vaddss  $dst,$dst,$tmp\t! add reduction16F" %}
5092   ins_encode %{
5093     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5094     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5095     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5096     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5097     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5098     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5099     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5100     __ vextractf32x4h($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5101     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5102     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5103     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5104     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5105     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5106     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5107     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5108     __ vextractf32x4h($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
5109     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5110     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5111     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5112     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5113     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5114     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5115     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5116     __ vextractf32x4h($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
5117     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5118     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5119     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5120     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5121     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5122     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5123     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5124   %}
5125   ins_pipe( pipe_slow );
5126 %}
5127 
5128 instruct rsadd2D_reduction_reg(regD dst, vecX src2, regD tmp) %{
5129   predicate(UseSSE >= 1 && UseAVX == 0);
5130   match(Set dst (AddReductionVD dst src2));
5131   effect(TEMP tmp, TEMP dst);
5132   format %{ "addsd   $dst,$src2\n\t"
5133             "pshufd  $tmp,$src2,0xE\n\t"
5134             "addsd   $dst,$tmp\t! add reduction2D" %}
5135   ins_encode %{
5136     __ addsd($dst$$XMMRegister, $src2$$XMMRegister);


5145   match(Set dst (AddReductionVD dst src2));
5146   effect(TEMP tmp, TEMP dst);
5147   format %{ "vaddsd  $dst,$dst,$src2\n\t"
5148             "pshufd  $tmp,$src2,0xE\n\t"
5149             "vaddsd  $dst,$dst,$tmp\t! add reduction2D" %}
5150   ins_encode %{
5151     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5152     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5153     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5154   %}
5155   ins_pipe( pipe_slow );
5156 %}
5157 
5158 instruct rvadd4D_reduction_reg(regD dst, vecY src2, regD tmp, regD tmp2) %{
5159   predicate(UseAVX > 0);
5160   match(Set dst (AddReductionVD dst src2));
5161   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5162   format %{ "vaddsd  $dst,$dst,$src2\n\t"
5163             "pshufd  $tmp,$src2,0xE\n\t"
5164             "vaddsd  $dst,$dst,$tmp\n\t"
5165             "vextractf32x4h  $tmp2,$src2, 0x1\n\t"
5166             "vaddsd  $dst,$dst,$tmp2\n\t"
5167             "pshufd  $tmp,$tmp2,0xE\n\t"
5168             "vaddsd  $dst,$dst,$tmp\t! add reduction4D" %}
5169   ins_encode %{
5170     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5171     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5172     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5173     __ vextractf32x4h($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5174     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5175     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5176     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5177   %}
5178   ins_pipe( pipe_slow );
5179 %}
5180 
5181 instruct rvadd8D_reduction_reg(regD dst, vecZ src2, regD tmp, regD tmp2) %{
5182   predicate(UseAVX > 2);
5183   match(Set dst (AddReductionVD dst src2));
5184   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5185   format %{ "vaddsd  $dst,$dst,$src2\n\t"
5186             "pshufd  $tmp,$src2,0xE\n\t"
5187             "vaddsd  $dst,$dst,$tmp\n\t"
5188             "vextractf32x4  $tmp2,$src2, 0x1\n\t"
5189             "vaddsd  $dst,$dst,$tmp2\n\t"
5190             "pshufd  $tmp,$tmp2,0xE\n\t"
5191             "vaddsd  $dst,$dst,$tmp\n\t"
5192             "vextractf32x4  $tmp2,$src2, 0x2\n\t"
5193             "vaddsd  $dst,$dst,$tmp2\n\t"
5194             "pshufd  $tmp,$tmp2,0xE\n\t"
5195             "vaddsd  $dst,$dst,$tmp\n\t"
5196             "vextractf32x4  $tmp2,$src2, 0x3\n\t"
5197             "vaddsd  $dst,$dst,$tmp2\n\t"
5198             "pshufd  $tmp,$tmp2,0xE\n\t"
5199             "vaddsd  $dst,$dst,$tmp\t! add reduction8D" %}
5200   ins_encode %{
5201     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5202     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5203     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5204     __ vextractf32x4h($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5205     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5206     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5207     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5208     __ vextractf32x4h($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
5209     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5210     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5211     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5212     __ vextractf32x4h($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
5213     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5214     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5215     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5216   %}
5217   ins_pipe( pipe_slow );
5218 %}
5219 
5220 instruct rsmul2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
5221   predicate(UseSSE > 3 && UseAVX == 0);
5222   match(Set dst (MulReductionVI src1 src2));
5223   effect(TEMP tmp, TEMP tmp2);
5224   format %{ "pshufd  $tmp2,$src2,0x1\n\t"
5225             "pmulld  $tmp2,$src2\n\t"
5226             "movd    $tmp,$src1\n\t"
5227             "pmulld  $tmp2,$tmp\n\t"
5228             "movd    $dst,$tmp2\t! mul reduction2I" %}
5229   ins_encode %{
5230     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5231     __ pmulld($tmp2$$XMMRegister, $src2$$XMMRegister);
5232     __ movdl($tmp$$XMMRegister, $src1$$Register);


5290             "movd     $tmp2,$src1\n\t"
5291             "vpmulld  $tmp2,$tmp,$tmp2\n\t"
5292             "movd     $dst,$tmp2\t! mul reduction4I" %}
5293   ins_encode %{
5294     int vector_len = 0;
5295     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
5296     __ vpmulld($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5297     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
5298     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5299     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5300     __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5301     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5302   %}
5303   ins_pipe( pipe_slow );
5304 %}
5305 
5306 instruct rvmul8I_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2) %{
5307   predicate(UseAVX > 0);
5308   match(Set dst (MulReductionVI src1 src2));
5309   effect(TEMP tmp, TEMP tmp2);
5310   format %{ "vextracti128  $tmp,$src2\n\t"
5311             "vpmulld  $tmp,$tmp,$src2\n\t"
5312             "pshufd   $tmp2,$tmp,0xE\n\t"
5313             "vpmulld  $tmp,$tmp,$tmp2\n\t"
5314             "pshufd   $tmp2,$tmp,0x1\n\t"
5315             "vpmulld  $tmp,$tmp,$tmp2\n\t"
5316             "movd     $tmp2,$src1\n\t"
5317             "vpmulld  $tmp2,$tmp,$tmp2\n\t"
5318             "movd     $dst,$tmp2\t! mul reduction8I" %}
5319   ins_encode %{
5320     int vector_len = 0;
5321     __ vextracti128h($tmp$$XMMRegister, $src2$$XMMRegister);
5322     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
5323     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
5324     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5325     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
5326     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5327     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5328     __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5329     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5330   %}
5331   ins_pipe( pipe_slow );
5332 %}
5333 
5334 instruct rvmul16I_reduction_reg(rRegI dst, rRegI src1, vecZ src2, regF tmp, regF tmp2, regF tmp3) %{
5335   predicate(UseAVX > 2);
5336   match(Set dst (MulReductionVI src1 src2));
5337   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
5338   format %{ "vextracti64x4  $tmp3,$src2,0x1\n\t"
5339             "vpmulld  $tmp3,$tmp3,$src2\n\t"
5340             "vextracti128   $tmp,$tmp3\n\t"
5341             "vpmulld  $tmp,$tmp,$src2\n\t"
5342             "pshufd   $tmp2,$tmp,0xE\n\t"
5343             "vpmulld  $tmp,$tmp,$tmp2\n\t"
5344             "pshufd   $tmp2,$tmp,0x1\n\t"
5345             "vpmulld  $tmp,$tmp,$tmp2\n\t"
5346             "movd     $tmp2,$src1\n\t"
5347             "vpmulld  $tmp2,$tmp,$tmp2\n\t"
5348             "movd     $dst,$tmp2\t! mul reduction16I" %}
5349   ins_encode %{
5350     __ vextracti64x4h($tmp3$$XMMRegister, $src2$$XMMRegister, 1);
5351     __ vpmulld($tmp3$$XMMRegister, $tmp3$$XMMRegister, $src2$$XMMRegister, 1);
5352     __ vextracti128h($tmp$$XMMRegister, $tmp3$$XMMRegister);
5353     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp3$$XMMRegister, 0);
5354     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
5355     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5356     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
5357     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5358     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5359     __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5360     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5361   %}
5362   ins_pipe( pipe_slow );
5363 %}
5364 
5365 #ifdef _LP64
5366 instruct rvmul2L_reduction_reg(rRegL dst, rRegL src1, vecX src2, regF tmp, regF tmp2) %{
5367   predicate(UseAVX > 2 && VM_Version::supports_avx512dq());
5368   match(Set dst (MulReductionVL src1 src2));
5369   effect(TEMP tmp, TEMP tmp2);
5370   format %{ "pshufd   $tmp2,$src2,0xE\n\t"
5371             "vpmullq  $tmp,$src2,$tmp2\n\t"
5372             "movdq    $tmp2,$src1\n\t"
5373             "vpmullq  $tmp2,$tmp,$tmp2\n\t"
5374             "movdq    $dst,$tmp2\t! mul reduction2L" %}
5375   ins_encode %{
5376     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
5377     __ vpmullq($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, 0);
5378     __ movdq($tmp2$$XMMRegister, $src1$$Register);
5379     __ vpmullq($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5380     __ movdq($dst$$Register, $tmp2$$XMMRegister);
5381   %}
5382   ins_pipe( pipe_slow );
5383 %}
5384 
5385 instruct rvmul4L_reduction_reg(rRegL dst, rRegL src1, vecY src2, regF tmp, regF tmp2) %{
5386   predicate(UseAVX > 2 && VM_Version::supports_avx512dq());
5387   match(Set dst (MulReductionVL src1 src2));
5388   effect(TEMP tmp, TEMP tmp2);
5389   format %{ "vextracti128  $tmp,$src2\n\t"
5390             "vpmullq  $tmp2,$tmp,$src2\n\t"
5391             "pshufd   $tmp,$tmp2,0xE\n\t"
5392             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5393             "movdq    $tmp,$src1\n\t"
5394             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5395             "movdq    $dst,$tmp2\t! mul reduction4L" %}
5396   ins_encode %{
5397     __ vextracti128h($tmp$$XMMRegister, $src2$$XMMRegister);
5398     __ vpmullq($tmp2$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, 0);
5399     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5400     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5401     __ movdq($tmp$$XMMRegister, $src1$$Register);
5402     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5403     __ movdq($dst$$Register, $tmp2$$XMMRegister);
5404   %}
5405   ins_pipe( pipe_slow );
5406 %}
5407 
5408 instruct rvmul8L_reduction_reg(rRegL dst, rRegL src1, vecZ src2, regF tmp, regF tmp2) %{
5409   predicate(UseAVX > 2 && VM_Version::supports_avx512dq());
5410   match(Set dst (MulReductionVL src1 src2));
5411   effect(TEMP tmp, TEMP tmp2);
5412   format %{ "vextracti64x4  $tmp2,$src2,0x1\n\t"
5413             "vpmullq  $tmp2,$tmp2,$src2\n\t"
5414             "vextracti128   $tmp,$tmp2\n\t"
5415             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5416             "pshufd   $tmp,$tmp2,0xE\n\t"
5417             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5418             "movdq    $tmp,$src1\n\t"
5419             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5420             "movdq    $dst,$tmp2\t! mul reduction8L" %}
5421   ins_encode %{
5422     __ vextracti64x4h($tmp2$$XMMRegister, $src2$$XMMRegister, 1);
5423     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
5424     __ vextracti128h($tmp$$XMMRegister, $tmp2$$XMMRegister);
5425     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5426     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5427     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5428     __ movdq($tmp$$XMMRegister, $src1$$Register);
5429     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5430     __ movdq($dst$$Register, $tmp2$$XMMRegister);
5431   %}
5432   ins_pipe( pipe_slow );
5433 %}
5434 #endif
5435 
5436 instruct rsmul2F_reduction(regF dst, vecD src2, regF tmp) %{
5437   predicate(UseSSE >= 1 && UseAVX == 0);
5438   match(Set dst (MulReductionVF dst src2));
5439   effect(TEMP dst, TEMP tmp);
5440   format %{ "mulss   $dst,$src2\n\t"
5441             "pshufd  $tmp,$src2,0x01\n\t"
5442             "mulss   $dst,$tmp\t! mul reduction2F" %}
5443   ins_encode %{
5444     __ mulss($dst$$XMMRegister, $src2$$XMMRegister);


5503     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5504     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5505     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5506     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5507     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5508   %}
5509   ins_pipe( pipe_slow );
5510 %}
5511 
5512 instruct rvmul8F_reduction_reg(regF dst, vecY src2, regF tmp, regF tmp2) %{
5513   predicate(UseAVX > 0);
5514   match(Set dst (MulReductionVF dst src2));
5515   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5516   format %{ "vmulss  $dst,$dst,$src2\n\t"
5517             "pshufd  $tmp,$src2,0x01\n\t"
5518             "vmulss  $dst,$dst,$tmp\n\t"
5519             "pshufd  $tmp,$src2,0x02\n\t"
5520             "vmulss  $dst,$dst,$tmp\n\t"
5521             "pshufd  $tmp,$src2,0x03\n\t"
5522             "vmulss  $dst,$dst,$tmp\n\t"
5523             "vextractf128  $tmp2,$src2\n\t"
5524             "vmulss  $dst,$dst,$tmp2\n\t"
5525             "pshufd  $tmp,$tmp2,0x01\n\t"
5526             "vmulss  $dst,$dst,$tmp\n\t"
5527             "pshufd  $tmp,$tmp2,0x02\n\t"
5528             "vmulss  $dst,$dst,$tmp\n\t"
5529             "pshufd  $tmp,$tmp2,0x03\n\t"
5530             "vmulss  $dst,$dst,$tmp\t! mul reduction8F" %}
5531   ins_encode %{
5532     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5533     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5534     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5535     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5536     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5537     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5538     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5539     __ vextractf128h($tmp2$$XMMRegister, $src2$$XMMRegister);
5540     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5541     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5542     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5543     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5544     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5545     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5546     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5547   %}
5548   ins_pipe( pipe_slow );
5549 %}
5550 
5551 instruct rvmul16F_reduction_reg(regF dst, vecZ src2, regF tmp, regF tmp2) %{
5552   predicate(UseAVX > 2);
5553   match(Set dst (MulReductionVF dst src2));
5554   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5555   format %{ "vmulss  $dst,$dst,$src2\n\t"
5556             "pshufd  $tmp,$src2,0x01\n\t"
5557             "vmulss  $dst,$dst,$tmp\n\t"
5558             "pshufd  $tmp,$src2,0x02\n\t"
5559             "vmulss  $dst,$dst,$tmp\n\t"
5560             "pshufd  $tmp,$src2,0x03\n\t"
5561             "vmulss  $dst,$dst,$tmp\n\t"
5562             "vextractf32x4  $tmp2,$src2, 0x1\n\t"
5563             "vmulss  $dst,$dst,$tmp2\n\t"
5564             "pshufd  $tmp,$tmp2,0x01\n\t"
5565             "vmulss  $dst,$dst,$tmp\n\t"
5566             "pshufd  $tmp,$tmp2,0x02\n\t"
5567             "vmulss  $dst,$dst,$tmp\n\t"
5568             "pshufd  $tmp,$tmp2,0x03\n\t"
5569             "vmulss  $dst,$dst,$tmp\n\t"
5570             "vextractf32x4  $tmp2,$src2, 0x2\n\t"
5571             "vmulss  $dst,$dst,$tmp2\n\t"
5572             "pshufd  $tmp,$tmp2,0x01\n\t"
5573             "vmulss  $dst,$dst,$tmp\n\t"
5574             "pshufd  $tmp,$tmp2,0x02\n\t"
5575             "vmulss  $dst,$dst,$tmp\n\t"
5576             "pshufd  $tmp,$tmp2,0x03\n\t"
5577             "vmulss  $dst,$dst,$tmp\n\t"
5578             "vextractf32x4  $tmp2,$src2, 0x3\n\t"
5579             "vmulss  $dst,$dst,$tmp2\n\t"
5580             "pshufd  $tmp,$tmp2,0x01\n\t"
5581             "vmulss  $dst,$dst,$tmp\n\t"
5582             "pshufd  $tmp,$tmp2,0x02\n\t"
5583             "vmulss  $dst,$dst,$tmp\n\t"
5584             "pshufd  $tmp,$tmp2,0x03\n\t"
5585             "vmulss  $dst,$dst,$tmp\t! mul reduction16F" %}
5586   ins_encode %{
5587     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5588     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5589     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5590     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5591     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5592     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5593     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5594     __ vextractf32x4h($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5595     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5596     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5597     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5598     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5599     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5600     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5601     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5602     __ vextractf32x4h($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
5603     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5604     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5605     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5606     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5607     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5608     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5609     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5610     __ vextractf32x4h($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
5611     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5612     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5613     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5614     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5615     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5616     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5617     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5618   %}
5619   ins_pipe( pipe_slow );
5620 %}
5621 
5622 instruct rsmul2D_reduction_reg(regD dst, vecX src2, regD tmp) %{
5623   predicate(UseSSE >= 1 && UseAVX == 0);
5624   match(Set dst (MulReductionVD dst src2));
5625   effect(TEMP dst, TEMP tmp);
5626   format %{ "mulsd   $dst,$src2\n\t"
5627             "pshufd  $tmp,$src2,0xE\n\t"
5628             "mulsd   $dst,$tmp\t! mul reduction2D" %}
5629   ins_encode %{
5630     __ mulsd($dst$$XMMRegister, $src2$$XMMRegister);


5639   match(Set dst (MulReductionVD dst src2));
5640   effect(TEMP tmp, TEMP dst);
5641   format %{ "vmulsd  $dst,$dst,$src2\n\t"
5642             "pshufd  $tmp,$src2,0xE\n\t"
5643             "vmulsd  $dst,$dst,$tmp\t! mul reduction2D" %}
5644   ins_encode %{
5645     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5646     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5647     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5648   %}
5649   ins_pipe( pipe_slow );
5650 %}
5651 
5652 instruct rvmul4D_reduction_reg(regD dst, vecY src2, regD tmp, regD tmp2) %{
5653   predicate(UseAVX > 0);
5654   match(Set dst (MulReductionVD dst src2));
5655   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5656   format %{ "vmulsd  $dst,$dst,$src2\n\t"
5657             "pshufd  $tmp,$src2,0xE\n\t"
5658             "vmulsd  $dst,$dst,$tmp\n\t"
5659             "vextractf128  $tmp2,$src2\n\t"
5660             "vmulsd  $dst,$dst,$tmp2\n\t"
5661             "pshufd  $tmp,$tmp2,0xE\n\t"
5662             "vmulsd  $dst,$dst,$tmp\t! mul reduction4D" %}
5663   ins_encode %{
5664     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5665     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5666     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5667     __ vextractf128h($tmp2$$XMMRegister, $src2$$XMMRegister);
5668     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5669     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5670     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5671   %}
5672   ins_pipe( pipe_slow );
5673 %}
5674 
5675 instruct rvmul8D_reduction_reg(regD dst, vecZ src2, regD tmp, regD tmp2) %{
5676   predicate(UseAVX > 2);
5677   match(Set dst (MulReductionVD dst src2));
5678   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5679   format %{ "vmulsd  $dst,$dst,$src2\n\t"
5680             "pshufd  $tmp,$src2,0xE\n\t"
5681             "vmulsd  $dst,$dst,$tmp\n\t"
5682             "vextractf32x4  $tmp2,$src2, 0x1\n\t"
5683             "vmulsd  $dst,$dst,$tmp2\n\t"
5684             "pshufd  $tmp,$src2,0xE\n\t"
5685             "vmulsd  $dst,$dst,$tmp\n\t"
5686             "vextractf32x4  $tmp2,$src2, 0x2\n\t"
5687             "vmulsd  $dst,$dst,$tmp2\n\t"
5688             "pshufd  $tmp,$tmp2,0xE\n\t"
5689             "vmulsd  $dst,$dst,$tmp\n\t"
5690             "vextractf32x4  $tmp2,$src2, 0x3\n\t"
5691             "vmulsd  $dst,$dst,$tmp2\n\t"
5692             "pshufd  $tmp,$tmp2,0xE\n\t"
5693             "vmulsd  $dst,$dst,$tmp\t! mul reduction8D" %}
5694   ins_encode %{
5695     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5696     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5697     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5698     __ vextractf32x4h($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5699     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5700     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5701     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5702     __ vextractf32x4h($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
5703     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5704     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5705     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5706     __ vextractf32x4h($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
5707     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5708     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5709     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5710   %}
5711   ins_pipe( pipe_slow );
5712 %}
5713 
5714 // ====================VECTOR ARITHMETIC=======================================
5715 
5716 // --------------------------------- ADD --------------------------------------
5717 
5718 // Bytes vector add
5719 instruct vadd4B(vecS dst, vecS src) %{
5720   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
5721   match(Set dst (AddVB dst src));
5722   format %{ "paddb   $dst,$src\t! add packed4B" %}
5723   ins_encode %{
5724     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
5725   %}
5726   ins_pipe( pipe_slow );




3162   predicate(n->as_Vector()->length() == 16 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
3163   match(Set dst (ReplicateB (LoadB mem)));
3164   format %{ "punpcklbw $dst,$mem\n\t"
3165             "pshuflw $dst,$dst,0x00\n\t"
3166             "punpcklqdq $dst,$dst\t! replicate16B" %}
3167   ins_encode %{
3168     __ punpcklbw($dst$$XMMRegister, $mem$$Address);
3169     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3170     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3171   %}
3172   ins_pipe( pipe_slow );
3173 %}
3174 
3175 instruct Repl32B(vecY dst, rRegI src) %{
3176   predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
3177   match(Set dst (ReplicateB src));
3178   format %{ "movd    $dst,$src\n\t"
3179             "punpcklbw $dst,$dst\n\t"
3180             "pshuflw $dst,$dst,0x00\n\t"
3181             "punpcklqdq $dst,$dst\n\t"
3182             "vinserti128_high $dst,$dst\t! replicate32B" %}
3183   ins_encode %{
3184     __ movdl($dst$$XMMRegister, $src$$Register);
3185     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
3186     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3187     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3188     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3189   %}
3190   ins_pipe( pipe_slow );
3191 %}
3192 
3193 instruct Repl32B_mem(vecY dst, memory mem) %{
3194   predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
3195   match(Set dst (ReplicateB (LoadB mem)));
3196   format %{ "punpcklbw $dst,$mem\n\t"
3197             "pshuflw $dst,$dst,0x00\n\t"
3198             "punpcklqdq $dst,$dst\n\t"
3199             "vinserti128_high $dst,$dst\t! replicate32B" %}
3200   ins_encode %{
3201     __ punpcklbw($dst$$XMMRegister, $mem$$Address);
3202     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3203     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3204     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3205   %}
3206   ins_pipe( pipe_slow );
3207 %}
3208 
3209 instruct Repl16B_imm(vecX dst, immI con) %{
3210   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
3211   match(Set dst (ReplicateB con));
3212   format %{ "movq    $dst,[$constantaddress]\n\t"
3213             "punpcklqdq $dst,$dst\t! replicate16B($con)" %}
3214   ins_encode %{
3215     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
3216     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3217   %}
3218   ins_pipe( pipe_slow );
3219 %}
3220 
3221 instruct Repl32B_imm(vecY dst, immI con) %{
3222   predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
3223   match(Set dst (ReplicateB con));
3224   format %{ "movq    $dst,[$constantaddress]\n\t"
3225             "punpcklqdq $dst,$dst\n\t"
3226             "vinserti128_high $dst,$dst\t! lreplicate32B($con)" %}
3227   ins_encode %{
3228     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
3229     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3230     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3231   %}
3232   ins_pipe( pipe_slow );
3233 %}
3234 
3235 instruct Repl4S(vecD dst, rRegI src) %{
3236   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vlbw());
3237   match(Set dst (ReplicateS src));
3238   format %{ "movd    $dst,$src\n\t"
3239             "pshuflw $dst,$dst,0x00\t! replicate4S" %}
3240   ins_encode %{
3241     __ movdl($dst$$XMMRegister, $src$$Register);
3242     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3243   %}
3244   ins_pipe( pipe_slow );
3245 %}
3246 
3247 instruct Repl4S_mem(vecD dst, memory mem) %{
3248   predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
3249   match(Set dst (ReplicateS (LoadS mem)));
3250   format %{ "pshuflw $dst,$mem,0x00\t! replicate4S" %}


3281 %}
3282 
3283 instruct Repl8S_imm(vecX dst, immI con) %{
3284   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vlbw());
3285   match(Set dst (ReplicateS con));
3286   format %{ "movq    $dst,[$constantaddress]\n\t"
3287             "punpcklqdq $dst,$dst\t! replicate8S($con)" %}
3288   ins_encode %{
3289     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
3290     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3291   %}
3292   ins_pipe( pipe_slow );
3293 %}
3294 
3295 instruct Repl16S(vecY dst, rRegI src) %{
3296   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
3297   match(Set dst (ReplicateS src));
3298   format %{ "movd    $dst,$src\n\t"
3299             "pshuflw $dst,$dst,0x00\n\t"
3300             "punpcklqdq $dst,$dst\n\t"
3301             "vinserti128_high $dst,$dst\t! replicate16S" %}
3302   ins_encode %{
3303     __ movdl($dst$$XMMRegister, $src$$Register);
3304     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3305     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3306     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3307   %}
3308   ins_pipe( pipe_slow );
3309 %}
3310 
3311 instruct Repl16S_mem(vecY dst, memory mem) %{
3312   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
3313   match(Set dst (ReplicateS (LoadS mem)));
3314   format %{ "pshuflw $dst,$mem,0x00\n\t"
3315             "punpcklqdq $dst,$dst\n\t"
3316             "vinserti128_high $dst,$dst\t! replicate16S" %}
3317   ins_encode %{
3318     __ pshuflw($dst$$XMMRegister, $mem$$Address, 0x00);
3319     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3320     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3321   %}
3322   ins_pipe( pipe_slow );
3323 %}
3324 
3325 instruct Repl16S_imm(vecY dst, immI con) %{
3326   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
3327   match(Set dst (ReplicateS con));
3328   format %{ "movq    $dst,[$constantaddress]\n\t"
3329             "punpcklqdq $dst,$dst\n\t"
3330             "vinserti128_high $dst,$dst\t! replicate16S($con)" %}
3331   ins_encode %{
3332     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
3333     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3334     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3335   %}
3336   ins_pipe( pipe_slow );
3337 %}
3338 
3339 instruct Repl4I(vecX dst, rRegI src) %{
3340   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3341   match(Set dst (ReplicateI src));
3342   format %{ "movd    $dst,$src\n\t"
3343             "pshufd  $dst,$dst,0x00\t! replicate4I" %}
3344   ins_encode %{
3345     __ movdl($dst$$XMMRegister, $src$$Register);
3346     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3347   %}
3348   ins_pipe( pipe_slow );
3349 %}
3350 
3351 instruct Repl4I_mem(vecX dst, memory mem) %{
3352   predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3353   match(Set dst (ReplicateI (LoadI mem)));
3354   format %{ "pshufd  $dst,$mem,0x00\t! replicate4I" %}
3355   ins_encode %{
3356     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3357   %}
3358   ins_pipe( pipe_slow );
3359 %}
3360 
3361 instruct Repl8I(vecY dst, rRegI src) %{
3362   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3363   match(Set dst (ReplicateI src));
3364   format %{ "movd    $dst,$src\n\t"
3365             "pshufd  $dst,$dst,0x00\n\t"
3366             "vinserti128_high $dst,$dst\t! replicate8I" %}
3367   ins_encode %{
3368     __ movdl($dst$$XMMRegister, $src$$Register);
3369     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3370     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3371   %}
3372   ins_pipe( pipe_slow );
3373 %}
3374 
3375 instruct Repl8I_mem(vecY dst, memory mem) %{
3376   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3377   match(Set dst (ReplicateI (LoadI mem)));
3378   format %{ "pshufd  $dst,$mem,0x00\n\t"
3379             "vinserti128_high $dst,$dst\t! replicate8I" %}
3380   ins_encode %{
3381     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3382     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3383   %}
3384   ins_pipe( pipe_slow );
3385 %}
3386 
3387 instruct Repl4I_imm(vecX dst, immI con) %{
3388   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3389   match(Set dst (ReplicateI con));
3390   format %{ "movq    $dst,[$constantaddress]\t! replicate4I($con)\n\t"
3391             "punpcklqdq $dst,$dst" %}
3392   ins_encode %{
3393     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
3394     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3395   %}
3396   ins_pipe( pipe_slow );
3397 %}
3398 
3399 instruct Repl8I_imm(vecY dst, immI con) %{
3400   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3401   match(Set dst (ReplicateI con));
3402   format %{ "movq    $dst,[$constantaddress]\t! replicate8I($con)\n\t"
3403             "punpcklqdq $dst,$dst\n\t"
3404             "vinserti128_high $dst,$dst" %}
3405   ins_encode %{
3406     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
3407     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3408     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3409   %}
3410   ins_pipe( pipe_slow );
3411 %}
3412 
3413 // Long could be loaded into xmm register directly from memory.
3414 instruct Repl2L_mem(vecX dst, memory mem) %{
3415   predicate(n->as_Vector()->length() == 2 && !VM_Version::supports_avx512vlbw());
3416   match(Set dst (ReplicateL (LoadL mem)));
3417   format %{ "movq    $dst,$mem\n\t"
3418             "punpcklqdq $dst,$dst\t! replicate2L" %}
3419   ins_encode %{
3420     __ movq($dst$$XMMRegister, $mem$$Address);
3421     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3422   %}
3423   ins_pipe( pipe_slow );
3424 %}
3425 
3426 // Replicate long (8 byte) scalar to be vector
3427 #ifdef _LP64
3428 instruct Repl4L(vecY dst, rRegL src) %{
3429   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3430   match(Set dst (ReplicateL src));
3431   format %{ "movdq   $dst,$src\n\t"
3432             "punpcklqdq $dst,$dst\n\t"
3433             "vinserti128_high $dst,$dst\t! replicate4L" %}
3434   ins_encode %{
3435     __ movdq($dst$$XMMRegister, $src$$Register);
3436     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3437     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3438   %}
3439   ins_pipe( pipe_slow );
3440 %}
3441 #else // _LP64
3442 instruct Repl4L(vecY dst, eRegL src, regD tmp) %{
3443   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3444   match(Set dst (ReplicateL src));
3445   effect(TEMP dst, USE src, TEMP tmp);
3446   format %{ "movdl   $dst,$src.lo\n\t"
3447             "movdl   $tmp,$src.hi\n\t"
3448             "punpckldq $dst,$tmp\n\t"
3449             "punpcklqdq $dst,$dst\n\t"
3450             "vinserti128_high $dst,$dst\t! replicate4L" %}
3451   ins_encode %{
3452     __ movdl($dst$$XMMRegister, $src$$Register);
3453     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
3454     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
3455     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3456     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3457   %}
3458   ins_pipe( pipe_slow );
3459 %}
3460 #endif // _LP64
3461 
3462 instruct Repl4L_imm(vecY dst, immL con) %{
3463   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3464   match(Set dst (ReplicateL con));
3465   format %{ "movq    $dst,[$constantaddress]\n\t"
3466             "punpcklqdq $dst,$dst\n\t"
3467             "vinserti128_high $dst,$dst\t! replicate4L($con)" %}
3468   ins_encode %{
3469     __ movq($dst$$XMMRegister, $constantaddress($con));
3470     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3471     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3472   %}
3473   ins_pipe( pipe_slow );
3474 %}
3475 
3476 instruct Repl4L_mem(vecY dst, memory mem) %{
3477   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3478   match(Set dst (ReplicateL (LoadL mem)));
3479   format %{ "movq    $dst,$mem\n\t"
3480             "punpcklqdq $dst,$dst\n\t"
3481             "vinserti128_high $dst,$dst\t! replicate4L" %}
3482   ins_encode %{
3483     __ movq($dst$$XMMRegister, $mem$$Address);
3484     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3485     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3486   %}
3487   ins_pipe( pipe_slow );
3488 %}
3489 
3490 instruct Repl2F_mem(vecD dst, memory mem) %{
3491   predicate(n->as_Vector()->length() == 2 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3492   match(Set dst (ReplicateF (LoadF mem)));
3493   format %{ "pshufd  $dst,$mem,0x00\t! replicate2F" %}
3494   ins_encode %{
3495     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3496   %}
3497   ins_pipe( pipe_slow );
3498 %}
3499 
3500 instruct Repl4F_mem(vecX dst, memory mem) %{
3501   predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3502   match(Set dst (ReplicateF (LoadF mem)));
3503   format %{ "pshufd  $dst,$mem,0x00\t! replicate4F" %}
3504   ins_encode %{
3505     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3506   %}
3507   ins_pipe( pipe_slow );
3508 %}
3509 
3510 instruct Repl8F(vecY dst, regF src) %{
3511   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3512   match(Set dst (ReplicateF src));
3513   format %{ "pshufd  $dst,$src,0x00\n\t"
3514             "vinsertf128_high $dst,$dst\t! replicate8F" %}
3515   ins_encode %{
3516     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
3517     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3518   %}
3519   ins_pipe( pipe_slow );
3520 %}
3521 
3522 instruct Repl8F_mem(vecY dst, memory mem) %{
3523   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3524   match(Set dst (ReplicateF (LoadF mem)));
3525   format %{ "pshufd  $dst,$mem,0x00\n\t"
3526             "vinsertf128_high $dst,$dst\t! replicate8F" %}
3527   ins_encode %{
3528     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3529     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3530   %}
3531   ins_pipe( pipe_slow );
3532 %}
3533 
3534 instruct Repl2F_zero(vecD dst, immF0 zero) %{
3535   predicate(n->as_Vector()->length() == 2 && UseAVX < 3);
3536   match(Set dst (ReplicateF zero));
3537   format %{ "xorps   $dst,$dst\t! replicate2F zero" %}
3538   ins_encode %{
3539     __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
3540   %}
3541   ins_pipe( fpu_reg_reg );
3542 %}
3543 
3544 instruct Repl4F_zero(vecX dst, immF0 zero) %{
3545   predicate(n->as_Vector()->length() == 4 && UseAVX < 3);
3546   match(Set dst (ReplicateF zero));
3547   format %{ "xorps   $dst,$dst\t! replicate4F zero" %}
3548   ins_encode %{
3549     __ xorps($dst$$XMMRegister, $dst$$XMMRegister);


3559     int vector_len = 1;
3560     __ vxorps($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3561   %}
3562   ins_pipe( fpu_reg_reg );
3563 %}
3564 
3565 instruct Repl2D_mem(vecX dst, memory mem) %{
3566   predicate(n->as_Vector()->length() == 2 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3567   match(Set dst (ReplicateD (LoadD mem)));
3568   format %{ "pshufd  $dst,$mem,0x44\t! replicate2D" %}
3569   ins_encode %{
3570     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x44);
3571   %}
3572   ins_pipe( pipe_slow );
3573 %}
3574 
3575 instruct Repl4D(vecY dst, regD src) %{
3576   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3577   match(Set dst (ReplicateD src));
3578   format %{ "pshufd  $dst,$src,0x44\n\t"
3579             "vinsertf128_high $dst,$dst\t! replicate4D" %}
3580   ins_encode %{
3581     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
3582     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3583   %}
3584   ins_pipe( pipe_slow );
3585 %}
3586 
3587 instruct Repl4D_mem(vecY dst, memory mem) %{
3588   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3589   match(Set dst (ReplicateD (LoadD mem)));
3590   format %{ "pshufd  $dst,$mem,0x44\n\t"
3591             "vinsertf128_high $dst,$dst\t! replicate4D" %}
3592   ins_encode %{
3593     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x44);
3594     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3595   %}
3596   ins_pipe( pipe_slow );
3597 %}
3598 
3599 // Replicate double (8 byte) scalar zero to be vector
3600 instruct Repl2D_zero(vecX dst, immD0 zero) %{
3601   predicate(n->as_Vector()->length() == 2 && UseAVX < 3);
3602   match(Set dst (ReplicateD zero));
3603   format %{ "xorpd   $dst,$dst\t! replicate2D zero" %}
3604   ins_encode %{
3605     __ xorpd($dst$$XMMRegister, $dst$$XMMRegister);
3606   %}
3607   ins_pipe( fpu_reg_reg );
3608 %}
3609 
3610 instruct Repl4D_zero(vecY dst, immD0 zero) %{
3611   predicate(n->as_Vector()->length() == 4 && UseAVX < 3);
3612   match(Set dst (ReplicateD zero));
3613   format %{ "vxorpd  $dst,$dst,$dst,vect256\t! replicate4D zero" %}
3614   ins_encode %{


4774             "movd    $dst,$tmp2\t! add reduction4I" %}
4775   ins_encode %{
4776     int vector_len = 0;
4777     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
4778     __ vpaddd($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4779     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
4780     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4781     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4782     __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4783     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4784   %}
4785   ins_pipe( pipe_slow );
4786 %}
4787 
4788 instruct rvadd8I_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2) %{
4789   predicate(VM_Version::supports_avxonly());
4790   match(Set dst (AddReductionVI src1 src2));
4791   effect(TEMP tmp, TEMP tmp2);
4792   format %{ "vphaddd  $tmp,$src2,$src2\n\t"
4793             "vphaddd  $tmp,$tmp,$tmp2\n\t"
4794             "vextracti128_high  $tmp2,$tmp\n\t"
4795             "vpaddd   $tmp,$tmp,$tmp2\n\t"
4796             "movd     $tmp2,$src1\n\t"
4797             "vpaddd   $tmp2,$tmp2,$tmp\n\t"
4798             "movd     $dst,$tmp2\t! add reduction8I" %}
4799   ins_encode %{
4800     int vector_len = 1;
4801     __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len);
4802     __ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4803     __ vextracti128_high($tmp2$$XMMRegister, $tmp$$XMMRegister);
4804     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
4805     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4806     __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
4807     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4808   %}
4809   ins_pipe( pipe_slow );
4810 %}
4811 
4812 instruct rvadd8I_reduction_reg_evex(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2) %{
4813   predicate(UseAVX > 2);
4814   match(Set dst (AddReductionVI src1 src2));
4815   effect(TEMP tmp, TEMP tmp2);
4816   format %{ "vextracti128_high  $tmp,$src2\n\t"
4817             "vpaddd  $tmp,$tmp,$src2\n\t"
4818             "pshufd  $tmp2,$tmp,0xE\n\t"
4819             "vpaddd  $tmp,$tmp,$tmp2\n\t"
4820             "pshufd  $tmp2,$tmp,0x1\n\t"
4821             "vpaddd  $tmp,$tmp,$tmp2\n\t"
4822             "movd    $tmp2,$src1\n\t"
4823             "vpaddd  $tmp2,$tmp,$tmp2\n\t"
4824             "movd    $dst,$tmp2\t! add reduction8I" %}
4825   ins_encode %{
4826     int vector_len = 0;
4827     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
4828     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
4829     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
4830     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4831     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
4832     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4833     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4834     __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4835     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4836   %}
4837   ins_pipe( pipe_slow );
4838 %}
4839 
4840 instruct rvadd16I_reduction_reg_evex(rRegI dst, rRegI src1, vecZ src2, regF tmp, regF tmp2, regF tmp3) %{
4841   predicate(UseAVX > 2);
4842   match(Set dst (AddReductionVI src1 src2));
4843   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
4844   format %{ "vextracti64x4_high  $tmp3,$src2\n\t"
4845             "vpaddd  $tmp3,$tmp3,$src2\n\t"
4846             "vextracti128_high  $tmp,$tmp3\n\t"
4847             "vpaddd  $tmp,$tmp,$tmp3\n\t"
4848             "pshufd  $tmp2,$tmp,0xE\n\t"
4849             "vpaddd  $tmp,$tmp,$tmp2\n\t"
4850             "pshufd  $tmp2,$tmp,0x1\n\t"
4851             "vpaddd  $tmp,$tmp,$tmp2\n\t"
4852             "movd    $tmp2,$src1\n\t"
4853             "vpaddd  $tmp2,$tmp,$tmp2\n\t"
4854             "movd    $dst,$tmp2\t! mul reduction16I" %}
4855   ins_encode %{
4856     __ vextracti64x4_high($tmp3$$XMMRegister, $src2$$XMMRegister);
4857     __ vpaddd($tmp3$$XMMRegister, $tmp3$$XMMRegister, $src2$$XMMRegister, 1);
4858     __ vextracti128_high($tmp$$XMMRegister, $tmp3$$XMMRegister);
4859     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp3$$XMMRegister, 0);
4860     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
4861     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
4862     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
4863     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
4864     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4865     __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
4866     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4867   %}
4868   ins_pipe( pipe_slow );
4869 %}
4870 
4871 #ifdef _LP64
4872 instruct rvadd2L_reduction_reg(rRegL dst, rRegL src1, vecX src2, regF tmp, regF tmp2) %{
4873   predicate(UseAVX > 2);
4874   match(Set dst (AddReductionVL src1 src2));
4875   effect(TEMP tmp, TEMP tmp2);
4876   format %{ "pshufd  $tmp2,$src2,0xE\n\t"
4877             "vpaddq  $tmp,$src2,$tmp2\n\t"
4878             "movdq   $tmp2,$src1\n\t"
4879             "vpaddq  $tmp2,$tmp,$tmp2\n\t"
4880             "movdq   $dst,$tmp2\t! add reduction2L" %}
4881   ins_encode %{
4882     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
4883     __ vpaddq($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, 0);
4884     __ movdq($tmp2$$XMMRegister, $src1$$Register);
4885     __ vpaddq($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
4886     __ movdq($dst$$Register, $tmp2$$XMMRegister);
4887   %}
4888   ins_pipe( pipe_slow );
4889 %}
4890 
4891 instruct rvadd4L_reduction_reg(rRegL dst, rRegL src1, vecY src2, regF tmp, regF tmp2) %{
4892   predicate(UseAVX > 2);
4893   match(Set dst (AddReductionVL src1 src2));
4894   effect(TEMP tmp, TEMP tmp2);
4895   format %{ "vextracti128_high  $tmp,$src2\n\t"
4896             "vpaddq  $tmp2,$tmp,$src2\n\t"
4897             "pshufd  $tmp,$tmp2,0xE\n\t"
4898             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
4899             "movdq   $tmp,$src1\n\t"
4900             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
4901             "movdq   $dst,$tmp2\t! add reduction4L" %}
4902   ins_encode %{
4903     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
4904     __ vpaddq($tmp2$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, 0);
4905     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
4906     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
4907     __ movdq($tmp$$XMMRegister, $src1$$Register);
4908     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
4909     __ movdq($dst$$Register, $tmp2$$XMMRegister);
4910   %}
4911   ins_pipe( pipe_slow );
4912 %}
4913 
4914 instruct rvadd8L_reduction_reg(rRegL dst, rRegL src1, vecZ src2, regF tmp, regF tmp2) %{
4915   predicate(UseAVX > 2);
4916   match(Set dst (AddReductionVL src1 src2));
4917   effect(TEMP tmp, TEMP tmp2);
4918   format %{ "vextracti64x4_high  $tmp2,$src2\n\t"
4919             "vpaddq  $tmp2,$tmp2,$src2\n\t"
4920             "vextracti128_high  $tmp,$tmp2\n\t"
4921             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
4922             "pshufd  $tmp,$tmp2,0xE\n\t"
4923             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
4924             "movdq   $tmp,$src1\n\t"
4925             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
4926             "movdq   $dst,$tmp2\t! add reduction8L" %}
4927   ins_encode %{
4928     __ vextracti64x4_high($tmp2$$XMMRegister, $src2$$XMMRegister);
4929     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
4930     __ vextracti128_high($tmp$$XMMRegister, $tmp2$$XMMRegister);
4931     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
4932     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
4933     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
4934     __ movdq($tmp$$XMMRegister, $src1$$Register);
4935     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
4936     __ movdq($dst$$Register, $tmp2$$XMMRegister);
4937   %}
4938   ins_pipe( pipe_slow );
4939 %}
4940 #endif
4941 
4942 instruct rsadd2F_reduction_reg(regF dst, vecD src2, regF tmp) %{
4943   predicate(UseSSE >= 1 && UseAVX == 0);
4944   match(Set dst (AddReductionVF dst src2));
4945   effect(TEMP dst, TEMP tmp);
4946   format %{ "addss   $dst,$src2\n\t"
4947             "pshufd  $tmp,$src2,0x01\n\t"
4948             "addss   $dst,$tmp\t! add reduction2F" %}
4949   ins_encode %{
4950     __ addss($dst$$XMMRegister, $src2$$XMMRegister);


5009     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5010     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5011     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5012     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5013     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5014   %}
5015   ins_pipe( pipe_slow );
5016 %}
5017 
5018 instruct radd8F_reduction_reg(regF dst, vecY src2, regF tmp, regF tmp2) %{
5019   predicate(UseAVX > 0);
5020   match(Set dst (AddReductionVF dst src2));
5021   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5022   format %{ "vaddss  $dst,$dst,$src2\n\t"
5023             "pshufd  $tmp,$src2,0x01\n\t"
5024             "vaddss  $dst,$dst,$tmp\n\t"
5025             "pshufd  $tmp,$src2,0x02\n\t"
5026             "vaddss  $dst,$dst,$tmp\n\t"
5027             "pshufd  $tmp,$src2,0x03\n\t"
5028             "vaddss  $dst,$dst,$tmp\n\t"
5029             "vextractf128_high  $tmp2,$src2\n\t"
5030             "vaddss  $dst,$dst,$tmp2\n\t"
5031             "pshufd  $tmp,$tmp2,0x01\n\t"
5032             "vaddss  $dst,$dst,$tmp\n\t"
5033             "pshufd  $tmp,$tmp2,0x02\n\t"
5034             "vaddss  $dst,$dst,$tmp\n\t"
5035             "pshufd  $tmp,$tmp2,0x03\n\t"
5036             "vaddss  $dst,$dst,$tmp\t! add reduction8F" %}
5037   ins_encode %{
5038     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5039     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5040     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5041     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5042     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5043     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5044     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5045     __ vextractf128_high($tmp2$$XMMRegister, $src2$$XMMRegister);
5046     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5047     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5048     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5049     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5050     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5051     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5052     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5053   %}
5054   ins_pipe( pipe_slow );
5055 %}
5056 
5057 instruct radd16F_reduction_reg(regF dst, vecZ src2, regF tmp, regF tmp2) %{
5058   predicate(UseAVX > 2);
5059   match(Set dst (AddReductionVF dst src2));
5060   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5061   format %{ "vaddss  $dst,$dst,$src2\n\t"
5062             "pshufd  $tmp,$src2,0x01\n\t"
5063             "vaddss  $dst,$dst,$tmp\n\t"
5064             "pshufd  $tmp,$src2,0x02\n\t"
5065             "vaddss  $dst,$dst,$tmp\n\t"
5066             "pshufd  $tmp,$src2,0x03\n\t"
5067             "vaddss  $dst,$dst,$tmp\n\t"
5068             "vextractf32x4  $tmp2,$src2,0x1\n\t"
5069             "vaddss  $dst,$dst,$tmp2\n\t"
5070             "pshufd  $tmp,$tmp2,0x01\n\t"
5071             "vaddss  $dst,$dst,$tmp\n\t"
5072             "pshufd  $tmp,$tmp2,0x02\n\t"
5073             "vaddss  $dst,$dst,$tmp\n\t"
5074             "pshufd  $tmp,$tmp2,0x03\n\t"
5075             "vaddss  $dst,$dst,$tmp\n\t"
5076             "vextractf32x4  $tmp2,$src2,0x2\n\t"
5077             "vaddss  $dst,$dst,$tmp2\n\t"
5078             "pshufd  $tmp,$tmp2,0x01\n\t"
5079             "vaddss  $dst,$dst,$tmp\n\t"
5080             "pshufd  $tmp,$tmp2,0x02\n\t"
5081             "vaddss  $dst,$dst,$tmp\n\t"
5082             "pshufd  $tmp,$tmp2,0x03\n\t"
5083             "vaddss  $dst,$dst,$tmp\n\t"
5084             "vextractf32x4  $tmp2,$src2,0x3\n\t"
5085             "vaddss  $dst,$dst,$tmp2\n\t"
5086             "pshufd  $tmp,$tmp2,0x01\n\t"
5087             "vaddss  $dst,$dst,$tmp\n\t"
5088             "pshufd  $tmp,$tmp2,0x02\n\t"
5089             "vaddss  $dst,$dst,$tmp\n\t"
5090             "pshufd  $tmp,$tmp2,0x03\n\t"
5091             "vaddss  $dst,$dst,$tmp\t! add reduction16F" %}
5092   ins_encode %{
5093     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5094     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5095     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5096     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5097     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5098     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5099     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5100     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5101     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5102     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5103     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5104     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5105     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5106     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5107     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5108     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
5109     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5110     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5111     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5112     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5113     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5114     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5115     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5116     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
5117     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5118     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5119     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5120     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5121     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5122     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5123     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5124   %}
5125   ins_pipe( pipe_slow );
5126 %}
5127 
5128 instruct rsadd2D_reduction_reg(regD dst, vecX src2, regD tmp) %{
5129   predicate(UseSSE >= 1 && UseAVX == 0);
5130   match(Set dst (AddReductionVD dst src2));
5131   effect(TEMP tmp, TEMP dst);
5132   format %{ "addsd   $dst,$src2\n\t"
5133             "pshufd  $tmp,$src2,0xE\n\t"
5134             "addsd   $dst,$tmp\t! add reduction2D" %}
5135   ins_encode %{
5136     __ addsd($dst$$XMMRegister, $src2$$XMMRegister);


5145   match(Set dst (AddReductionVD dst src2));
5146   effect(TEMP tmp, TEMP dst);
5147   format %{ "vaddsd  $dst,$dst,$src2\n\t"
5148             "pshufd  $tmp,$src2,0xE\n\t"
5149             "vaddsd  $dst,$dst,$tmp\t! add reduction2D" %}
5150   ins_encode %{
5151     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5152     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5153     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5154   %}
5155   ins_pipe( pipe_slow );
5156 %}
5157 
5158 instruct rvadd4D_reduction_reg(regD dst, vecY src2, regD tmp, regD tmp2) %{
5159   predicate(UseAVX > 0);
5160   match(Set dst (AddReductionVD dst src2));
5161   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5162   format %{ "vaddsd  $dst,$dst,$src2\n\t"
5163             "pshufd  $tmp,$src2,0xE\n\t"
5164             "vaddsd  $dst,$dst,$tmp\n\t"
5165             "vextractf32x4  $tmp2,$src2,0x1\n\t"
5166             "vaddsd  $dst,$dst,$tmp2\n\t"
5167             "pshufd  $tmp,$tmp2,0xE\n\t"
5168             "vaddsd  $dst,$dst,$tmp\t! add reduction4D" %}
5169   ins_encode %{
5170     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5171     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5172     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5173     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5174     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5175     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5176     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5177   %}
5178   ins_pipe( pipe_slow );
5179 %}
5180 
5181 instruct rvadd8D_reduction_reg(regD dst, vecZ src2, regD tmp, regD tmp2) %{
5182   predicate(UseAVX > 2);
5183   match(Set dst (AddReductionVD dst src2));
5184   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5185   format %{ "vaddsd  $dst,$dst,$src2\n\t"
5186             "pshufd  $tmp,$src2,0xE\n\t"
5187             "vaddsd  $dst,$dst,$tmp\n\t"
5188             "vextractf32x4  $tmp2,$src2,0x1\n\t"
5189             "vaddsd  $dst,$dst,$tmp2\n\t"
5190             "pshufd  $tmp,$tmp2,0xE\n\t"
5191             "vaddsd  $dst,$dst,$tmp\n\t"
5192             "vextractf32x4  $tmp2,$src2,0x2\n\t"
5193             "vaddsd  $dst,$dst,$tmp2\n\t"
5194             "pshufd  $tmp,$tmp2,0xE\n\t"
5195             "vaddsd  $dst,$dst,$tmp\n\t"
5196             "vextractf32x4  $tmp2,$src2,0x3\n\t"
5197             "vaddsd  $dst,$dst,$tmp2\n\t"
5198             "pshufd  $tmp,$tmp2,0xE\n\t"
5199             "vaddsd  $dst,$dst,$tmp\t! add reduction8D" %}
5200   ins_encode %{
5201     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5202     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5203     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5204     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5205     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5206     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5207     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5208     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
5209     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5210     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5211     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5212     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
5213     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5214     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5215     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5216   %}
5217   ins_pipe( pipe_slow );
5218 %}
5219 
5220 instruct rsmul2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
5221   predicate(UseSSE > 3 && UseAVX == 0);
5222   match(Set dst (MulReductionVI src1 src2));
5223   effect(TEMP tmp, TEMP tmp2);
5224   format %{ "pshufd  $tmp2,$src2,0x1\n\t"
5225             "pmulld  $tmp2,$src2\n\t"
5226             "movd    $tmp,$src1\n\t"
5227             "pmulld  $tmp2,$tmp\n\t"
5228             "movd    $dst,$tmp2\t! mul reduction2I" %}
5229   ins_encode %{
5230     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5231     __ pmulld($tmp2$$XMMRegister, $src2$$XMMRegister);
5232     __ movdl($tmp$$XMMRegister, $src1$$Register);


5290             "movd     $tmp2,$src1\n\t"
5291             "vpmulld  $tmp2,$tmp,$tmp2\n\t"
5292             "movd     $dst,$tmp2\t! mul reduction4I" %}
5293   ins_encode %{
5294     int vector_len = 0;
5295     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
5296     __ vpmulld($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5297     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
5298     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5299     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5300     __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5301     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5302   %}
5303   ins_pipe( pipe_slow );
5304 %}
5305 
5306 instruct rvmul8I_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2) %{
5307   predicate(UseAVX > 0);
5308   match(Set dst (MulReductionVI src1 src2));
5309   effect(TEMP tmp, TEMP tmp2);
5310   format %{ "vextracti128_high  $tmp,$src2\n\t"
5311             "vpmulld  $tmp,$tmp,$src2\n\t"
5312             "pshufd   $tmp2,$tmp,0xE\n\t"
5313             "vpmulld  $tmp,$tmp,$tmp2\n\t"
5314             "pshufd   $tmp2,$tmp,0x1\n\t"
5315             "vpmulld  $tmp,$tmp,$tmp2\n\t"
5316             "movd     $tmp2,$src1\n\t"
5317             "vpmulld  $tmp2,$tmp,$tmp2\n\t"
5318             "movd     $dst,$tmp2\t! mul reduction8I" %}
5319   ins_encode %{
5320     int vector_len = 0;
5321     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
5322     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
5323     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
5324     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5325     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
5326     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5327     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5328     __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5329     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5330   %}
5331   ins_pipe( pipe_slow );
5332 %}
5333 
5334 instruct rvmul16I_reduction_reg(rRegI dst, rRegI src1, vecZ src2, regF tmp, regF tmp2, regF tmp3) %{
5335   predicate(UseAVX > 2);
5336   match(Set dst (MulReductionVI src1 src2));
5337   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
5338   format %{ "vextracti64x4_high  $tmp3,$src2\n\t"
5339             "vpmulld  $tmp3,$tmp3,$src2\n\t"
5340             "vextracti128_high  $tmp,$tmp3\n\t"
5341             "vpmulld  $tmp,$tmp,$src2\n\t"
5342             "pshufd   $tmp2,$tmp,0xE\n\t"
5343             "vpmulld  $tmp,$tmp,$tmp2\n\t"
5344             "pshufd   $tmp2,$tmp,0x1\n\t"
5345             "vpmulld  $tmp,$tmp,$tmp2\n\t"
5346             "movd     $tmp2,$src1\n\t"
5347             "vpmulld  $tmp2,$tmp,$tmp2\n\t"
5348             "movd     $dst,$tmp2\t! mul reduction16I" %}
5349   ins_encode %{
5350     __ vextracti64x4_high($tmp3$$XMMRegister, $src2$$XMMRegister);
5351     __ vpmulld($tmp3$$XMMRegister, $tmp3$$XMMRegister, $src2$$XMMRegister, 1);
5352     __ vextracti128_high($tmp$$XMMRegister, $tmp3$$XMMRegister);
5353     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp3$$XMMRegister, 0);
5354     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
5355     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5356     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
5357     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5358     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5359     __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5360     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5361   %}
5362   ins_pipe( pipe_slow );
5363 %}
5364 
5365 #ifdef _LP64
5366 instruct rvmul2L_reduction_reg(rRegL dst, rRegL src1, vecX src2, regF tmp, regF tmp2) %{
5367   predicate(UseAVX > 2 && VM_Version::supports_avx512dq());
5368   match(Set dst (MulReductionVL src1 src2));
5369   effect(TEMP tmp, TEMP tmp2);
5370   format %{ "pshufd   $tmp2,$src2,0xE\n\t"
5371             "vpmullq  $tmp,$src2,$tmp2\n\t"
5372             "movdq    $tmp2,$src1\n\t"
5373             "vpmullq  $tmp2,$tmp,$tmp2\n\t"
5374             "movdq    $dst,$tmp2\t! mul reduction2L" %}
5375   ins_encode %{
5376     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
5377     __ vpmullq($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, 0);
5378     __ movdq($tmp2$$XMMRegister, $src1$$Register);
5379     __ vpmullq($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5380     __ movdq($dst$$Register, $tmp2$$XMMRegister);
5381   %}
5382   ins_pipe( pipe_slow );
5383 %}
5384 
5385 instruct rvmul4L_reduction_reg(rRegL dst, rRegL src1, vecY src2, regF tmp, regF tmp2) %{
5386   predicate(UseAVX > 2 && VM_Version::supports_avx512dq());
5387   match(Set dst (MulReductionVL src1 src2));
5388   effect(TEMP tmp, TEMP tmp2);
5389   format %{ "vextracti128_high  $tmp,$src2\n\t"
5390             "vpmullq  $tmp2,$tmp,$src2\n\t"
5391             "pshufd   $tmp,$tmp2,0xE\n\t"
5392             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5393             "movdq    $tmp,$src1\n\t"
5394             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5395             "movdq    $dst,$tmp2\t! mul reduction4L" %}
5396   ins_encode %{
5397     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
5398     __ vpmullq($tmp2$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, 0);
5399     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5400     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5401     __ movdq($tmp$$XMMRegister, $src1$$Register);
5402     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5403     __ movdq($dst$$Register, $tmp2$$XMMRegister);
5404   %}
5405   ins_pipe( pipe_slow );
5406 %}
5407 
5408 instruct rvmul8L_reduction_reg(rRegL dst, rRegL src1, vecZ src2, regF tmp, regF tmp2) %{
5409   predicate(UseAVX > 2 && VM_Version::supports_avx512dq());
5410   match(Set dst (MulReductionVL src1 src2));
5411   effect(TEMP tmp, TEMP tmp2);
5412   format %{ "vextracti64x4_high  $tmp2,$src2\n\t"
5413             "vpmullq  $tmp2,$tmp2,$src2\n\t"
5414             "vextracti128_high  $tmp,$tmp2\n\t"
5415             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5416             "pshufd   $tmp,$tmp2,0xE\n\t"
5417             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5418             "movdq    $tmp,$src1\n\t"
5419             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5420             "movdq    $dst,$tmp2\t! mul reduction8L" %}
5421   ins_encode %{
5422     __ vextracti64x4_high($tmp2$$XMMRegister, $src2$$XMMRegister);
5423     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
5424     __ vextracti128_high($tmp$$XMMRegister, $tmp2$$XMMRegister);
5425     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5426     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5427     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5428     __ movdq($tmp$$XMMRegister, $src1$$Register);
5429     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5430     __ movdq($dst$$Register, $tmp2$$XMMRegister);
5431   %}
5432   ins_pipe( pipe_slow );
5433 %}
5434 #endif
5435 
5436 instruct rsmul2F_reduction(regF dst, vecD src2, regF tmp) %{
5437   predicate(UseSSE >= 1 && UseAVX == 0);
5438   match(Set dst (MulReductionVF dst src2));
5439   effect(TEMP dst, TEMP tmp);
5440   format %{ "mulss   $dst,$src2\n\t"
5441             "pshufd  $tmp,$src2,0x01\n\t"
5442             "mulss   $dst,$tmp\t! mul reduction2F" %}
5443   ins_encode %{
5444     __ mulss($dst$$XMMRegister, $src2$$XMMRegister);


5503     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5504     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5505     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5506     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5507     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5508   %}
5509   ins_pipe( pipe_slow );
5510 %}
5511 
5512 instruct rvmul8F_reduction_reg(regF dst, vecY src2, regF tmp, regF tmp2) %{
5513   predicate(UseAVX > 0);
5514   match(Set dst (MulReductionVF dst src2));
5515   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5516   format %{ "vmulss  $dst,$dst,$src2\n\t"
5517             "pshufd  $tmp,$src2,0x01\n\t"
5518             "vmulss  $dst,$dst,$tmp\n\t"
5519             "pshufd  $tmp,$src2,0x02\n\t"
5520             "vmulss  $dst,$dst,$tmp\n\t"
5521             "pshufd  $tmp,$src2,0x03\n\t"
5522             "vmulss  $dst,$dst,$tmp\n\t"
5523             "vextractf128_high  $tmp2,$src2\n\t"
5524             "vmulss  $dst,$dst,$tmp2\n\t"
5525             "pshufd  $tmp,$tmp2,0x01\n\t"
5526             "vmulss  $dst,$dst,$tmp\n\t"
5527             "pshufd  $tmp,$tmp2,0x02\n\t"
5528             "vmulss  $dst,$dst,$tmp\n\t"
5529             "pshufd  $tmp,$tmp2,0x03\n\t"
5530             "vmulss  $dst,$dst,$tmp\t! mul reduction8F" %}
5531   ins_encode %{
5532     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5533     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5534     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5535     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5536     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5537     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5538     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5539     __ vextractf128_high($tmp2$$XMMRegister, $src2$$XMMRegister);
5540     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5541     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5542     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5543     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5544     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5545     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5546     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5547   %}
5548   ins_pipe( pipe_slow );
5549 %}
5550 
5551 instruct rvmul16F_reduction_reg(regF dst, vecZ src2, regF tmp, regF tmp2) %{
5552   predicate(UseAVX > 2);
5553   match(Set dst (MulReductionVF dst src2));
5554   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5555   format %{ "vmulss  $dst,$dst,$src2\n\t"
5556             "pshufd  $tmp,$src2,0x01\n\t"
5557             "vmulss  $dst,$dst,$tmp\n\t"
5558             "pshufd  $tmp,$src2,0x02\n\t"
5559             "vmulss  $dst,$dst,$tmp\n\t"
5560             "pshufd  $tmp,$src2,0x03\n\t"
5561             "vmulss  $dst,$dst,$tmp\n\t"
5562             "vextractf32x4  $tmp2,$src2,0x1\n\t"
5563             "vmulss  $dst,$dst,$tmp2\n\t"
5564             "pshufd  $tmp,$tmp2,0x01\n\t"
5565             "vmulss  $dst,$dst,$tmp\n\t"
5566             "pshufd  $tmp,$tmp2,0x02\n\t"
5567             "vmulss  $dst,$dst,$tmp\n\t"
5568             "pshufd  $tmp,$tmp2,0x03\n\t"
5569             "vmulss  $dst,$dst,$tmp\n\t"
5570             "vextractf32x4  $tmp2,$src2,0x2\n\t"
5571             "vmulss  $dst,$dst,$tmp2\n\t"
5572             "pshufd  $tmp,$tmp2,0x01\n\t"
5573             "vmulss  $dst,$dst,$tmp\n\t"
5574             "pshufd  $tmp,$tmp2,0x02\n\t"
5575             "vmulss  $dst,$dst,$tmp\n\t"
5576             "pshufd  $tmp,$tmp2,0x03\n\t"
5577             "vmulss  $dst,$dst,$tmp\n\t"
5578             "vextractf32x4  $tmp2,$src2,0x3\n\t"
5579             "vmulss  $dst,$dst,$tmp2\n\t"
5580             "pshufd  $tmp,$tmp2,0x01\n\t"
5581             "vmulss  $dst,$dst,$tmp\n\t"
5582             "pshufd  $tmp,$tmp2,0x02\n\t"
5583             "vmulss  $dst,$dst,$tmp\n\t"
5584             "pshufd  $tmp,$tmp2,0x03\n\t"
5585             "vmulss  $dst,$dst,$tmp\t! mul reduction16F" %}
5586   ins_encode %{
5587     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5588     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5589     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5590     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5591     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5592     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5593     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5594     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5595     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5596     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5597     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5598     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5599     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5600     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5601     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5602     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
5603     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5604     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5605     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5606     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5607     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5608     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5609     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5610     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
5611     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5612     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5613     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5614     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5615     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5616     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5617     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5618   %}
5619   ins_pipe( pipe_slow );
5620 %}
5621 
5622 instruct rsmul2D_reduction_reg(regD dst, vecX src2, regD tmp) %{
5623   predicate(UseSSE >= 1 && UseAVX == 0);
5624   match(Set dst (MulReductionVD dst src2));
5625   effect(TEMP dst, TEMP tmp);
5626   format %{ "mulsd   $dst,$src2\n\t"
5627             "pshufd  $tmp,$src2,0xE\n\t"
5628             "mulsd   $dst,$tmp\t! mul reduction2D" %}
5629   ins_encode %{
5630     __ mulsd($dst$$XMMRegister, $src2$$XMMRegister);


5639   match(Set dst (MulReductionVD dst src2));
5640   effect(TEMP tmp, TEMP dst);
5641   format %{ "vmulsd  $dst,$dst,$src2\n\t"
5642             "pshufd  $tmp,$src2,0xE\n\t"
5643             "vmulsd  $dst,$dst,$tmp\t! mul reduction2D" %}
5644   ins_encode %{
5645     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5646     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5647     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5648   %}
5649   ins_pipe( pipe_slow );
5650 %}
5651 
5652 instruct rvmul4D_reduction_reg(regD dst, vecY src2, regD tmp, regD tmp2) %{
5653   predicate(UseAVX > 0);
5654   match(Set dst (MulReductionVD dst src2));
5655   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5656   format %{ "vmulsd  $dst,$dst,$src2\n\t"
5657             "pshufd  $tmp,$src2,0xE\n\t"
5658             "vmulsd  $dst,$dst,$tmp\n\t"
5659             "vextractf128_high  $tmp2,$src2\n\t"
5660             "vmulsd  $dst,$dst,$tmp2\n\t"
5661             "pshufd  $tmp,$tmp2,0xE\n\t"
5662             "vmulsd  $dst,$dst,$tmp\t! mul reduction4D" %}
5663   ins_encode %{
5664     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5665     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5666     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5667     __ vextractf128_high($tmp2$$XMMRegister, $src2$$XMMRegister);
5668     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5669     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5670     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5671   %}
5672   ins_pipe( pipe_slow );
5673 %}
5674 
5675 instruct rvmul8D_reduction_reg(regD dst, vecZ src2, regD tmp, regD tmp2) %{
5676   predicate(UseAVX > 2);
5677   match(Set dst (MulReductionVD dst src2));
5678   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5679   format %{ "vmulsd  $dst,$dst,$src2\n\t"
5680             "pshufd  $tmp,$src2,0xE\n\t"
5681             "vmulsd  $dst,$dst,$tmp\n\t"
5682             "vextractf32x4  $tmp2,$src2,0x1\n\t"
5683             "vmulsd  $dst,$dst,$tmp2\n\t"
5684             "pshufd  $tmp,$src2,0xE\n\t"
5685             "vmulsd  $dst,$dst,$tmp\n\t"
5686             "vextractf32x4  $tmp2,$src2,0x2\n\t"
5687             "vmulsd  $dst,$dst,$tmp2\n\t"
5688             "pshufd  $tmp,$tmp2,0xE\n\t"
5689             "vmulsd  $dst,$dst,$tmp\n\t"
5690             "vextractf32x4  $tmp2,$src2,0x3\n\t"
5691             "vmulsd  $dst,$dst,$tmp2\n\t"
5692             "pshufd  $tmp,$tmp2,0xE\n\t"
5693             "vmulsd  $dst,$dst,$tmp\t! mul reduction8D" %}
5694   ins_encode %{
5695     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5696     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5697     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5698     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5699     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5700     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5701     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5702     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
5703     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5704     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5705     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5706     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
5707     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5708     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5709     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5710   %}
5711   ins_pipe( pipe_slow );
5712 %}
5713 
5714 // ====================VECTOR ARITHMETIC=======================================
5715 
5716 // --------------------------------- ADD --------------------------------------
5717 
5718 // Bytes vector add
5719 instruct vadd4B(vecS dst, vecS src) %{
5720   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
5721   match(Set dst (AddVB dst src));
5722   format %{ "paddb   $dst,$src\t! add packed4B" %}
5723   ins_encode %{
5724     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
5725   %}
5726   ins_pipe( pipe_slow );


src/cpu/x86/vm/x86.ad
Index Unified diffs Context diffs Sdiffs Patch New Old Previous File Next File